From 6c6f7fe8a850ca08f9a8774de020163a2a7c2164 Mon Sep 17 00:00:00 2001
From: Mengqing Cao <cmq0113@163.com>
Date: Fri, 27 Dec 2024 16:45:25 +0800
Subject: [PATCH 001/462] [Platform] Move model arch check to platform (#11503)

Signed-off-by: Mengqing Cao <cmq0113@163.com>
---
 vllm/model_executor/models/registry.py | 37 +-----------------------
 vllm/platforms/interface.py            | 12 ++++++++
 vllm/platforms/rocm.py                 | 39 +++++++++++++++++++++++++-
 3 files changed, 51 insertions(+), 37 deletions(-)

diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index feb33bb373c3e..89992de7e238d 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -187,31 +187,6 @@
     **_SPECULATIVE_DECODING_MODELS,
 }
 
-# Models not supported by ROCm.
-_ROCM_UNSUPPORTED_MODELS: List[str] = []
-
-# Models partially supported by ROCm.
-# Architecture -> Reason.
-_ROCM_SWA_REASON = ("Sliding window attention (SWA) is not yet supported in "
-                    "Triton flash attention. For half-precision SWA support, "
-                    "please use CK flash attention by setting "
-                    "`VLLM_USE_TRITON_FLASH_ATTN=0`")
-_ROCM_PARTIALLY_SUPPORTED_MODELS: Dict[str, str] = {
-    "Qwen2ForCausalLM":
-    _ROCM_SWA_REASON,
-    "MistralForCausalLM":
-    _ROCM_SWA_REASON,
-    "MixtralForCausalLM":
-    _ROCM_SWA_REASON,
-    "PaliGemmaForConditionalGeneration":
-    ("ROCm flash attention does not yet "
-     "fully support 32-bit precision on PaliGemma"),
-    "Phi3VForCausalLM":
-    ("ROCm Triton flash attention may run into compilation errors due to "
-     "excessive use of shared memory. If this happens, disable Triton FA "
-     "by setting `VLLM_USE_TRITON_FLASH_ATTN=0`")
-}
-
 
 @dataclass(frozen=True)
 class _ModelInfo:
@@ -297,17 +272,7 @@ def _try_load_model_cls(
     model_arch: str,
     model: _BaseRegisteredModel,
 ) -> Optional[Type[nn.Module]]:
-    if current_platform.is_rocm():
-        if model_arch in _ROCM_UNSUPPORTED_MODELS:
-            raise ValueError(f"Model architecture '{model_arch}' is not "
-                             "supported by ROCm for now.")
-
-        if model_arch in _ROCM_PARTIALLY_SUPPORTED_MODELS:
-            msg = _ROCM_PARTIALLY_SUPPORTED_MODELS[model_arch]
-            logger.warning(
-                "Model architecture '%s' is partially "
-                "supported by ROCm: %s", model_arch, msg)
-
+    current_platform.verify_model_arch(model_arch)
     try:
         return model.load_model_cls()
     except Exception:
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 4150b0cdf836a..ddccaa2ce0148 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -199,6 +199,18 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         """
         pass
 
+    @classmethod
+    def verify_model_arch(cls, model_arch: str) -> None:
+        """
+        Verify whether the current platform supports the specified model
+        architecture.
+
+        - This will raise an Error or Warning based on the model support on
+        the current platform.
+        - By default all models are considered supported.
+        """
+        pass
+
     @classmethod
     def verify_quantization(cls, quant: str) -> None:
         """
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 7778b565372cb..aa779f265135f 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -1,6 +1,6 @@
 import os
 from functools import lru_cache
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING, Dict, List, Optional
 
 import torch
 
@@ -33,6 +33,31 @@
                    " `spawn` instead.")
     os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
 
+# Models not supported by ROCm.
+_ROCM_UNSUPPORTED_MODELS: List[str] = []
+
+# Models partially supported by ROCm.
+# Architecture -> Reason.
+_ROCM_SWA_REASON = ("Sliding window attention (SWA) is not yet supported in "
+                    "Triton flash attention. For half-precision SWA support, "
+                    "please use CK flash attention by setting "
+                    "`VLLM_USE_TRITON_FLASH_ATTN=0`")
+_ROCM_PARTIALLY_SUPPORTED_MODELS: Dict[str, str] = {
+    "Qwen2ForCausalLM":
+    _ROCM_SWA_REASON,
+    "MistralForCausalLM":
+    _ROCM_SWA_REASON,
+    "MixtralForCausalLM":
+    _ROCM_SWA_REASON,
+    "PaliGemmaForConditionalGeneration":
+    ("ROCm flash attention does not yet "
+     "fully support 32-bit precision on PaliGemma"),
+    "Phi3VForCausalLM":
+    ("ROCm Triton flash attention may run into compilation errors due to "
+     "excessive use of shared memory. If this happens, disable Triton FA "
+     "by setting `VLLM_USE_TRITON_FLASH_ATTN=0`")
+}
+
 
 class RocmPlatform(Platform):
     _enum = PlatformEnum.ROCM
@@ -102,6 +127,18 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
             else:
                 parallel_config.worker_cls = "vllm.worker.worker.Worker"
 
+    @classmethod
+    def verify_model_arch(cls, model_arch: str) -> None:
+        if model_arch in _ROCM_UNSUPPORTED_MODELS:
+            raise ValueError(f"Model architecture '{model_arch}' is not "
+                             "supported by ROCm for now.")
+
+        if model_arch in _ROCM_PARTIALLY_SUPPORTED_MODELS:
+            msg = _ROCM_PARTIALLY_SUPPORTED_MODELS[model_arch]
+            logger.warning(
+                "Model architecture '%s' is partially "
+                "supported by ROCm: %s", model_arch, msg)
+
     @classmethod
     def verify_quantization(cls, quant: str) -> None:
         super().verify_quantization(quant)

From d003f3ea391b4c879f6f848dd485dd3c04fa6ca9 Mon Sep 17 00:00:00 2001
From: AlexHe99 <alehe@amd.com>
Date: Fri, 27 Dec 2024 18:00:04 +0800
Subject: [PATCH 002/462] Update deploying_with_k8s.md with AMD ROCm GPU
 example (#11465)

Signed-off-by: Alex He <alehe@amd.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 docs/source/serving/deploying_with_k8s.md | 79 ++++++++++++++++++++++-
 1 file changed, 78 insertions(+), 1 deletion(-)

diff --git a/docs/source/serving/deploying_with_k8s.md b/docs/source/serving/deploying_with_k8s.md
index d27db826cd006..77f848088ea43 100644
--- a/docs/source/serving/deploying_with_k8s.md
+++ b/docs/source/serving/deploying_with_k8s.md
@@ -47,7 +47,11 @@ data:
   token: "REPLACE_WITH_TOKEN"
 ```
 
-Create a deployment file for vLLM to run the model server. The following example deploys the `Mistral-7B-Instruct-v0.3` model:
+Next to create the deployment file for vLLM to run the model server. The following example deploys the `Mistral-7B-Instruct-v0.3` model.
+
+Here are two examples for using NVIDIA GPU and AMD GPU. 
+
+- NVIDIA GPU
 
 ```yaml
 apiVersion: apps/v1
@@ -119,6 +123,79 @@ spec:
           periodSeconds: 5
 ```
 
+- AMD GPU
+
+You can refer to the `deployment.yaml` below if using AMD ROCm GPU like MI300X.
+
+```yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: mistral-7b
+  namespace: default
+  labels:
+    app: mistral-7b
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: mistral-7b
+  template:
+    metadata:
+      labels:
+        app: mistral-7b
+    spec:
+      volumes:
+      # PVC
+      - name: cache-volume
+        persistentVolumeClaim:
+          claimName: mistral-7b
+      # vLLM needs to access the host's shared memory for tensor parallel inference.
+      - name: shm
+        emptyDir:
+          medium: Memory
+          sizeLimit: "8Gi"
+      hostNetwork: true
+      hostIPC: true
+      containers:
+      - name: mistral-7b
+        image: rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4
+        securityContext:
+          seccompProfile:
+            type: Unconfined
+          runAsGroup: 44
+          capabilities:
+            add:
+            - SYS_PTRACE
+        command: ["/bin/sh", "-c"]
+        args: [
+          "vllm serve mistralai/Mistral-7B-v0.3 --port 8000 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024"
+        ]
+        env:
+        - name: HUGGING_FACE_HUB_TOKEN
+          valueFrom:
+            secretKeyRef:
+              name: hf-token-secret
+              key: token
+        ports:
+        - containerPort: 8000
+        resources:
+          limits:
+            cpu: "10"
+            memory: 20G
+            amd.com/gpu: "1"
+          requests:
+            cpu: "6"
+            memory: 6G
+            amd.com/gpu: "1"
+        volumeMounts:
+        - name: cache-volume
+          mountPath: /root/.cache/huggingface
+        - name: shm
+          mountPath: /dev/shm
+```
+You can get the full example with steps and sample yaml files from <https://github.com/ROCm/k8s-device-plugin/tree/master/example/vllm-serve>.
+
 2. **Create a Kubernetes Service for vLLM**
 
 Next, create a Kubernetes Service file to expose the `mistral-7b` deployment:

From 2c9b8ea2b006e763b8268b8ab02181c9822cfe76 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Fri, 27 Dec 2024 18:39:15 +0800
Subject: [PATCH 003/462] [Bugfix] Fix TeleChat2ForCausalLM weights mapper
 (#11546)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/model_executor/models/telechat2.py | 26 ++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/vllm/model_executor/models/telechat2.py b/vllm/model_executor/models/telechat2.py
index 28c37bb96612c..02ca7fe08e556 100644
--- a/vllm/model_executor/models/telechat2.py
+++ b/vllm/model_executor/models/telechat2.py
@@ -31,19 +31,6 @@
 
 class TeleChat2Model(LlamaModel):
 
-    hf_to_vllm_mapper = WeightsMapper(
-        orig_to_new_prefix={
-            "transformer.": "model.",
-        },
-        orig_to_new_substr={
-            ".h.": ".layers.",
-            ".self_attention.": ".self_attn.",
-            ".word_embeddings.": ".embed_tokens.",
-            ".dense.": ".o_proj.",
-            ".ln_f.": ".norm.",
-        },
-    )
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         # 1. Initialize the LlamaModel with bias
         vllm_config.model_config.hf_config.bias = True
@@ -118,6 +105,19 @@ def load_weights(self, weights: Iterable[Tuple[str,
 
 class TeleChat2ForCausalLM(LlamaForCausalLM):
 
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            "transformer.": "model.",
+        },
+        orig_to_new_substr={
+            ".h.": ".layers.",
+            ".self_attention.": ".self_attn.",
+            ".word_embeddings.": ".embed_tokens.",
+            ".dense.": ".o_proj.",
+            ".ln_f.": ".norm.",
+        },
+    )
+
     def _init_model(self, vllm_config: VllmConfig, prefix: str = ""):
         return TeleChat2Model(vllm_config=vllm_config, prefix=prefix)
 

From 7af553ea30031446b4c1c74ad83187f9fd3de4e7 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 27 Dec 2024 19:21:23 +0800
Subject: [PATCH 004/462] [Misc] Abstract the logic for reading and writing
 media content (#11527)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/entrypoints/openai/test_serving_chat.py |   1 +
 tests/entrypoints/test_chat_utils.py          |   6 +-
 tests/multimodal/test_utils.py                |  59 ++-
 vllm/assets/audio.py                          |   6 +-
 vllm/entrypoints/chat_utils.py                | 129 +++--
 vllm/multimodal/audio.py                      |  36 +-
 vllm/multimodal/base.py                       |  38 +-
 vllm/multimodal/image.py                      |  41 +-
 vllm/multimodal/utils.py                      | 477 ++++++++----------
 vllm/multimodal/video.py                      |  87 +++-
 10 files changed, 493 insertions(+), 387 deletions(-)

diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index 51b255bb2a6db..61677b65af342 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -33,6 +33,7 @@ class MockModelConfig:
     hf_config = MockHFConfig()
     logits_processor_pattern = None
     diff_sampling_param: Optional[dict] = None
+    allowed_local_media_path: str = ""
 
     def get_diff_sampling_param(self):
         return self.diff_sampling_param or {}
diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index 996e60bfee592..d63b963522e73 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -2,7 +2,6 @@
 from typing import Optional
 
 import pytest
-from PIL import Image
 
 from vllm.assets.image import ImageAsset
 from vllm.config import ModelConfig
@@ -91,10 +90,7 @@ def _assert_mm_data_is_image_input(
     image_data = mm_data.get("image")
     assert image_data is not None
 
-    if image_count == 1:
-        assert isinstance(image_data, Image.Image)
-    else:
-        assert isinstance(image_data, list) and len(image_data) == image_count
+    assert isinstance(image_data, list) and len(image_data) == image_count
 
 
 def test_parse_chat_messages_single_image(
diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py
index fd82fb0c55fd7..6029f2e514772 100644
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@@ -9,7 +9,7 @@
 from PIL import Image, ImageChops
 from transformers import AutoConfig, AutoTokenizer
 
-from vllm.multimodal.utils import (async_fetch_image, fetch_image,
+from vllm.multimodal.utils import (MediaConnector,
                                    repeat_and_pad_placeholder_tokens)
 
 # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
@@ -23,7 +23,12 @@
 
 @pytest.fixture(scope="module")
 def url_images() -> Dict[str, Image.Image]:
-    return {image_url: fetch_image(image_url) for image_url in TEST_IMAGE_URLS}
+    connector = MediaConnector()
+
+    return {
+        image_url: connector.fetch_image(image_url)
+        for image_url in TEST_IMAGE_URLS
+    }
 
 
 def get_supported_suffixes() -> Tuple[str, ...]:
@@ -43,8 +48,10 @@ def _image_equals(a: Image.Image, b: Image.Image) -> bool:
 @pytest.mark.asyncio
 @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
 async def test_fetch_image_http(image_url: str):
-    image_sync = fetch_image(image_url)
-    image_async = await async_fetch_image(image_url)
+    connector = MediaConnector()
+
+    image_sync = connector.fetch_image(image_url)
+    image_async = await connector.fetch_image_async(image_url)
     assert _image_equals(image_sync, image_async)
 
 
@@ -53,6 +60,7 @@ async def test_fetch_image_http(image_url: str):
 @pytest.mark.parametrize("suffix", get_supported_suffixes())
 async def test_fetch_image_base64(url_images: Dict[str, Image.Image],
                                   image_url: str, suffix: str):
+    connector = MediaConnector()
     url_image = url_images[image_url]
 
     try:
@@ -75,48 +83,49 @@ async def test_fetch_image_base64(url_images: Dict[str, Image.Image],
         base64_image = base64.b64encode(f.read()).decode("utf-8")
         data_url = f"data:{mime_type};base64,{base64_image}"
 
-        data_image_sync = fetch_image(data_url)
+        data_image_sync = connector.fetch_image(data_url)
         if _image_equals(url_image, Image.open(f)):
             assert _image_equals(url_image, data_image_sync)
         else:
             pass  # Lossy format; only check that image can be opened
 
-        data_image_async = await async_fetch_image(data_url)
+        data_image_async = await connector.fetch_image_async(data_url)
         assert _image_equals(data_image_sync, data_image_async)
 
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
 async def test_fetch_image_local_files(image_url: str):
+    connector = MediaConnector()
+
     with TemporaryDirectory() as temp_dir:
-        origin_image = fetch_image(image_url)
+        local_connector = MediaConnector(allowed_local_media_path=temp_dir)
+
+        origin_image = connector.fetch_image(image_url)
         origin_image.save(os.path.join(temp_dir, os.path.basename(image_url)),
                           quality=100,
                           icc_profile=origin_image.info.get('icc_profile'))
 
-        image_async = await async_fetch_image(
-            f"file://{temp_dir}/{os.path.basename(image_url)}",
-            allowed_local_media_path=temp_dir)
-
-        image_sync = fetch_image(
-            f"file://{temp_dir}/{os.path.basename(image_url)}",
-            allowed_local_media_path=temp_dir)
+        image_async = await local_connector.fetch_image_async(
+            f"file://{temp_dir}/{os.path.basename(image_url)}")
+        image_sync = local_connector.fetch_image(
+            f"file://{temp_dir}/{os.path.basename(image_url)}")
         # Check that the images are equal
         assert not ImageChops.difference(image_sync, image_async).getbbox()
 
-        with pytest.raises(ValueError):
-            await async_fetch_image(
-                f"file://{temp_dir}/../{os.path.basename(image_url)}",
-                allowed_local_media_path=temp_dir)
-        with pytest.raises(ValueError):
-            await async_fetch_image(
+        with pytest.raises(ValueError, match="must be a subpath"):
+            await local_connector.fetch_image_async(
+                f"file://{temp_dir}/../{os.path.basename(image_url)}")
+        with pytest.raises(RuntimeError, match="Cannot load local files"):
+            await connector.fetch_image_async(
                 f"file://{temp_dir}/../{os.path.basename(image_url)}")
 
-        with pytest.raises(ValueError):
-            fetch_image(f"file://{temp_dir}/../{os.path.basename(image_url)}",
-                        allowed_local_media_path=temp_dir)
-        with pytest.raises(ValueError):
-            fetch_image(f"file://{temp_dir}/../{os.path.basename(image_url)}")
+        with pytest.raises(ValueError, match="must be a subpath"):
+            local_connector.fetch_image(
+                f"file://{temp_dir}/../{os.path.basename(image_url)}")
+        with pytest.raises(RuntimeError, match="Cannot load local files"):
+            connector.fetch_image(
+                f"file://{temp_dir}/../{os.path.basename(image_url)}")
 
 
 @pytest.mark.parametrize("model", ["llava-hf/llava-v1.6-mistral-7b-hf"])
diff --git a/vllm/assets/audio.py b/vllm/assets/audio.py
index 9033644e3264a..a46c67ad7e00e 100644
--- a/vllm/assets/audio.py
+++ b/vllm/assets/audio.py
@@ -21,12 +21,10 @@ class AudioAsset:
     name: Literal["winning_call", "mary_had_lamb"]
 
     @property
-    def audio_and_sample_rate(self) -> tuple[npt.NDArray, int]:
+    def audio_and_sample_rate(self) -> tuple[npt.NDArray, float]:
         audio_path = get_vllm_public_assets(filename=f"{self.name}.ogg",
                                             s3_prefix=ASSET_DIR)
-        y, sr = librosa.load(audio_path, sr=None)
-        assert isinstance(sr, int)
-        return y, sr
+        return librosa.load(audio_path, sr=None)
 
     @property
     def url(self) -> str:
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 3df08c740d65b..a492d5496e025 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -6,7 +6,7 @@
 from functools import lru_cache, partial
 from pathlib import Path
 from typing import (Any, Awaitable, Callable, Dict, Generic, Iterable, List,
-                    Literal, Mapping, Optional, Tuple, TypeVar, Union, cast)
+                    Literal, Optional, Tuple, TypeVar, Union, cast)
 
 import jinja2.nodes
 import transformers.utils.chat_template_utils as hf_chat_utils
@@ -23,6 +23,8 @@
     ChatCompletionMessageParam as OpenAIChatCompletionMessageParam)
 from openai.types.chat import (ChatCompletionMessageToolCallParam,
                                ChatCompletionToolMessageParam)
+from openai.types.chat.chat_completion_content_part_input_audio_param import (
+    InputAudio)
 # yapf: enable
 # pydantic needs the TypedDict from typing_extensions
 from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
@@ -31,11 +33,7 @@
 from vllm.config import ModelConfig
 from vllm.logger import init_logger
 from vllm.multimodal import MultiModalDataDict
-from vllm.multimodal.utils import (async_get_and_parse_audio,
-                                   async_get_and_parse_image,
-                                   async_get_and_parse_video,
-                                   get_and_parse_audio, get_and_parse_image,
-                                   get_and_parse_video)
+from vllm.multimodal.utils import MediaConnector
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
 from vllm.utils import print_warning_once
 
@@ -368,14 +366,17 @@ def __init__(self, model_config: ModelConfig, tokenizer: AnyTokenizer):
         self._tokenizer = tokenizer
         self._allowed_items = (model_config.multimodal_config.limit_per_prompt
                                if model_config.multimodal_config else {})
-        self._consumed_items = {k: 0 for k in self._allowed_items}
 
-        self._items: List[_T] = []
+        self._items_by_modality = defaultdict[str, list[_T]](list)
 
     @property
     def model_config(self) -> ModelConfig:
         return self._model_config
 
+    @property
+    def allowed_local_media_path(self):
+        return self._model_config.allowed_local_media_path
+
     @staticmethod
     @lru_cache(maxsize=None)
     def _cached_token_str(tokenizer: AnyTokenizer, token_index: int) -> str:
@@ -435,38 +436,19 @@ def _placeholder_str(self, modality: ModalityStr,
         else:
             raise TypeError(f"Unknown modality: {modality}")
 
-    @staticmethod
-    def _combine(items: List[MultiModalDataDict]) -> MultiModalDataDict:
-        mm_lists: Mapping[str, List[object]] = defaultdict(list)
-
-        # Merge all the multi-modal items
-        for single_mm_data in items:
-            for mm_key, mm_item in single_mm_data.items():
-                if isinstance(mm_item, list):
-                    mm_lists[mm_key].extend(mm_item)
-                else:
-                    mm_lists[mm_key].append(mm_item)
-
-        # Unpack any single item lists for models that don't expect multiple.
-        return {
-            mm_key: mm_list[0] if len(mm_list) == 1 else mm_list
-            for mm_key, mm_list in mm_lists.items()
-        }
-
     def add(self, modality: ModalityStr, item: _T) -> Optional[str]:
         """
         Add a multi-modal item to the current prompt and returns the
         placeholder string to use, if any.
         """
         allowed_count = self._allowed_items.get(modality, 1)
-        current_count = self._consumed_items.get(modality, 0) + 1
+        current_count = len(self._items_by_modality[modality]) + 1
         if current_count > allowed_count:
             raise ValueError(
                 f"At most {allowed_count} {modality}(s) may be provided in "
                 "one request.")
 
-        self._consumed_items[modality] = current_count
-        self._items.append(item)
+        self._items_by_modality[modality].append(item)
 
         return self._placeholder_str(modality, current_count)
 
@@ -475,22 +457,26 @@ def create_parser(self) -> "BaseMultiModalContentParser":
         raise NotImplementedError
 
 
-class MultiModalItemTracker(BaseMultiModalItemTracker[MultiModalDataDict]):
+class MultiModalItemTracker(BaseMultiModalItemTracker[object]):
 
     def all_mm_data(self) -> Optional[MultiModalDataDict]:
-        return self._combine(self._items) if self._items else None
+        if self._items_by_modality:
+            return dict(self._items_by_modality)
+
+        return None
 
     def create_parser(self) -> "BaseMultiModalContentParser":
         return MultiModalContentParser(self)
 
 
-class AsyncMultiModalItemTracker(
-        BaseMultiModalItemTracker[Awaitable[MultiModalDataDict]]):
+class AsyncMultiModalItemTracker(BaseMultiModalItemTracker[Awaitable[object]]):
 
     async def all_mm_data(self) -> Optional[MultiModalDataDict]:
-        if self._items:
-            items = await asyncio.gather(*self._items)
-            return self._combine(items)
+        if self._items_by_modality:
+            return {
+                modality: await asyncio.gather(*items)
+                for modality, items in self._items_by_modality.items()
+            }
 
         return None
 
@@ -522,7 +508,7 @@ def parse_audio(self, audio_url: str) -> None:
         raise NotImplementedError
 
     @abstractmethod
-    def parse_input_audio(self, input_audio: Dict[str, str]) -> None:
+    def parse_input_audio(self, input_audio: InputAudio) -> None:
         raise NotImplementedError
 
     @abstractmethod
@@ -537,31 +523,31 @@ def __init__(self, tracker: MultiModalItemTracker) -> None:
 
         self._tracker = tracker
 
+        self._connector = MediaConnector(
+            allowed_local_media_path=tracker.allowed_local_media_path,
+        )
+
     def parse_image(self, image_url: str) -> None:
-        image = get_and_parse_image(image_url,
-                                    allowed_local_media_path=self._tracker.
-                                    _model_config.allowed_local_media_path)
+        image = self._connector.fetch_image(image_url)
 
         placeholder = self._tracker.add("image", image)
         self._add_placeholder(placeholder)
 
     def parse_audio(self, audio_url: str) -> None:
-        audio = get_and_parse_audio(audio_url)
+        audio = self._connector.fetch_audio(audio_url)
 
         placeholder = self._tracker.add("audio", audio)
         self._add_placeholder(placeholder)
 
-    def parse_input_audio(self, input_audio: Dict[str, str]) -> None:
-        input_audio_data = input_audio.get("data","")
-        input_audio_format = input_audio.get("format","")
-        audio_url = f"data:audio/{input_audio_format};base64,{input_audio_data}"
-        audio = get_and_parse_audio(audio_url)
+    def parse_input_audio(self, input_audio: InputAudio) -> None:
+        audio_data = input_audio.get("data", "")
+        audio_format = input_audio.get("format", "")
+        audio_url = f"data:audio/{audio_format};base64,{audio_data}"
 
-        placeholder = self._tracker.add("audio", audio)
-        self._add_placeholder(placeholder)
+        return self.parse_audio(audio_url)
 
     def parse_video(self, video_url: str) -> None:
-        video = get_and_parse_video(video_url)
+        video = self._connector.fetch_video(video_url)
 
         placeholder = self._tracker.add("video", video)
         self._add_placeholder(placeholder)
@@ -573,33 +559,31 @@ def __init__(self, tracker: AsyncMultiModalItemTracker) -> None:
         super().__init__()
 
         self._tracker = tracker
+        self._connector = MediaConnector(
+            allowed_local_media_path=tracker.allowed_local_media_path,
+        )
 
     def parse_image(self, image_url: str) -> None:
-        image_coro = async_get_and_parse_image(
-            image_url,
-            allowed_local_media_path=self._tracker._model_config.
-            allowed_local_media_path)
+        image_coro = self._connector.fetch_image_async(image_url)
 
         placeholder = self._tracker.add("image", image_coro)
         self._add_placeholder(placeholder)
 
     def parse_audio(self, audio_url: str) -> None:
-        audio_coro = async_get_and_parse_audio(audio_url)
+        audio_coro = self._connector.fetch_audio_async(audio_url)
 
         placeholder = self._tracker.add("audio", audio_coro)
         self._add_placeholder(placeholder)
 
-    def parse_input_audio(self, input_audio: Dict[str, str]) -> None:
-        input_audio_data = input_audio.get("data","")
-        input_audio_format = input_audio.get("format","")
-        audio_url = f"data:audio/{input_audio_format};base64,{input_audio_data}"
-        audio_coro = async_get_and_parse_audio(audio_url)
+    def parse_input_audio(self, input_audio: InputAudio) -> None:
+        audio_data = input_audio.get("data", "")
+        audio_format = input_audio.get("format", "")
+        audio_url = f"data:audio/{audio_format};base64,{audio_data}"
 
-        placeholder = self._tracker.add("audio", audio_coro)
-        self._add_placeholder(placeholder)
+        return self.parse_audio(audio_url)
 
     def parse_video(self, video_url: str) -> None:
-        video = async_get_and_parse_video(video_url)
+        video = self._connector.fetch_video_async(video_url)
 
         placeholder = self._tracker.add("video", video)
         self._add_placeholder(placeholder)
@@ -695,10 +679,13 @@ def _get_full_multimodal_text_prompt(placeholder_counts: Dict[str, int],
 _RefusalParser = partial(cast, ChatCompletionContentPartRefusalParam)
 _VideoParser = partial(cast, ChatCompletionContentPartVideoParam)
 
+_ContentPart: TypeAlias = Union[str, Dict[str, str], InputAudio]
+
 # Define a mapping from part types to their corresponding parsing functions.
-MM_PARSER_MAP: Dict[str,
-                    Callable[[ChatCompletionContentPartParam],
-                             Union[str, Dict[str,str]]]] = {
+MM_PARSER_MAP: Dict[
+    str,
+    Callable[[ChatCompletionContentPartParam], _ContentPart],
+] = {
     "text":
     lambda part: _TextParser(part).get("text", ""),
     "image_url":
@@ -715,8 +702,7 @@ def _get_full_multimodal_text_prompt(placeholder_counts: Dict[str, int],
 
 
 def _parse_chat_message_content_mm_part(
-        part: ChatCompletionContentPartParam) -> Tuple[str,
-                                                Union[str, Dict[str, str]]]:
+        part: ChatCompletionContentPartParam) -> tuple[str, _ContentPart]:
     """
     Parses a given multi-modal content part based on its type.
 
@@ -783,7 +769,7 @@ def _parse_chat_message_content_parts(
     *,
     wrap_dicts: bool,
 ) -> List[ConversationMessage]:
-    content: List[Union[str, Dict[str, str]]] = []
+    content = list[_ContentPart]()
 
     mm_parser = mm_tracker.create_parser()
 
@@ -814,7 +800,7 @@ def _parse_chat_message_content_part(
     mm_parser: BaseMultiModalContentParser,
     *,
     wrap_dicts: bool,
-) -> Optional[Union[str, Dict[str, str]]]:
+) -> Optional[_ContentPart]:
     """Parses a single part of a conversation. If wrap_dicts is True,
     structured dictionary pieces for texts and images will be
     wrapped in dictionaries, i.e., {"type": "text", "text", ...} and
@@ -823,8 +809,7 @@ def _parse_chat_message_content_part(
     with multimodal placeholders.
     """
     if isinstance(part, str):  # Handle plain text parts
-        text = _TextParser(part)
-        return text
+        return part
 
     # Handle structured dictionary parts
     part_type, content = _parse_chat_message_content_mm_part(part)
@@ -855,7 +840,7 @@ def _parse_chat_message_content_part(
         return {'type': 'audio'} if wrap_dicts else None
 
     if part_type == "input_audio":
-        dict_content = cast(Dict[str, str], content)
+        dict_content = cast(InputAudio, content)
         mm_parser.parse_input_audio(dict_content)
         return {'type': 'audio'} if wrap_dicts else None
 
diff --git a/vllm/multimodal/audio.py b/vllm/multimodal/audio.py
index ed3bb82bf0aaa..3e09ef1fcbb56 100644
--- a/vllm/multimodal/audio.py
+++ b/vllm/multimodal/audio.py
@@ -1,10 +1,14 @@
+import base64
+from io import BytesIO
+from pathlib import Path
+
 import numpy as np
 import numpy.typing as npt
 
 from vllm.inputs.registry import InputContext
 from vllm.utils import PlaceholderModule
 
-from .base import MultiModalPlugin
+from .base import MediaIO, MultiModalPlugin
 from .inputs import AudioItem, MultiModalData, MultiModalKwargs
 
 try:
@@ -12,6 +16,11 @@
 except ImportError:
     librosa = PlaceholderModule("librosa")  # type: ignore[assignment]
 
+try:
+    import soundfile
+except ImportError:
+    soundfile = PlaceholderModule("soundfile")  # type: ignore[assignment]
+
 
 class AudioPlugin(MultiModalPlugin):
     """Plugin for audio data."""
@@ -39,3 +48,28 @@ def resample_audio(
     target_sr: float,
 ) -> npt.NDArray[np.floating]:
     return librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr)
+
+
+class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]):
+
+    def load_bytes(self, data: bytes) -> tuple[npt.NDArray, float]:
+        return librosa.load(BytesIO(data), sr=None)
+
+    def load_base64(
+        self,
+        media_type: str,
+        data: str,
+    ) -> tuple[npt.NDArray, float]:
+        return self.load_bytes(base64.b64decode(data))
+
+    def load_file(self, filepath: Path) -> tuple[npt.NDArray, float]:
+        return librosa.load(filepath, sr=None)
+
+    def encode_base64(self, media: tuple[npt.NDArray, float]) -> str:
+        audio, sr = media
+
+        with BytesIO() as buffer:
+            soundfile.write(buffer, audio, sr, format="WAV")
+            data = buffer.getvalue()
+
+        return base64.b64encode(data).decode('utf-8')
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 1e5a46946c6c0..10488e24b30cc 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -1,6 +1,7 @@
 from abc import ABC, abstractmethod
 from collections import defaultdict
-from typing import (TYPE_CHECKING, Any, Callable, Dict, List, NamedTuple,
+from pathlib import Path
+from typing import (TYPE_CHECKING, Any, Callable, Generic, NamedTuple,
                     Optional, Sequence, Tuple, Type, TypeVar, Union)
 
 from torch import nn
@@ -118,7 +119,7 @@ def map_input(
         self,
         model_config: "ModelConfig",
         data: MultiModalData[Any],
-        mm_processor_kwargs: Optional[Dict[str, Any]],
+        mm_processor_kwargs: Optional[dict[str, Any]],
     ) -> MultiModalKwargs:
         """
         Transform the data into a dictionary of model inputs using the
@@ -254,10 +255,10 @@ class MultiModalPlaceholderMap:
     """
 
     class IndexMap(NamedTuple):
-        src: List[int]
-        dest: List[int]
+        src: list[int]
+        dest: list[int]
 
-    src_ranges: List[range]
+    src_ranges: list[range]
     """
     The indices of the multi-modal embeddings that will replace the
     corresponding placeholder embeddings pointed to by ``dest_ranges``.
@@ -268,7 +269,7 @@ class IndexMap(NamedTuple):
     The total number of flattened multi-modal embeddings.
     """
 
-    dest_ranges: List[range]
+    dest_ranges: list[range]
     """
     The indices of the placeholder embeddings that will be replaced by the
     multimodal embeddings.
@@ -288,7 +289,7 @@ def __init__(self):
     @classmethod
     def from_seq_group(
         cls, seq_group: "SequenceGroupMetadata", positions: range
-    ) -> Tuple[Optional[MultiModalDataDict], Dict[str,
+    ) -> Tuple[Optional[MultiModalDataDict], dict[str,
                                                   "MultiModalPlaceholderMap"]]:
         """
         Returns the multi-modal items that intersect with the portion of a
@@ -376,9 +377,9 @@ def from_seq_group(
     def append_items_from_seq_group(
         self,
         positions: range,
-        multi_modal_items: List[_T],
+        multi_modal_items: list[_T],
         multi_modal_placeholders: Sequence[PlaceholderRange],
-    ) -> List[_T]:
+    ) -> list[_T]:
         """
         Adds the multi-modal items that intersect ```positions`` to this
         placeholder map and returns the intersecting items.
@@ -454,3 +455,22 @@ def index_map(self) -> "IndexMap":
 
         return MultiModalPlaceholderMap.IndexMap(src=src_indices,
                                                  dest=dest_indices)
+
+
+class MediaIO(ABC, Generic[_T]):
+
+    @abstractmethod
+    def load_bytes(self, data: bytes) -> _T:
+        raise NotImplementedError
+
+    @abstractmethod
+    def load_base64(self, media_type: str, data: str) -> _T:
+        """
+        List of media types:
+        https://www.iana.org/assignments/media-types/media-types.xhtml
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def load_file(self, filepath: Path) -> _T:
+        raise NotImplementedError
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index c705e1a3d1554..14c79dfadec0c 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -1,4 +1,7 @@
+import base64
 from functools import lru_cache
+from io import BytesIO
+from pathlib import Path
 from typing import TYPE_CHECKING, Any, Dict, Optional
 
 import torch
@@ -9,7 +12,7 @@
 from vllm.transformers_utils.processor import get_image_processor
 from vllm.utils import is_list_of
 
-from .base import MultiModalPlugin
+from .base import MediaIO, MultiModalPlugin
 from .inputs import ImageItem, MultiModalData, MultiModalKwargs
 
 if TYPE_CHECKING:
@@ -96,3 +99,39 @@ def rescale_image_size(image: Image.Image,
     if transpose >= 0:
         image = image.transpose(Image.Transpose(transpose))
     return image
+
+
+class ImageMediaIO(MediaIO[Image.Image]):
+
+    def __init__(self, *, image_mode: str = "RGB") -> None:
+        super().__init__()
+
+        self.image_mode = image_mode
+
+    def load_bytes(self, data: bytes) -> Image.Image:
+        image = Image.open(BytesIO(data))
+        image.load()
+        return image.convert(self.image_mode)
+
+    def load_base64(self, media_type: str, data: str) -> Image.Image:
+        return self.load_bytes(base64.b64decode(data))
+
+    def load_file(self, filepath: Path) -> Image.Image:
+        image = Image.open(filepath)
+        image.load()
+        return image.convert(self.image_mode)
+
+    def encode_base64(
+        self,
+        media: Image.Image,
+        *,
+        image_format: str = "JPEG",
+    ) -> str:
+        image = media
+
+        with BytesIO() as buffer:
+            image = image.convert(self.image_mode)
+            image.save(buffer, image_format)
+            data = buffer.getvalue()
+
+        return base64.b64encode(data).decode('utf-8')
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index a49da2bdee972..87b12a6fb33c1 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -1,8 +1,7 @@
-import base64
-import os
 from functools import lru_cache
-from io import BytesIO
-from typing import List, Optional, Tuple, TypeVar, Union
+from pathlib import Path
+from typing import Optional, TypeVar, Union
+from urllib.parse import ParseResult, urlparse
 
 import numpy as np
 import numpy.typing as npt
@@ -10,283 +9,246 @@
 from PIL import Image
 
 import vllm.envs as envs
-from vllm.connections import global_http_connection
+from vllm.connections import HTTPConnection, global_http_connection
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
-from vllm.utils import PlaceholderModule
 
-from .inputs import MultiModalDataDict, PlaceholderRange
-
-try:
-    import decord
-except ImportError:
-    decord = PlaceholderModule("decord")  # type: ignore[assignment]
-
-try:
-    import librosa
-except ImportError:
-    librosa = PlaceholderModule("librosa")  # type: ignore[assignment]
-
-try:
-    import soundfile
-except ImportError:
-    soundfile = PlaceholderModule("soundfile")  # type: ignore[assignment]
+from .audio import AudioMediaIO
+from .base import MediaIO
+from .image import ImageMediaIO
+from .inputs import PlaceholderRange
+from .video import VideoMediaIO
 
 logger = init_logger(__name__)
 
 cached_get_tokenizer = lru_cache(get_tokenizer)
 
+_M = TypeVar("_M")
 
-def _load_image_from_bytes(b: bytes) -> Image.Image:
-    image = Image.open(BytesIO(b))
-    image.load()
-    return image
-
-
-def _is_subpath(image_path: str, allowed_local_media_path: str) -> bool:
-    # Get the common path
-    common_path = os.path.commonpath([
-        os.path.abspath(image_path),
-        os.path.abspath(allowed_local_media_path)
-    ])
-    # Check if the common path is the same as allowed_local_media_path
-    return common_path == os.path.abspath(allowed_local_media_path)
 
+class MediaConnector:
 
-def _load_image_from_file(image_url: str,
-                          allowed_local_media_path: str) -> Image.Image:
-    if not allowed_local_media_path:
-        raise ValueError("Invalid 'image_url': Cannot load local files without"
-                         "'--allowed-local-media-path'.")
-    if allowed_local_media_path:
-        if not os.path.exists(allowed_local_media_path):
-            raise ValueError(
-                "Invalid '--allowed-local-media-path': "
-                f"The path {allowed_local_media_path} does not exist.")
-        if not os.path.isdir(allowed_local_media_path):
+    def __init__(
+        self,
+        connection: HTTPConnection = global_http_connection,
+        *,
+        allowed_local_media_path: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.connection = connection
+
+        if allowed_local_media_path:
+            allowed_local_media_path_ = Path(allowed_local_media_path)
+
+            if not allowed_local_media_path_.exists():
+                raise ValueError(
+                    "Invalid `--allowed-local-media-path`: The path "
+                    f"{allowed_local_media_path_} does not exist.")
+            if not allowed_local_media_path_.is_dir():
+                raise ValueError(
+                    "Invalid `--allowed-local-media-path`: The path "
+                    f"{allowed_local_media_path_} must be a directory.")
+        else:
+            allowed_local_media_path_ = None
+
+        self.allowed_local_media_path = allowed_local_media_path_
+
+    def _load_data_url(
+        self,
+        url_spec: ParseResult,
+        media_io: MediaIO[_M],
+    ) -> _M:
+        data_spec, data = url_spec.path.split(",", 1)
+        media_type, data_type = data_spec.split(";", 1)
+
+        if data_type != "base64":
+            msg = "Only base64 data URLs are supported for now."
+            raise NotImplementedError(msg)
+
+        return media_io.load_base64(media_type, data)
+
+    def _load_file_url(
+        self,
+        url_spec: ParseResult,
+        media_io: MediaIO[_M],
+    ) -> _M:
+        allowed_local_media_path = self.allowed_local_media_path
+        if allowed_local_media_path is None:
+            raise RuntimeError("Cannot load local files without "
+                               "`--allowed-local-media-path`.")
+
+        filepath = Path(url_spec.path)
+        if allowed_local_media_path not in filepath.resolve().parents:
             raise ValueError(
-                "Invalid '--allowed-local-media-path': "
-                f"The path {allowed_local_media_path} must be a directory.")
-
-    # Only split once and assume the second part is the image path
-    _, image_path = image_url.split("file://", 1)
-    if not _is_subpath(image_path, allowed_local_media_path):
-        raise ValueError(
-            f"Invalid 'image_url': The file path {image_path} must"
-            " be a subpath of '--allowed-local-media-path'"
-            f" '{allowed_local_media_path}'.")
-
-    image = Image.open(image_path)
-    image.load()
-    return image
+                f"The file path {filepath} must be a subpath "
+                f"of `--allowed-local-media-path` {allowed_local_media_path}.")
 
+        return media_io.load_file(filepath)
 
-def _load_image_from_data_url(image_url: str) -> Image.Image:
-    # Only split once and assume the second part is the base64 encoded image
-    _, image_base64 = image_url.split(",", 1)
-    return load_image_from_base64(image_base64)
-
-
-def fetch_image(image_url: str,
-                *,
-                image_mode: str = "RGB",
-                allowed_local_media_path: str = "") -> Image.Image:
-    """
-    Load a PIL image from a HTTP or base64 data URL.
-
-    By default, the image is converted into RGB format.
-    """
-    if image_url.startswith('http'):
-        image_raw = global_http_connection.get_bytes(
-            image_url,
-            timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT,
-        )
-        image = _load_image_from_bytes(image_raw)
-
-    elif image_url.startswith('data:image'):
-        image = _load_image_from_data_url(image_url)
-    elif image_url.startswith('file://'):
-        image = _load_image_from_file(image_url, allowed_local_media_path)
-    else:
-        raise ValueError("Invalid 'image_url': A valid 'image_url' must start "
-                         "with either 'data:image', 'file://' or 'http'.")
-
-    return image.convert(image_mode)
-
-
-async def async_fetch_image(image_url: str,
-                            *,
-                            image_mode: str = "RGB",
-                            allowed_local_media_path: str = "") -> Image.Image:
-    """
-    Asynchronously load a PIL image from a HTTP or base64 data URL.
-
-    By default, the image is converted into RGB format.
-    """
-    if image_url.startswith('http'):
-        image_raw = await global_http_connection.async_get_bytes(
-            image_url,
-            timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT,
-        )
-        image = _load_image_from_bytes(image_raw)
-
-    elif image_url.startswith('data:image'):
-        image = _load_image_from_data_url(image_url)
-    elif image_url.startswith('file://'):
-        image = _load_image_from_file(image_url, allowed_local_media_path)
-    else:
-        raise ValueError("Invalid 'image_url': A valid 'image_url' must start "
-                         "with either 'data:image', 'file://' or 'http'.")
+    def load_from_url(
+        self,
+        url: str,
+        media_io: MediaIO[_M],
+        *,
+        fetch_timeout: Optional[int] = None,
+    ) -> _M:
+        url_spec = urlparse(url)
 
-    return image.convert(image_mode)
+        if url_spec.scheme.startswith("http"):
+            connection = self.connection
+            data = connection.get_bytes(url, timeout=fetch_timeout)
 
+            return media_io.load_bytes(data)
 
-def _load_video_from_bytes(b: bytes, num_frames: int = 32) -> npt.NDArray:
-    video_path = BytesIO(b)
-    vr = decord.VideoReader(video_path, num_threads=1)
-    total_frame_num = len(vr)
+        if url_spec.scheme == "data":
+            return self._load_data_url(url_spec, media_io)
 
-    if total_frame_num > num_frames:
-        uniform_sampled_frames = np.linspace(0,
-                                             total_frame_num - 1,
-                                             num_frames,
-                                             dtype=int)
-        frame_idx = uniform_sampled_frames.tolist()
-    else:
-        frame_idx = [i for i in range(0, total_frame_num)]
-    frames = vr.get_batch(frame_idx).asnumpy()
+        if url_spec.scheme == "file":
+            return self._load_file_url(url_spec, media_io)
 
-    return frames
+        msg = "The URL must be either a HTTP, data or file URL."
+        raise ValueError(msg)
 
+    async def load_from_url_async(
+        self,
+        url: str,
+        media_io: MediaIO[_M],
+        *,
+        fetch_timeout: Optional[int] = None,
+    ) -> _M:
+        url_spec = urlparse(url)
 
-def _load_video_from_data_url(video_url: str) -> npt.NDArray:
-    # Only split once and assume the second part is the base64 encoded video
-    _, video_base64 = video_url.split(",", 1)
+        if url_spec.scheme.startswith("http"):
+            connection = self.connection
+            data = await connection.async_get_bytes(url, timeout=fetch_timeout)
 
-    if video_url.startswith("data:video/jpeg;"):
-        return np.stack([
-            np.array(load_image_from_base64(frame_base64))
-            for frame_base64 in video_base64.split(",")
-        ])
+            return media_io.load_bytes(data)
 
-    return load_video_from_base64(video_base64)
+        if url_spec.scheme == "data":
+            return self._load_data_url(url_spec, media_io)
 
+        if url_spec.scheme == "file":
+            return self._load_file_url(url_spec, media_io)
 
-def fetch_video(video_url: str, *, num_frames: int = 32) -> npt.NDArray:
-    """
-    Load video from a HTTP or base64 data URL.
-    """
-    if video_url.startswith('http') or video_url.startswith('https'):
-        video_raw = global_http_connection.get_bytes(
-            video_url,
-            timeout=envs.VLLM_VIDEO_FETCH_TIMEOUT,
-        )
-        video = _load_video_from_bytes(video_raw, num_frames)
-    elif video_url.startswith('data:video'):
-        video = _load_video_from_data_url(video_url)
-    else:
-        raise ValueError("Invalid 'video_url': A valid 'video_url' must start "
-                         "with either 'data:video' or 'http'.")
-    return video
+        msg = "The URL must be either a HTTP, data or file URL."
+        raise ValueError(msg)
 
+    def fetch_audio(
+        self,
+        audio_url: str,
+    ) -> tuple[np.ndarray, Union[int, float]]:
+        """
+        Load audio from a URL.
+        """
+        audio_io = AudioMediaIO()
 
-async def async_fetch_video(video_url: str,
-                            *,
-                            num_frames: int = 32) -> npt.NDArray:
-    """
-    Asynchronously load video from a HTTP or base64 data URL.
-
-    By default, the image is converted into RGB format.
-    """
-    if video_url.startswith('http') or video_url.startswith('https'):
-        video_raw = await global_http_connection.async_get_bytes(
-            video_url,
-            timeout=envs.VLLM_VIDEO_FETCH_TIMEOUT,
-        )
-        video = _load_video_from_bytes(video_raw, num_frames)
-    elif video_url.startswith('data:video'):
-        video = _load_video_from_data_url(video_url)
-    else:
-        raise ValueError("Invalid 'video_url': A valid 'video_url' must start "
-                         "with either 'data:video' or 'http'.")
-    return video
-
-
-def fetch_audio(audio_url: str) -> Tuple[np.ndarray, Union[int, float]]:
-    """
-    Load audio from a URL.
-    """
-    if audio_url.startswith("http"):
-        audio_bytes = global_http_connection.get_bytes(
+        return self.load_from_url(
             audio_url,
-            timeout=envs.VLLM_AUDIO_FETCH_TIMEOUT,
+            audio_io,
+            fetch_timeout=envs.VLLM_AUDIO_FETCH_TIMEOUT,
         )
-    elif audio_url.startswith("data:audio"):
-        _, audio_base64 = audio_url.split(",", 1)
-        audio_bytes = base64.b64decode(audio_base64)
-    else:
-        raise ValueError("Invalid 'audio_url': A valid 'audio_url' must start "
-                         "with either 'data:audio' or 'http'.")
-
-    return librosa.load(BytesIO(audio_bytes), sr=None)
 
+    async def fetch_audio_async(
+        self,
+        audio_url: str,
+    ) -> tuple[np.ndarray, Union[int, float]]:
+        """
+        Asynchronously fetch audio from a URL.
+        """
+        audio_io = AudioMediaIO()
 
-async def async_fetch_audio(
-        audio_url: str) -> Tuple[np.ndarray, Union[int, float]]:
-    """
-    Asynchronously fetch audio from a URL.
-    """
-    if audio_url.startswith("http"):
-        audio_bytes = await global_http_connection.async_get_bytes(
+        return await self.load_from_url_async(
             audio_url,
-            timeout=envs.VLLM_AUDIO_FETCH_TIMEOUT,
+            audio_io,
+            fetch_timeout=envs.VLLM_AUDIO_FETCH_TIMEOUT,
         )
-    elif audio_url.startswith("data:audio"):
-        _, audio_base64 = audio_url.split(",", 1)
-        audio_bytes = base64.b64decode(audio_base64)
-    else:
-        raise ValueError("Invalid 'audio_url': A valid 'audio_url' must start "
-                         "with either 'data:audio' or 'http'.")
-
-    return librosa.load(BytesIO(audio_bytes), sr=None)
 
+    def fetch_image(
+        self,
+        image_url: str,
+        *,
+        image_mode: str = "RGB",
+    ) -> Image.Image:
+        """
+        Load a PIL image from a HTTP or base64 data URL.
 
-def get_and_parse_audio(audio_url: str) -> MultiModalDataDict:
-    audio, sr = fetch_audio(audio_url)
-    return {"audio": (audio, sr)}
+        By default, the image is converted into RGB format.
+        """
+        image_io = ImageMediaIO(image_mode=image_mode)
 
+        return self.load_from_url(
+            image_url,
+            image_io,
+            fetch_timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT,
+        )
 
-def get_and_parse_image(
+    async def fetch_image_async(
+        self,
         image_url: str,
         *,
-        allowed_local_media_path: str = "") -> MultiModalDataDict:
-    image = fetch_image(image_url,
-                        allowed_local_media_path=allowed_local_media_path)
-    return {"image": image}
-
+        image_mode: str = "RGB",
+    ) -> Image.Image:
+        """
+        Asynchronously load a PIL image from a HTTP or base64 data URL.
 
-def get_and_parse_video(video_url: str) -> MultiModalDataDict:
-    video = fetch_video(video_url)
-    return {"video": video}
+        By default, the image is converted into RGB format.
+        """
+        image_io = ImageMediaIO(image_mode=image_mode)
 
+        return await self.load_from_url_async(
+            image_url,
+            image_io,
+            fetch_timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT,
+        )
 
-async def async_get_and_parse_audio(audio_url: str) -> MultiModalDataDict:
-    audio, sr = await async_fetch_audio(audio_url)
-    return {"audio": (audio, sr)}
-
+    def fetch_video(
+        self,
+        video_url: str,
+        *,
+        image_mode: str = "RGB",
+        num_frames: int = 32,
+    ) -> npt.NDArray:
+        """
+        Load video from a HTTP or base64 data URL.
+        """
+        image_io = ImageMediaIO(image_mode=image_mode)
+        video_io = VideoMediaIO(image_io, num_frames=num_frames)
+
+        return self.load_from_url(
+            video_url,
+            video_io,
+            fetch_timeout=envs.VLLM_VIDEO_FETCH_TIMEOUT,
+        )
 
-async def async_get_and_parse_image(
-        image_url: str,
+    async def fetch_video_async(
+        self,
+        video_url: str,
         *,
-        allowed_local_media_path: str = "") -> MultiModalDataDict:
-    image = await async_fetch_image(
-        image_url, allowed_local_media_path=allowed_local_media_path)
-    return {"image": image}
+        image_mode: str = "RGB",
+        num_frames: int = 32,
+    ) -> npt.NDArray:
+        """
+        Asynchronously load video from a HTTP or base64 data URL.
+
+        By default, the image is converted into RGB format.
+        """
+        image_io = ImageMediaIO(image_mode=image_mode)
+        video_io = VideoMediaIO(image_io, num_frames=num_frames)
+
+        return await self.load_from_url_async(
+            video_url,
+            video_io,
+            fetch_timeout=envs.VLLM_VIDEO_FETCH_TIMEOUT,
+        )
 
 
-async def async_get_and_parse_video(video_url: str) -> MultiModalDataDict:
-    video = await async_fetch_video(video_url)
-    return {"video": video}
+global_media_connector = MediaConnector()
+"""The global :class:`MediaConnector` instance used by vLLM."""
+
+fetch_audio = global_media_connector.fetch_audio
+fetch_image = global_media_connector.fetch_image
+fetch_video = global_media_connector.fetch_video
 
 
 def encode_audio_base64(
@@ -294,10 +256,8 @@ def encode_audio_base64(
     sampling_rate: int,
 ) -> str:
     """Encode audio as base64."""
-    buffered = BytesIO()
-    soundfile.write(buffered, audio, sampling_rate, format="WAV")
-
-    return base64.b64encode(buffered.getvalue()).decode('utf-8')
+    audio_io = AudioMediaIO()
+    return audio_io.encode_base64((audio, sampling_rate))
 
 
 def encode_image_base64(
@@ -311,29 +271,14 @@ def encode_image_base64(
 
     By default, the image is converted into RGB format before being encoded.
     """
-    buffered = BytesIO()
-    image = image.convert(image_mode)
-    image.save(buffered, format)
-    return base64.b64encode(buffered.getvalue()).decode('utf-8')
-
-
-def load_image_from_base64(image: Union[bytes, str]) -> Image.Image:
-    """Load image from base64 format."""
-    return _load_image_from_bytes(base64.b64decode(image))
+    image_io = ImageMediaIO(image_mode=image_mode)
+    return image_io.encode_base64(image, image_format=format)
 
 
 def encode_video_base64(frames: npt.NDArray) -> str:
-    base64_frames = []
-    frames_list = [frames[i] for i in range(frames.shape[0])]
-    for frame in frames_list:
-        img_base64 = encode_image_base64(Image.fromarray(frame))
-        base64_frames.append(img_base64)
-    return ",".join(base64_frames)
-
-
-def load_video_from_base64(video: Union[bytes, str]) -> npt.NDArray:
-    """Load video from base64 format."""
-    return _load_video_from_bytes(base64.b64decode(video))
+    image_io = ImageMediaIO()
+    video_io = VideoMediaIO(image_io)
+    return video_io.encode_base64(frames)
 
 
 def resolve_visual_encoder_outputs(
@@ -389,7 +334,7 @@ def repeat_and_pad_token(
     repeat_count: int = 1,
     pad_token_left: Optional[_T] = None,
     pad_token_right: Optional[_T] = None,
-) -> List[_T]:
+) -> list[_T]:
     replacement = [token] * repeat_count
     if pad_token_left is not None:
         replacement = [pad_token_left] + replacement
@@ -402,13 +347,13 @@ def repeat_and_pad_token(
 def repeat_and_pad_placeholder_tokens(
     tokenizer: AnyTokenizer,
     prompt: Optional[str],
-    prompt_token_ids: List[int],
+    prompt_token_ids: list[int],
     *,
     placeholder_token_id: int,
-    repeat_count: Union[int, List[int]],
+    repeat_count: Union[int, list[int]],
     pad_token_left: Optional[int] = None,
     pad_token_right: Optional[int] = None,
-) -> Tuple[Optional[str], List[int], List[PlaceholderRange]]:
+) -> tuple[Optional[str], list[int], list[PlaceholderRange]]:
     if isinstance(repeat_count, int):
         repeat_count = [repeat_count]
 
@@ -450,8 +395,8 @@ def repeat_and_pad_placeholder_tokens(
             new_prompt += prompt_parts[i] + replacement_str
         new_prompt += prompt_parts[-1]
 
-    new_token_ids: List[int] = []
-    placeholder_ranges: List[PlaceholderRange] = []
+    new_token_ids = list[int]()
+    placeholder_ranges = list[PlaceholderRange]()
     placeholder_token_idx = 0
     for i, token in enumerate(prompt_token_ids):
         if token == placeholder_token_id:
@@ -481,7 +426,7 @@ def repeat_and_pad_placeholder_tokens(
 def consecutive_placeholder_ranges(
         num_items: int,
         item_size: int,
-        initial_offset: int = 0) -> List[PlaceholderRange]:
+        initial_offset: int = 0) -> list[PlaceholderRange]:
     """Returns a list of consecutive PlaceholderRanges of a fixed size"""
 
     return [
diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index c4be100562703..b7d43c830cc46 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -1,23 +1,32 @@
-from functools import lru_cache
+import base64
+from functools import lru_cache, partial
+from io import BytesIO
+from pathlib import Path
 from typing import TYPE_CHECKING, Any, Dict, Optional
 
 import cv2
 import numpy as np
 import numpy.typing as npt
+from PIL import Image
 
 from vllm.inputs.registry import InputContext
 from vllm.logger import init_logger
 from vllm.transformers_utils.processor import get_video_processor
 from vllm.transformers_utils.tokenizer import get_tokenizer
-from vllm.utils import is_list_of
+from vllm.utils import PlaceholderModule, is_list_of
 
-from .base import MultiModalData
-from .image import ImagePlugin
+from .base import MediaIO, MultiModalData
+from .image import ImageMediaIO, ImagePlugin
 from .inputs import MultiModalKwargs, VideoItem
 
 if TYPE_CHECKING:
     from vllm.config import ModelConfig
 
+try:
+    import decord
+except ImportError:
+    decord = PlaceholderModule("decord")  # type: ignore[assignment]
+
 logger = init_logger(__name__)
 
 cached_get_video_processor = lru_cache(get_video_processor)
@@ -107,3 +116,73 @@ def sample_frames_from_video(frames: npt.NDArray,
     frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
     sampled_frames = frames[frame_indices, ...]
     return sampled_frames
+
+
+class VideoMediaIO(MediaIO[npt.NDArray]):
+
+    def __init__(
+        self,
+        image_io: ImageMediaIO,
+        *,
+        num_frames: int = 32,
+    ) -> None:
+        super().__init__()
+
+        self.image_io = image_io
+        self.num_frames = num_frames
+
+    def load_bytes(self, data: bytes) -> npt.NDArray:
+        vr = decord.VideoReader(BytesIO(data), num_threads=1)
+        total_frame_num = len(vr)
+
+        num_frames = self.num_frames
+        if total_frame_num > num_frames:
+            uniform_sampled_frames = np.linspace(0,
+                                                 total_frame_num - 1,
+                                                 num_frames,
+                                                 dtype=int)
+            frame_idx = uniform_sampled_frames.tolist()
+        else:
+            frame_idx = list(range(0, total_frame_num))
+
+        return vr.get_batch(frame_idx).asnumpy()
+
+    def load_base64(self, media_type: str, data: str) -> npt.NDArray:
+        if media_type.lower() == "video/jpeg":
+            load_frame = partial(
+                self.image_io.load_base64,
+                "image/jpeg",
+            )
+
+            return np.stack([
+                np.array(load_frame(frame_data))
+                for frame_data in data.split(",")
+            ])
+
+        return self.load_bytes(base64.b64decode(data))
+
+    def load_file(self, filepath: Path) -> npt.NDArray:
+        with filepath.open("rb") as f:
+            data = f.read()
+
+        return self.load_bytes(data)
+
+    def encode_base64(
+        self,
+        media: npt.NDArray,
+        *,
+        video_format: str = "JPEG",
+    ) -> str:
+        video = media
+
+        if video_format == "JPEG":
+            encode_frame = partial(
+                self.image_io.encode_base64,
+                image_format=video_format,
+            )
+
+            return ",".join(
+                encode_frame(Image.fromarray(frame)) for frame in video)
+
+        msg = "Only JPEG format is supported for now."
+        raise NotImplementedError(msg)

From 5ce4627a7ec4cf4e19ff4be7f030883ef486393f Mon Sep 17 00:00:00 2001
From: Chen1022 <112855051+ccjincong@users.noreply.github.com>
Date: Fri, 27 Dec 2024 21:05:10 +0800
Subject: [PATCH 005/462] [Doc]  Add xgrammar in doc (#11549)

Signed-off-by: ccjincong <chenjincong11@gmail.com>
---
 docs/source/usage/structured_outputs.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/usage/structured_outputs.md b/docs/source/usage/structured_outputs.md
index 3f5d9ffc26278..7292012e36a26 100644
--- a/docs/source/usage/structured_outputs.md
+++ b/docs/source/usage/structured_outputs.md
@@ -2,7 +2,7 @@
 
 # Structured Outputs
 
-vLLM supports the generation of structured outputs using [outlines](https://github.com/dottxt-ai/outlines) or [lm-format-enforcer](https://github.com/noamgat/lm-format-enforcer) as backends for the guided decoding.
+vLLM supports the generation of structured outputs using [outlines](https://github.com/dottxt-ai/outlines), [lm-format-enforcer](https://github.com/noamgat/lm-format-enforcer), or [xgrammar](https://github.com/mlc-ai/xgrammar) as backends for the guided decoding.
 This document shows you some examples of the different options that are available to generate structured outputs.
 
 ## Online Inference (OpenAI API)

From 101418096ffe3c83b6d541e1303b10e9d5e03861 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 28 Dec 2024 01:22:48 +0800
Subject: [PATCH 006/462] [VLM] Support caching in merged multi-modal processor
 (#11396)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/source/conf.py                           |   3 +-
 .../design/multimodal/multimodal_index.md     |  24 +-
 docs/source/models/supported_models.md        |   3 +-
 .../openai/test_vision_embedding.py           |   4 +-
 .../mm_processor_kwargs/test_qwen2_vl.py      |   2 +-
 .../vision_language/test_models.py            |   4 +-
 tests/multimodal/test_processing.py           | 209 ++++++-
 vllm/inputs/registry.py                       |  22 +-
 vllm/model_executor/models/llava.py           | 178 +++---
 vllm/model_executor/models/phi3v.py           | 107 +++-
 vllm/model_executor/models/qwen.py            |   4 +-
 vllm/model_executor/models/qwen2_audio.py     |  65 ++-
 vllm/model_executor/models/qwen2_vl.py        | 115 ++--
 vllm/model_executor/models/ultravox.py        |  76 ++-
 vllm/multimodal/base.py                       |  44 +-
 vllm/multimodal/inputs.py                     | 438 ++++++++++++++-
 vllm/multimodal/processing.py                 | 516 ++++++++++++------
 vllm/multimodal/registry.py                   |  50 +-
 vllm/transformers_utils/processor.py          |  12 +-
 vllm/utils.py                                 |  27 +-
 20 files changed, 1455 insertions(+), 448 deletions(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 1fe0474631140..71394c5302a39 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -191,6 +191,7 @@ def linkcode_resolve(domain, info):
 
 # Mock out external dependencies here, otherwise the autodoc pages may be blank.
 autodoc_mock_imports = [
+    "blake3",
     "compressed_tensors",
     "cpuinfo",
     "cv2",
@@ -207,7 +208,7 @@ def linkcode_resolve(domain, info):
     "tensorizer",
     "pynvml",
     "outlines",
-    "xgrammar,"
+    "xgrammar",
     "librosa",
     "soundfile",
     "gguf",
diff --git a/docs/source/design/multimodal/multimodal_index.md b/docs/source/design/multimodal/multimodal_index.md
index 88af07afc7018..e4f2171e84ff7 100644
--- a/docs/source/design/multimodal/multimodal_index.md
+++ b/docs/source/design/multimodal/multimodal_index.md
@@ -45,39 +45,39 @@ adding_multimodal_plugin
 ### Base Classes
 
 ```{eval-rst}
-.. autodata:: vllm.multimodal.NestedTensors
+.. automodule:: vllm.multimodal.base
+    :members:
+    :show-inheritance:
 ```
 
-```{eval-rst}
-.. autodata:: vllm.multimodal.BatchedTensorInputs
-```
+### Input Classes
 
 ```{eval-rst}
-.. autoclass:: vllm.multimodal.MultiModalDataBuiltins
+.. automodule:: vllm.multimodal.inputs
     :members:
     :show-inheritance:
 ```
 
-```{eval-rst}
-.. autodata:: vllm.multimodal.MultiModalDataDict
-```
+### Audio Classes
 
 ```{eval-rst}
-.. autoclass:: vllm.multimodal.MultiModalKwargs
+.. automodule:: vllm.multimodal.audio
     :members:
     :show-inheritance:
 ```
 
+### Image Classes
+
 ```{eval-rst}
-.. autoclass:: vllm.multimodal.MultiModalPlugin
+.. automodule:: vllm.multimodal.image
     :members:
     :show-inheritance:
 ```
 
-### Image Classes
+### Video Classes
 
 ```{eval-rst}
-.. automodule:: vllm.multimodal.image
+.. automodule:: vllm.multimodal.video
     :members:
     :show-inheritance:
 ```
diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 95add0d71bbab..7acafda50793c 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -755,8 +755,7 @@ vLLM currently only supports adding LoRA to the language backbone of multimodal
 ```
 
 ```{note}
-To use {code}`TIGER-Lab/Mantis-8B-siglip-llama3`, you have to install their GitHub repo ({code}`pip install git+https://github.com/TIGER-AI-Lab/Mantis.git`)
-and pass {code}`--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM.
+To use {code}`TIGER-Lab/Mantis-8B-siglip-llama3`, you have pass {code}`--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM.
 ```
 
 ```{note}
diff --git a/tests/entrypoints/openai/test_vision_embedding.py b/tests/entrypoints/openai/test_vision_embedding.py
index 3731b2dcdeae1..c851539c610ec 100644
--- a/tests/entrypoints/openai/test_vision_embedding.py
+++ b/tests/entrypoints/openai/test_vision_embedding.py
@@ -91,5 +91,5 @@ async def test_image_embedding(server: RemoteOpenAIServer, model_name: str,
     assert len(embeddings.data) == 1
     assert len(embeddings.data[0].embedding) == 3072
     assert embeddings.usage.completion_tokens == 0
-    assert embeddings.usage.prompt_tokens == 765
-    assert embeddings.usage.total_tokens == 765
+    assert embeddings.usage.prompt_tokens == 764
+    assert embeddings.usage.total_tokens == 764
diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py
index cd8954ffc48c2..5897c04c89e19 100644
--- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py
+++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py
@@ -30,7 +30,7 @@ def get_max_qwen2_vl_image_tokens():
 
 
 @pytest.mark.parametrize("mm_processor_kwargs,expected_max_tokens", [
-    ({}, 1225),
+    ({}, 16384),
     ({
         MIN_PIXELS: 64**2,
         MAX_PIXELS: 512**2
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index 3101d1d2ea831..1a9c1b4ef1be0 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -201,6 +201,7 @@
         vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output,
         num_logprobs=10,
         image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
+        marks=[large_gpu_mark(min_gb=48)],
     ),
     "glm4": VLMTestInfo(
         models=["THUDM/glm-4v-9b"],
@@ -212,7 +213,7 @@
         dtype="bfloat16",
         get_stop_token_ids=lambda tok: [151329, 151336, 151338],
         patch_hf_runner=model_utils.glm_patch_hf_runner,
-        marks=[large_gpu_mark(min_gb=48)],
+        marks=[large_gpu_mark(min_gb=32)],
     ),
     "h2ovl": VLMTestInfo(
         models = [
@@ -261,6 +262,7 @@
         dtype="bfloat16",
         use_tokenizer_eos=True,
         patch_hf_runner=model_utils.internvl_patch_hf_runner,
+        marks=[large_gpu_mark(min_gb=32)],
     ),
     "llava_next": VLMTestInfo(
         models=["llava-hf/llava-v1.6-mistral-7b-hf"],
diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index d22d778f81fa8..1b2847ed0f534 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -1,12 +1,20 @@
+from functools import partial
 from typing import cast
 
+import numpy as np
 import pytest
-
-from vllm.multimodal.processing import (PromptReplacement, _PlaceholderInfo,
-                                        find_text_matches, find_token_matches,
-                                        iter_placeholders, iter_token_matches,
+from PIL import Image
+
+from vllm.config import ModelConfig
+from vllm.inputs import InputProcessingContext
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.processing import (ProcessingCache, PromptReplacement,
+                                        _PlaceholderInfo, find_text_matches,
+                                        find_token_matches, iter_placeholders,
+                                        iter_token_matches,
                                         replace_text_matches,
                                         replace_token_matches)
+from vllm.multimodal.utils import cached_get_tokenizer
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils import full_groupby
 
@@ -457,6 +465,7 @@ def test_find_replace_tokens(
         ),
     ]
 )
+# yapf: enable
 def test_iter_placeholders(
     repl_by_key,
     prompt,
@@ -475,11 +484,199 @@ def test_iter_placeholders(
             prompt_repls,
             prompt,
             # Effectively match all occurrences in the prompt
-            {key: 3 for key in repl_by_key},
-         ))
+            {key: 3
+             for key in repl_by_key},
+        ))
 
     # Only displayed on error
     print("result:", result)
 
     # Manually constructed results
     assert result == expected
+
+
+def _rand_img(rng: np.random.RandomState, min_wh: int, max_wh: int):
+    w, h = rng.randint(min_wh, max_wh, size=(2, ))
+    arr = rng.randint(0, 255, size=(w, h, 3), dtype=np.uint8)
+    return Image.fromarray(arr)
+
+
+def _rand_video(
+    rng: np.random.RandomState,
+    min_frames: int,
+    max_frames: int,
+    min_wh: int,
+    max_wh: int,
+):
+    # Temporary workaround for https://github.com/huggingface/transformers/issues/35412
+    num_frames = rng.randint(min_frames, max_frames)
+    num_frames = (num_frames // 2) * 2
+
+    w, h = rng.randint(min_wh, max_wh, size=(2, ))
+    return rng.randint(0, 255, size=(num_frames, w, h, 3), dtype=np.uint8)
+
+
+def _rand_audio(
+    rng: np.random.RandomState,
+    min_len: int,
+    max_len: int,
+    sr: int,
+):
+    audio_len = rng.randint(min_len, max_len)
+    return rng.rand(audio_len), sr
+
+
+def _test_processing_cache_correctness(
+    model_id: str,
+    modalities: set[str],
+    hit_rate: float,
+    num_batches: int,
+    simplify_rate: float,
+):
+    if model_id == "TIGER-Lab/Mantis-8B-siglip-llama3":
+        hf_overrides = {"architectures": ["MantisForConditionalGeneration"]}
+    else:
+        hf_overrides = {}
+
+    model_config = ModelConfig(
+        model_id,
+        task="auto",
+        tokenizer=model_id,
+        tokenizer_mode="auto",
+        trust_remote_code=True,
+        seed=0,
+        dtype="float16",
+        revision=None,
+        hf_overrides=hf_overrides,
+    )
+    model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
+
+    processor_factory = MULTIMODAL_REGISTRY._processor_factories[model_cls]
+    ctx = InputProcessingContext(
+        model_config,
+        tokenizer=cached_get_tokenizer(model_config.tokenizer),
+    )
+    # Ensure that it can fit all of the data
+    cache = ProcessingCache(capacity=1 << 30)
+
+    baseline_processor = processor_factory(ctx, cache=None)
+    cached_processor = processor_factory(ctx, cache=cache)
+
+    rng = np.random.RandomState(0)
+
+    input_to_hit = {
+        "image": Image.new("RGB", size=(128, 128)),
+        "video": np.zeros((4, 128, 128, 3), dtype=np.uint8),
+        "audio": (np.zeros((512, )), 16000),
+    }
+    input_factory = {
+        "image":
+        partial(_rand_img, rng, min_wh=128, max_wh=256),
+        "video":
+        partial(_rand_video,
+                rng,
+                min_frames=2,
+                max_frames=8,
+                min_wh=128,
+                max_wh=256),
+        "audio":
+        partial(_rand_audio, rng, min_len=256, max_len=512, sr=16000),
+    }
+    input_max_count = {
+        "image": 3,
+        "video": 3,
+        "audio": 3,
+    }
+
+    for batch_idx in range(num_batches):
+        mm_data = {
+            k:
+            [(input_to_hit[k] if rng.rand() < hit_rate else input_factory[k]())
+             for _ in range(rng.randint(input_max_count[k]))]
+            for k in modalities
+        }
+
+        mm_counts = {k: len(vs) for k, vs in mm_data.items()}
+        prompt = baseline_processor._get_dummy_mm_inputs(mm_counts).prompt_text
+
+        # Drop unnecessary keys and test single -> multi conversion
+        if rng.rand() < simplify_rate:
+            for k in list(mm_data.keys()):
+                if not mm_data[k]:
+                    del mm_data[k]
+                elif len(mm_data[k]) == 1:
+                    mm_data[k] = mm_data[k][0]
+
+        baseline_result = baseline_processor.apply(
+            prompt,
+            mm_data=mm_data,
+            hf_processor_mm_kwargs={},
+        )
+        cached_result = cached_processor.apply(
+            prompt,
+            mm_data=mm_data,
+            hf_processor_mm_kwargs={},
+        )
+
+        assert baseline_result == cached_result, (
+            f"Failed ({batch_idx=}, {mm_data=})")
+
+
+# yapf: disable
+@pytest.mark.parametrize(("model_id", "modalities"), [
+    ("llava-hf/llava-1.5-7b-hf", {"image"}),
+    ("TIGER-Lab/Mantis-8B-siglip-llama3", {"image"}),
+    ("mistral-community/pixtral-12b", {"image"}),
+    ("Qwen/Qwen2-VL-2B-Instruct", {"image", "video"}),
+    ("Qwen/Qwen2-Audio-7B-Instruct", {"audio"}),
+    ("fixie-ai/ultravox-v0_3", {"audio"}),
+])
+@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
+@pytest.mark.parametrize("num_batches", [32])
+@pytest.mark.parametrize("simplify_rate", [1.0])
+# yapf: enable
+def test_processing_cache_correctness(
+    model_id: str,
+    modalities: set[str],
+    hit_rate: float,
+    num_batches: int,
+    simplify_rate: float,
+):
+    _test_processing_cache_correctness(
+        model_id,
+        modalities,
+        hit_rate=hit_rate,
+        num_batches=num_batches,
+        simplify_rate=simplify_rate,
+    )
+
+
+# yapf: disable
+@pytest.mark.parametrize(("model_id", "modalities"), [
+    ("microsoft/Phi-3-vision-128k-instruct", {"image"}),
+])
+@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
+@pytest.mark.parametrize("num_batches", [32])
+@pytest.mark.parametrize("simplify_rate", [1.0])
+# yapf: enable
+def test_processing_cache_correctness_phi3v(
+    model_id: str,
+    modalities: set[str],
+    hit_rate: float,
+    num_batches: int,
+    simplify_rate: float,
+):
+    # HACK - this is an attempted workaround for the following bug
+    # https://github.com/huggingface/transformers/issues/34307
+    from transformers import AutoImageProcessor  # noqa: F401
+    from transformers import AutoProcessor  # noqa: F401
+
+    AutoImageProcessor.from_pretrained(model_id, trust_remote_code=True)
+
+    _test_processing_cache_correctness(
+        model_id,
+        modalities,
+        hit_rate=hit_rate,
+        num_batches=num_batches,
+        simplify_rate=simplify_rate,
+    )
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index f3ec9d115c9ba..46346b08e99c2 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -99,6 +99,9 @@ def get_hf_processor(
 
         merged_kwargs = {**base_kwargs, **kwargs}
 
+        if isinstance(typ, type):
+            merged_kwargs["processor_cls"] = typ
+
         hf_processor = cached_get_processor(
             self.model_config.model,
             trust_remote_code=self.model_config.trust_remote_code,
@@ -132,10 +135,13 @@ def get_hf_processor(
     def call_hf_processor(
         self,
         hf_processor: ProcessorMixin,
-        prompt: str,
-        processor_data: Mapping[str, object],
-        inference_kwargs: Mapping[str, object],
+        data: Mapping[str, object],
+        kwargs: Mapping[str, object] = {},
     ) -> BatchFeature:
+        """
+        Call :code:`hf_processor` on the prompt :code:`data`
+        (text, image, audio...) with configurable options :code:`kwargs`.
+        """
         assert callable(hf_processor)
 
         base_kwargs = self.model_config.mm_processor_kwargs
@@ -144,21 +150,15 @@ def call_hf_processor(
 
         merged_kwargs = resolve_mm_processor_kwargs(
             base_kwargs,
-            inference_kwargs,
+            kwargs,
             hf_processor,
             requires_kw_only=False,
             allow_var_kwargs=True,
         )
 
         try:
-            return hf_processor(
-                text=prompt,
-                **processor_data,
-                **merged_kwargs,
-                return_tensors="pt",
-            )
+            return hf_processor(**data, **merged_kwargs, return_tensors="pt")
         except Exception as exc:
-            data = dict(text=prompt, **processor_data)
             msg = (f"Failed to apply {type(hf_processor).__name__} "
                    f"on data={data} with kwargs={merged_kwargs}")
 
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 0662d90e79b92..0ecba5a1cae0f 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -1,5 +1,4 @@
 from functools import cached_property
-from types import MethodType
 from typing import (Iterable, List, Literal, Mapping, Optional, Protocol, Set,
                     Tuple, TypedDict, Union)
 
@@ -7,7 +6,7 @@
 import torch.nn as nn
 from transformers import (BatchFeature, CLIPVisionConfig, LlavaConfig,
                           PixtralVisionConfig, PretrainedConfig,
-                          ProcessorMixin, SiglipVisionConfig)
+                          SiglipVisionConfig)
 from transformers.models.llava import LlavaProcessor
 from transformers.models.pixtral import PixtralProcessor
 
@@ -21,10 +20,12 @@
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import NestedTensors
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalDataItems,
+                                    MultiModalFieldConfig, MultiModalInputsV2,
+                                    MultiModalKwargs, NestedTensors)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessorInputs,
-                                        PromptReplacement)
+                                        ProcessorInputs, PromptReplacement,
+                                        full_groupby_modality)
 from vllm.sequence import IntermediateTensors
 
 from .clip import (CLIPVisionModel, dummy_image_for_clip,
@@ -116,36 +117,54 @@ def get_max_llava_image_tokens(ctx: InputContext):
 
 class LlavaMultiModalProcessor(BaseMultiModalProcessor):
 
-    def _patch_pixtral_processor(self, hf_processor: PixtralProcessor):
-        if getattr(hf_processor, "__is_patched__", False):
-            return  # Already patched
-
-        image_processor = hf_processor.image_processor  # type: ignore
-        orig_preprocess = image_processor.preprocess
+    def _get_hf_processor(self) -> Union[LlavaProcessor, PixtralProcessor]:
+        return self.ctx.get_hf_processor((LlavaProcessor, PixtralProcessor))
 
-        def preprocess(__self, *args, **kwargs):
-            hf_inputs = orig_preprocess(*args, **kwargs)
-            hf_inputs["is_pixtral"] = torch.tensor(True)
-            return hf_inputs
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        processed_outputs = super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+        )
 
-        image_processor.preprocess = MethodType(preprocess, image_processor)
+        # NOTE: pixel_values=None for MLlavaProcessor
+        pixel_values = processed_outputs.get("pixel_values")
+        if pixel_values is not None:
+            images = mm_data["images"]
+            assert isinstance(images, list)
 
-        hf_processor.__is_patched__ = True  # type: ignore
+            if isinstance(self._get_hf_processor(), PixtralProcessor):
+                # Original output: (1, num_images, C, H, W)
+                # New output: (num_images, C, H, W)
+                assert (isinstance(pixel_values, list)
+                        and len(pixel_values) == 1
+                        and isinstance(pixel_values[0], list)
+                        and len(pixel_values[0]) == len(images))
 
-    def _get_hf_processor(self) -> Union[LlavaProcessor, PixtralProcessor]:
-        hf_processor = self.ctx.get_hf_processor(
-            (LlavaProcessor, PixtralProcessor))
+                processed_outputs["pixel_values"] = pixel_values[0]
 
-        if isinstance(hf_processor, PixtralProcessor):
-            self._patch_pixtral_processor(hf_processor)
+        return processed_outputs
 
-        return hf_processor
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+            image_embeds=MultiModalFieldConfig.batched("image"),
+        )
 
     def _get_prompt_replacements(
         self,
         mm_items: MultiModalDataItems,
-        hf_inputs: BatchFeature,
-        mm_processor_kwargs: Mapping[str, object],
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
         hf_config = self.ctx.get_hf_config(LlavaConfig)
         image_token_id = hf_config.image_token_index
@@ -200,7 +219,7 @@ def _get_dummy_mm_inputs(
     ) -> ProcessorInputs:
         hf_config = self.ctx.get_hf_config(LlavaConfig)
         vision_config = hf_config.vision_config
-        num_images = mm_counts["image"]
+        num_images = mm_counts.get("image", 0)
 
         if isinstance(vision_config, CLIPVisionConfig):
             data = dummy_image_for_clip(vision_config, num_images)
@@ -218,7 +237,6 @@ def _get_dummy_mm_inputs(
         return ProcessorInputs(
             prompt_text=image_token * num_images,
             mm_data=data,
-            mm_processor_kwargs={},
         )
 
 
@@ -379,7 +397,6 @@ def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
     def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[LlavaImageInputs]:
         pixel_values = kwargs.pop("pixel_values", None)
-        is_pixtral = kwargs.pop("is_pixtral", torch.tensor([False]))
         image_embeds = kwargs.pop("image_embeds", None)
 
         if pixel_values is None and image_embeds is None:
@@ -390,33 +407,6 @@ def _parse_and_validate_image_input(
                 raise ValueError("Incorrect type of pixel values. "
                                  f"Got type: {type(pixel_values)}")
 
-            assert isinstance(is_pixtral, torch.Tensor)
-            if is_pixtral.any():
-                images = pixel_values
-
-                def flatten_to_3d_tensors(item):
-                    if isinstance(item, torch.Tensor):
-                        if item.dim() >= 3:
-                            return [t for t in item.view(-1, *item.shape[-3:])]
-                        else:
-                            raise ValueError(
-                                f"Unexpected tensor dimension: {item.dim()}")
-                    elif isinstance(item, list):
-                        return [
-                            t for subitem in item
-                            for t in flatten_to_3d_tensors(subitem)
-                        ]
-                    else:
-                        raise ValueError(f"Unexpected type: {type(item)}")
-
-                # Restructure the batched images into a list of lists of images
-                images = flatten_to_3d_tensors(pixel_values)
-
-                return LlavaImagePixelInputs(
-                    type="pixel_values",
-                    data=images,
-                )
-
             return LlavaImagePixelInputs(
                 type="pixel_values",
                 data=self._validate_pixel_values(
@@ -586,19 +576,71 @@ def load_weights(self, weights: Iterable[Tuple[str,
 
 class MantisMultiModalProcessor(LlavaMultiModalProcessor):
 
-    def _get_hf_processor(self) -> ProcessorMixin:
-        try:
-            from mantis.models.mllava import MLlavaProcessor
-        except ModuleNotFoundError as exc:
-            raise ModuleNotFoundError(
-                "You need to `pip install "
-                "git+https://github.com/TIGER-AI-Lab/Mantis.git` "
-                "to use this model") from exc
-
-        processor = MLlavaProcessor.from_pretrained(
-            self.ctx.model_config.tokenizer)
-        assert isinstance(processor, ProcessorMixin)
-        return processor
+    def _get_hf_processor(self):
+        return self.ctx.get_hf_processor(LlavaProcessor)
+
+    def apply(
+        self,
+        prompt_text: str,
+        mm_data: MultiModalDataDict,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> MultiModalInputsV2:
+        hf_config = self.ctx.get_hf_config(LlavaConfig)
+        image_token_id = hf_config.image_token_index
+        max_image_tokens = get_max_llava_image_tokens(self.ctx)
+
+        result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs)
+
+        mm_items = self._get_mm_items(mm_data)
+        mm_item_counts = mm_items.get_item_counts()
+        mm_kwargs = result["mm_kwargs"]
+
+        # We reimplement the functionality of MLlavaProcessor from
+        # https://github.com/TIGER-AI-Lab/Mantis.git
+        def get_replacement_mantis(item_idx: int):
+            return "".join([
+                f"(image {item_idx+1}: <Image>",  # 7 tokens
+                "<image>" * max_image_tokens,
+                "</Image>)",  # 3 tokens
+            ])
+
+        mantis_repls = self._bind_prompt_replacements([
+            PromptReplacement(
+                modality="image",
+                target=[image_token_id] * max_image_tokens,
+                replacement=get_replacement_mantis,
+            )
+        ])
+
+        prompt_ids, prompt_text, _ = self._apply_prompt_replacements(
+            result["prompt_token_ids"],
+            mantis_repls,
+            mm_item_counts,
+        )
+
+        unbound_orig_repls = self._get_prompt_replacements(
+            mm_items,
+            hf_processor_mm_kwargs,
+            mm_kwargs,
+        )
+        orig_repls = self._bind_prompt_replacements(unbound_orig_repls)
+
+        all_placeholders = self._find_placeholders(orig_repls, prompt_ids,
+                                                   mm_item_counts)
+        assert len(all_placeholders) == mm_item_counts.get("image", 0)
+
+        mm_placeholders = {
+            modality: [item.to_range() for item in items]
+            for modality, items in full_groupby_modality(all_placeholders)
+        }
+
+        return MultiModalInputsV2(
+            type="multimodal",
+            prompt=prompt_text,
+            prompt_token_ids=prompt_ids,
+            mm_kwargs=mm_kwargs,
+            mm_placeholders=mm_placeholders,
+        )
 
 
 # To use this model, please use
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 4e2e7f5761544..fefa9fd62d1d0 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -12,9 +12,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from collections.abc import Iterable, Mapping, Sequence
 from functools import cached_property
-from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
-                    TypedDict, Union)
+from typing import List, Literal, Optional, Set, Tuple, TypedDict, Union
 
 import torch
 import torch.nn as nn
@@ -32,10 +32,14 @@
 from vllm.model_executor.models.clip import CLIPVisionModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import NestedTensors
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalDataItems,
+                                    MultiModalFieldConfig, MultiModalInputsV2,
+                                    MultiModalKwargs, NestedTensors,
+                                    PlaceholderRange)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessorInputs,
-                                        PromptReplacement)
+                                        ProcessorInputs, PromptReplacement,
+                                        _BoundPromptReplacement,
+                                        _PlaceholderInfo)
 from vllm.sequence import IntermediateTensors
 from vllm.utils import is_list_of
 
@@ -306,11 +310,11 @@ def get_max_phi3v_image_tokens(
     *,
     num_crops: Optional[int] = None,
 ) -> int:
-    mm_processor_kwargs = {}
+    hf_processor_mm_kwargs = {}
     if num_crops:
-        mm_processor_kwargs["num_crops"] = num_crops
+        hf_processor_mm_kwargs["num_crops"] = num_crops
 
-    processor = ctx.get_hf_processor(**mm_processor_kwargs)
+    processor = ctx.get_hf_processor(**hf_processor_mm_kwargs)
 
     return processor.calc_num_image_tokens_from_image_size(
         width=MAX_IMAGE_FEATURE_SIZE_WIDTH,
@@ -331,39 +335,50 @@ def _get_hf_processor(
 
     def _call_hf_processor(
         self,
-        hf_processor: ProcessorMixin,
         prompt: str,
-        processor_data: Mapping[str, object],
-        mm_processor_kwargs: Mapping[str, object],
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
     ) -> BatchFeature:
         processed_outputs = super()._call_hf_processor(
-            hf_processor,
             prompt=prompt,
-            processor_data=processor_data,
-            mm_processor_kwargs=mm_processor_kwargs,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
         )
 
+        input_ids = processed_outputs["input_ids"]
+        assert isinstance(input_ids, torch.Tensor)
+
         # Phi3v processor has inserted -1, -2 etc as placeholder in prompt_ids,
         # which will cause OverflowError when decoding the prompt_ids.
         # Therefore, we need to do an early replacement here
-        token_ids = processed_outputs['input_ids']
-        token_ids[token_ids < 0] = _IMAGE_TOKEN_ID
-        processed_outputs['input_ids'] = token_ids
+        input_ids.masked_fill_(input_ids < 0, _IMAGE_TOKEN_ID)
 
         return processed_outputs
 
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+            image_sizes=MultiModalFieldConfig.batched("image"),
+            image_embeds=MultiModalFieldConfig.batched("image"),
+        )
+
     def _get_prompt_replacements(
         self,
         mm_items: MultiModalDataItems,
-        hf_inputs: BatchFeature,
-        mm_processor_kwargs: Mapping[str, object],
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
         hf_processor = self._get_hf_processor()
         image_tokens: list[str] = hf_processor.img_tokens  # type: ignore
         image_processor = hf_processor.image_processor  # type: ignore
 
-        mm_config = self.ctx.get_mm_config()
-        max_images = mm_config.limit_per_prompt.get("image", 1)
+        tokenizer = self._get_tokenizer()
+        bos_token_id = tokenizer.bos_token_id
+        assert isinstance(bos_token_id, int)
 
         def get_replacement_phi3v(item_idx: int):
             image_size = mm_items.get_image_size(item_idx)
@@ -372,21 +387,44 @@ def get_replacement_phi3v(item_idx: int):
                 height=image_size.height,
             )
 
-            return [_IMAGE_TOKEN_ID] * num_tokens
+            return [_IMAGE_TOKEN_ID] * num_tokens + [bos_token_id]
 
         return [
             PromptReplacement(
                 modality="image",
                 target=image_token,
                 replacement=get_replacement_phi3v,
-            ) for image_token in image_tokens[:max_images]
+            ) for image_token in image_tokens[:len(mm_items.images)]
         ]
 
+    def _apply_prompt_replacements(
+        self,
+        token_ids: list[int],
+        prompt_repls: Sequence[_BoundPromptReplacement],
+        mm_item_counts: Mapping[str, int],
+    ) -> tuple[list[int], str, list[_PlaceholderInfo]]:
+        token_ids, text, placeholders = super()._apply_prompt_replacements(
+            token_ids=token_ids,
+            prompt_repls=prompt_repls,
+            mm_item_counts=mm_item_counts,
+        )
+
+        # Keep the behavior in line with HF processor
+        if text.startswith("<s> <|image|>"):
+            text = text.replace("<s> <|image|>", "<s><|image|>", 1)
+            token_ids = [token_ids[0], *token_ids[2:]]
+            placeholders = [
+                _PlaceholderInfo(p.modality, p.start_idx - 1, p.replacement)
+                for p in placeholders
+            ]
+
+        return token_ids, text, placeholders
+
     def _get_dummy_mm_inputs(
         self,
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
-        num_images = mm_counts["image"]
+        num_images = mm_counts.get("image", 0)
 
         data = dummy_image_for_clip(
             CLIP_VIT_LARGE_PATCH14_336_CONFIG,
@@ -401,9 +439,28 @@ def _get_dummy_mm_inputs(
         return ProcessorInputs(
             prompt_text="".join(image_tokens[:num_images]),
             mm_data=data,
-            mm_processor_kwargs={},
         )
 
+    def apply(
+        self,
+        prompt_text: str,
+        mm_data: MultiModalDataDict,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> MultiModalInputsV2:
+        result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs)
+
+        # Only <|image|> tokens should be considered as placeholders,
+        # so we ignore the trailing bos_token_id
+        result["mm_placeholders"] = {
+            modality: [
+                PlaceholderRange(offset=p["offset"], length=p["length"] - 1)
+                for p in ps
+            ]
+            for modality, ps in result["mm_placeholders"].items()
+        }
+
+        return result
+
 
 @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_phi3v_image_tokens)
 @MULTIMODAL_REGISTRY.register_processor(Phi3VMultiModalProcessor)
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index 63d1374ab4092..baf955f6b515d 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -225,7 +225,7 @@ def __init__(
         d_model: int,
         n_head: int,
         mlp_ratio: float = 4.0,
-        norm_layer: Callable = nn.LayerNorm,
+        norm_layer: Callable[[int], nn.Module] = nn.LayerNorm,
         quant_config: Optional[QuantizationConfig] = None,
     ):
         super().__init__()
@@ -266,7 +266,7 @@ def __init__(
         layers: int,
         heads: int,
         mlp_ratio: float = 4.0,
-        norm_layer: Callable = nn.LayerNorm,
+        norm_layer: Callable[[int], nn.Module] = nn.LayerNorm,
         quant_config: Optional[QuantizationConfig] = None,
     ):
         super().__init__()
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index 6259166a7fc57..25a351bd9c656 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -26,7 +26,7 @@
 import numpy as np
 import torch
 import torch.nn as nn
-from transformers import BatchFeature, ProcessorMixin
+from transformers import BatchFeature
 from transformers.models.qwen2_audio import (Qwen2AudioConfig,
                                              Qwen2AudioEncoder,
                                              Qwen2AudioProcessor)
@@ -38,10 +38,10 @@
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import NestedTensors
+from vllm.multimodal.inputs import (MultiModalDataItems, MultiModalFieldConfig,
+                                    MultiModalKwargs, NestedTensors)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessorInputs,
-                                        PromptReplacement)
+                                        ProcessorInputs, PromptReplacement)
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsMultiModal, SupportsPP
@@ -73,7 +73,7 @@ def forward(self, audio_features):
 
 
 # From Qwen2AudioEncoder._get_feat_extract_output_lengths
-def _get_feat_extract_output_lengths(input_lengths: torch.LongTensor):
+def _get_feat_extract_output_lengths(input_lengths: torch.Tensor):
     feat_lengths = (input_lengths - 1) // 2 + 1
     output_lengths = (feat_lengths - 2) // 2 + 1
     return feat_lengths, output_lengths
@@ -88,13 +88,18 @@ def get_max_qwen2_audio_audio_tokens(ctx: InputContext) -> int:
 
 class Qwen2AudioMultiModalProcessor(BaseMultiModalProcessor):
 
-    def _get_hf_processor(self) -> Qwen2AudioProcessor:
+    def _get_hf_processor(
+        self,
+        *,
+        # Ignored in initialization
+        sampling_rate: Optional[int] = None,
+    ) -> Qwen2AudioProcessor:
         return self.ctx.get_hf_processor(Qwen2AudioProcessor)
 
     def _get_feature_extractor(self) -> WhisperFeatureExtractor:
         return self._get_hf_processor().feature_extractor  # type: ignore
 
-    def _get_processor_data(
+    def _get_hf_mm_data(
         self,
         mm_items: MultiModalDataItems,
     ) -> tuple[dict[str, Any], dict[str, Any]]:
@@ -102,50 +107,61 @@ def _get_processor_data(
         feature_extractor = self._get_feature_extractor()
         mm_items.resample_audios(feature_extractor.sampling_rate)
 
-        return super()._get_processor_data(mm_items)
+        return super()._get_hf_mm_data(mm_items)
 
     def _call_hf_processor(
         self,
-        hf_processor: ProcessorMixin,
         prompt: str,
-        processor_data: Mapping[str, object],
-        mm_processor_kwargs: Mapping[str, object],
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
     ) -> BatchFeature:
-        processor_data = dict(processor_data)
-        audios = processor_data.pop("audios", [])
+        mm_data = dict(mm_data)
+        audios = mm_data.pop("audios", [])
 
         if audios:
-            processor_data["audios"] = audios
+            mm_data["audios"] = audios
 
             feature_extractor = self._get_feature_extractor()
-            mm_processor_kwargs = dict(
-                **mm_processor_kwargs,
+            mm_kwargs = dict(
+                **mm_kwargs,
                 sampling_rate=feature_extractor.sampling_rate,
             )
         else:
             # NOTE: WhisperFeatureExtractor cannot handle empty list of audios
             pass
 
-        return super()._call_hf_processor(
-            hf_processor,
+        processed_outputs = super()._call_hf_processor(
             prompt=prompt,
-            processor_data=processor_data,
-            mm_processor_kwargs=mm_processor_kwargs,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+        )
+
+        return processed_outputs
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            input_features=MultiModalFieldConfig.batched("audio"),
+            feature_attention_mask=MultiModalFieldConfig.batched("audio"),
         )
 
     def _get_prompt_replacements(
         self,
         mm_items: MultiModalDataItems,
-        hf_inputs: BatchFeature,
-        mm_processor_kwargs: Mapping[str, object],
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
         hf_config = self.ctx.get_hf_config(Qwen2AudioConfig)
         placeholder = hf_config.audio_token_index
 
-        feature_attention_mask = hf_inputs.get("feature_attention_mask")
+        feature_attention_mask = out_mm_kwargs.get("feature_attention_mask")
         if feature_attention_mask is None:
             audio_output_lengths = []
         else:
+            assert isinstance(feature_attention_mask, torch.Tensor)
             _, audio_output_lengths = _get_feat_extract_output_lengths(
                 feature_attention_mask.sum(-1))
 
@@ -168,14 +184,13 @@ def _get_dummy_mm_inputs(
         sampling_rate = feature_extractor.sampling_rate
         audio_len = feature_extractor.chunk_length * sampling_rate
 
-        audio_count = mm_counts["audio"]
+        audio_count = mm_counts.get("audio", 0)
         audio = np.zeros(audio_len)
         data = {"audio": [audio] * audio_count}
 
         return ProcessorInputs(
             prompt_text="<|AUDIO|>" * audio_count,
             mm_data=data,
-            mm_processor_kwargs={},
         )
 
 
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index fb97eb1916002..574845ef5a525 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -22,9 +22,10 @@
 # limitations under the License.
 """Inference-only Qwen2-VL model compatible with HuggingFace weights."""
 from functools import cached_property, partial
-from typing import (Any, Iterable, List, Literal, Mapping, Optional, Set,
-                    Tuple, Type, TypedDict, Union)
+from typing import (Any, Callable, Iterable, List, Literal, Mapping, Optional,
+                    Set, Tuple, Type, TypedDict, Union)
 
+import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -54,10 +55,11 @@
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalDataDict, NestedTensors
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalDataItems,
+                                    MultiModalFieldConfig, MultiModalKwargs,
+                                    NestedTensors)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessorInputs,
-                                        PromptReplacement)
+                                        ProcessorInputs, PromptReplacement)
 from vllm.platforms import _Backend
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.config import uses_mrope
@@ -229,9 +231,9 @@ class Qwen2VisionAttention(nn.Module):
 
     def __init__(
         self,
-        embed_dim: Optional[int] = None,
-        num_heads: Optional[int] = None,
-        projection_size: Optional[int] = None,
+        embed_dim: int,
+        num_heads: int,
+        projection_size: int,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
     ) -> None:
@@ -264,7 +266,7 @@ def forward(
         self,
         x: torch.Tensor,
         cu_seqlens: torch.Tensor,
-        rotary_pos_emb: torch.Tensor = None,
+        rotary_pos_emb: torch.Tensor,
     ) -> torch.Tensor:
         # [s, b, c] --> [s, b, head * 3 * head_dim]
         x, _ = self.qkv(x)
@@ -347,7 +349,7 @@ def __init__(
         num_heads: int,
         mlp_ratio: float,
         act_layer: Type[nn.Module] = QuickGELU,
-        norm_layer: Type[nn.Module] = None,
+        norm_layer: Optional[Callable[[int], nn.Module]] = None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
     ) -> None:
@@ -384,7 +386,7 @@ def __init__(
         self,
         patch_size: int = 14,
         temporal_patch_size: int = 2,
-        in_chans: int = 3,
+        in_channels: int = 3,
         embed_dim: int = 1152,
     ) -> None:
         super().__init__()
@@ -392,8 +394,8 @@ def __init__(
         self.temporal_patch_size = temporal_patch_size
         self.embed_dim = embed_dim
 
-        kernel_size = [temporal_patch_size, patch_size, patch_size]
-        self.proj = nn.Conv3d(in_chans,
+        kernel_size = (temporal_patch_size, patch_size, patch_size)
+        self.proj = nn.Conv3d(in_channels,
                               embed_dim,
                               kernel_size=kernel_size,
                               stride=kernel_size,
@@ -413,7 +415,7 @@ def __init__(
         self,
         d_model: int,
         context_dim: int,
-        norm_layer: Type[nn.Module] = None,
+        norm_layer: Optional[Callable[[int], nn.Module]] = None,
         spatial_merge_size: int = 2,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
@@ -489,15 +491,15 @@ def __init__(
     ) -> None:
         super().__init__()
 
-        patch_size: int = vision_config.patch_size
-        temporal_patch_size: int = vision_config.temporal_patch_size
-        spatial_merge_size: int = vision_config.spatial_merge_size
-        in_chans: int = vision_config.in_chans
-        hidden_size: int = vision_config.hidden_size
-        embed_dim: int = vision_config.embed_dim
-        depth: int = vision_config.depth
-        num_heads: int = vision_config.num_heads
-        mlp_ratio: float = vision_config.mlp_ratio
+        patch_size = vision_config.patch_size
+        temporal_patch_size = vision_config.temporal_patch_size
+        spatial_merge_size = vision_config.spatial_merge_size
+        in_channels = vision_config.in_channels
+        hidden_size = vision_config.hidden_size
+        embed_dim = vision_config.embed_dim
+        depth = vision_config.depth
+        num_heads = vision_config.num_heads
+        mlp_ratio = vision_config.mlp_ratio
 
         self.spatial_merge_size = spatial_merge_size
         self.num_heads = num_heads
@@ -506,7 +508,7 @@ def __init__(
         self.patch_embed = Qwen2VisionPatchEmbed(
             patch_size=patch_size,
             temporal_patch_size=temporal_patch_size,
-            in_chans=in_chans,
+            in_channels=in_channels,
             embed_dim=embed_dim,
         )
 
@@ -733,8 +735,12 @@ def from_dict(data: MultiModalDataDict) -> "MultiModalDataItems":
             if k == "video":
                 # Special case since even a single item can be a list
                 multi_data[k] = (  # type: ignore[index]
-                    v if (isinstance(v, (dict, torch.Tensor))  # type: ignore[assignment]
-                          or is_list_of(v, list)) else [v]
+                    v if (
+                        isinstance(v, (dict, torch.Tensor))  # type: ignore[assignment]
+                        or is_list_of(v, list)
+                        or isinstance(v[0], (np.ndarray, torch.Tensor))
+                           and v[0].ndim == 4
+                    ) else [v]
                 )
             elif k in ("image", "audio"):
                 multi_data[k] = (  # type: ignore[index]
@@ -754,6 +760,12 @@ def get_item_counts(self) -> Mapping[str, int]:
             for m, items in self.items()
         }
 
+    def has_embedding_inputs(self) -> bool:
+        return any(
+            isinstance(items, dict) or any(
+                isinstance(item, torch.Tensor) for item in items)
+            for items in self.values())
+
 
 class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor):
 
@@ -784,7 +796,7 @@ def _get_hf_processor(
 
         return hf_processor
 
-    def _get_processor_data(
+    def _get_hf_mm_data(
         self,
         mm_items: MultiModalDataItems,
     ) -> tuple[dict[str, Any], dict[str, Any]]:
@@ -805,7 +817,7 @@ def _get_processor_data(
                       and v[0].ndim == 2):
                     # Pass through embedding inputs (multi)
                     passthrough_data[f"{k}_embeds"] = v
-                else:
+                elif len(v) > 0:
                     # Map keys to plural form, e.g.: image -> images
                     processor_data[f"{k}s"] = v
             else:
@@ -816,8 +828,8 @@ def _get_processor_data(
     def _get_prompt_replacements(
         self,
         mm_items: MultiModalDataItems,
-        hf_inputs: BatchFeature,
-        mm_processor_kwargs: Mapping[str, object],
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
         hf_processor = self._get_hf_processor()
         image_processor = _get_image_processor(hf_processor)
@@ -831,7 +843,9 @@ def _get_prompt_replacements(
         merge_length = image_processor.merge_size**2
 
         def get_replacement_qwen2vl(item_idx: int, modality: str):
-            grid_thw = hf_inputs[f"{modality}_grid_thw"][item_idx]
+            grid_thw = out_mm_kwargs[f"{modality}_grid_thw"][item_idx]
+            assert isinstance(grid_thw, torch.Tensor)
+
             num_tokens = grid_thw.prod() // merge_length
             return placeholder[modality] * num_tokens
 
@@ -844,11 +858,40 @@ def get_replacement_qwen2vl(item_idx: int, modality: str):
             ) for modality in ("image", "video")
         ]
 
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        image_grid_thw = hf_inputs.get("image_grid_thw", torch.empty((0, 3)))
+        image_slice_idxs = [0] + image_grid_thw.prod(-1).cumsum_(0).tolist()
+        image_slices = [
+            slice(image_slice_idxs[i], image_slice_idxs[i + 1])
+            for i in range(len(image_grid_thw))
+        ]
+
+        video_grid_thw = hf_inputs.get("video_grid_thw", torch.empty((0, 3)))
+        video_slice_idxs = [0] + video_grid_thw.prod(-1).cumsum_(0).tolist()
+        video_slices = [
+            slice(video_slice_idxs[i], video_slice_idxs[i + 1])
+            for i in range(len(video_grid_thw))
+        ]
+
+        return dict(
+            pixel_values=MultiModalFieldConfig.flat("image", image_slices),
+            image_embeds=MultiModalFieldConfig.flat("image", image_slices),
+            image_grid_thw=MultiModalFieldConfig.batched("image"),
+            pixel_values_videos=MultiModalFieldConfig.flat(
+                "video", video_slices),
+            video_embeds=MultiModalFieldConfig.flat("video", video_slices),
+            video_grid_thw=MultiModalFieldConfig.batched("video"),
+        )
+
     def _get_dummy_mm_inputs(
         self,
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
-        num_images = mm_counts["image"]
+        num_images = mm_counts.get("image", 0)
         hf_processor = self._get_hf_processor()
         image_token: str = hf_processor.image_token
         image_processor = _get_image_processor(hf_processor)
@@ -869,7 +912,6 @@ def _get_dummy_mm_inputs(
         return ProcessorInputs(
             prompt_text=image_token * num_images,
             mm_data=data,
-            mm_processor_kwargs={},
         )
 
 
@@ -950,9 +992,7 @@ def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig):
             return None
         return quant_config
 
-    def _validate_and_reshape_mm_tensor(self,
-                                        mm_input: Union[torch.Tensor,
-                                                        List[torch.Tensor]],
+    def _validate_and_reshape_mm_tensor(self, mm_input: object,
                                         name: str) -> torch.Tensor:
         if not isinstance(mm_input, (torch.Tensor, list)):
             raise ValueError(f"Incorrect type of {name}. "
@@ -962,7 +1002,8 @@ def _validate_and_reshape_mm_tensor(self,
                 return mm_input
             if mm_input.ndim != 3:
                 raise ValueError(f"{name} should be 2D or batched 3D tensor. "
-                                 f"Got ndim: {mm_input.ndim}")
+                                 f"Got ndim: {mm_input.ndim} "
+                                 f"(shape={mm_input.shape})")
             return torch.concat(list(mm_input))
         else:
             return torch.concat(mm_input)
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index 509ad9e580ddf..7b4aeeec5f403 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -23,10 +23,11 @@
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.model_loader.loader import DefaultModelLoader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import MULTIMODAL_REGISTRY, NestedTensors
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalDataItems, MultiModalFieldConfig,
+                                    MultiModalKwargs, NestedTensors)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessorInputs,
-                                        PromptReplacement)
+                                        ProcessorInputs, PromptReplacement)
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs.ultravox import UltravoxConfig
 from vllm.utils import is_list_of
@@ -72,11 +73,19 @@ def get_ultravox_max_audio_tokens(ctx: InputContext):
 
 class UltravoxMultiModalProcessor(BaseMultiModalProcessor):
 
+    def _get_hf_processor(
+        self,
+        *,
+        # Ignored in initialization
+        sampling_rate: Optional[int] = None,
+    ) -> ProcessorMixin:
+        return self.ctx.get_hf_processor()
+
     def _get_feature_extractor(self) -> WhisperFeatureExtractor:
         hf_processor = self._get_hf_processor()
         return hf_processor.audio_processor.feature_extractor  # type: ignore
 
-    def _get_processor_data(
+    def _get_hf_mm_data(
         self,
         mm_items: MultiModalDataItems,
     ) -> tuple[dict[str, Any], dict[str, Any]]:
@@ -84,33 +93,41 @@ def _get_processor_data(
         feature_extractor = self._get_feature_extractor()
         mm_items.resample_audios(feature_extractor.sampling_rate)
 
-        return super()._get_processor_data(mm_items)
+        return super()._get_hf_mm_data(mm_items)
 
     def _call_hf_processor(
         self,
-        hf_processor: ProcessorMixin,
         prompt: str,
-        processor_data: Mapping[str, object],
-        mm_processor_kwargs: Mapping[str, object],
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
     ) -> BatchFeature:
-        processor_data = dict(processor_data)
-        audios = processor_data.pop("audios", [])
+        # Text-only input not supported in composite processor
+        if not mm_data:
+            tokenizer = self._get_tokenizer()
+
+            prompt_ids = tokenizer.encode(
+                prompt,
+                add_special_tokens=False,  # type: ignore
+            )
+            return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
+
+        mm_data = dict(mm_data)
+        audios = mm_data.pop("audios", [])
 
         if not audios:
             return super()._call_hf_processor(
-                hf_processor,
                 prompt=prompt,
-                processor_data=processor_data,
-                mm_processor_kwargs=mm_processor_kwargs,
+                mm_data=mm_data,
+                mm_kwargs=mm_kwargs,
             )
 
         feature_extractor = self._get_feature_extractor()
-        mm_processor_kwargs = dict(
-            **mm_processor_kwargs,
+        mm_kwargs = dict(
+            **mm_kwargs,
             sampling_rate=feature_extractor.sampling_rate,
         )
 
-        # Already resampled by _get_processor_data
+        # Already resampled by _get_hf_mm_data
         assert is_list_of(audios, np.ndarray)
 
         # Ultravox processor doesn't support multiple inputs,
@@ -119,13 +136,12 @@ def _call_hf_processor(
         shared_outputs = {}
         for audio in audios:
             # NOTE: Ultravox processor accepts "audio" instead of "audios"
-            item_processor_data = dict(**processor_data, audio=audio)
+            item_processor_data = dict(**mm_data, audio=audio)
 
             item_outputs = super()._call_hf_processor(
-                hf_processor,
                 prompt=prompt,
-                processor_data=item_processor_data,
-                mm_processor_kwargs=mm_processor_kwargs,
+                mm_data=item_processor_data,
+                mm_kwargs=mm_kwargs,
             )
 
             audio_features.append(item_outputs.pop("audio_values")[0])
@@ -139,17 +155,28 @@ def _call_hf_processor(
         )
         return BatchFeature(combined_outputs)
 
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            audio_features=MultiModalFieldConfig.batched("audio"),
+            audio_token_len=MultiModalFieldConfig.batched("audio"),
+            audio_embeds=MultiModalFieldConfig.batched("audio"),
+        )
+
     def _get_prompt_replacements(
         self,
         mm_items: MultiModalDataItems,
-        hf_inputs: BatchFeature,
-        mm_processor_kwargs: Mapping[str, object],
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
         hf_processor = self._get_hf_processor()
         placeholder = hf_processor.audio_token_replacement  # type: ignore
 
         def get_replacement_ultravox(item_idx: int):
-            audio_token_len = hf_inputs["audio_token_len"][item_idx]
+            audio_token_len = out_mm_kwargs["audio_token_len"][item_idx]
             return placeholder * audio_token_len
 
         return [
@@ -168,14 +195,13 @@ def _get_dummy_mm_inputs(
         sampling_rate = feature_extractor.sampling_rate
         audio_len = feature_extractor.chunk_length * sampling_rate
 
-        audio_count = mm_counts["audio"]
+        audio_count = mm_counts.get("audio", 0)
         audio = np.zeros(audio_len)
         data = {"audio": [audio] * audio_count}
 
         return ProcessorInputs(
             prompt_text="<|audio|>" * audio_count,
             mm_data=data,
-            mm_processor_kwargs={},
         )
 
 
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 10488e24b30cc..cdda6f8052794 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -297,35 +297,37 @@ def from_seq_group(
         ``MultiModalPlaceholderMap`` that relates the multi-modal embedding
         vectors to their corresponding placeholders.
 
-        Consider the following scenarios:
+        Examples:
 
-           Prompt: |AAAA BBBB What's in these images?|
-        Positions: |.................................|
+        .. code-block::
 
-            images      = [A, B]
-            src_ranges  = [(0, 4), (4, 8)]
-            dest_ranges = [(0, 4), (5, 9)]
+            Prompt:    |AAAA BBBB What's in these images?|
+            Positions: |.................................|
 
-           Prompt: |AAAA BBBB What's in these images?|
-        Positions: |  .....                          |
+                images      = [A, B]
+                src_ranges  = [(0, 4), (4, 8)]
+                dest_ranges = [(0, 4), (5, 9)]
 
-            images      = [A, B]
-            src_ranges  = [(2, 4), (4, 6)]
-            dest_ranges = [(0, 2), (3, 5)]
+            Prompt:    |AAAA BBBB What's in these images?|
+            Positions: |  .....                          |
 
-           Prompt: |AAAA BBBB What's in these images?|
-        Positions: |     .........                   |
+                images      = [A, B]
+                src_ranges  = [(2, 4), (4, 6)]
+                dest_ranges = [(0, 2), (3, 5)]
 
-            images      = [B]
-            src_ranges  = [(0, 4)]
-            dest_ranges = [(0, 4)]
+            Prompt:    |AAAA BBBB What's in these images?|
+            Positions: |     .........                   |
 
-           Prompt: |AAAA BBBB What's in these images?|
-        Positions: |          .......................|
+                images      = [B]
+                src_ranges  = [(0, 4)]
+                dest_ranges = [(0, 4)]
 
-            images      = []
-            src_ranges  = []
-            dest_ranges = []
+            Prompt:    |AAAA BBBB What's in these images?|
+            Positions: |          .......................|
+
+                images      = []
+                src_ranges  = []
+                dest_ranges = []
         """
         seq_mm_data = seq_group.multi_modal_data
         seq_mm_placeholders = seq_group.multi_modal_placeholders
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 9ecae2c1ca2bf..1fbda6e0b8750 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -1,12 +1,16 @@
+from abc import ABC, abstractmethod
 from collections import UserDict, defaultdict
-from typing import (Any, Dict, List, Literal, Mapping, Sequence, Tuple,
-                    TypedDict, TypeVar, Union, cast, final)
+from collections.abc import Mapping, Sequence
+from dataclasses import dataclass
+from typing import (Any, Literal, NamedTuple, TypedDict, TypeVar, Union, cast,
+                    final)
 
 import numpy as np
 import torch
 import torch.types
 from PIL.Image import Image
-from typing_extensions import NotRequired, TypeAlias
+from transformers import BatchFeature
+from typing_extensions import NotRequired, TypeAlias, assert_never
 
 from vllm.utils import JSONTree, is_list_of, json_map_leaves
 
@@ -44,7 +48,7 @@
 """
 # yapf: enable
 
-MultiModalData: TypeAlias = Union[_T, List[_T]]
+MultiModalData: TypeAlias = Union[_T, list[_T]]
 """
 Either a single data item, or a list of data items.
 
@@ -79,13 +83,135 @@ class MultiModalDataBuiltins(TypedDict, total=False):
 """
 
 
+class ImageSize(NamedTuple):
+    width: int
+    height: int
+
+
+class MultiModalDataItems(UserDict[str, list[Any]]):
+    """
+    As :class:`MultiModalDataDict`, but normalized such that each entry
+    corresponds to a list.
+    """
+
+    @staticmethod
+    def from_dict(data: MultiModalDataDict) -> "MultiModalDataItems":
+        """
+        Normalize :class:`MultiModalDataDict` to :class:`MultiModalDataItems`.
+        """
+        multi_data = MultiModalDataItems()
+
+        for k, v in data.items():
+            # TODO: Make a separate modality for embedding inputs
+            # to avoid confusion
+            # yapf: disable
+            if k == "video":
+                # Special case since even a single item can be a list
+                multi_data[k] = (  # type: ignore[index]
+                    v if (
+                        isinstance(v, torch.Tensor)
+                        or is_list_of(v, list)
+                        or isinstance(v[0], (np.ndarray, torch.Tensor))
+                           and v[0].ndim == 4
+                    ) else [v]
+                )
+            elif k in ("image", "audio"):
+                multi_data[k] = (  # type: ignore[index]
+                    v if isinstance(v, (torch.Tensor, list)) else [v]
+                )
+            else:
+                multi_data[k] = v if isinstance(v, list) else [v]  # type: ignore[index]
+            # yapf: enable
+
+        return multi_data
+
+    # NOTE: When a field (e.g. `images`) doesn't exist, directly appending to
+    # `self.images` doesn't update this dictionary, which may be confusing
+    # We annotate the getter methods as `Sequence` to prevent others from
+    # trying to update the list in this way
+    @property
+    def images(self) -> Sequence[ImageItem]:
+        return self.get("image", [])
+
+    @property
+    def videos(self) -> Sequence[VideoItem]:
+        return self.get("video", [])
+
+    @property
+    def audios(self) -> Sequence[AudioItem]:
+        return self.get("audio", [])
+
+    def get_item_counts(self) -> Mapping[str, int]:
+        return {m: len(items) for m, items in self.items()}
+
+    def has_embedding_inputs(self) -> bool:
+        return any(
+            any(isinstance(item, torch.Tensor) for item in items)
+            for items in self.values())
+
+    def get_image_size(self, item_idx: int) -> ImageSize:
+        image = self.images[item_idx]
+
+        if isinstance(image, Image):
+            return ImageSize(*image.size)
+        if isinstance(image, (np.ndarray, torch.Tensor)):
+            _, h, w = image.shape
+            return ImageSize(w, h)
+
+        assert_never(image)
+
+    def get_audio_with_sr(
+        self,
+        item_idx: int,
+        *,
+        default_sr: float,
+    ) -> tuple[np.ndarray, float]:
+        audio = self.audios[item_idx]
+
+        if isinstance(audio, tuple):
+            return audio
+        if isinstance(audio, list):
+            return np.array(audio), default_sr
+        if isinstance(audio, np.ndarray):
+            return audio, default_sr
+
+        assert_never(audio)
+
+    def resample_audios(self, new_sr: float, *, drop_sr: bool = True) -> None:
+        """
+        If :code:`drop_sr=True`, the audio items in this dictionary are updated
+        to be NumPy arrays which implicitly means that their sampling rate is
+        the same as the model's expected sampling rate; otherwise, they remain
+        as :code:`(audio, new_sr)` tuples.
+        """
+        # Avoid circular import
+        from .audio import resample_audio
+
+        if not self.audios:
+            return
+
+        new_audios = []
+        for item_idx in range(len(self.audios)):
+            audio, sr = self.get_audio_with_sr(item_idx, default_sr=new_sr)
+            audio = resample_audio(audio, orig_sr=sr, target_sr=new_sr)
+
+            new_audios.append(audio if drop_sr else (audio, new_sr))
+
+        self["audio"] = new_audios
+
+
 class PlaceholderRange(TypedDict):
     """
     Placeholder location information for multi-modal data.
 
-    For example:
-        Prompt: AAAA BBBB What is in these images?
+    Example:
+
+        Prompt: :code:`AAAA BBBB What is in these images?`
+
         Images A and B will have:
+
+        .. code-block::
+
             A: { "offset": 0, "length": 4 }
             B: { "offset": 5, "length": 4 }
     """
@@ -97,25 +223,256 @@ class PlaceholderRange(TypedDict):
     """The length of the placeholder."""
 
 
-NestedTensors = Union[List["NestedTensors"], List[torch.Tensor], torch.Tensor,
-                      Tuple[torch.Tensor, ...]]
+NestedTensors = Union[list["NestedTensors"], list[torch.Tensor], torch.Tensor,
+                      tuple[torch.Tensor, ...]]
 """
 Uses a list instead of a tensor if the dimensions of each element do not match.
 """
 
-BatchedTensorInputs: TypeAlias = Dict[str, NestedTensors]
+
+def nested_tensors_equal(a: NestedTensors, b: NestedTensors) -> bool:
+    """Equality check between :data:`NestedTensors` objects."""
+    if isinstance(a, torch.Tensor):
+        return isinstance(b, torch.Tensor) and bool((a == b).all().item())
+    elif isinstance(b, torch.Tensor):
+        return isinstance(a, torch.Tensor) and bool((b == a).all().item())
+
+    if isinstance(a, list):
+        return (isinstance(b, list)
+                and all(nested_tensors_equal(a_, b_) for a_, b_ in zip(a, b)))
+    if isinstance(b, list):
+        return (isinstance(a, list)
+                and all(nested_tensors_equal(b_, a_) for b_, a_ in zip(b, a)))
+
+    # Both a and b are scalars
+    return a == b
+
+
+BatchedTensorInputs: TypeAlias = Mapping[str, NestedTensors]
 """
 A dictionary containing nested tensors which have been batched via
 :meth:`MultiModalKwargs.batch`.
 """
 
 
+@dataclass(frozen=True)
+class MultiModalFieldItem:
+    """
+    Contains metadata and data in :class:`MultiModalKwargs`
+    corresponding to a data item in :class:`MultiModalDataItems`.
+    """
+    field: "BaseMultiModalField"
+    data: NestedTensors
+
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, self.__class__):
+            return False
+
+        return (self.field == other.field
+                and nested_tensors_equal(self.data, other.data))
+
+
+@dataclass(frozen=True)
+class BaseMultiModalField(ABC):
+    """Abstract base class for a field in :class:`MultiModalKwargs`."""
+    key: str
+    modality: str
+
+    @abstractmethod
+    def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors:
+        raise NotImplementedError
+
+    def _build_item(self, data: NestedTensors) -> MultiModalFieldItem:
+        return MultiModalFieldItem(self, data)
+
+    def reduce(self, batch: list[MultiModalFieldItem]) -> MultiModalFieldItem:
+        """Merge multiple instances of :class:`MultiModalFieldItem` together."""
+        fields = [item.field for item in batch]
+        if len(set(fields)) > 1:
+            raise ValueError(f"Cannot merge different {fields=}")
+
+        data = self._reduce_data([item.data for item in batch])
+
+        return self._build_item(data)
+
+
+@dataclass(frozen=True)
+class MultiModalBatchedField(BaseMultiModalField):
+    """
+    A :class:`BaseMultiModalField` implementation where an item is obtained by
+    directly indexing into the first dimension of the underlying data.
+    """
+
+    def build_items(self, batch: NestedTensors) -> list[MultiModalFieldItem]:
+        return [self._build_item(item) for item in batch]
+
+    def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors:
+        if len(batch) > 0 and is_list_of(batch, torch.Tensor, check="all"):
+            first_shape = batch[0].shape
+            if all(item.shape == first_shape for item in batch):
+                return torch.stack(batch)
+
+        return batch
+
+
+@dataclass(frozen=True)
+class MultiModalFlatField(BaseMultiModalField):
+    """
+    A :class:`BaseMultiModalField` implementation where an item is obtained by
+    slicing along the first dimension of the underlying data.
+    """
+
+    def build_items(
+        self,
+        batch: NestedTensors,
+        slices: Sequence[slice],
+    ) -> list[MultiModalFieldItem]:
+        return [self._build_item(batch[slice_]) for slice_ in slices]
+
+    def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors:
+        if len(batch) > 0 and is_list_of(batch, torch.Tensor, check="all"):
+            first_shape = batch[0].shape
+            if all(item.shape[1:] == first_shape[1:] for item in batch):
+                return torch.concat(batch)
+
+        return [elem for item in batch for elem in item]
+
+
+class MultiModalFieldConfig:
+
+    @staticmethod
+    def batched(modality: str):
+        return MultiModalFieldConfig(
+            field_cls=MultiModalBatchedField,
+            modality=modality,
+        )
+
+    @staticmethod
+    def flat(modality: str, slices: Sequence[slice]):
+        return MultiModalFieldConfig(
+            field_cls=MultiModalFlatField,
+            modality=modality,
+            slices=slices,
+        )
+
+    def __init__(
+        self,
+        field_cls: type[BaseMultiModalField],
+        modality: str,
+        **field_config: Any,
+    ) -> None:
+        super().__init__()
+
+        self._field_cls = field_cls
+        self._modality = modality
+        self._field_config = field_config
+
+    def build_items(
+        self,
+        key: str,
+        batch: NestedTensors,
+    ) -> list[MultiModalFieldItem]:
+        field = self._field_cls(key=key, modality=self._modality)
+        return field.build_items(batch, **self._field_config)  # type: ignore
+
+
 class MultiModalKwargs(UserDict[str, NestedTensors]):
     """
     A dictionary that represents the keyword arguments to
     :meth:`~torch.nn.Module.forward`.
+
+    The metadata :code:`items_by_key` defines how to split batched keyword
+    arguments corresponding to each data item in :class:`MultiModalDataItems`:
+
+    - For a keyword argument, we can access the :code:`i` th item in the batch
+      via :code:`items_by_key[key][i]`.
+    - We can gather the keyword arguments belonging to a modality by finding
+      the keys with items that belong to that modality, then accessing
+      the :code:`i` th item in the batch for each such key.
+
+    Example:
+
+        .. code-block:: python
+
+            # All items belong to the "image" modality
+            items_by_key={
+                "pixel_values": [a, b, c, d],  # "image" modality
+                "image_grid_thw": [e, f, g, h],  # "image" modality
+                "pixel_values_video": [h, i, j],  # "video" modality
+                "video_grid_thw": [k, l, m],  # "video" modality
+            }
+
+        - The keyword arguments belonging to the first image are
+          :code:`{"pixel_values": a, "image_grid_thw": e}`.
+        - The keyword arguments belonging to the second video are
+          :code:`{"pixel_values_video": i, "video_grid_thw": l}`.
     """
 
+    @staticmethod
+    def from_hf_inputs(
+        hf_inputs: BatchFeature,
+        config_by_key: Mapping[str, MultiModalFieldConfig],
+        *,
+        enable_sanity_checks: bool = False,
+    ):
+        # NOTE: This skips fields in `hf_inputs` that are not in `config_by_key`
+        # We assume that those fields are not used in vLLM
+        items_by_key = {
+            key: config.build_items(key, batch)
+            for key, config in config_by_key.items()
+            if (batch := hf_inputs.get(key)) is not None
+        }
+
+        return MultiModalKwargs.from_items_by_key(
+            items_by_key,
+            enable_sanity_checks=enable_sanity_checks,
+        )
+
+    @staticmethod
+    def from_items_by_key(
+        items_by_key: Mapping[str, list[MultiModalFieldItem]],
+        *,
+        enable_sanity_checks: bool = False,
+    ) -> "MultiModalKwargs":
+        data = {
+            key: items[0].field.reduce(items).data
+            for key, items in items_by_key.items()
+        }
+
+        return MultiModalKwargs(data,
+                                items_by_key=items_by_key,
+                                enable_sanity_checks=enable_sanity_checks)
+
+    def __init__(
+        self,
+        data: Mapping[str, NestedTensors],
+        *,
+        items_by_key: Mapping[str, list[MultiModalFieldItem]] = {},
+        enable_sanity_checks: bool = False,
+    ) -> None:
+        super().__init__(data)
+
+        # Shallow copy to avoid footgun in case a defaultdict is passed in
+        self._items_by_key = dict(items_by_key)
+
+        keys_by_modality = defaultdict[str, set[str]](set)
+        for key, items in items_by_key.items():
+            for item in items:
+                keys_by_modality[item.field.modality].add(key)
+
+        self._keys_by_modality = dict(keys_by_modality)
+
+        if enable_sanity_checks:
+            for modality, keys in keys_by_modality.items():
+                items_in_modality = {k: items_by_key[k] for k in keys}
+                batch_sizes = {k: len(v) for k, v in items_in_modality.items()}
+                batch_size = next(iter(batch_sizes.values()), 0)
+                assert all(bs == batch_size
+                           for bs in batch_sizes.values()), dict(
+                               modality=modality,
+                               batch_sizes=batch_sizes,
+                               items_by_key=items_by_key)
+
     @staticmethod
     def _try_stack(nested_tensors: NestedTensors) -> NestedTensors:
         """
@@ -139,7 +496,7 @@ def _try_stack(nested_tensors: NestedTensors) -> NestedTensors:
             # Only tensors (not lists) can be stacked.
             return stacked
 
-        tensors_ = cast(List[torch.Tensor], stacked)
+        tensors_ = cast(list[torch.Tensor], stacked)
         if any(t.shape != tensors_[0].shape for t in tensors_):
             # The tensors have incompatible shapes and can't be stacked.
             return tensors_
@@ -147,7 +504,7 @@ def _try_stack(nested_tensors: NestedTensors) -> NestedTensors:
         return torch.stack(tensors_)
 
     @staticmethod
-    def batch(inputs_list: List["MultiModalKwargs"]) -> BatchedTensorInputs:
+    def batch(inputs_list: list["MultiModalKwargs"]) -> BatchedTensorInputs:
         """
         Batch multiple inputs together into a dictionary.
 
@@ -162,7 +519,7 @@ def batch(inputs_list: List["MultiModalKwargs"]) -> BatchedTensorInputs:
 
         # We need to consider the case where each item in the batch
         # contains different modalities (i.e. different keys).
-        item_lists: Dict[str, List[NestedTensors]] = defaultdict(list)
+        item_lists = defaultdict[str, list[NestedTensors]](list)
 
         for inputs in inputs_list:
             for k, v in inputs.items():
@@ -188,6 +545,57 @@ def as_kwargs(
 
         return cast(BatchedTensorInputs, json_mapped)
 
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, self.__class__):
+            return False
+        if self._items_by_key != other._items_by_key:
+            return False
+
+        ks = self.keys()
+        return (ks == other.keys()
+                and all(nested_tensors_equal(self[k], other[k]) for k in ks))
+
+    def get_item(self, key: str, item_index: int) -> MultiModalFieldItem:
+        return self._items_by_key[key][item_index]
+
+    def get_items_by_modality(
+        self,
+        modality: str,
+        item_index: int,
+    ) -> Mapping[str, MultiModalFieldItem]:
+        """
+        Get the keyword arguments corresponding to an item identified by
+        its modality and index.
+        """
+        keys_to_gather = self._keys_by_modality[modality]
+
+        return {
+            key: self.get_item(key, item_index)
+            for key in keys_to_gather if key in self
+        }
+
+    @staticmethod
+    def from_items_by_modality(
+        items_by_modality: Mapping[str, list[Mapping[str,
+                                                     MultiModalFieldItem]]],
+        *,
+        enable_sanity_checks: bool = False,
+    ) -> "MultiModalKwargs":
+        """
+        Construct a new :class:`MultiModalKwargs` from multiple items returned
+        by :meth:`get_fields_by_modality`.
+        """
+        items_by_key = defaultdict[str, list[MultiModalFieldItem]](list)
+        for fields in items_by_modality.values():
+            for field in fields:
+                for k, v in field.items():
+                    items_by_key[k].append(v)
+
+        return MultiModalKwargs.from_items_by_key(
+            items_by_key,
+            enable_sanity_checks=enable_sanity_checks,
+        )
+
 
 MultiModalPlaceholderDict = Mapping[str, Sequence[PlaceholderRange]]
 """
@@ -207,16 +615,16 @@ class MultiModalInputsV2(TypedDict):
     prompt: str
     """The processed prompt text."""
 
-    prompt_token_ids: List[int]
+    prompt_token_ids: list[int]
     """The processed token IDs which includes placeholder tokens."""
 
-    token_type_ids: NotRequired[List[int]]
+    token_type_ids: NotRequired[list[int]]
     """The token type IDs of the prompt."""
 
     mm_kwargs: MultiModalKwargs
     """Keyword arguments to be directly passed to the model after batching."""
 
-    mm_hashes: NotRequired[List[str]]
+    mm_hashes: NotRequired[list[str]]
     """The hashes of the multi-modal data."""
 
     mm_placeholders: MultiModalPlaceholderDict
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 6baf19d675d50..3ece0762e3228 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -1,6 +1,6 @@
+import pickle
 import re
 from abc import ABC, abstractmethod
-from collections import UserDict
 from collections.abc import Callable, ItemsView, Iterable, Mapping, Sequence
 from dataclasses import dataclass, field
 from functools import lru_cache
@@ -8,19 +8,18 @@
 
 import numpy as np
 import torch
+from blake3 import blake3
 from PIL.Image import Image
 from transformers import BatchFeature, ProcessorMixin
-from typing_extensions import assert_never
 
 from vllm.inputs import DummyData, InputProcessingContext
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
-from vllm.utils import flatten_2d_lists, full_groupby, is_list_of
+from vllm.utils import LRUCache, flatten_2d_lists, full_groupby, is_list_of
 
-from .audio import resample_audio
-from .inputs import (AudioItem, ImageItem, MultiModalDataDict,
-                     MultiModalInputsV2, MultiModalKwargs, PlaceholderRange,
-                     VideoItem)
+from .inputs import (MultiModalDataDict, MultiModalDataItems,
+                     MultiModalFieldConfig, MultiModalFieldItem,
+                     MultiModalInputsV2, MultiModalKwargs, PlaceholderRange)
 
 logger = init_logger(__name__)
 
@@ -201,111 +200,6 @@ def get_replacement(self, item_idx: int) -> _BoundPromptSequence:
         return bound_replacement
 
 
-class ImageSize(NamedTuple):
-    width: int
-    height: int
-
-
-class MultiModalDataItems(UserDict[str, list[Any]]):
-    """
-    As :class:`MultiModalDataDict`, but normalized such that each entry
-    corresponds to a list.
-    """
-
-    @staticmethod
-    def from_dict(data: MultiModalDataDict) -> "MultiModalDataItems":
-        """
-        Normalize :class:`MultiModalDataDict` to :class:`MultiModalDataItems`.
-        """
-        multi_data = MultiModalDataItems()
-
-        for k, v in data.items():
-            # TODO: Make a separate modality for embedding inputs
-            # to avoid confusion
-            # yapf: disable
-            if k == "video":
-                # Special case since even a single item can be a list
-                multi_data[k] = (  # type: ignore[index]
-                    v if (isinstance(v, torch.Tensor)
-                          or is_list_of(v, list)) else [v]
-                )
-            elif k in ("image", "audio"):
-                multi_data[k] = (  # type: ignore[index]
-                    v if isinstance(v, (torch.Tensor, list)) else [v]
-                )
-            else:
-                multi_data[k] = v if isinstance(v, list) else [v]  # type: ignore[index]
-            # yapf: enable
-
-        return multi_data
-
-    # NOTE: When a field (e.g. `images`) doesn't exist, directly appending to
-    # `self.images` doesn't update this dictionary, which may be confusing
-    # We annotate the getter methods as `Sequence` to prevent others from
-    # trying to update the list in this way
-    @property
-    def images(self) -> Sequence[ImageItem]:
-        return self.get("image", [])
-
-    @property
-    def videos(self) -> Sequence[VideoItem]:
-        return self.get("video", [])
-
-    @property
-    def audios(self) -> Sequence[AudioItem]:
-        return self.get("audio", [])
-
-    def get_item_counts(self) -> Mapping[str, int]:
-        return {m: len(items) for m, items in self.items()}
-
-    def get_image_size(self, item_idx: int) -> ImageSize:
-        image = self.images[item_idx]
-
-        if isinstance(image, Image):
-            return ImageSize(*image.size)
-        if isinstance(image, (np.ndarray, torch.Tensor)):
-            _, h, w = image.shape
-            return ImageSize(w, h)
-
-        assert_never(image)
-
-    def get_audio_with_sr(
-        self,
-        item_idx: int,
-        *,
-        default_sr: float,
-    ) -> tuple[np.ndarray, float]:
-        audio = self.audios[item_idx]
-
-        if isinstance(audio, tuple):
-            return audio
-        if isinstance(audio, list):
-            return np.array(audio), default_sr
-        if isinstance(audio, np.ndarray):
-            return audio, default_sr
-
-        assert_never(audio)
-
-    def resample_audios(self, new_sr: float, *, drop_sr: bool = True) -> None:
-        """
-        If :code:`drop_sr=True`, the audio items in this dictionary are updated
-        to be NumPy arrays which implicitly means that their sampling rate is
-        the same as the model's expected sampling rate; otherwise, they remain
-        as :code:`(audio, new_sr)` tuples.
-        """
-        if not self.audios:
-            return
-
-        new_audios = []
-        for item_idx in range(len(self.audios)):
-            audio, sr = self.get_audio_with_sr(item_idx, default_sr=new_sr)
-            audio = resample_audio(audio, orig_sr=sr, target_sr=new_sr)
-
-            new_audios.append(audio if drop_sr else (audio, new_sr))
-
-        self["audio"] = new_audios
-
-
 class _TokenMatch(NamedTuple):
     start_idx: int
     end_idx: int
@@ -583,11 +477,124 @@ def iter_placeholders(
             )
 
 
-class ProcessorInputs(NamedTuple):
-    """Keyword arguments to :meth:`BaseMultiModalProcessor`"""
+@dataclass
+class ProcessorInputs:
+    """Keyword arguments to :meth:`BaseMultiModalProcessor`."""
     prompt_text: str
     mm_data: MultiModalDataDict
-    mm_processor_kwargs: Mapping[str, object]
+    hf_processor_mm_kwargs: Mapping[str, object] = field(default_factory=dict)
+
+
+class ProcessingCache:
+
+    def __init__(self, capacity: int) -> None:
+        super().__init__()
+
+        # DEBUG: Set to None to disable
+        self.debug_cache_hit_ratio_steps: Optional[int] = None
+
+        self._cache = LRUCache[str, Mapping[str,
+                                            MultiModalFieldItem]](capacity)
+
+    def _maybe_log_cache_stats(self) -> None:
+        steps = self.debug_cache_hit_ratio_steps
+        if not steps:
+            return
+
+        cache_stats = self._cache.stat()
+        if cache_stats.total % steps == 0:
+            logger.debug("ProcessingCache: hit_ratio = %.2f",
+                         cache_stats.hit_ratio)
+
+    def _serialize_item(self, obj: object) -> bytes:
+        # Simple cases
+        if isinstance(obj, str):
+            return obj.encode("utf-8")
+        if isinstance(obj, bytes):
+            return obj
+        if isinstance(obj, Image):
+            return obj.tobytes()
+
+        # Convertible to NumPy arrays
+        if isinstance(obj, torch.Tensor):
+            obj = obj.numpy()
+        if isinstance(obj, (int, float)):
+            obj = np.array(obj)
+        if isinstance(obj, np.ndarray):
+            return obj.tobytes()
+
+        logger.warning(
+            "No serialization method found for %s. "
+            "Falling back to pickle.", type(obj))
+
+        return pickle.dumps(obj)
+
+    def _item_to_bytes(
+        self,
+        key: str,
+        obj: object,
+    ) -> Iterable[tuple[bytes, bytes]]:
+        # Recursive cases
+        if isinstance(obj, (list, tuple)):
+            for i, elem in enumerate(obj):
+                yield from self._item_to_bytes(f"{key}.{i}", elem)
+        elif isinstance(obj, dict):
+            for k, v in obj.items():
+                yield from self._item_to_bytes(f"{key}.{k}", v)
+        else:
+            key_bytes = self._serialize_item(key)
+            value_bytes = self._serialize_item(obj)
+            yield key_bytes, value_bytes
+
+    def _hash_kwargs(self, **kwargs: object) -> str:
+        hasher = blake3()
+
+        for k, v in kwargs.items():
+            for k_bytes, v_bytes in self._item_to_bytes(k, v):
+                hasher.update(k_bytes)
+                hasher.update(v_bytes)
+
+        return hasher.hexdigest()
+
+    def get(
+        self,
+        model_id: str,
+        modality: str,
+        input_item: object,
+        input_kwargs: Mapping[str, object],
+    ) -> Optional[Mapping[str, MultiModalFieldItem]]:
+        """
+        Get a processed multi-modal item from the cache
+        according to its dependencies, including:
+
+        - The model ID
+        - The modality of the item
+        - The original data item passed to the HF processor
+        - The configuration options of the HF processor
+        """
+        self._maybe_log_cache_stats()
+
+        cache_key = self._hash_kwargs(model_id=model_id,
+                                      **{modality: input_item},
+                                      **input_kwargs)
+        return self._cache.get(cache_key)
+
+    def put(
+        self,
+        model_id: str,
+        modality: str,
+        input_item: object,
+        input_kwargs: Mapping[str, object],
+        output_kwargs: Mapping[str, MultiModalFieldItem],
+    ) -> None:
+        """
+        Put a processed multi-modal item into the cache
+        according to its dependencies (see :meth:`get`).
+        """
+        cache_key = self._hash_kwargs(model_id=model_id,
+                                      **{modality: input_item},
+                                      **input_kwargs)
+        self._cache.put(cache_key, output_kwargs)
 
 
 class BaseMultiModalProcessor(ABC):
@@ -595,18 +602,24 @@ class BaseMultiModalProcessor(ABC):
     Abstract base class to process multi-modal inputs to be used in vLLM.
     """
 
-    def __init__(self, ctx: InputProcessingContext) -> None:
+    def __init__(self,
+                 ctx: InputProcessingContext,
+                 *,
+                 cache: Optional[ProcessingCache] = None,
+                 enable_sanity_checks: bool = True) -> None:
         super().__init__()
 
         self.ctx = ctx
+        self.cache = cache
+        self.enable_sanity_checks = enable_sanity_checks
 
     def __call__(
         self,
         prompt: str,
         mm_data: MultiModalDataDict,
-        mm_processor_kwargs: Mapping[str, object],
+        hf_processor_mm_kwargs: Mapping[str, object],
     ) -> MultiModalInputsV2:
-        return self.apply(prompt, mm_data, mm_processor_kwargs)
+        return self.apply(prompt, mm_data, hf_processor_mm_kwargs)
 
     def _get_hf_processor(self) -> ProcessorMixin:
         """
@@ -624,12 +637,21 @@ def _get_mm_items(
     ) -> MultiModalDataItems:
         return MultiModalDataItems.from_dict(mm_data)
 
+    @abstractmethod
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        """Given the HF-processed data, output the metadata of each field."""
+        raise NotImplementedError
+
     @abstractmethod
     def _get_prompt_replacements(
         self,
         mm_items: MultiModalDataItems,
-        hf_inputs: BatchFeature,
-        mm_processor_kwargs: Mapping[str, object],
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
         """
         Given the original multi-modal items for this modality
@@ -651,7 +673,7 @@ def _find_placeholders(
         return list(
             iter_placeholders(all_prompt_repls, new_token_ids, mm_item_counts))
 
-    def _get_processor_data(
+    def _get_hf_mm_data(
         self,
         mm_items: MultiModalDataItems,
     ) -> tuple[dict[str, Any], dict[str, Any]]:
@@ -669,7 +691,7 @@ def _get_processor_data(
                       and v[0].ndim == 2):
                     # Pass through embedding inputs (multi)
                     passthrough_data[f"{k}_embeds"] = v
-                else:
+                elif len(v) > 0:
                     # Map keys to plural form, e.g.: image -> images
                     processor_data[f"{k}s"] = v
             else:
@@ -679,39 +701,181 @@ def _get_processor_data(
 
     def _call_hf_processor(
         self,
-        hf_processor: ProcessorMixin,
         prompt: str,
-        processor_data: Mapping[str, object],
-        mm_processor_kwargs: Mapping[str, object],
+        # Not to be confused with `mm_data` in `self.apply`.
+        # This refers to the data to be passed to HF processor.
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
     ) -> BatchFeature:
         return self.ctx.call_hf_processor(
-            hf_processor,
-            prompt,
-            processor_data,
-            mm_processor_kwargs,
+            self._get_hf_processor(**mm_kwargs),
+            dict(text=prompt, **mm_data),
+            mm_kwargs,
         )
 
     def _apply_hf_processor(
         self,
-        prompt: str,
+        prompt_text: str,
         mm_items: MultiModalDataItems,
-        mm_processor_kwargs: Mapping[str, object],
-    ) -> BatchFeature:
-        # some mm_processor_kwargs may be used in processor initialization
-        # instead of processor call
-        hf_processor = self._get_hf_processor(**mm_processor_kwargs)
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> tuple[list[int], MultiModalKwargs]:
+        """
+        Apply the HF processor on the full prompt text and multi-modal data.
+        """
+        processor_data, passthrough_data = self._get_hf_mm_data(mm_items)
+
+        processed_data = self._call_hf_processor(
+            prompt=prompt_text,
+            mm_data=processor_data,
+            mm_kwargs=hf_processor_mm_kwargs,
+        )
+        processed_data.update(passthrough_data)
 
-        processor_data, passthrough_data = self._get_processor_data(mm_items)
+        prompt_ids, = processed_data.pop("input_ids").tolist()
 
-        hf_inputs = self._call_hf_processor(
-            hf_processor,
-            prompt=prompt,
-            processor_data=processor_data,
-            mm_processor_kwargs=mm_processor_kwargs,
+        mm_kwargs = MultiModalKwargs.from_hf_inputs(
+            processed_data,
+            self._get_mm_fields_config(processed_data, hf_processor_mm_kwargs),
+            enable_sanity_checks=self.enable_sanity_checks,
         )
-        hf_inputs.update(passthrough_data)
 
-        return hf_inputs
+        return prompt_ids, mm_kwargs
+
+    def _apply_hf_processor_missing(
+        self,
+        prompt_text: str,
+        mm_missing_data_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ):
+        """
+        Apply the HF processor on the full prompt text, but only on the
+        multi-modal data that are missing from the cache.
+
+        Note: We pass prompt text and multi-modal data into the HF processor
+        in separate calls to avoid HF prompt replacement being done for
+        cached items; instead, we rely on our own prompt replacement logic
+        for the full text.
+        """
+        mm_missing_counts = mm_missing_data_items.get_item_counts()
+
+        prompt_ids, _ = self._apply_hf_processor(
+            prompt_text=prompt_text,
+            mm_items=MultiModalDataItems({}),
+            hf_processor_mm_kwargs={},
+        )
+
+        # Some HF processors (e.g. Qwen2-VL) expect corresponding
+        # multi-modal tokens to be in the prompt text
+        dummy_inputs = self._get_dummy_mm_inputs(mm_missing_counts)
+
+        _, mm_missing_kwargs = self._apply_hf_processor(
+            prompt_text=dummy_inputs.prompt_text,
+            mm_items=mm_missing_data_items,
+            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+        )
+
+        return prompt_ids, mm_missing_kwargs
+
+    def _cached_apply_hf_processor(
+        self,
+        prompt_text: str,
+        mm_data_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> tuple[list[int], MultiModalKwargs]:
+        """
+        Apply the HF processor on the full prompt text,
+        caching the results and reusing cached results.
+        """
+        cache = self.cache
+        model_id = self.ctx.model_config.model
+
+        if cache is None or mm_data_items.has_embedding_inputs():
+            return self._apply_hf_processor(
+                prompt_text=prompt_text,
+                mm_items=mm_data_items,
+                hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+            )
+
+        mm_maybe_cached_field_items = {
+            modality: [
+                cache.get(model_id, modality, item, hf_processor_mm_kwargs)
+                for item in items
+            ]
+            for modality, items in mm_data_items.items()
+        }
+
+        mm_missing_idxs = {
+            modality: [idx for idx, out in enumerate(fields) if out is None]
+            for modality, fields in mm_maybe_cached_field_items.items()
+        }
+        mm_missing_data = {
+            modality: [mm_data_items[modality][idx] for idx in idxs]
+            for modality, idxs in mm_missing_idxs.items()
+        }
+        mm_missing_data_items = self._get_mm_items(mm_missing_data)
+
+        prompt_ids, mm_missing_kwargs = self._apply_hf_processor_missing(
+            prompt_text=prompt_text,
+            mm_missing_data_items=mm_missing_data_items,
+            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+        )
+
+        mm_missing_next_idx = {
+            modality: 0
+            for modality in mm_missing_data_items
+        }
+
+        mm_merged_field_items = dict[str, list[Mapping[str,
+                                                       MultiModalFieldItem]]]()
+        for modality, modal_items_lst in mm_maybe_cached_field_items.items():
+            merged_modal_items_lst = list[Mapping[str, MultiModalFieldItem]]()
+
+            for idx, modal_items in enumerate(modal_items_lst):
+                if modal_items is None:
+                    modal_items = mm_missing_kwargs.get_items_by_modality(
+                        modality,
+                        mm_missing_next_idx[modality],
+                    )
+
+                    cache.put(
+                        model_id,
+                        modality,
+                        mm_data_items[modality][idx],
+                        hf_processor_mm_kwargs,
+                        modal_items,
+                    )
+
+                    mm_missing_next_idx[modality] += 1
+
+                merged_modal_items_lst.append(modal_items)
+
+            mm_merged_field_items[modality] = merged_modal_items_lst
+
+        if self.enable_sanity_checks:
+            mm_missing_counts = mm_missing_data_items.get_item_counts()
+            assert all(
+                item_count == mm_missing_counts[modality]
+                for modality, item_count in mm_missing_next_idx.items()), dict(
+                    mm_missing_next_idx=mm_missing_next_idx,
+                    mm_missing_counts=mm_missing_counts)
+
+        mm_kwargs = MultiModalKwargs.from_items_by_modality(
+            mm_merged_field_items,
+            enable_sanity_checks=self.enable_sanity_checks,
+        )
+
+        if self.enable_sanity_checks:
+            mm_item_counts = mm_data_items.get_item_counts()
+
+            for modality, item_count in mm_item_counts.items():
+                for item_idx in range(item_count):
+                    try:
+                        mm_kwargs.get_items_by_modality(modality, item_idx)
+                    except Exception as e:
+                        # Make it easy to set a breakpoint in the debugger
+                        raise e
+
+        return prompt_ids, mm_kwargs
 
     def _bind_prompt_replacements(
         self,
@@ -730,6 +894,10 @@ def _apply_prompt_replacements(
         tokenizer = self._get_tokenizer()
 
         token_matches = find_token_matches(token_ids, prompt_repls)
+        mm_match_counts = {
+            modality: len(matches)
+            for modality, matches in full_groupby_modality(token_matches)
+        }
 
         # If the search text does not represent a special token,
         # it may have different token IDs in the prompt, because
@@ -742,8 +910,8 @@ def _apply_prompt_replacements(
         # of the search text in the prompt, we instead perform string
         # replacement on the decoded token IDs, then encode them back.
         if all(
-            len(matches) >= mm_item_counts[modality]
-            for modality, matches in full_groupby_modality(token_matches)
+            mm_match_counts.get(modality, 0) >= item_count
+            for modality, item_count in mm_item_counts.items()
         ):  # yapf: disable
             token_ids = replace_token_matches(
                 token_ids,
@@ -775,7 +943,7 @@ def apply(
         self,
         prompt_text: str,
         mm_data: MultiModalDataDict,
-        mm_processor_kwargs: Mapping[str, object],
+        hf_processor_mm_kwargs: Mapping[str, object],
     ) -> MultiModalInputsV2:
         """
         Process multi-modal inputs to be used in vLLM.
@@ -792,20 +960,24 @@ def apply(
         """
         mm_items = self._get_mm_items(mm_data)
 
-        hf_inputs = self._apply_hf_processor(prompt_text, mm_items,
-                                             mm_processor_kwargs)
-        prompt_ids, = hf_inputs.pop("input_ids").tolist()
-        mm_kwargs = MultiModalKwargs(hf_inputs)
+        prompt_ids, mm_kwargs = self._cached_apply_hf_processor(
+            prompt_text,
+            mm_items,
+            hf_processor_mm_kwargs,
+        )
 
-        prompt_repls = self._get_prompt_replacements(mm_items, hf_inputs,
-                                                     mm_processor_kwargs)
-        all_prompt_repls = self._bind_prompt_replacements(prompt_repls)
+        unbound_prompt_repls = self._get_prompt_replacements(
+            mm_items,
+            hf_processor_mm_kwargs,
+            mm_kwargs,
+        )
+        prompt_repls = self._bind_prompt_replacements(unbound_prompt_repls)
 
         # If HF processor already inserts placeholder tokens,
         # there is no need for us to insert them
         mm_item_counts = mm_items.get_item_counts()
-        all_placeholders = self._find_placeholders(all_prompt_repls,
-                                                   prompt_ids, mm_item_counts)
+        all_placeholders = self._find_placeholders(prompt_repls, prompt_ids,
+                                                   mm_item_counts)
 
         if all_placeholders:
             tokenizer = self._get_tokenizer()
@@ -817,7 +989,7 @@ def apply(
                 all_placeholders,
             ) = self._apply_prompt_replacements(
                 prompt_ids,
-                all_prompt_repls,
+                prompt_repls,
                 mm_item_counts,
             )
 
@@ -855,23 +1027,29 @@ def get_dummy_data(
         from vllm.sequence import SequenceData
 
         processor_inputs = self._get_dummy_mm_inputs(mm_counts)
-        mm_inputs = self.apply(*processor_inputs)
+        mm_inputs = self.apply(
+            prompt_text=processor_inputs.prompt_text,
+            mm_data=processor_inputs.mm_data,
+            hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs,
+        )
 
         prompt_token_ids = mm_inputs["prompt_token_ids"]
         placeholders_by_modality = mm_inputs["mm_placeholders"]
 
-        total_placeholders_by_modality = dict[str, int]()
-        for modality, placeholders in placeholders_by_modality.items():
-            num_placeholders = sum(item["length"] for item in placeholders)
-            max_tokens = mm_max_tokens[modality]
-
-            if num_placeholders != max_tokens:
-                logger.warning(
-                    "The processed dummy data has a total of %d placeholder "
-                    "tokens for the '%s' modality, which is not the expected "
-                    "%d tokens.", num_placeholders, modality, max_tokens)
-
-            total_placeholders_by_modality[modality] = num_placeholders
+        total_placeholders_by_modality = {
+            modality: sum(item["length"] for item in placeholders)
+            for modality, placeholders in placeholders_by_modality.items()
+        }
+        expected_placeholders_by_modality = {
+            modality: mm_max_tokens[modality]
+            for modality in placeholders_by_modality
+        }
+        if total_placeholders_by_modality != expected_placeholders_by_modality:
+            raise AssertionError(
+                f"The processed dummy data has a total of "
+                f"{total_placeholders_by_modality} placeholder tokens, which "
+                f"is not the expected {expected_placeholders_by_modality} "
+                "tokens.")
 
         total_len = len(prompt_token_ids)
         if total_len > seq_len:
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index ded45a7184b5d..3a5e11867ad9e 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -1,10 +1,9 @@
 import functools
 from collections import UserDict
-from typing import (TYPE_CHECKING, Any, Callable, Dict, Mapping, Optional,
+from typing import (TYPE_CHECKING, Any, Dict, Mapping, Optional, Protocol,
                     Sequence, Type, TypeVar)
 
 import torch.nn as nn
-from typing_extensions import TypeAlias
 
 from vllm.inputs import InputProcessingContext
 from vllm.logger import init_logger
@@ -15,7 +14,7 @@
 from .base import MultiModalInputMapper, MultiModalPlugin, MultiModalTokensCalc
 from .image import ImagePlugin
 from .inputs import MultiModalDataDict, MultiModalKwargs, NestedTensors
-from .processing import BaseMultiModalProcessor
+from .processing import BaseMultiModalProcessor, ProcessingCache
 from .video import VideoPlugin
 
 if TYPE_CHECKING:
@@ -23,15 +22,22 @@
 
 logger = init_logger(__name__)
 
+# TODO: Tune the MM cache size
+MM_CACHE_SIZE = 256
+
 N = TypeVar("N", bound=Type[nn.Module])
 
-MultiModalProcessorFactory: TypeAlias = Callable[[InputProcessingContext],
-                                                 BaseMultiModalProcessor]
-"""
-Constructs a :class:`MultiModalProcessor` instance from the context.
 
-The processing metadata should be derived from the context.
-"""
+class MultiModalProcessorFactory(Protocol):
+    """Constructs a :class:`MultiModalProcessor` instance from the context."""
+
+    def __call__(
+        self,
+        ctx: InputProcessingContext,
+        *,
+        cache: Optional[ProcessingCache] = None,
+    ) -> BaseMultiModalProcessor:
+        ...
 
 
 class _MultiModalLimits(UserDict["ModelConfig", Dict[str, int]]):
@@ -71,6 +77,8 @@ def __init__(
 
         self._limits_by_model = _MultiModalLimits()
 
+        self._processing_cache = ProcessingCache(MM_CACHE_SIZE)
+
     def register_plugin(self, plugin: MultiModalPlugin) -> None:
         """
         Register a multi-modal plugin so it can be recognized by vLLM.
@@ -328,15 +336,18 @@ def wrapper(model_cls: N) -> N:
 
         return wrapper
 
-    def has_processor(self, model_config: "ModelConfig") -> bool:
-        """
-        Test whether a multi-modal processor is defined for a specific model.
-        """
+    def _get_model_cls(self, model_config: "ModelConfig"):
         # Avoid circular import
         from vllm.model_executor.model_loader import get_model_architecture
 
         model_cls, _ = get_model_architecture(model_config)
-        return model_cls in self._processor_factories
+        return model_cls
+
+    def has_processor(self, model_config: "ModelConfig") -> bool:
+        """
+        Test whether a multi-modal processor is defined for a specific model.
+        """
+        return self._get_model_cls(model_config) in self._processor_factories
 
     def create_processor(
         self,
@@ -346,12 +357,11 @@ def create_processor(
         """
         Create a multi-modal processor for a specific model and tokenizer.
         """
-
-        # Avoid circular import
-        from vllm.model_executor.model_loader import get_model_architecture
-
-        model_cls, _ = get_model_architecture(model_config)
+        model_cls = self._get_model_cls(model_config)
         processor_factory = self._processor_factories[model_cls]
 
         ctx = InputProcessingContext(model_config, tokenizer)
-        return processor_factory(ctx)
+        cache = (None if model_config.disable_mm_preprocessor_cache else
+                 self._processing_cache)
+
+        return processor_factory(ctx, cache=cache)
diff --git a/vllm/transformers_utils/processor.py b/vllm/transformers_utils/processor.py
index f1523667b0466..b12cc83a22970 100644
--- a/vllm/transformers_utils/processor.py
+++ b/vllm/transformers_utils/processor.py
@@ -1,25 +1,31 @@
 from functools import lru_cache
 from typing import Any, cast
 
+from transformers.processing_utils import ProcessorMixin
+
 
 def get_processor(
     processor_name: str,
     *args: Any,
     trust_remote_code: bool = False,
+    processor_cls: type[ProcessorMixin] = ProcessorMixin,
     **kwargs: Any,
 ):
     """Load a processor for the given model name via HuggingFace."""
     # don't put this import at the top level
     # it will call torch.cuda.device_count()
     from transformers import AutoProcessor
-    from transformers.processing_utils import ProcessorMixin
+
+    processor_factory = (AutoProcessor
+                         if processor_cls == ProcessorMixin else processor_cls)
 
     try:
-        processor = AutoProcessor.from_pretrained(
+        processor = processor_factory.from_pretrained(
             processor_name,
             *args,
             trust_remote_code=trust_remote_code,
-            **kwargs)
+            **kwargs,
+        )
     except ValueError as e:
         # If the error pertains to the processor class not existing or not
         # currently being imported, suggest using the --trust-remote-code flag.
diff --git a/vllm/utils.py b/vllm/utils.py
index 3d198887021dc..5eb4e8c4180c4 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -25,11 +25,11 @@
 import weakref
 from asyncio import FIRST_COMPLETED, AbstractEventLoop, Task
 from collections import OrderedDict, UserDict, defaultdict
-from collections.abc import Iterable, Mapping
+from collections.abc import Hashable, Iterable, Mapping
 from dataclasses import dataclass, field
 from functools import lru_cache, partial, wraps
 from typing import (TYPE_CHECKING, Any, AsyncGenerator, Awaitable, Callable,
-                    Dict, Generator, Generic, Hashable, List, Literal,
+                    Dict, Generator, Generic, List, Literal, NamedTuple,
                     Optional, Tuple, Type, TypeVar, Union, overload)
 from uuid import uuid4
 
@@ -194,13 +194,29 @@ def reset(self) -> None:
         self.counter = 0
 
 
+class CacheInfo(NamedTuple):
+    hits: int
+    total: int
+
+    @property
+    def hit_ratio(self) -> float:
+        if self.total == 0:
+            return 0
+
+        return self.hits / self.total
+
+
 class LRUCache(Generic[_K, _V]):
+    """Note: This class is not thread safe!"""
 
     def __init__(self, capacity: int) -> None:
         self.cache = OrderedDict[_K, _V]()
         self.pinned_items = set[_K]()
         self.capacity = capacity
 
+        self._hits = 0
+        self._total = 0
+
     def __contains__(self, key: _K) -> bool:
         return key in self.cache
 
@@ -218,6 +234,9 @@ def __setitem__(self, key: _K, value: _V) -> None:
     def __delitem__(self, key: _K) -> None:
         self.pop(key)
 
+    def stat(self) -> CacheInfo:
+        return CacheInfo(hits=self._hits, total=self._total)
+
     def touch(self, key: _K) -> None:
         self.cache.move_to_end(key)
 
@@ -226,8 +245,12 @@ def get(self, key: _K, default: Optional[_V] = None) -> Optional[_V]:
         if key in self.cache:
             value = self.cache[key]
             self.cache.move_to_end(key)
+
+            self._hits += 1
         else:
             value = default
+
+        self._total += 1
         return value
 
     def put(self, key: _K, value: _V) -> None:

From 55509c2114718c1292c11348f002461ba44cb23b Mon Sep 17 00:00:00 2001
From: ErezSC42 <erezs@ai21.com>
Date: Fri, 27 Dec 2024 19:58:21 +0200
Subject: [PATCH 007/462] [MODEL] LoRA support for Jamba model (#11209)

Signed-off-by: Erez Schwartz <erezs@ai21.com>
---
 tests/lora/conftest.py                        | 24 +++++++++
 tests/lora/test_jamba.py                      | 54 +++++++++++++++++++
 .../layers/mamba/mamba_mixer.py               | 22 ++++++--
 vllm/model_executor/models/jamba.py           | 50 ++++++++---------
 vllm/model_executor/models/mamba.py           | 14 +++--
 5 files changed, 132 insertions(+), 32 deletions(-)
 create mode 100644 tests/lora/test_jamba.py

diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index 8b247fb9b2388..57ebaa424fc59 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -4,6 +4,7 @@
 from unittest.mock import MagicMock, patch
 
 import pytest
+import safetensors
 import torch
 import torch.nn as nn
 from huggingface_hub import snapshot_download
@@ -169,6 +170,29 @@ def mixtral_lora_files_all_target_modules():
     return snapshot_download(repo_id="dyang415/mixtral-lora-v0")
 
 
+@pytest.fixture(scope="session")
+def jamba_lora_files():
+    #   some of the adapters have unnecessary weights for serving,
+    #   hence we remove them
+    def remove_unnecessary_weights(path):
+        lora_path = f"{adapter_path}/adapter_model.safetensors"
+        tensors = safetensors.torch.load_file(lora_path)
+        nonlora_keys = []
+        for k in list(tensors.keys()):
+            if "lora" not in k:
+                nonlora_keys.append(k)
+        for k in nonlora_keys:
+            del tensors[k]
+        safetensors.torch.save_file(tensors, lora_path)
+
+    adapter_path = snapshot_download(
+        repo_id=
+        "hf-100/Jamba-1.5-mini-Spellbound-StoryWriter-0.1-6583896-ckpt53-lora")
+
+    remove_unnecessary_weights(adapter_path)
+    return adapter_path
+
+
 @pytest.fixture(scope="session")
 def gemma_lora_files():
     return snapshot_download(repo_id="wskwon/gemma-7b-test-lora")
diff --git a/tests/lora/test_jamba.py b/tests/lora/test_jamba.py
new file mode 100644
index 0000000000000..6aa33926cb6b8
--- /dev/null
+++ b/tests/lora/test_jamba.py
@@ -0,0 +1,54 @@
+from typing import List
+
+import pytest
+import torch
+
+import vllm
+from vllm.lora.request import LoRARequest
+
+MODEL_PATH = "ai21labs/AI21-Jamba-1.5-Mini"
+
+MAX_TOKENS = 40
+
+
+def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int,
+              prompts: List[str]) -> List[str]:
+
+    sampling_params = vllm.SamplingParams(temperature=0, max_tokens=MAX_TOKENS)
+    outputs = llm.generate(
+        prompts,
+        sampling_params,
+        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
+        if lora_id else None)
+    # Print the outputs.
+    generated_texts: List[str] = []
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text.strip()
+        generated_texts.append(generated_text)
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    return generated_texts
+
+
+@pytest.mark.parametrize("tp_size", [4])
+def test_jamba_lora(jamba_lora_files, tp_size):
+    """Original test, the LoRA model has the common target modules, not all"""
+    if torch.cuda.device_count() < tp_size:
+        pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
+
+    prompts = ["Write a story about a sheep and a goat."]
+
+    llm = vllm.LLM(
+        MODEL_PATH,
+        enable_lora=True,
+        max_num_seqs=16,
+        max_loras=4,
+        distributed_executor_backend="ray",
+        tensor_parallel_size=tp_size,
+    )
+
+    expected_jamba_output = [
+        """Once upon a time, in a lush green meadow, there lived a sheep named Clara and a goat named Billy. Clara was a gentle creature, always nibbling on the soft grass and humming"""  # noqa: E501
+    ]
+    assert do_sample(llm, jamba_lora_files, lora_id=1,
+                     prompts=prompts) == expected_jamba_output
diff --git a/vllm/model_executor/layers/mamba/mamba_mixer.py b/vllm/model_executor/layers/mamba/mamba_mixer.py
index 10bec75f49fdf..606c796d503cf 100644
--- a/vllm/model_executor/layers/mamba/mamba_mixer.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer.py
@@ -42,12 +42,14 @@ def __init__(self,
                  use_rms_norm: bool,
                  rms_norm_has_weight: bool = True,
                  rms_norm_eps: float = 1e-5,
-                 activation="silu"):
+                 activation="silu",
+                 is_lora_enabled: bool = False):
         super().__init__()
         self.time_step_rank = time_step_rank
         self.ssm_state_size = ssm_state_size
         self.use_rms_norm = use_rms_norm
         self.activation = activation
+        self.is_lora_enabled = is_lora_enabled
 
         self.conv1d = ColumnParallelLinear(
             input_size=conv_kernel_size,
@@ -63,6 +65,7 @@ def __init__(self,
         self.in_proj = MergedColumnParallelLinear(hidden_size,
                                                   [intermediate_size] * 2,
                                                   bias=use_bias)
+
         # selective projection used to make dt, B and C input dependent
         self.x_proj = RowParallelLinear(
             intermediate_size,
@@ -170,7 +173,13 @@ def forward_cuda(self, hidden_states: torch.Tensor,
 
         # 3. State Space Model sequence transformation
         # 3.a. input varying initialization of time_step, B and C
-        ssm_parameters = self.x_proj(hidden_states.transpose(-2, -1))[0]
+
+        if self.is_lora_enabled:
+            #   lora kernel requires contiguous tensor
+            ssm_parameters = self.x_proj(
+                hidden_states.transpose(-2, -1).contiguous())[0]
+        else:
+            ssm_parameters = self.x_proj(hidden_states.transpose(-2, -1))[0]
 
         time_step, B, C = torch.split(
             ssm_parameters,
@@ -222,6 +231,11 @@ def forward_cuda(self, hidden_states: torch.Tensor,
             scan_outputs = scan_outputs.transpose(0, 1)
 
         # 4. Final linear projection
-        contextualized_states = self.out_proj(scan_outputs.transpose(-2,
-                                                                     -1))[0]
+        if self.is_lora_enabled:
+            #  lora kernel requires contiguous tensor
+            contextualized_states = self.out_proj(
+                scan_outputs.transpose(-2, -1).contiguous())[0]
+        else:
+            contextualized_states = self.out_proj(
+                scan_outputs.transpose(-2, -1))[0]
         return contextualized_states
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index 91786db5ddc96..890b5530b97d6 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -107,9 +107,11 @@ def __init__(self,
                  layer_idx: int,
                  cache_config: Optional[CacheConfig] = None,
                  quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = "") -> None:
+                 is_lora_enabled: Optional[bool] = False,
+                 **kwargs) -> None:
         super().__init__()
         self.config = config
+        self.is_lora_enabled = is_lora_enabled
         self.mamba = MambaMixer(hidden_size= config.hidden_size,
                                 ssm_state_size = config.mamba_d_state,
                                 conv_kernel_size = config.mamba_d_conv,
@@ -120,7 +122,9 @@ def __init__(self,
                                 use_bias = config.mamba_proj_bias,
                                 use_rms_norm=True,
                                 rms_norm_eps=config.rms_norm_eps,
-                                activation=config.hidden_act)
+                                activation=config.hidden_act,
+                                is_lora_enabled = self.is_lora_enabled
+                                )
 
         num_experts = config.layers_num_experts[layer_idx]
         ffn_layer_class = JambaMoE if num_experts > 1 else JambaMLP
@@ -156,14 +160,13 @@ def forward(
 
 class JambaAttentionDecoderLayer(nn.Module):
 
-    def __init__(
-        self,
-        config: JambaConfig,
-        layer_idx: int,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self,
+                 config: JambaConfig,
+                 layer_idx: int,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = "",
+                 **kwargs) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
         tp_size = get_tensor_model_parallel_world_size()
@@ -287,17 +290,18 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             org_num_embeddings=config.vocab_size,
         )
 
+        extra_kwargs = {"is_lora_enabled": bool(vllm_config.lora_config)}
+
         def get_layer(prefix: str):
             layer_idx = int(prefix.rsplit(".", 1)[1])
             layer_class = ALL_DECODER_LAYER_TYPES[
                 config.layers_block_type[layer_idx]]
-            return layer_class(
-                config,
-                layer_idx,
-                cache_config,
-                quant_config=quant_config,
-                prefix=prefix,
-            )
+            return layer_class(config,
+                               layer_idx,
+                               cache_config,
+                               quant_config=quant_config,
+                               prefix=prefix,
+                               **extra_kwargs)
 
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers, get_layer, prefix=f"{prefix}.layers")
@@ -371,14 +375,13 @@ class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
             "k_proj",
             "v_proj",
         ],
+        "in_proj": ["in_proj"],
     }
 
     # LoRA specific attributes
     supported_lora_modules = [
-        "qkv_proj",
-        "o_proj",
-        "embed_tokens",
-        "lm_head",
+        "qkv_proj", "o_proj", "embed_tokens", "lm_head", "up_proj",
+        "down_proj", "gate_proj", "out_proj", "in_proj", "x_proj"
     ]
     embedding_modules = {
         "embed_tokens": "input_embeddings",
@@ -423,9 +426,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
         if self.scheduler_config is not None and \
-            not self.model_config.enforce_eager:
+                not self.model_config.enforce_eager:
             if self.scheduler_config.max_num_seqs > \
-                vllm_config.compilation_config.max_capture_size:
+                    vllm_config.compilation_config.max_capture_size:
                 self.max_batch_size = \
                     vllm_config.compilation_config.max_capture_size
             else:
@@ -446,7 +449,6 @@ def forward(self,
                 inputs_embeds: Optional[torch.Tensor] = None,
                 **kwargs):
         if self.mamba_cache is None:
-
             num_mamba_layers = self.model_config.get_num_layers_by_block_type(
                 self.vllm_config.parallel_config, LayerBlockType.mamba)
             self.mamba_cache = MambaCacheManager(
diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py
index 06c8d9723cd01..553bc9c28cb21 100644
--- a/vllm/model_executor/models/mamba.py
+++ b/vllm/model_executor/models/mamba.py
@@ -38,10 +38,12 @@ class MambaDecoderLayer(nn.Module):
     def __init__(self,
                  config: MambaConfig,
                  cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None) -> None:
+                 quant_config: Optional[QuantizationConfig] = None,
+                 is_lora_enabled: Optional[bool] = False) -> None:
         super().__init__()
         self.config = config
         self.is_falcon_mamba = config.model_type == "falcon_mamba"
+        self.is_lora_enabled = is_lora_enabled
         mixer_rms_eps = config.mixer_rms_eps if self.is_falcon_mamba else None
         self.mixer = MambaMixer(hidden_size=config.hidden_size,
                                 ssm_state_size=config.state_size,
@@ -53,7 +55,8 @@ def __init__(self,
                                 use_rms_norm=self.is_falcon_mamba,
                                 rms_norm_has_weight=not self.is_falcon_mamba,
                                 rms_norm_eps=mixer_rms_eps,
-                                activation=config.hidden_act)
+                                activation=config.hidden_act,
+                                is_lora_enabled=self.is_lora_enabled)
 
         self.norm = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
 
@@ -85,6 +88,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
+        is_lora_enabled = bool(lora_config)
 
         self.config = config
         self.padding_idx = config.pad_token_id
@@ -101,8 +105,10 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
-            lambda prefix: MambaDecoderLayer(
-                config, cache_config=cache_config, quant_config=quant_config),
+            lambda prefix: MambaDecoderLayer(config,
+                                             cache_config=cache_config,
+                                             quant_config=quant_config,
+                                             is_lora_enabled=is_lora_enabled),
             prefix=f"{prefix}.layers")
 
         self.norm_f = RMSNorm(config.hidden_size,

From 0240402c4632604c9cd02f7eae4ae36fa990b38f Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Sat, 28 Dec 2024 02:48:24 +0800
Subject: [PATCH 008/462] [Misc]Add BNB quantization for MolmoForCausalLM 
 (#11551)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/model_executor/model_loader/loader.py | 26 +++++--
 vllm/model_executor/models/molmo.py        | 90 ++++++++++++++++------
 2 files changed, 83 insertions(+), 33 deletions(-)

diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index f2d9293b31a83..4bca13cb2f60c 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -11,7 +11,8 @@
 import warnings
 from abc import ABC, abstractmethod
 from contextlib import contextmanager
-from typing import Any, Dict, Generator, Iterable, List, Optional, Tuple, cast
+from typing import (Any, Callable, Dict, Generator, Iterable, List, Optional,
+                    Tuple, cast)
 
 import gguf
 import huggingface_hub
@@ -706,6 +707,8 @@ def __init__(self, load_config: LoadConfig):
         # Store all module names (from transformers) that support
         # BNB quantization.
         self.target_modules: List[str] = []
+        # mapping weight names from transformers to vllm.
+        self.weight_mapper: Callable = lambda name: name
 
     def _get_weight_files(
         self,
@@ -763,9 +766,12 @@ def _prepare_weights(self, model_name_or_path: str,
 
     def _hf_weight_iter(self, hf_weights_files, use_safetensors: bool):
         if use_safetensors:
-            return safetensors_weights_iterator(hf_weights_files)
+            iterator = safetensors_weights_iterator(hf_weights_files)
         else:
-            return pt_weights_iterator(hf_weights_files)
+            iterator = pt_weights_iterator(hf_weights_files)
+        for name, param in iterator:
+            # mapping weight names from transformers to vllm.
+            yield self.weight_mapper(name), param
 
     def _get_quantized_weights_iterator(
         self,
@@ -782,12 +788,12 @@ def _get_quantized_weights_iterator(
         try:
             import bitsandbytes
 
-            if bitsandbytes.__version__ < "0.44.0":
+            if bitsandbytes.__version__ < "0.45.0":
                 raise ImportError("bitsandbytes version is wrong. Please "
-                                  "install bitsandbytes>=0.44.0.")
+                                  "install bitsandbytes>=0.45.0.")
         except ImportError as err:
-            raise ImportError("Please install bitsandbytes>=0.44.0 via "
-                              "`pip install bitsandbytes>=0.44.0` to use "
+            raise ImportError("Please install bitsandbytes>=0.45.0 via "
+                              "`pip install bitsandbytes>=0.45.0` to use "
                               "bitsandbytes quantizer.") from err
 
         hf_weights_files, use_safetensors = self._prepare_weights(
@@ -991,7 +997,7 @@ def _get_bnb_target_modules(self, model: nn.Module) -> None:
             if isinstance(module, (LinearBase, )):
                 last_name = name.split(".")[-1]
                 if sub_modules := inverse_stacked_mapping.get(last_name, []):
-                    # Map vllm's names to transformers' names.
+                    # Map vllm's names to transformers's names.
                     for sub_name in sub_modules:
                         self.target_modules.append(
                             name.replace(last_name, sub_name))
@@ -1013,6 +1019,10 @@ def _load_weights(self, model_config: ModelConfig,
                 f"Model {type(model).__name__} does not support BitsAndBytes "
                 "quantization yet.")
 
+        # For some models like Molmo, we need to use hf_to_vllm_mapper
+        # to ensure correct loading of weights.
+        if hf_to_vllm_mapper := getattr(model, "hf_to_vllm_mapper", None):
+            self.weight_mapper = lambda name: hf_to_vllm_mapper._map_name(name)
         # Modules whose weights might have fused on disk
         # we need their output_sizes to make shard in flight correctly with TP
         self.maybe_fused_weights_modules: Dict[str, List[int]] = {}
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index 8938f62d0c494..5d52d2c3e6b48 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -461,30 +461,71 @@ def forward(
         return output
 
 
-class MolmoMLP(nn.Module):
+class SwiGLU(nn.Module):
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x, gate = x.chunk(2, dim=-1)
+        # Note that the order is reversed compared to
+        # SiluAndMul.
+        return x * F.silu(gate)
+
+
+class LanuageModelMLP(nn.Module):
     """Molmo's LLM mlp."""
 
     def __init__(self,
                  config: PretrainedConfig,
                  input_dim: Optional[int] = None,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 proj_name: str = "gate_up_proj") -> None:
+                 quant_config: Optional[QuantizationConfig] = None) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
         self.intermediate_size = config.intermediate_size // 2
 
-        # Molmo's LLM proj weights are already merged into the disk, while
-        # image_projector proj is separate. If the same proj_name were used, it
-        # would create ambiguity and make it difficult to support BNB and LoRA.
-        self.proj_name = proj_name
-        setattr(
-            self, proj_name,
-            MergedColumnParallelLinear(
-                input_dim or self.hidden_size,
-                [self.intermediate_size] * 2,
-                bias=False,
-                quant_config=quant_config,
-            ))
+        self.gate_up_proj = MergedColumnParallelLinear(
+            input_dim or self.hidden_size,
+            [self.intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+        )
+        # Activation function.
+        self.act_fn = SwiGLU()
+        # Feed-forward output projection.
+        self.down_proj = RowParallelLinear(
+            self.intermediate_size,
+            self.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+    ) -> torch.Tensor:
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class ImageProjectorMLP(nn.Module):
+    """Molmo's image_projector mlp."""
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        input_dim: Optional[int] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size // 2
+
+        self.merged_linear = MergedColumnParallelLinear(
+            input_dim or self.hidden_size,
+            [self.intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+        )
         # Activation function.
         self.act_fn = SiluAndMul()
 
@@ -500,7 +541,7 @@ def forward(
         self,
         x: torch.Tensor,
     ) -> torch.Tensor:
-        gate_up, _ = getattr(self, self.proj_name)(x)
+        gate_up, _ = self.merged_linear(x)
         x = self.act_fn(gate_up)
         x, _ = self.down_proj(x)
         return x
@@ -523,9 +564,7 @@ def __init__(
                                         prefix=f"{prefix}.self_attn")
 
         # MLP block.
-        self.mlp = MolmoMLP(config,
-                            quant_config=quant_config,
-                            proj_name="gate_up_proj")
+        self.mlp = LanuageModelMLP(config, quant_config=quant_config)
 
         # LayerNorm
         assert config.layer_norm_type == "rms"
@@ -617,11 +656,10 @@ def __init__(
             vision_config,
             nlayers=len(self.vit_layers),
             quant_config=quant_config)
-        self.image_projector = MolmoMLP(
+        self.image_projector = ImageProjectorMLP(
             config,
             input_dim=vision_config.image_emb_dim,
             quant_config=quant_config,
-            proj_name="merged_linear",
         )
 
         image_dim = vision_config.image_emb_dim * len(self.vit_layers)
@@ -842,10 +880,6 @@ def load_weights(self, weights: Iterable[Tuple[str,
         loaded_params: Set[str] = set()
 
         for name, loaded_weight in weights:
-            if "gate_up_proj" in name:
-                up_proj, gate_proj = loaded_weight.chunk(2, dim=0)
-                loaded_weight = torch.cat([gate_proj, up_proj], dim=0)
-
             if name.endswith(".bias") and name not in params_dict:
                 continue
             if is_pp_missing_parameter(name, self):
@@ -1157,6 +1191,12 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
         },
     )
 
+    # BitandBytes specific attributes
+    bitsandbytes_stacked_params_mapping = {
+        "gate_proj": ("merged_linear", 0),
+        "up_proj": ("merged_linear", 1),
+    }
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config

From dde1fa18c9f9ba992a8300a300543d6c18d5f08d Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sat, 28 Dec 2024 03:45:13 +0800
Subject: [PATCH 009/462] [Misc] Improve BNB loader to handle mixture of
 sharded and merged weights with same suffix (#11566)

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 vllm/model_executor/model_loader/loader.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 4bca13cb2f60c..a9c1fa7221217 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -1001,8 +1001,11 @@ def _get_bnb_target_modules(self, model: nn.Module) -> None:
                     for sub_name in sub_modules:
                         self.target_modules.append(
                             name.replace(last_name, sub_name))
-                else:
-                    self.target_modules.append(name)
+                # Add original module name even if the module has stacked map,
+                # in case model has a mixture of disk-merged and disk-splitted
+                # weights with same last name.
+                self.target_modules.append(name)
+
         assert (self.target_modules
                 ), "vllm currently does not support BNB quantization for"
         f" {type(model).__name__}"

From ac797994039ba9e6ed0c2b3a503099cb122a936e Mon Sep 17 00:00:00 2001
From: Selali <selali.adobor@gmail.com>
Date: Fri, 27 Dec 2024 12:12:11 -0800
Subject: [PATCH 010/462] [Bugfix] Fix for ROCM compressed tensor support
 (#11561)

---
 .../schemes/compressed_tensors_w8a8_fp8.py             | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
index 73cc8ce0d2a4b..1d4e4bd52adaa 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
@@ -41,10 +41,12 @@ def process_weights_after_loading(self, layer) -> None:
             )
 
             if current_platform.is_rocm():
+                input_scale = getattr(layer, 'input_scale', None)
+
                 weight, max_w_scale, input_scale = normalize_e4m3fn_to_e4m3fnuz(
                     weight=weight,
                     weight_scale=max_w_scale,
-                    input_scale=layer.input_scale)
+                    input_scale=input_scale)
                 if input_scale is not None:
                     layer.input_scale = Parameter(input_scale,
                                                   requires_grad=False)
@@ -57,11 +59,13 @@ def process_weights_after_loading(self, layer) -> None:
             weight = layer.weight
 
             if current_platform.is_rocm():
+                input_scale = getattr(layer, 'input_scale', None)
+
                 weight, weight_scale, input_scale = \
                     normalize_e4m3fn_to_e4m3fnuz(
                         weight=weight,
                         weight_scale=layer.weight_scale,
-                        input_scale=layer.input_scale)
+                        input_scale=input_scale)
                 if input_scale is not None:
                     layer.input_scale = Parameter(input_scale,
                                                   requires_grad=False)
@@ -76,7 +80,7 @@ def process_weights_after_loading(self, layer) -> None:
             raise ValueError(f"Unknown quantization strategy {self.strategy}")
 
         # INPUT SCALE
-        if self.is_static_input_scheme:
+        if self.is_static_input_scheme and hasattr(layer, 'input_scale'):
             layer.input_scale = Parameter(layer.input_scale.max(),
                                           requires_grad=False)
         else:

From a60731247fba82fae5e71af7a19ea0df96de1caa Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Sat, 28 Dec 2024 08:31:10 +0800
Subject: [PATCH 011/462] [Doc] Update mllama example based on official doc
 (#11567)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
---
 examples/offline_inference_vision_language.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py
index d5a71862656e7..77af914a6ef02 100644
--- a/examples/offline_inference_vision_language.py
+++ b/examples/offline_inference_vision_language.py
@@ -308,7 +308,20 @@ def run_mllama(question: str, modality: str):
         disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
 
-    prompt = f"<|image|><|begin_of_text|>{question}"
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    messages = [{
+        "role":
+        "user",
+        "content": [{
+            "type": "image"
+        }, {
+            "type": "text",
+            "text": f"{question}"
+        }]
+    }]
+    prompt = tokenizer.apply_chat_template(messages,
+                                           add_generation_prompt=True,
+                                           tokenize=False)
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 

From df04dffade84c87cafd74de4c39e6fd7cb95c24f Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Fri, 27 Dec 2024 20:45:08 -0500
Subject: [PATCH 012/462] [V1] [4/N] API Server: ZMQ/MP Utilities (#11541)

---
 docs/requirements-docs.txt                 |   1 +
 tests/v1/engine/test_engine_core.py        |  13 +--
 tests/v1/engine/test_engine_core_client.py |  10 +-
 vllm/entrypoints/openai/api_server.py      |  11 +-
 vllm/executor/multiproc_worker_utils.py    |  22 +---
 vllm/utils.py                              |  90 ++++++++++++++++-
 vllm/v1/engine/async_llm.py                |   6 +-
 vllm/v1/engine/core.py                     | 111 ++++-----------------
 vllm/v1/engine/core_client.py              |  92 ++++++++---------
 vllm/v1/engine/llm_engine.py               |   6 +-
 vllm/v1/executor/multiproc_executor.py     |  11 +-
 vllm/v1/utils.py                           |  89 +++++++++++------
 12 files changed, 247 insertions(+), 215 deletions(-)

diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt
index 4859c8ac08bea..25a700033cc9e 100644
--- a/docs/requirements-docs.txt
+++ b/docs/requirements-docs.txt
@@ -19,3 +19,4 @@ openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entr
 fastapi # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
 partial-json-parser # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
 requests
+zmq
diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index c529cd21f384b..954cec734b956 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -7,7 +7,6 @@
 from vllm import SamplingParams
 from vllm.engine.arg_utils import EngineArgs
 from vllm.platforms import current_platform
-from vllm.usage.usage_lib import UsageContext
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.async_llm import AsyncLLM
 from vllm.v1.engine.core import EngineCore
@@ -43,13 +42,11 @@ def test_engine_core(monkeypatch):
         m.setenv("VLLM_USE_V1", "1")
         """Setup the EngineCore."""
         engine_args = EngineArgs(model=MODEL_NAME)
-        vllm_config = engine_args.create_engine_config(
-            usage_context=UsageContext.UNKNOWN_CONTEXT)
+        vllm_config = engine_args.create_engine_config()
         executor_class = AsyncLLM._get_executor_cls(vllm_config)
 
         engine_core = EngineCore(vllm_config=vllm_config,
-                                 executor_class=executor_class,
-                                 usage_context=UsageContext.UNKNOWN_CONTEXT)
+                                 executor_class=executor_class)
         """Test basic request lifecycle."""
 
         # First request.
@@ -151,13 +148,11 @@ def test_engine_core_advanced_sampling(monkeypatch):
         m.setenv("VLLM_USE_V1", "1")
         """Setup the EngineCore."""
         engine_args = EngineArgs(model=MODEL_NAME)
-        vllm_config = engine_args.create_engine_config(
-            usage_context=UsageContext.UNKNOWN_CONTEXT)
+        vllm_config = engine_args.create_engine_config()
         executor_class = AsyncLLM._get_executor_cls(vllm_config)
 
         engine_core = EngineCore(vllm_config=vllm_config,
-                                 executor_class=executor_class,
-                                 usage_context=UsageContext.UNKNOWN_CONTEXT)
+                                 executor_class=executor_class)
         """Test basic request lifecycle."""
         # First request.
         request: EngineCoreRequest = make_request()
diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
index 2f1cbec607a91..729975e4ea8c4 100644
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -86,11 +86,10 @@ def test_engine_core_client(monkeypatch, multiprocessing_mode: bool):
             UsageContext.UNKNOWN_CONTEXT)
         executor_class = AsyncLLM._get_executor_cls(vllm_config)
         client = EngineCoreClient.make_client(
-            vllm_config,
-            executor_class,
-            UsageContext.UNKNOWN_CONTEXT,
             multiprocess_mode=multiprocessing_mode,
             asyncio_mode=False,
+            vllm_config=vllm_config,
+            executor_class=executor_class,
         )
 
         MAX_TOKENS = 20
@@ -158,11 +157,10 @@ async def test_engine_core_client_asyncio(monkeypatch):
             usage_context=UsageContext.UNKNOWN_CONTEXT)
         executor_class = AsyncLLM._get_executor_cls(vllm_config)
         client = EngineCoreClient.make_client(
-            vllm_config,
-            executor_class,
-            UsageContext.UNKNOWN_CONTEXT,
             multiprocess_mode=True,
             asyncio_mode=True,
+            vllm_config=vllm_config,
+            executor_class=executor_class,
         )
 
         MAX_TOKENS = 20
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 2e45b474237f9..094cc15a317e9 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -68,7 +68,7 @@
 from vllm.logger import init_logger
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import (FlexibleArgumentParser, get_open_zmq_ipc_path,
-                        is_valid_ipv6_address, set_ulimit)
+                        is_valid_ipv6_address, kill_process_tree, set_ulimit)
 from vllm.version import __version__ as VLLM_VERSION
 
 TIMEOUT_KEEP_ALIVE = 5  # seconds
@@ -737,6 +737,15 @@ def signal_handler(*_) -> None:
 
     signal.signal(signal.SIGTERM, signal_handler)
 
+    # The child processes will send SIGQUIT to this process when
+    # any error happens. This process then clean up the whole tree.
+    # TODO(rob): move this into AsyncLLM.__init__ once we remove
+    # the context manager below.
+    def sigquit_handler(signum, frame):
+        kill_process_tree(os.getpid())
+
+    signal.signal(signal.SIGQUIT, sigquit_handler)
+
     async with build_async_engine_client(args) as engine_client:
         app = build_app(args)
 
diff --git a/vllm/executor/multiproc_worker_utils.py b/vllm/executor/multiproc_worker_utils.py
index c4d90f0856f86..bc32826529eef 100644
--- a/vllm/executor/multiproc_worker_utils.py
+++ b/vllm/executor/multiproc_worker_utils.py
@@ -1,5 +1,4 @@
 import asyncio
-import multiprocessing
 import os
 import sys
 import threading
@@ -13,10 +12,9 @@
 
 import torch
 
-import vllm.envs as envs
 from vllm.logger import init_logger
 from vllm.triton_utils.importing import HAS_TRITON
-from vllm.utils import cuda_is_initialized
+from vllm.utils import _check_multiproc_method, get_mp_context
 
 if HAS_TRITON:
     from vllm.triton_utils import maybe_set_triton_cache_manager
@@ -274,24 +272,6 @@ def write_with_prefix(s: str):
     file.write = write_with_prefix  # type: ignore[method-assign]
 
 
-def _check_multiproc_method():
-    if (cuda_is_initialized()
-            and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn"):
-        logger.warning("CUDA was previously initialized. We must use "
-                       "the `spawn` multiprocessing start method. Setting "
-                       "VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. "
-                       "See https://docs.vllm.ai/en/latest/getting_started/"
-                       "debugging.html#python-multiprocessing "
-                       "for more information.")
-        os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
-
-
-def get_mp_context():
-    _check_multiproc_method()
-    mp_method = envs.VLLM_WORKER_MULTIPROC_METHOD
-    return multiprocessing.get_context(mp_method)
-
-
 def set_multiprocessing_worker_envs(parallel_config):
     """ Set up environment variables that should be used when there are workers
     in a multiprocessing environment. This should be called by the parent 
diff --git a/vllm/utils.py b/vllm/utils.py
index 5eb4e8c4180c4..2b46c1fef0d09 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -10,6 +10,7 @@
 import importlib.util
 import inspect
 import ipaddress
+import multiprocessing
 import os
 import re
 import resource
@@ -20,6 +21,7 @@
 import tempfile
 import threading
 import time
+import traceback
 import uuid
 import warnings
 import weakref
@@ -29,8 +31,9 @@
 from dataclasses import dataclass, field
 from functools import lru_cache, partial, wraps
 from typing import (TYPE_CHECKING, Any, AsyncGenerator, Awaitable, Callable,
-                    Dict, Generator, Generic, List, Literal, NamedTuple,
-                    Optional, Tuple, Type, TypeVar, Union, overload)
+                    Dict, Generator, Generic, Iterator, List, Literal,
+                    NamedTuple, Optional, Tuple, Type, TypeVar, Union,
+                    overload)
 from uuid import uuid4
 
 import numpy as np
@@ -39,6 +42,8 @@
 import torch
 import torch.types
 import yaml
+import zmq
+import zmq.asyncio
 from packaging.version import Version
 from torch.library import Library
 from typing_extensions import ParamSpec, TypeIs, assert_never
@@ -1844,7 +1849,7 @@ def memory_profiling(
     result.non_kv_cache_memory_in_bytes = result.non_torch_increase_in_bytes + result.torch_peak_increase_in_bytes + result.weights_memory_in_bytes  # noqa
 
 
-# Adapted from: https://github.com/sgl-project/sglang/blob/f46f394f4d4dbe4aae85403dec006199b34d2840/python/sglang/srt/utils.py#L630 # noqa: E501Curre
+# Adapted from: https://github.com/sgl-project/sglang/blob/v0.4.1/python/sglang/srt/utils.py#L630 # noqa: E501
 def set_ulimit(target_soft_limit=65535):
     resource_type = resource.RLIMIT_NOFILE
     current_soft, current_hard = resource.getrlimit(resource_type)
@@ -1859,3 +1864,82 @@ def set_ulimit(target_soft_limit=65535):
                 "with error %s. This can cause fd limit errors like"
                 "`OSError: [Errno 24] Too many open files`. Consider "
                 "increasing with ulimit -n", current_soft, e)
+
+
+# Adapted from: https://github.com/sgl-project/sglang/blob/v0.4.1/python/sglang/utils.py#L28 # noqa: E501
+def get_exception_traceback():
+    etype, value, tb = sys.exc_info()
+    err_str = "".join(traceback.format_exception(etype, value, tb))
+    return err_str
+
+
+# Adapted from: https://github.com/sgl-project/sglang/blob/v0.4.1/python/sglang/srt/utils.py#L783 # noqa: E501
+def make_zmq_socket(
+    ctx: Union[zmq.asyncio.Context, zmq.Context],  # type: ignore[name-defined]
+    path: str,
+    type: Any,
+) -> Union[zmq.Socket, zmq.asyncio.Socket]:  # type: ignore[name-defined]
+    """Make a ZMQ socket with the proper bind/connect semantics."""
+
+    mem = psutil.virtual_memory()
+    socket = ctx.socket(type)
+
+    # Calculate buffer size based on system memory
+    total_mem = mem.total / 1024**3
+    available_mem = mem.available / 1024**3
+    # For systems with substantial memory (>32GB total, >16GB available):
+    # - Set a large 0.5GB buffer to improve throughput
+    # For systems with less memory:
+    # - Use system default (-1) to avoid excessive memory consumption
+    if total_mem > 32 and available_mem > 16:
+        buf_size = int(0.5 * 1024**3)  # 0.5GB in bytes
+    else:
+        buf_size = -1  # Use system default buffer size
+
+    if type == zmq.constants.PULL:
+        socket.setsockopt(zmq.constants.RCVHWM, 0)
+        socket.setsockopt(zmq.constants.RCVBUF, buf_size)
+        socket.connect(path)
+    elif type == zmq.constants.PUSH:
+        socket.setsockopt(zmq.constants.SNDHWM, 0)
+        socket.setsockopt(zmq.constants.SNDBUF, buf_size)
+        socket.bind(path)
+    else:
+        raise ValueError(f"Unknown Socket Type: {type}")
+
+    return socket
+
+
+@contextlib.contextmanager
+def zmq_socket_ctx(
+        path: str,
+        type: Any) -> Iterator[zmq.Socket]:  # type: ignore[name-defined]
+    """Context manager for a ZMQ socket"""
+
+    ctx = zmq.Context(io_threads=2)  # type: ignore[attr-defined]
+    try:
+        yield make_zmq_socket(ctx, path, type)
+
+    except KeyboardInterrupt:
+        logger.debug("Got Keyboard Interrupt.")
+
+    finally:
+        ctx.destroy(linger=0)
+
+
+def _check_multiproc_method():
+    if (cuda_is_initialized()
+            and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn"):
+        logger.warning("CUDA was previously initialized. We must use "
+                       "the `spawn` multiprocessing start method. Setting "
+                       "VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. "
+                       "See https://docs.vllm.ai/en/latest/getting_started/"
+                       "debugging.html#python-multiprocessing "
+                       "for more information.")
+        os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+
+
+def get_mp_context():
+    _check_multiproc_method()
+    mp_method = envs.VLLM_WORKER_MULTIPROC_METHOD
+    return multiprocessing.get_context(mp_method)
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index ba2b8377759d6..da3da6dad6436 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -75,11 +75,11 @@ def __init__(
 
         # EngineCore (starts the engine in background process).
         self.engine_core = EngineCoreClient.make_client(
-            vllm_config=vllm_config,
-            executor_class=executor_class,
-            usage_context=usage_context,
             multiprocess_mode=True,
             asyncio_mode=True,
+            vllm_config=vllm_config,
+            executor_class=executor_class,
+            log_stats=self.log_stats,
         )
 
         self.output_handler: Optional[asyncio.Task] = None
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 0aef61fc7f680..5840541d774ba 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -3,20 +3,19 @@
 import signal
 import threading
 import time
-from dataclasses import dataclass
-from multiprocessing.process import BaseProcess
+from multiprocessing.connection import Connection
 from typing import List, Tuple, Type
 
+import psutil
 import zmq
 import zmq.asyncio
 from msgspec import msgpack
 
 from vllm.config import CacheConfig, VllmConfig
-from vllm.executor.multiproc_worker_utils import get_mp_context
 from vllm.logger import init_logger
 from vllm.transformers_utils.config import (
     maybe_register_config_serialize_by_value)
-from vllm.usage.usage_lib import UsageContext
+from vllm.utils import get_exception_traceback, zmq_socket_ctx
 from vllm.v1.core.scheduler import Scheduler
 from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs,
                             EngineCoreProfile, EngineCoreRequest,
@@ -25,14 +24,13 @@
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.request import Request, RequestStatus
 from vllm.v1.serial_utils import PickleEncoder
-from vllm.v1.utils import make_zmq_socket
 from vllm.version import __version__ as VLLM_VERSION
 
 logger = init_logger(__name__)
 
 POLLING_TIMEOUT_MS = 5000
 POLLING_TIMEOUT_S = POLLING_TIMEOUT_MS // 1000
-LOGGING_TIME_S = POLLING_TIMEOUT_S
+LOGGING_TIME_S = 5
 
 
 class EngineCore:
@@ -42,9 +40,10 @@ def __init__(
         self,
         vllm_config: VllmConfig,
         executor_class: Type[Executor],
-        usage_context: UsageContext,
+        log_stats: bool = False,
     ):
         assert vllm_config.model_config.runner_type != "pooling"
+        self.log_stats = log_stats
 
         logger.info("Initializing an LLM engine (v%s) with config: %s",
                     VLLM_VERSION, vllm_config)
@@ -134,29 +133,19 @@ def profile(self, is_start: bool = True):
         self.model_executor.profile(is_start)
 
 
-@dataclass
-class EngineCoreProcHandle:
-    proc: BaseProcess
-    ready_path: str
-    input_path: str
-    output_path: str
-
-
 class EngineCoreProc(EngineCore):
     """ZMQ-wrapper for running EngineCore in background process."""
 
-    READY_STR = "READY"
-
     def __init__(
         self,
-        vllm_config: VllmConfig,
-        executor_class: Type[Executor],
-        usage_context: UsageContext,
         input_path: str,
         output_path: str,
-        ready_path: str,
+        ready_pipe: Connection,
+        vllm_config: VllmConfig,
+        executor_class: Type[Executor],
+        log_stats: bool = False,
     ):
-        super().__init__(vllm_config, executor_class, usage_context)
+        super().__init__(vllm_config, executor_class, log_stats)
 
         # Background Threads and Queues for IO. These enable us to
         # overlap ZMQ socket IO with GPU since they release the GIL,
@@ -173,68 +162,7 @@ def __init__(
                          daemon=True).start()
 
         # Send Readiness signal to EngineClient.
-        with make_zmq_socket(ready_path, zmq.constants.PUSH) as ready_socket:
-            ready_socket.send_string(EngineCoreProc.READY_STR)
-
-    @staticmethod
-    def wait_for_startup(
-        proc: BaseProcess,
-        ready_path: str,
-    ) -> None:
-        """Wait until the EngineCore is ready."""
-
-        try:
-            sync_ctx = zmq.Context()  # type: ignore[attr-defined]
-            socket = sync_ctx.socket(zmq.constants.PULL)
-            socket.connect(ready_path)
-
-            # Wait for EngineCore to send EngineCoreProc.READY_STR.
-            while socket.poll(timeout=POLLING_TIMEOUT_MS) == 0:
-                logger.debug("Waiting for EngineCoreProc to startup.")
-
-                if not proc.is_alive():
-                    raise RuntimeError("EngineCoreProc failed to start.")
-
-            message = socket.recv_string()
-            assert message == EngineCoreProc.READY_STR
-
-        except BaseException as e:
-            logger.exception(e)
-            raise e
-
-        finally:
-            sync_ctx.destroy(linger=0)
-
-    @staticmethod
-    def make_engine_core_process(
-        vllm_config: VllmConfig,
-        executor_class: Type[Executor],
-        usage_context: UsageContext,
-        input_path: str,
-        output_path: str,
-        ready_path: str,
-    ) -> EngineCoreProcHandle:
-        context = get_mp_context()
-
-        process_kwargs = {
-            "input_path": input_path,
-            "output_path": output_path,
-            "ready_path": ready_path,
-            "vllm_config": vllm_config,
-            "executor_class": executor_class,
-            "usage_context": usage_context,
-        }
-        # Run EngineCore busy loop in background process.
-        proc = context.Process(target=EngineCoreProc.run_engine_core,
-                               kwargs=process_kwargs)
-        proc.start()
-
-        # Wait for startup
-        EngineCoreProc.wait_for_startup(proc, ready_path)
-        return EngineCoreProcHandle(proc=proc,
-                                    ready_path=ready_path,
-                                    input_path=input_path,
-                                    output_path=output_path)
+        ready_pipe.send({"status": "READY"})
 
     @staticmethod
     def run_engine_core(*args, **kwargs):
@@ -258,6 +186,7 @@ def signal_handler(signum, frame):
         signal.signal(signal.SIGTERM, signal_handler)
         signal.signal(signal.SIGINT, signal_handler)
 
+        parent_process = psutil.Process().parent()
         engine_core = None
         try:
             engine_core = EngineCoreProc(*args, **kwargs)
@@ -266,9 +195,10 @@ def signal_handler(signum, frame):
         except SystemExit:
             logger.debug("EngineCore interrupted.")
 
-        except BaseException as e:
-            logger.exception(e)
-            raise e
+        except Exception:
+            traceback = get_exception_traceback()
+            logger.error("EngineCore hit an exception: %s", traceback)
+            parent_process.send_signal(signal.SIGQUIT)
 
         finally:
             if engine_core is not None:
@@ -309,6 +239,9 @@ def run_busy_loop(self):
     def _log_stats(self):
         """Log basic stats every LOGGING_TIME_S"""
 
+        if not self.log_stats:
+            return
+
         now = time.time()
 
         if now - self._last_logging_time > LOGGING_TIME_S:
@@ -339,7 +272,7 @@ def process_input_socket(self, input_path: str):
         decoder_add_req = PickleEncoder()
         decoder_abort_req = PickleEncoder()
 
-        with make_zmq_socket(input_path, zmq.constants.PULL) as socket:
+        with zmq_socket_ctx(input_path, zmq.constants.PULL) as socket:
             while True:
                 # (RequestType, RequestData)
                 type_frame, data_frame = socket.recv_multipart(copy=False)
@@ -367,7 +300,7 @@ def process_output_socket(self, output_path: str):
         # Reuse send buffer.
         buffer = bytearray()
 
-        with make_zmq_socket(output_path, zmq.constants.PUSH) as socket:
+        with zmq_socket_ctx(output_path, zmq.constants.PUSH) as socket:
             while True:
                 engine_core_outputs = self.output_queue.get()
                 outputs = EngineCoreOutputs(outputs=engine_core_outputs)
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index d56fcbdb1e7c4..beb5d57c20c83 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -1,19 +1,19 @@
-import os
-import weakref
-from typing import List, Optional
+from typing import List, Optional, Type
 
 import msgspec
 import zmq
 import zmq.asyncio
 
+from vllm.config import VllmConfig
 from vllm.logger import init_logger
-from vllm.utils import get_open_zmq_ipc_path, kill_process_tree
+from vllm.utils import get_open_zmq_ipc_path
 from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs,
                             EngineCoreProfile, EngineCoreRequest,
                             EngineCoreRequestType, EngineCoreRequestUnion)
-from vllm.v1.engine.core import (EngineCore, EngineCoreProc,
-                                 EngineCoreProcHandle)
+from vllm.v1.engine.core import EngineCore, EngineCoreProc
+from vllm.v1.executor.abstract import Executor
 from vllm.v1.serial_utils import PickleEncoder
+from vllm.v1.utils import BackgroundProcHandle
 
 logger = init_logger(__name__)
 
@@ -31,10 +31,11 @@ class EngineCoreClient:
 
     @staticmethod
     def make_client(
-        *args,
         multiprocess_mode: bool,
         asyncio_mode: bool,
-        **kwargs,
+        vllm_config: VllmConfig,
+        executor_class: Type[Executor],
+        log_stats: bool = False,
     ) -> "EngineCoreClient":
 
         # TODO: support this for debugging purposes.
@@ -44,12 +45,12 @@ def make_client(
                 "is not currently supported.")
 
         if multiprocess_mode and asyncio_mode:
-            return AsyncMPClient(*args, **kwargs)
+            return AsyncMPClient(vllm_config, executor_class, log_stats)
 
         if multiprocess_mode and not asyncio_mode:
-            return SyncMPClient(*args, **kwargs)
+            return SyncMPClient(vllm_config, executor_class, log_stats)
 
-        return InprocClient(*args, **kwargs)
+        return InprocClient(vllm_config, executor_class, log_stats)
 
     def shutdown(self):
         pass
@@ -128,9 +129,10 @@ class MPClient(EngineCoreClient):
 
     def __init__(
         self,
-        *args,
         asyncio_mode: bool,
-        **kwargs,
+        vllm_config: VllmConfig,
+        executor_class: Type[Executor],
+        log_stats: bool = False,
     ):
         # Serialization setup.
         self.encoder = PickleEncoder()
@@ -143,7 +145,6 @@ def __init__(
             self.ctx = zmq.Context()  # type: ignore[attr-defined]
 
         # Path for IPC.
-        ready_path = get_open_zmq_ipc_path()
         output_path = get_open_zmq_ipc_path()
         input_path = get_open_zmq_ipc_path()
 
@@ -156,47 +157,40 @@ def __init__(
         self.input_socket.bind(input_path)
 
         # Start EngineCore in background process.
-        self.proc_handle: Optional[EngineCoreProcHandle]
-        self.proc_handle = EngineCoreProc.make_engine_core_process(
-            *args,
-            input_path=
-            input_path,  # type: ignore[misc]  # MyPy incorrectly flags duplicate keywords
-            output_path=output_path,  # type: ignore[misc]
-            ready_path=ready_path,  # type: ignore[misc]
-            **kwargs,
-        )
-        self._finalizer = weakref.finalize(self, self.shutdown)
+        self.proc_handle: Optional[BackgroundProcHandle]
+        self.proc_handle = BackgroundProcHandle(
+            input_path=input_path,
+            output_path=output_path,
+            process_name="EngineCore",
+            target_fn=EngineCoreProc.run_engine_core,
+            process_kwargs={
+                "vllm_config": vllm_config,
+                "executor_class": executor_class,
+                "log_stats": log_stats,
+            })
 
     def shutdown(self):
         # Shut down the zmq context.
         self.ctx.destroy(linger=0)
 
         if hasattr(self, "proc_handle") and self.proc_handle:
-            # Shutdown the process if needed.
-            if self.proc_handle.proc.is_alive():
-                self.proc_handle.proc.terminate()
-                self.proc_handle.proc.join(5)
-
-                if self.proc_handle.proc.is_alive():
-                    kill_process_tree(self.proc_handle.proc.pid)
-
-            # Remove zmq ipc socket files
-            ipc_sockets = [
-                self.proc_handle.ready_path, self.proc_handle.output_path,
-                self.proc_handle.input_path
-            ]
-            for ipc_socket in ipc_sockets:
-                socket_file = ipc_socket.replace("ipc://", "")
-                if os and os.path.exists(socket_file):
-                    os.remove(socket_file)
+            self.proc_handle.shutdown()
             self.proc_handle = None
 
 
 class SyncMPClient(MPClient):
     """Synchronous client for multi-proc EngineCore."""
 
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, asyncio_mode=False, **kwargs)
+    def __init__(self,
+                 vllm_config: VllmConfig,
+                 executor_class: Type[Executor],
+                 log_stats: bool = False):
+        super().__init__(
+            asyncio_mode=False,
+            vllm_config=vllm_config,
+            executor_class=executor_class,
+            log_stats=log_stats,
+        )
 
     def get_output(self) -> List[EngineCoreOutput]:
 
@@ -225,8 +219,16 @@ def profile(self, is_start: bool = True) -> None:
 class AsyncMPClient(MPClient):
     """Asyncio-compatible client for multi-proc EngineCore."""
 
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, asyncio_mode=True, **kwargs)
+    def __init__(self,
+                 vllm_config: VllmConfig,
+                 executor_class: Type[Executor],
+                 log_stats: bool = False):
+        super().__init__(
+            asyncio_mode=True,
+            vllm_config=vllm_config,
+            executor_class=executor_class,
+            log_stats=log_stats,
+        )
 
     async def get_output_async(self) -> List[EngineCoreOutput]:
 
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index b58f62778ffe9..fc323184abc8f 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -72,11 +72,11 @@ def __init__(
 
         # EngineCore (gets EngineCoreRequests and gives EngineCoreOutputs)
         self.engine_core = EngineCoreClient.make_client(
-            vllm_config,
-            executor_class,
-            usage_context,
             multiprocess_mode=multiprocess_mode,
             asyncio_mode=False,
+            vllm_config=vllm_config,
+            executor_class=executor_class,
+            log_stats=False,
         )
 
     @classmethod
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index 128101aa6956d..ed64e7741390d 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -17,13 +17,12 @@
 from vllm.distributed.device_communicators.shm_broadcast import (Handle,
                                                                  MessageQueue)
 from vllm.executor.multiproc_worker_utils import (
-    _add_prefix, get_mp_context, set_multiprocessing_worker_envs)
+    _add_prefix, set_multiprocessing_worker_envs)
 from vllm.logger import init_logger
-from vllm.utils import (get_distributed_init_method, get_open_port,
-                        get_open_zmq_ipc_path)
+from vllm.utils import (get_distributed_init_method, get_mp_context,
+                        get_open_port, get_open_zmq_ipc_path, zmq_socket_ctx)
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.outputs import ModelRunnerOutput
-from vllm.v1.utils import make_zmq_socket
 from vllm.worker.worker_base import WorkerWrapperBase
 
 logger = init_logger(__name__)
@@ -250,7 +249,7 @@ def __init__(
         worker_response_mq_handle = self.worker_response_mq.export_handle()
 
         # Send Readiness signal to EngineCore process.
-        with make_zmq_socket(ready_path, zmq.constants.PUSH) as ready_socket:
+        with zmq_socket_ctx(ready_path, zmq.constants.PUSH) as ready_socket:
             payload = pickle.dumps(worker_response_mq_handle,
                                    protocol=pickle.HIGHEST_PROTOCOL)
             ready_socket.send_string(WorkerProc.READY_STR)
@@ -352,7 +351,7 @@ def wait_for_startup(
         ready_path: str,
     ) -> Optional[Handle]:
         """Wait until the Worker is ready."""
-        with make_zmq_socket(ready_path, zmq.constants.PULL) as socket:
+        with zmq_socket_ctx(ready_path, zmq.constants.PULL) as socket:
 
             # Wait for Worker to send READY.
             while socket.poll(timeout=POLLING_TIMEOUT_MS) == 0:
diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index e802c6439b740..19e0dd17237c9 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -1,11 +1,11 @@
+import os
+import weakref
 from collections.abc import Sequence
-from contextlib import contextmanager
-from typing import (Any, Generic, Iterator, List, Optional, TypeVar, Union,
-                    overload)
-
-import zmq
+from typing import (Any, Callable, Dict, Generic, List, Optional, TypeVar,
+                    Union, overload)
 
 from vllm.logger import init_logger
+from vllm.utils import get_mp_context, kill_process_tree
 
 logger = init_logger(__name__)
 
@@ -77,27 +77,58 @@ def __len__(self):
         return len(self._x)
 
 
-@contextmanager
-def make_zmq_socket(
-        path: str,
-        type: Any) -> Iterator[zmq.Socket]:  # type: ignore[name-defined]
-    """Context manager for a ZMQ socket"""
-
-    ctx = zmq.Context()  # type: ignore[attr-defined]
-    try:
-        socket = ctx.socket(type)
-
-        if type == zmq.constants.PULL:
-            socket.connect(path)
-        elif type == zmq.constants.PUSH:
-            socket.bind(path)
-        else:
-            raise ValueError(f"Unknown Socket Type: {type}")
-
-        yield socket
-
-    except KeyboardInterrupt:
-        logger.debug("Worker had Keyboard Interrupt.")
-
-    finally:
-        ctx.destroy(linger=0)
+class BackgroundProcHandle:
+    """
+    Utility class to handle creation, readiness, and shutdown
+    of background processes used by the AsyncLLM and LLMEngine.
+    """
+
+    def __init__(
+        self,
+        input_path: str,
+        output_path: str,
+        process_name: str,
+        target_fn: Callable,
+        process_kwargs: Dict[Any, Any],
+    ):
+        self._finalizer = weakref.finalize(self, self.shutdown)
+
+        context = get_mp_context()
+        reader, writer = context.Pipe(duplex=False)
+
+        assert ("ready_pipe" not in process_kwargs
+                and "input_path" not in process_kwargs
+                and "output_path" not in process_kwargs)
+        process_kwargs["ready_pipe"] = writer
+        process_kwargs["input_path"] = input_path
+        process_kwargs["output_path"] = output_path
+        self.input_path = input_path
+        self.output_path = output_path
+
+        # Run Detokenizer busy loop in background process.
+        self.proc = context.Process(target=target_fn, kwargs=process_kwargs)
+        self.proc.start()
+
+        # Wait for startup.
+        if reader.recv()["status"] != "READY":
+            raise RuntimeError(f"{process_name} initialization failed. "
+                               "See root cause above.")
+
+    def __del__(self):
+        self.shutdown()
+
+    def shutdown(self):
+        # Shutdown the process if needed.
+        if hasattr(self, "proc") and self.proc.is_alive():
+            self.proc.terminate()
+            self.proc.join(5)
+
+            if self.proc.is_alive():
+                kill_process_tree(self.proc.pid)
+
+        # Remove zmq ipc socket files
+        ipc_sockets = [self.output_path, self.input_path]
+        for ipc_socket in ipc_sockets:
+            socket_file = ipc_socket.replace("ipc://", "")
+            if os and os.path.exists(socket_file):
+                os.remove(socket_file)

From b5cbe8eeb30e86c8477d91c66f5c7a10e4ee754b Mon Sep 17 00:00:00 2001
From: Rajveer Bachkaniwala <46040700+rajveerb@users.noreply.github.com>
Date: Fri, 27 Dec 2024 22:34:46 -0500
Subject: [PATCH 013/462] [Bugfix] Last token measurement fix (#11376)

Signed-off-by: rajveerb <46040700+rajveerb@users.noreply.github.com>
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
---
 vllm/engine/llm_engine.py |  8 ++++++--
 vllm/sequence.py          | 24 ++++++++++++++----------
 2 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 39f59e55da1f7..1db3e59ff3bae 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -1124,6 +1124,8 @@ def _process_model_outputs(self,
 
             seq_group = scheduled_seq_group.seq_group
             seq_group.maybe_set_first_token_time(now)
+            if not seq_group.is_prefill():
+                seq_group.set_last_token_time(now)
             request_output = RequestOutputFactory.create(
                 seq_group,
                 self.seq_id_to_seq_group,
@@ -1166,6 +1168,8 @@ def _process_model_outputs(self,
 
             seq_group = scheduled_seq_group.seq_group
             seq_group.maybe_set_first_token_time(now)
+            if not seq_group.is_prefill():
+                seq_group.set_last_token_time(now)
             request_output = RequestOutputFactory.create(
                 seq_group,
                 self.seq_id_to_seq_group,
@@ -1686,7 +1690,7 @@ def _get_stats(self,
                     # If the seq_group just finished the prefill state
                     # get TTFT.
                     if not seq_group.is_prefill():
-                        latency = seq_group.get_last_latency(now)
+                        latency = seq_group.get_last_token_latency()
                         time_to_first_tokens_iter.append(latency)
 
                         # One generation token per finished prefill.
@@ -1694,7 +1698,7 @@ def _get_stats(self,
                             seq_group.num_seqs())
                 else:
                     # TPOTs.
-                    latency = seq_group.get_last_latency(now)
+                    latency = seq_group.get_last_token_latency()
                     time_per_output_tokens_iter.append(latency)
                     if seq_group.state.current_step == 0:
                         # For async_output_proc, the do_log_stats()
diff --git a/vllm/sequence.py b/vllm/sequence.py
index cc3d96fc93a79..34f910d47b7d9 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -667,6 +667,7 @@ def __init__(
                                       first_scheduled_time=None,
                                       first_token_time=None,
                                       time_in_queue=None)
+        self.last_token_latency = 0.0
         self.lora_request = lora_request
         self.prompt_logprobs: Optional[PromptLogprobs] = None
         self.state = SequenceGroupState()
@@ -762,18 +763,21 @@ def init_multi_step_from_lookahead_slots(self, num_lookahead_slots: int,
             assert num_lookahead_slots + 1 == num_scheduler_steps or is_prefill
             self.init_multi_step(num_steps=num_lookahead_slots + 1)
 
-    def get_last_latency(self, now: float) -> float:
+    def set_last_token_time(self, now: float) -> None:
         """Sets the last token time for Request level timings."""
-        # If still in prefill phase, raise Error.
-        if self.is_prefill():
-            raise ValueError(
-                "seq_group.get_last_latency() should not be called "
-                "if the seq_group is in prefill phase.")
-
-        # Otherwise return token latency.
-        latency = now - self.metrics.last_token_time
+        # If still in prefill phase, assertion fails.
+        assert not self.is_prefill(), (
+            "seq_group.set_last_token_time() should not be called "
+            "if the seq_group is in prefill phase.")
+        self.last_token_latency = now - self.metrics.last_token_time
         self.metrics.last_token_time = now
-        return latency
+
+    def get_last_token_latency(self) -> float:
+        """Returns the latency of the last token."""
+        assert not self.is_prefill(), (
+            "seq_group.get_last_token_latency() should not be called "
+            "if the seq_group is in prefill phase.")
+        return self.last_token_latency
 
     def maybe_set_first_token_time(self, time: float) -> None:
         """Sets the first token time for Request level timings."""

From d34be24bb196cb0cce167257c97449f0cd6858f7 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sat, 28 Dec 2024 14:14:10 +0800
Subject: [PATCH 014/462] [Model] Support InternLM2 Reward models (#11571)

Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 docs/source/models/supported_models.md  |  5 +++
 tests/models/registry.py                |  2 +
 vllm/model_executor/models/internlm2.py | 60 ++++++++++++++++++++++++-
 vllm/model_executor/models/registry.py  |  1 +
 4 files changed, 67 insertions(+), 1 deletion(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 7acafda50793c..fa7102cd88063 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -450,6 +450,11 @@ of the whole prompt are extracted from the normalized hidden state corresponding
     - Example HF Models
     - :ref:`LoRA <lora-adapter>`
     - :ref:`PP <distributed-serving>`
+  * - :code:`InternLM2ForRewardModel`
+    - InternLM2-based
+    - :code:`internlm/internlm2-1_8b-reward`, :code:`internlm/internlm2-7b-reward`, etc.
+    - ✅︎
+    - ✅︎
   * - :code:`LlamaForCausalLM`
     - Llama-based
     - :code:`peiyi9979/math-shepherd-mistral-7b-prm`, etc.
diff --git a/tests/models/registry.py b/tests/models/registry.py
index f5a37420a2909..e5dfb2822745d 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -140,6 +140,8 @@ class _HfExamplesInfo:
     "BertModel": _HfExamplesInfo("BAAI/bge-base-en-v1.5"),
     "Gemma2Model": _HfExamplesInfo("BAAI/bge-multilingual-gemma2"),
     "GritLM": _HfExamplesInfo("parasail-ai/GritLM-7B-vllm"),
+    "InternLM2ForRewardModel": _HfExamplesInfo("internlm/internlm2-1_8b-reward",
+                                               trust_remote_code=True),
     "JambaForSequenceClassification": _HfExamplesInfo("ai21labs/Jamba-tiny-reward-dev"),  # noqa: E501
     "LlamaModel": _HfExamplesInfo("llama", is_available_online=False),
     "MistralModel": _HfExamplesInfo("intfloat/e5-mistral-7b-instruct"),
diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py
index 41b9f110d771f..28c23edd4c8e8 100644
--- a/vllm/model_executor/models/internlm2.py
+++ b/vllm/model_executor/models/internlm2.py
@@ -18,14 +18,16 @@
                                                QKVParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.pooler import Pooler, PoolingType
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import IntermediateTensors
+from vllm.sequence import IntermediateTensors, PoolerOutput
 
 from .interfaces import SupportsLoRA, SupportsPP
 from .utils import (is_pp_missing_parameter,
@@ -433,3 +435,59 @@ def load_weights(self, weights: Iterable[Tuple[str,
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
+
+
+class InternLM2ForRewardModel(InternLM2ForCausalLM):
+
+    def __init__(
+        self,
+        *,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        model_type: Type[InternLM2Model] = InternLM2Model,
+    ):
+        super().__init__(vllm_config=vllm_config,
+                         prefix=prefix,
+                         model_type=model_type)
+
+        for attr in ("output", "logits_processor", "sampler"):
+            delattr(self, attr)
+
+        config = vllm_config.model_config.hf_config
+        self.v_head = RowParallelLinear(
+            config.hidden_size,
+            1,
+            bias=False,
+            input_is_parallel=False,
+            prefix=maybe_prefix(prefix, "v_head"),
+        )
+
+        pooler_config = vllm_config.model_config.pooler_config
+        self._pooler = Pooler.from_config_with_defaults(
+            pooler_config,
+            pooling_type=PoolingType.ALL,
+            normalize=False,
+            softmax=False,
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.model(input_ids, positions, kv_caches,
+                                   attn_metadata, intermediate_tensors,
+                                   inputs_embeds)
+        logits, _ = self.v_head(hidden_states)
+        return logits
+
+    def pooler(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> Optional[PoolerOutput]:
+        return self._pooler(hidden_states, pooling_metadata)
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 89992de7e238d..67268eb4bb85f 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -113,6 +113,7 @@
     "Gemma2Model": ("gemma2", "Gemma2ForCausalLM"),
     "GlmForCausalLM": ("glm", "GlmForCausalLM"),
     "GritLM": ("gritlm", "GritLM"),
+    "InternLM2ForRewardModel": ("internlm2", "InternLM2ForRewardModel"),
     "JambaForSequenceClassification": ("jamba", "JambaForSequenceClassification"),  # noqa: E501
     "LlamaModel": ("llama", "LlamaForCausalLM"),
     **{

From b7dcc003dc1ace7605946d52b7e077ba1d3bbe86 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Sat, 28 Dec 2024 02:54:23 -0800
Subject: [PATCH 015/462] [Model] Remove hardcoded image tokens ids from
 Pixtral (#11582)

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/model_executor/models/pixtral.py | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index f3d66c2313198..22d29f5bbc50c 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -45,13 +45,6 @@
 except ImportError:
     USE_XFORMERS_OPS = False
 
-# These token ids cannot be retrieved from model config
-# so we hardcode them here.
-PIXTRAL_12B_IMAGE_BREAK_ID = 12
-PIXTRAL_12B_IMAGE_END_ID = 13
-PIXTRAL_LARGE_IMAGE_BREAK_ID = 14
-PIXTRAL_LARGE_IMAGE_END_ID = 15
-
 
 def get_max_pixtral_image_tokens(ctx: InputContext):
     tokenizer = cached_get_tokenizer(
@@ -201,6 +194,13 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             if key in dataclass_fields
         }
 
+        if not ("image_break_token_id" in vision_args
+                and "image_end_token_id" in vision_args):
+            raise ValueError(
+                "'image_break_token_id' and 'image_end_token_id' not found "
+                "in the vision_encoder arguments. Please download the latest "
+                "version of 'params.json' from the model repository.")
+
         self.vision_args = VisionEncoderArgs(**vision_args)
 
         # init MistralForCausalLM
@@ -240,9 +240,8 @@ def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
 
         # NOTE: Image embeddings are split into separate tensors for each image
         # by the indices of `[IMG_END]` token.
-        image_end_condition = (image_tokens == PIXTRAL_12B_IMAGE_END_ID) | (
-            image_tokens == PIXTRAL_LARGE_IMAGE_END_ID)
-        split_indices = torch.where(image_end_condition)[0] + 1
+        image_end_mask = image_tokens == self.vision_args.image_end_token_id
+        split_indices = torch.where(image_end_mask)[0] + 1
         if len(split_indices) <= 1:
             # Do not split, return as tensor of shape [1, fs, hs]
             return image_embeds.unsqueeze(0)
@@ -265,10 +264,8 @@ def get_input_embeddings(
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids, inputs_embeds, multimodal_embeddings, [
                     self.vision_args.image_token_id,
-                    PIXTRAL_12B_IMAGE_END_ID,
-                    PIXTRAL_12B_IMAGE_BREAK_ID,
-                    PIXTRAL_LARGE_IMAGE_BREAK_ID,
-                    PIXTRAL_LARGE_IMAGE_END_ID,
+                    self.vision_args.image_break_token_id,
+                    self.vision_args.image_end_token_id,
                 ])
         return inputs_embeds
 
@@ -409,6 +406,8 @@ class VisionEncoderArgs:
     num_attention_heads: int
     rope_theta: float  # for rope-2D
     image_token_id: int
+    image_break_token_id: int
+    image_end_token_id: int
     adapter_bias: bool = True
 
 
From 59d6bb4c863e511e58799efac847065c28c52c8b Mon Sep 17 00:00:00 2001
From: hj-wei <hjwei_xd@163.com>
Date: Sat, 28 Dec 2024 19:17:35 +0800
Subject: [PATCH 016/462] [Hardware][AMD]: Replace HIPCC version with more
 precise ROCm version (#11515)

Signed-off-by: hjwei <hjwei_xd@163.com>
---
 setup.py | 52 +++++++++++++++++++++++++++++-----------------------
 1 file changed, 29 insertions(+), 23 deletions(-)

diff --git a/setup.py b/setup.py
index 61d2d710aa20e..ba6953dbdc174 100644
--- a/setup.py
+++ b/setup.py
@@ -1,3 +1,4 @@
+import ctypes
 import importlib.util
 import logging
 import os
@@ -13,7 +14,7 @@
 from setuptools import Extension, find_packages, setup
 from setuptools.command.build_ext import build_ext
 from setuptools_scm import get_version
-from torch.utils.cpp_extension import CUDA_HOME
+from torch.utils.cpp_extension import CUDA_HOME, ROCM_HOME
 
 
 def load_module_from_path(module_name, path):
@@ -379,25 +380,31 @@ def _build_custom_ops() -> bool:
     return _is_cuda() or _is_hip() or _is_cpu()
 
 
-def get_hipcc_rocm_version():
-    # Run the hipcc --version command
-    result = subprocess.run(['hipcc', '--version'],
-                            stdout=subprocess.PIPE,
-                            stderr=subprocess.STDOUT,
-                            text=True)
+def get_rocm_version():
+    # Get the Rocm version from the ROCM_HOME/bin/librocm-core.so
+    # see https://github.com/ROCm/rocm-core/blob/d11f5c20d500f729c393680a01fa902ebf92094b/rocm_version.cpp#L21
+    try:
+        librocm_core_file = Path(ROCM_HOME) / "lib" / "librocm-core.so"
+        if not librocm_core_file.is_file():
+            return None
+        librocm_core = ctypes.CDLL(librocm_core_file)
+        VerErrors = ctypes.c_uint32
+        get_rocm_core_version = librocm_core.getROCmVersion
+        get_rocm_core_version.restype = VerErrors
+        get_rocm_core_version.argtypes = [
+            ctypes.POINTER(ctypes.c_uint32),
+            ctypes.POINTER(ctypes.c_uint32),
+            ctypes.POINTER(ctypes.c_uint32),
+        ]
+        major = ctypes.c_uint32()
+        minor = ctypes.c_uint32()
+        patch = ctypes.c_uint32()
 
-    # Check if the command was executed successfully
-    if result.returncode != 0:
-        print("Error running 'hipcc --version'")
+        if (get_rocm_core_version(ctypes.byref(major), ctypes.byref(minor),
+                                  ctypes.byref(patch)) == 0):
+            return "%d.%d.%d" % (major.value, minor.value, patch.value)
         return None
-
-    # Extract the version using a regular expression
-    match = re.search(r'HIP version: (\S+)', result.stdout)
-    if match:
-        # Return the version string
-        return match.group(1)
-    else:
-        print("Could not find HIP version in the output")
+    except Exception:
         return None
 
 
@@ -479,11 +486,10 @@ def get_vllm_version() -> str:
                 if "sdist" not in sys.argv:
                     version += f"{sep}cu{cuda_version_str}"
     elif _is_hip():
-        # Get the HIP version
-        hipcc_version = get_hipcc_rocm_version()
-        if hipcc_version != MAIN_CUDA_VERSION:
-            rocm_version_str = hipcc_version.replace(".", "")[:3]
-            version += f"{sep}rocm{rocm_version_str}"
+        # Get the Rocm Version
+        rocm_version = get_rocm_version() or torch.version.hip
+        if rocm_version and rocm_version != MAIN_CUDA_VERSION:
+            version += f"{sep}rocm{rocm_version.replace('.', '')[:3]}"
     elif _is_neuron():
         # Get the Neuron version
         neuron_version = str(get_neuronxcc_version())

From 42bb201fd6f79d6ed2e28e0263ffa891cd993c4c Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Sat, 28 Dec 2024 22:33:12 +0900
Subject: [PATCH 017/462] [V1][Minor] Set pin_memory=False for token_ids_cpu
 tensor (#11581)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu_input_batch.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 6c4d300ec6efe..e79145300fe06 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -57,11 +57,13 @@ def __init__(
 
         # TODO(woosuk): This buffer could be too large if max_model_len is big.
         # Find a way to reduce the CPU memory usage.
+        # This buffer is not directly transferred to the GPU, so it does not
+        # need to be pinned.
         self.token_ids_cpu_tensor = torch.zeros(
             (max_num_reqs, max_model_len),
             device="cpu",
             dtype=torch.int32,
-            pin_memory=pin_memory,
+            pin_memory=False,
         )
         self.token_ids_cpu = self.token_ids_cpu_tensor.numpy()
         self.num_computed_tokens_cpu = np.empty(max_num_reqs, dtype=np.int32)

From d427e5cfda8d2536b81e6021128e71b2dbc281aa Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 28 Dec 2024 21:53:59 +0800
Subject: [PATCH 018/462] [Doc] Minor documentation fixes (#11580)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/source/contributing/dockerfile/dockerfile.md  | 6 +++---
 docs/source/contributing/overview.md               | 2 +-
 docs/source/getting_started/arm-installation.md    | 2 +-
 docs/source/getting_started/cpu-installation.md    | 4 ++--
 docs/source/getting_started/gaudi-installation.md  | 8 +++++---
 docs/source/getting_started/neuron-installation.md | 2 +-
 docs/source/getting_started/quickstart.md          | 4 ++--
 docs/source/getting_started/tpu-installation.md    | 2 +-
 docs/source/models/supported_models.md             | 6 +++---
 docs/source/serving/deploying_with_cerebrium.md    | 6 +++---
 docs/source/serving/deploying_with_dstack.md       | 2 +-
 docs/source/serving/distributed_serving.md         | 6 +++---
 docs/source/serving/runai_model_streamer.md        | 2 +-
 13 files changed, 27 insertions(+), 25 deletions(-)

diff --git a/docs/source/contributing/dockerfile/dockerfile.md b/docs/source/contributing/dockerfile/dockerfile.md
index 6535414a7dca4..7ffec83333d7d 100644
--- a/docs/source/contributing/dockerfile/dockerfile.md
+++ b/docs/source/contributing/dockerfile/dockerfile.md
@@ -11,11 +11,11 @@ Below is a visual representation of the multi-stage Dockerfile. The build graph
 
 The edges of the build graph represent:
 
-- FROM ... dependencies (with a solid line and a full arrow head)
+- `FROM ...` dependencies (with a solid line and a full arrow head)
 
-- COPY --from=... dependencies (with a dashed line and an empty arrow head)
+- `COPY --from=...` dependencies (with a dashed line and an empty arrow head)
 
-- RUN --mount=(.\*)from=... dependencies (with a dotted line and an empty diamond arrow head)
+- `RUN --mount=(.\*)from=...` dependencies (with a dotted line and an empty diamond arrow head)
 
   > ```{figure} ../../assets/dev/dockerfile-stages-dependency.png
   > :align: center
diff --git a/docs/source/contributing/overview.md b/docs/source/contributing/overview.md
index 9dac41cff0bcb..c960790f47a13 100644
--- a/docs/source/contributing/overview.md
+++ b/docs/source/contributing/overview.md
@@ -34,7 +34,7 @@ pytest tests/
 ```
 
 ```{note}
-Currently, the repository does not pass the `mypy` tests.
+Currently, the repository is not fully checked by `mypy`.
 ```
 
 # Contribution Guidelines
diff --git a/docs/source/getting_started/arm-installation.md b/docs/source/getting_started/arm-installation.md
index de807e198b4f6..799b597b3ad5d 100644
--- a/docs/source/getting_started/arm-installation.md
+++ b/docs/source/getting_started/arm-installation.md
@@ -20,7 +20,7 @@ Contents:
 ## Requirements
 
 - **Operating System**: Linux or macOS
-- **Compiler**: gcc/g++ >= 12.3.0 (optional, but recommended)
+- **Compiler**: `gcc/g++ >= 12.3.0` (optional, but recommended)
 - **Instruction Set Architecture (ISA)**: NEON support is required
 
 (arm-backend-quick-start-dockerfile)=
diff --git a/docs/source/getting_started/cpu-installation.md b/docs/source/getting_started/cpu-installation.md
index b6f181ace6274..c3d3f715ed804 100644
--- a/docs/source/getting_started/cpu-installation.md
+++ b/docs/source/getting_started/cpu-installation.md
@@ -24,7 +24,7 @@ Table of contents:
 ## Requirements
 
 - OS: Linux
-- Compiler: gcc/g++>=12.3.0 (optional, recommended)
+- Compiler: `gcc/g++>=12.3.0` (optional, recommended)
 - Instruction set architecture (ISA) requirement: AVX512 (optional, recommended)
 
 (cpu-backend-quick-start-dockerfile)=
@@ -69,7 +69,7 @@ $ VLLM_TARGET_DEVICE=cpu python setup.py install
 
 ```{note}
 - AVX512_BF16 is an extension ISA provides native BF16 data type conversion and vector product instructions, will brings some performance improvement compared with pure AVX512. The CPU backend build script will check the host CPU flags to determine whether to enable AVX512_BF16.
-- If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable VLLM_CPU_AVX512BF16=1 before the building.
+- If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable `VLLM_CPU_AVX512BF16=1` before the building.
 ```
 
 (env-intro)=
diff --git a/docs/source/getting_started/gaudi-installation.md b/docs/source/getting_started/gaudi-installation.md
index acf42f210dffb..447bf98084a5d 100644
--- a/docs/source/getting_started/gaudi-installation.md
+++ b/docs/source/getting_started/gaudi-installation.md
@@ -167,6 +167,8 @@ Currently in vLLM for HPU we support four execution modes, depending on selected
 In 1.18.0, all modes utilizing `PT_HPU_LAZY_MODE=0` are highly experimental and should be only used for validating functional correctness. Their performance will be improved in the next releases. For obtaining the best performance in 1.18.0, please use HPU Graphs, or PyTorch lazy mode.
 ```
 
+(gaudi-bucketing-mechanism)=
+
 ### Bucketing mechanism
 
 Intel Gaudi accelerators work best when operating on models with fixed tensor shapes. [Intel Gaudi Graph Compiler](https://docs.habana.ai/en/latest/Gaudi_Overview/Intel_Gaudi_Software_Suite.html#graph-compiler-and-runtime) is responsible for generating optimized binary code that implements the given model topology on Gaudi. In its default configuration, the produced binary code may be heavily dependent on input and output tensor shapes, and can require graph recompilation when encountering differently shaped tensors within the same topology. While the resulting binaries utilize Gaudi efficiently, the compilation itself may introduce a noticeable overhead in end-to-end execution.
@@ -185,7 +187,7 @@ INFO 08-01 21:37:59 hpu_model_runner.py:504] Decode bucket config (min, step, ma
 INFO 08-01 21:37:59 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
 ```
 
-`min` determines the lowest value of the bucket. `step` determines the interval between buckets, and `max` determines the upper bound of the bucket. Furthermore, interval between `min` and `step` has special handling - `min` gets multiplied by consecutive powers of two, until `step` gets reached. We call this the ramp-up phase and it is used for handling lower batch sizes with minimum wastage, while allowing larger padding on larger batch sizes.
+`min` determines the lowest value of the bucket. `step` determines the interval between buckets, and `max` determines the upper bound of the bucket. Furthermore, interval between `min` and `step` has special handling -- `min` gets multiplied by consecutive powers of two, until `step` gets reached. We call this the ramp-up phase and it is used for handling lower batch sizes with minimum wastage, while allowing larger padding on larger batch sizes.
 
 Example (with ramp-up)
 
@@ -214,7 +216,7 @@ If a request exceeds maximum bucket size in any dimension, it will be processed
 As an example, if a request of 3 sequences, with max sequence length of 412 comes in to an idle vLLM server, it will be padded executed as `(4, 512)` prefill bucket, as `batch_size` (number of sequences) will be padded to 4 (closest batch_size dimension higher than 3), and max sequence length will be padded to 512 (closest sequence length dimension higher than 412). After prefill stage, it will be executed as `(4, 512)` decode bucket and will continue as that bucket until either batch dimension changes (due to request being finished) - in which case it will become a `(2, 512)` bucket, or context length increases above 512 tokens, in which case it will become `(4, 640)` bucket.
 
 ```{note}
-Bucketing is transparent to a client - padding in sequence length dimension is never returned to the client, and padding in batch dimension does not create new requests.
+Bucketing is transparent to a client -- padding in sequence length dimension is never returned to the client, and padding in batch dimension does not create new requests.
 ```
 
 ### Warmup
@@ -235,7 +237,7 @@ INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][47/48] batch_size
 INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB
 ```
 
-This example uses the same buckets as in *Bucketing mechanism* section. Each output line corresponds to execution of a single bucket. When bucket is executed for the first time, its graph is compiled and can be reused later on, skipping further graph compilations.
+This example uses the same buckets as in the [Bucketing Mechanism](#gaudi-bucketing-mechanism) section. Each output line corresponds to execution of a single bucket. When bucket is executed for the first time, its graph is compiled and can be reused later on, skipping further graph compilations.
 
 ```{tip}
 Compiling all the buckets might take some time and can be turned off with `VLLM_SKIP_WARMUP=true` environment variable. Keep in mind that if you do that, you may face graph compilations once executing a given bucket for the first time. It is fine to disable warmup for development, but it's highly recommended to enable it in deployment.
diff --git a/docs/source/getting_started/neuron-installation.md b/docs/source/getting_started/neuron-installation.md
index d6de5760cc82c..baaeeb9f53a10 100644
--- a/docs/source/getting_started/neuron-installation.md
+++ b/docs/source/getting_started/neuron-installation.md
@@ -26,7 +26,7 @@ Installation steps:
 (build-from-source-neuron)=
 
 ```{note}
-The currently supported version of Pytorch for Neuron installs `triton` version `2.1.0`. This is incompatible with vLLM >= 0.5.3. You may see an error `cannot import name 'default_dump_dir...`. To work around this, run a `pip install --upgrade triton==3.0.0` after installing the vLLM wheel.
+The currently supported version of Pytorch for Neuron installs `triton` version `2.1.0`. This is incompatible with `vllm >= 0.5.3`. You may see an error `cannot import name 'default_dump_dir...`. To work around this, run a `pip install --upgrade triton==3.0.0` after installing the vLLM wheel.
 ```
 
 ## Build from source
diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md
index 165e5df146dcd..9c8b7e4f592c9 100644
--- a/docs/source/getting_started/quickstart.md
+++ b/docs/source/getting_started/quickstart.md
@@ -114,7 +114,7 @@ $         "temperature": 0
 $     }'
 ```
 
-Since this server is compatible with OpenAI API, you can use it as a drop-in replacement for any applications using OpenAI API. For example, another way to query the server is via the `openai` python package:
+Since this server is compatible with OpenAI API, you can use it as a drop-in replacement for any applications using OpenAI API. For example, another way to query the server is via the `openai` Python package:
 
 ```python
 from openai import OpenAI
@@ -151,7 +151,7 @@ $         ]
 $     }'
 ```
 
-Alternatively, you can use the `openai` python package:
+Alternatively, you can use the `openai` Python package:
 
 ```python
 from openai import OpenAI
diff --git a/docs/source/getting_started/tpu-installation.md b/docs/source/getting_started/tpu-installation.md
index f2a949e7247d8..17eded4a51fec 100644
--- a/docs/source/getting_started/tpu-installation.md
+++ b/docs/source/getting_started/tpu-installation.md
@@ -103,7 +103,7 @@ Connect to your TPU using SSH:
 gcloud compute tpus tpu-vm ssh TPU_NAME --zone ZONE
 ```
 
-Install Miniconda
+Install Miniconda:
 
 ```bash
 wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index fa7102cd88063..f6e00fa71a310 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -435,7 +435,7 @@ despite being described otherwise on its model card.
 ```
 
 If your model is not in the above list, we will try to automatically convert the model using
-:func:`vllm.model_executor.models.adapters.as_embedding_model`. By default, the embeddings
+{func}`vllm.model_executor.models.adapters.as_embedding_model`. By default, the embeddings
 of the whole prompt are extracted from the normalized hidden state corresponding to the last token.
 
 #### Reward Modeling (`--task reward`)
@@ -468,7 +468,7 @@ of the whole prompt are extracted from the normalized hidden state corresponding
 ```
 
 If your model is not in the above list, we will try to automatically convert the model using
-:func:`vllm.model_executor.models.adapters.as_reward_model`. By default, we return the hidden states of each token directly.
+{func}`vllm.model_executor.models.adapters.as_reward_model`. By default, we return the hidden states of each token directly.
 
 ```{important}
 For process-supervised reward models such as {code}`peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly,
@@ -500,7 +500,7 @@ e.g.: {code}`--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 1
 ```
 
 If your model is not in the above list, we will try to automatically convert the model using
-:func:`vllm.model_executor.models.adapters.as_classification_model`. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token.
+{func}`vllm.model_executor.models.adapters.as_classification_model`. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token.
 
 #### Sentence Pair Scoring (`--task score`)
 
diff --git a/docs/source/serving/deploying_with_cerebrium.md b/docs/source/serving/deploying_with_cerebrium.md
index 4863936236119..950064c8c1b10 100644
--- a/docs/source/serving/deploying_with_cerebrium.md
+++ b/docs/source/serving/deploying_with_cerebrium.md
@@ -33,7 +33,7 @@ docker_base_image_url = "nvidia/cuda:12.1.1-runtime-ubuntu22.04"
 vllm = "latest"
 ```
 
-Next, let us add our code to handle inference for the LLM of your choice(`mistralai/Mistral-7B-Instruct-v0.1` for this example), add the following code to your main.py\`:
+Next, let us add our code to handle inference for the LLM of your choice (`mistralai/Mistral-7B-Instruct-v0.1` for this example), add the following code to your `main.py`:
 
 ```python
 from vllm import LLM, SamplingParams
@@ -55,13 +55,13 @@ def run(prompts: list[str], temperature: float = 0.8, top_p: float = 0.95):
     return {"results": results}
 ```
 
-Then, run the following code to deploy it to the cloud
+Then, run the following code to deploy it to the cloud:
 
 ```console
 $ cerebrium deploy
 ```
 
-If successful, you should be returned a CURL command that you can call inference against. Just remember to end the url with the function name you are calling (in our case /run)
+If successful, you should be returned a CURL command that you can call inference against. Just remember to end the url with the function name you are calling (in our case` /run`)
 
 ```python
 curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \
diff --git a/docs/source/serving/deploying_with_dstack.md b/docs/source/serving/deploying_with_dstack.md
index 65ef1c0016208..381f5f786ca2c 100644
--- a/docs/source/serving/deploying_with_dstack.md
+++ b/docs/source/serving/deploying_with_dstack.md
@@ -25,7 +25,7 @@ $ cd vllm-dstack
 $ dstack init
 ```
 
-Next, to provision a VM instance with LLM of your choice(`NousResearch/Llama-2-7b-chat-hf` for this example), create the following `serve.dstack.yml` file for the dstack `Service`:
+Next, to provision a VM instance with LLM of your choice (`NousResearch/Llama-2-7b-chat-hf` for this example), create the following `serve.dstack.yml` file for the dstack `Service`:
 
 ```yaml
 type: service
diff --git a/docs/source/serving/distributed_serving.md b/docs/source/serving/distributed_serving.md
index c0a4b23f6dc70..7446b7c84cf46 100644
--- a/docs/source/serving/distributed_serving.md
+++ b/docs/source/serving/distributed_serving.md
@@ -8,7 +8,7 @@ Before going into the details of distributed inference and serving, let's first
 
 - **Single GPU (no distributed inference)**: If your model fits in a single GPU, you probably don't need to use distributed inference. Just use the single GPU to run the inference.
 - **Single-Node Multi-GPU (tensor parallel inference)**: If your model is too large to fit in a single GPU, but it can fit in a single node with multiple GPUs, you can use tensor parallelism. The tensor parallel size is the number of GPUs you want to use. For example, if you have 4 GPUs in a single node, you can set the tensor parallel size to 4.
-- **Multi-Node Multi-GPU (tensor parallel plus pipeline parallel inference)**: If your model is too large to fit in a single node, you can use tensor parallel together with pipeline parallelism. The tensor parallel size is the number of GPUs you want to use in each node, and the pipeline parallel size is the number of nodes you want to use. For example, if you have 16 GPUs in 2 nodes (8GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2.
+- **Multi-Node Multi-GPU (tensor parallel plus pipeline parallel inference)**: If your model is too large to fit in a single node, you can use tensor parallel together with pipeline parallelism. The tensor parallel size is the number of GPUs you want to use in each node, and the pipeline parallel size is the number of nodes you want to use. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2.
 
 In short, you should increase the number of GPUs and the number of nodes until you have enough GPU memory to hold the model. The tensor parallel size should be the number of GPUs in each node, and the pipeline parallel size should be the number of nodes.
 
@@ -77,7 +77,7 @@ Then you get a ray cluster of containers. Note that you need to keep the shells
 
 Then, on any node, use `docker exec -it node /bin/bash` to enter the container, execute `ray status` to check the status of the Ray cluster. You should see the right number of nodes and GPUs.
 
-After that, on any node, you can use vLLM as usual, just as you have all the GPUs on one node. The common practice is to set the tensor parallel size to the number of GPUs in each node, and the pipeline parallel size to the number of nodes. For example, if you have 16 GPUs in 2 nodes (8GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2:
+After that, on any node, you can use vLLM as usual, just as you have all the GPUs on one node. The common practice is to set the tensor parallel size to the number of GPUs in each node, and the pipeline parallel size to the number of nodes. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2:
 
 ```console
 $ vllm serve /path/to/the/model/in/the/container \
@@ -85,7 +85,7 @@ $     --tensor-parallel-size 8 \
 $     --pipeline-parallel-size 2
 ```
 
-You can also use tensor parallel without pipeline parallel, just set the tensor parallel size to the number of GPUs in the cluster. For example, if you have 16 GPUs in 2 nodes (8GPUs per node), you can set the tensor parallel size to 16:
+You can also use tensor parallel without pipeline parallel, just set the tensor parallel size to the number of GPUs in the cluster. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 16:
 
 ```console
 $ vllm serve /path/to/the/model/in/the/container \
diff --git a/docs/source/serving/runai_model_streamer.md b/docs/source/serving/runai_model_streamer.md
index 1b5756a95075a..d4269050ff574 100644
--- a/docs/source/serving/runai_model_streamer.md
+++ b/docs/source/serving/runai_model_streamer.md
@@ -41,7 +41,7 @@ For reading from S3, it will be the number of client instances the host is openi
 $ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"concurrency":16}'
 ```
 
-You can controls the size of the CPU Memory buffer to which tensors are read from the file, and limit this size.
+You can control the size of the CPU Memory buffer to which tensors are read from the file, and limit this size.
 You can read further about CPU buffer memory limiting [here](https://github.com/run-ai/runai-model-streamer/blob/master/docs/src/env-vars.md#runai_streamer_memory_limit).
 
 ```console

From 328841d00294fb8226f0368cc380350b3d671d77 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sun, 29 Dec 2024 00:55:42 +0800
Subject: [PATCH 019/462] [bugfix] interleaving sliding window for cohere2
 model (#11583)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 docs/source/models/supported_models.md      |   2 +-
 tests/models/test_initialization.py         |   4 -
 vllm/config.py                              |   2 +-
 vllm/model_executor/models/commandr.py      |  10 +-
 vllm/transformers_utils/config.py           |   7 +-
 vllm/transformers_utils/configs/__init__.py |   2 +
 vllm/transformers_utils/configs/cohere2.py  | 192 ++++++++++++++++++++
 7 files changed, 206 insertions(+), 13 deletions(-)
 create mode 100644 vllm/transformers_utils/configs/cohere2.py

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index f6e00fa71a310..e11befbb8dd30 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -112,7 +112,7 @@ See [this page](#generative-models) for more information on how to use generativ
     - :code:`THUDM/chatglm2-6b`, :code:`THUDM/chatglm3-6b`, etc.
     - ✅︎
     - ✅︎
-  * - :code:`CohereForCausalLM`,:code:`Cohere2ForCausalLM`
+  * - :code:`CohereForCausalLM`, :code:`Cohere2ForCausalLM`
     - Command-R
     - :code:`CohereForAI/c4ai-command-r-v01`, :code:`CohereForAI/c4ai-command-r7b-12-2024`, etc.
     - ✅︎
diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py
index a4eea7f035c91..3b728f2744fca 100644
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -1,7 +1,6 @@
 from unittest.mock import patch
 
 import pytest
-import transformers
 from transformers import PretrainedConfig
 
 from vllm import LLM
@@ -12,9 +11,6 @@
 @pytest.mark.parametrize("model_arch", HF_EXAMPLE_MODELS.get_supported_archs())
 def test_can_initialize(model_arch):
     model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
-    if (model_arch == "Cohere2ForCausalLM"
-            and transformers.__version__ < "4.48.0"):
-        pytest.skip(reason="Model introduced in HF >= 4.48.0")
     if not model_info.is_available_online:
         pytest.skip("Model is not available online")
 
diff --git a/vllm/config.py b/vllm/config.py
index ac767bbe14be4..6ae1d4d944447 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -301,7 +301,7 @@ def __init__(self,
         sliding_window = getattr(self.hf_text_config, "sliding_window", None)
         has_interleaved_attention = (sliding_window is not None) and (
             isinstance(sliding_window, list) or
-            (self.hf_text_config.model_type in ["gemma2"]))
+            (self.hf_text_config.model_type in ["gemma2", "cohere2"]))
 
         if (not self.disable_sliding_window and has_interleaved_attention):
             if envs.VLLM_ATTENTION_BACKEND == "XFORMERS":
diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py
index c846e42f1b0c3..d22d1f3171463 100644
--- a/vllm/model_executor/models/commandr.py
+++ b/vllm/model_executor/models/commandr.py
@@ -172,16 +172,18 @@ def __init__(
             is_neox_style=False,
         )
 
-        sliding_window = getattr(config, "sliding_window", None)
-        # Model v2 has sliding windows, v1 does not
-        self.v1 = sliding_window is None
+        # Model v2 has interleaved sliding windows, v1 does not
+        interleaved_sliding_window = getattr(config,
+                                             "interleaved_sliding_window",
+                                             None)
+        self.v1 = interleaved_sliding_window is None
 
         layer_idx = extract_layer_index(prefix)
         layer_has_sliding_window = (
             getattr(config, "sliding_window_pattern", False)
             and (layer_idx + 1) % self.config.sliding_window_pattern != 0)
 
-        self.sliding_window = (sliding_window
+        self.sliding_window = (interleaved_sliding_window
                                if layer_has_sliding_window else None)
 
         self.attn = Attention(self.num_heads,
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 4529cf27ef565..58417980e7b47 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -22,9 +22,9 @@
 from vllm.logger import init_logger
 # yapf conflicts with isort for this block
 # yapf: disable
-from vllm.transformers_utils.configs import (ChatGLMConfig, DbrxConfig,
-                                             EAGLEConfig, ExaoneConfig,
-                                             H2OVLChatConfig,
+from vllm.transformers_utils.configs import (ChatGLMConfig, Cohere2Config,
+                                             DbrxConfig, EAGLEConfig,
+                                             ExaoneConfig, H2OVLChatConfig,
                                              InternVLChatConfig, JAISConfig,
                                              MedusaConfig, MllamaConfig,
                                              MLPSpeculatorConfig, MPTConfig,
@@ -52,6 +52,7 @@
 
 _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
     "chatglm": ChatGLMConfig,
+    "cohere2": Cohere2Config,
     "dbrx": DbrxConfig,
     "mpt": MPTConfig,
     "RefinedWeb": RWConfig,  # For tiiuae/falcon-40b(-instruct)
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index c24433cd436b4..a41a35c88b3a1 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -1,4 +1,5 @@
 from vllm.transformers_utils.configs.chatglm import ChatGLMConfig
+from vllm.transformers_utils.configs.cohere2 import Cohere2Config
 from vllm.transformers_utils.configs.dbrx import DbrxConfig
 from vllm.transformers_utils.configs.eagle import EAGLEConfig
 from vllm.transformers_utils.configs.exaone import ExaoneConfig
@@ -22,6 +23,7 @@
 
 __all__ = [
     "ChatGLMConfig",
+    "Cohere2Config",
     "DbrxConfig",
     "MPTConfig",
     "RWConfig",
diff --git a/vllm/transformers_utils/configs/cohere2.py b/vllm/transformers_utils/configs/cohere2.py
new file mode 100644
index 0000000000000..1509330fc2179
--- /dev/null
+++ b/vllm/transformers_utils/configs/cohere2.py
@@ -0,0 +1,192 @@
+# ruff: noqa
+
+# Adapted from
+# https://github.com/huggingface/transformers/blob/main/src/transformers/models/cohere2/configuration_cohere2.py
+from transformers import PretrainedConfig
+from transformers.modeling_rope_utils import rope_config_validation
+
+
+class Cohere2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`CohereModel`]. It is used to instantiate an Cohere
+    model according to the specified arguments, defining the model architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the [CohereForAI/c4ai-command-r-v01](https://huggingface.co/CohereForAI/c4ai-command-r-v01) model.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 256000):
+            Vocabulary size of the Cohere model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`CohereModel`]
+        hidden_size (`int`, *optional*, defaults to 8192):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 22528):
+            Dimension of the MLP representations.
+        logit_scale (`float`, *optional*, defaults to 0.0625):
+            The scaling factor for the output logits.
+        num_hidden_layers (`int`, *optional*, defaults to 40):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 64):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 8192):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 5):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 255001):
+            End of stream token id.
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        sliding_window (`int`, *optional*, defaults to 4096):
+            Size of the sliding window attention context.
+        sliding_window_pattern (`int`, *optional*, defaults to 4):
+            Pattern for the sliding window attention.
+        cache_implementation (`str`, *optional*, defaults to `"hybrid"`): the cache type to be used with `generate`.
+
+    ```python
+    >>> from transformers import Cohere2Model, Cohere2Config
+
+    >>> # Initializing a Cohere Nextmodel configuration
+    >>> configuration = Cohere2Config()
+
+    >>> # Initializing a model from the Cohere2 configuration
+    >>> model = Cohere2Model(configuration) # doctest: +SKIP
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config # doctest: +SKIP
+    ```
+    """
+
+    model_type = "cohere2"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=256000,
+        hidden_size=8192,
+        intermediate_size=22528,
+        logit_scale=0.0625,
+        num_hidden_layers=40,
+        num_attention_heads=64,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=8192,
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        use_cache=True,
+        pad_token_id=0,
+        bos_token_id=5,
+        eos_token_id=255001,
+        tie_word_embeddings=True,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        sliding_window=4096,
+        sliding_window_pattern=4,
+        cache_implementation="hybrid",
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.logit_scale = logit_scale
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.sliding_window = sliding_window
+        self.sliding_window_pattern = sliding_window_pattern
+        # Need to specify head_dim in the config so it can be used in the attention forward functions
+        self.head_dim = hidden_size // num_attention_heads
+        self.cache_implementation = cache_implementation
+
+        # Validate the correctness of rotary position embeddings parameters
+        rope_config_validation(self)
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+__all__ = ["Cohere2Config"]

From 4fb8e329fd6f51d576bcf4b7e8907e0d83c4b5cf Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Sat, 28 Dec 2024 15:51:57 -0500
Subject: [PATCH 020/462] [V1] [5/N] API Server: unify `Detokenizer` and 
 `EngineCore` input (#11545)

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
---
 tests/v1/engine/test_detokenizer.py | 57 ++++++++++++++++++-----------
 vllm/v1/engine/__init__.py          | 16 +-------
 vllm/v1/engine/async_llm.py         | 14 ++++---
 vllm/v1/engine/detokenizer.py       | 21 ++++++-----
 vllm/v1/engine/llm_engine.py        | 12 +++---
 vllm/v1/engine/processor.py         | 23 ++----------
 6 files changed, 66 insertions(+), 77 deletions(-)

diff --git a/tests/v1/engine/test_detokenizer.py b/tests/v1/engine/test_detokenizer.py
index 07f343666cb5e..aeae697ca32b0 100644
--- a/tests/v1/engine/test_detokenizer.py
+++ b/tests/v1/engine/test_detokenizer.py
@@ -3,9 +3,9 @@
 import pytest
 from transformers import AutoTokenizer
 
-from vllm.sampling_params import RequestOutputKind
-from vllm.v1.engine import EngineCoreOutput
-from vllm.v1.engine.detokenizer import Detokenizer, DetokenizerRequest
+from vllm.sampling_params import RequestOutputKind, SamplingParams
+from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
+from vllm.v1.engine.detokenizer import Detokenizer
 
 TOKENIZER_NAME = "mistralai/Mistral-7B-Instruct-v0.3"
 tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)
@@ -71,16 +71,22 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind):
 
     # Make N requests.
     requests = [
-        DetokenizerRequest(
-            request_id=f"request-{idx}",
-            prompt=prompt,
-            prompt_token_ids=prompt_tokens,
-            skip_special_tokens=False,
-            spaces_between_special_tokens=False,
-            output_kind=request_output_kind,
-            stop=[],
-            include_stop_str_in_output=False,
-        ) for idx, (
+        EngineCoreRequest(request_id=f"request-{idx}",
+                          prompt=prompt,
+                          prompt_token_ids=prompt_tokens,
+                          arrival_time=0,
+                          mm_inputs=None,
+                          mm_hashes=None,
+                          mm_placeholders=None,
+                          eos_token_id=None,
+                          lora_request=None,
+                          sampling_params=SamplingParams(
+                              skip_special_tokens=False,
+                              spaces_between_special_tokens=False,
+                              output_kind=request_output_kind,
+                              stop=[],
+                              include_stop_str_in_output=False))
+        for idx, (
             prompt,
             prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS))
     ]
@@ -133,18 +139,25 @@ def test_stop_string(include_stop_str_in_output: bool):
 
     # Make N requests.
     requests = [
-        DetokenizerRequest(
+        EngineCoreRequest(
             request_id=f"request-{idx}",
             prompt=prompt,
             prompt_token_ids=prompt_tokens,
-            skip_special_tokens=False,
-            spaces_between_special_tokens=False,
-            output_kind=RequestOutputKind.DELTA,
-            stop=STOP_STRINGS,
-            include_stop_str_in_output=include_stop_str_in_output,
-        ) for idx, (
-            prompt,
-            prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS))
+            arrival_time=0,
+            mm_inputs=None,
+            mm_hashes=None,
+            mm_placeholders=None,
+            eos_token_id=None,
+            lora_request=None,
+            sampling_params=SamplingParams(
+                skip_special_tokens=False,
+                spaces_between_special_tokens=False,
+                output_kind=RequestOutputKind.DELTA,
+                stop=STOP_STRINGS,
+                include_stop_str_in_output=include_stop_str_in_output,
+            )) for idx, (
+                prompt,
+                prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS))
     ]
 
     # Add requests to the detokenizer.
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index cc0c7ea23469a..f70464fc88298 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -6,21 +6,7 @@
 
 from vllm.lora.request import LoRARequest
 from vllm.multimodal import MultiModalKwargs, MultiModalPlaceholderDict
-from vllm.sampling_params import RequestOutputKind, SamplingParams
-
-
-@dataclass
-class DetokenizerRequest:
-
-    request_id: str
-    prompt: Optional[str]
-    prompt_token_ids: List[int]
-    skip_special_tokens: bool
-    spaces_between_special_tokens: bool
-    output_kind: RequestOutputKind
-
-    stop: List[str]
-    include_stop_str_in_output: bool
+from vllm.sampling_params import SamplingParams
 
 
 @dataclass
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index da3da6dad6436..213ddaa023dbc 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -158,16 +158,18 @@ async def add_request(
             raise ValueError(f"Request id {request_id} already running.")
         self.rid_to_queue[request_id] = asyncio.Queue()
 
-        # 2) Convert input --> DetokenizerRequest / EngineCoreRequest.
-        detokenizer_req, engine_core_req = self.processor.process_inputs(
-            request_id, prompt, params, arrival_time, lora_request,
-            trace_headers, prompt_adapter_request, priority)
+        # 2) Convert Input --> Request.
+        request = self.processor.process_inputs(request_id, prompt, params,
+                                                arrival_time, lora_request,
+                                                trace_headers,
+                                                prompt_adapter_request,
+                                                priority)
 
         # 3) Add the request to Detokenizer (this process).
-        self.detokenizer.add_request(detokenizer_req)
+        self.detokenizer.add_request(request)
 
         # 4) Add the EngineCoreRequest to EngineCore (separate process).
-        await self.engine_core.add_request_async(engine_core_req)
+        await self.engine_core.add_request_async(request)
 
         if self.log_requests:
             logger.info("Added request %s.", request_id)
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 02f34e2b54dd5..65be9e58e03c8 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -8,7 +8,7 @@
 from vllm.transformers_utils.detokenizer_utils import (
     AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally)
 from vllm.transformers_utils.tokenizer import get_tokenizer
-from vllm.v1.engine import DetokenizerRequest, EngineCoreOutput
+from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
 
 logger = init_logger(__name__)
 
@@ -55,19 +55,19 @@ def output_token_ids(self) -> List[int]:
     def from_new_request(
         cls,
         tokenizer: AnyTokenizer,
-        request: DetokenizerRequest,
+        request: EngineCoreRequest,
     ) -> "IncrementalDetokenizer":
 
         tokens, prefix_offset, read_offset = convert_prompt_ids_to_tokens(
             tokenizer=tokenizer,
             prompt_ids=request.prompt_token_ids,
-            skip_special_tokens=request.skip_special_tokens,
+            skip_special_tokens=request.sampling_params.skip_special_tokens,
         )
 
-        stops = request.stop
+        stops = request.sampling_params.stop
         # Number of chars to hold back when stop strings are to be excluded
         # from streamed output.
-        if stops and not request.include_stop_str_in_output:
+        if stops and not request.sampling_params.include_stop_str_in_output:
             stop_buffer_length = max(len(s) for s in stops) - 1
         else:
             stop_buffer_length = 0
@@ -79,13 +79,14 @@ def from_new_request(
             # NOTE(Nick): could we take ownership of it though?
             token_ids=request.prompt_token_ids.copy(),
             stop=stops,
-            include_stop_str_in_output=request.include_stop_str_in_output,
+            include_stop_str_in_output=request.sampling_params.
+            include_stop_str_in_output,
             prefix_offset=prefix_offset,
             read_offset=read_offset,
-            skip_special_tokens=request.skip_special_tokens,
-            spaces_between_special_tokens=request.
+            skip_special_tokens=request.sampling_params.skip_special_tokens,
+            spaces_between_special_tokens=request.sampling_params.
             spaces_between_special_tokens,
-            output_kind=request.output_kind,
+            output_kind=request.sampling_params.output_kind,
             request_id=request.request_id,
             prompt=request.prompt,
             prompt_token_ids=request.prompt_token_ids,
@@ -227,7 +228,7 @@ def abort_requests(
 
     def add_request(
         self,
-        request: DetokenizerRequest,
+        request: EngineCoreRequest,
     ):
         """Add new request to the Detokenizer."""
 
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index fc323184abc8f..a19109559eabf 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -152,15 +152,17 @@ def add_request(
     ) -> None:
 
         # 1) Process raw inputs into the request.
-        detokenizer_req, engine_core_req = self.processor.process_inputs(
-            request_id, prompt, params, arrival_time, lora_request,
-            trace_headers, prompt_adapter_request, priority)
+        request = self.processor.process_inputs(request_id, prompt, params,
+                                                arrival_time, lora_request,
+                                                trace_headers,
+                                                prompt_adapter_request,
+                                                priority)
 
         # 2) Add the request to Detokenizer.
-        self.detokenizer.add_request(detokenizer_req)
+        self.detokenizer.add_request(request)
 
         # 3) Add the request to EngineCore.
-        self.engine_core.add_request(engine_core_req)
+        self.engine_core.add_request(request)
 
     def step(self) -> List[RequestOutput]:
 
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 6ee8732bc902c..5b5a5a61cea7d 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -1,5 +1,5 @@
 import time
-from typing import Mapping, Optional, Tuple, Union
+from typing import Mapping, Optional, Union
 
 from vllm.config import CacheConfig, LoRAConfig, ModelConfig
 from vllm.inputs import (INPUT_REGISTRY, InputRegistry, ProcessorInputs,
@@ -13,7 +13,7 @@
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
-from vllm.v1.engine import DetokenizerRequest, EngineCoreRequest
+from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.mm_input_mapper import MMHasher, MMInputMapperClient
 
 
@@ -62,7 +62,7 @@ def process_inputs(
         trace_headers: Optional[Mapping[str, str]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         priority: int = 0,
-    ) -> Tuple[DetokenizerRequest, EngineCoreRequest]:
+    ) -> EngineCoreRequest:
 
         # TODO(woosuk): Support pooling models.
         # TODO(woosuk): Check max_logprobs
@@ -123,20 +123,7 @@ def process_inputs(
                 decoder_inputs.multi_modal_data, mm_hashes,
                 decoder_inputs.mm_processor_kwargs, precomputed_mm_inputs)
 
-        # Make Request for Detokenizer.
-        detokenizer_request = DetokenizerRequest(
-            request_id,
-            decoder_inputs.prompt,
-            decoder_inputs.prompt_token_ids,
-            sampling_params.skip_special_tokens,
-            sampling_params.spaces_between_special_tokens,
-            sampling_params.output_kind,
-            sampling_params.stop,
-            sampling_params.include_stop_str_in_output,
-        )
-
-        # Make Request for EngineCore.
-        engine_core_request = EngineCoreRequest(
+        return EngineCoreRequest(
             request_id,
             decoder_inputs.prompt,
             decoder_inputs.prompt_token_ids,
@@ -149,8 +136,6 @@ def process_inputs(
             lora_request,
         )
 
-        return detokenizer_request, engine_core_request
-
     def _validate_model_inputs(self, inputs: ProcessorInputs):
         if is_encoder_decoder_inputs(inputs):
             # For encoder-decoder multimodal models, the max_prompt_len

From 32b4c63f02b2ab28a49a040b1d170a903a5cd9dc Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sun, 29 Dec 2024 15:56:22 +0800
Subject: [PATCH 021/462] [Doc] Convert list tables to MyST (#11594)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/source/getting_started/debugging.md      |    2 +-
 .../getting_started/gaudi-installation.md     |   39 +-
 .../getting_started/tpu-installation.md       |   53 +-
 docs/source/models/supported_models.md        | 1206 ++++++++---------
 .../source/quantization/supported_hardware.md |  227 ++--
 docs/source/serving/deploying_with_helm.md    |  409 +++---
 6 files changed, 961 insertions(+), 975 deletions(-)

diff --git a/docs/source/getting_started/debugging.md b/docs/source/getting_started/debugging.md
index 3b0029f2e88ce..19eb699572a08 100644
--- a/docs/source/getting_started/debugging.md
+++ b/docs/source/getting_started/debugging.md
@@ -197,4 +197,4 @@ if __name__ == '__main__':
 ## Known Issues
 
 - In `v0.5.2`, `v0.5.3`, and `v0.5.3.post1`, there is a bug caused by [zmq](https://github.com/zeromq/pyzmq/issues/2000) , which can occasionally cause vLLM to hang depending on the machine configuration. The solution is to upgrade to the latest version of `vllm` to include the [fix](gh-pr:6759).
-- To circumvent a NCCL [bug](https://github.com/NVIDIA/nccl/issues/1234) , all vLLM processes will set an environment variable ``NCCL_CUMEM_ENABLE=0`` to disable NCCL's ``cuMem`` allocator. It does not affect performance but only gives memory benefits. When external processes want to set up a NCCL connection with vLLM's processes, they should also set this environment variable, otherwise, inconsistent environment setup will cause NCCL to hang or crash, as observed in the [RLHF integration](https://github.com/OpenRLHF/OpenRLHF/pull/604) and the [discussion](gh-issue:5723#issuecomment-2554389656) .
+- To circumvent a NCCL [bug](https://github.com/NVIDIA/nccl/issues/1234) , all vLLM processes will set an environment variable `NCCL_CUMEM_ENABLE=0` to disable NCCL's `cuMem` allocator. It does not affect performance but only gives memory benefits. When external processes want to set up a NCCL connection with vLLM's processes, they should also set this environment variable, otherwise, inconsistent environment setup will cause NCCL to hang or crash, as observed in the [RLHF integration](https://github.com/OpenRLHF/OpenRLHF/pull/604) and the [discussion](gh-issue:5723#issuecomment-2554389656) .
diff --git a/docs/source/getting_started/gaudi-installation.md b/docs/source/getting_started/gaudi-installation.md
index 447bf98084a5d..1f2ee62860dec 100644
--- a/docs/source/getting_started/gaudi-installation.md
+++ b/docs/source/getting_started/gaudi-installation.md
@@ -141,26 +141,25 @@ Gaudi2 devices. Configurations that are not listed may or may not work.
 
 Currently in vLLM for HPU we support four execution modes, depending on selected HPU PyTorch Bridge backend (via `PT_HPU_LAZY_MODE` environment variable), and `--enforce-eager` flag.
 
-```{eval-rst}
-.. list-table:: vLLM execution modes
-   :widths: 25 25 50
-   :header-rows: 1
-
-   * - ``PT_HPU_LAZY_MODE``
-     - ``enforce_eager``
-     - execution mode
-   * - 0
-     - 0
-     - torch.compile
-   * - 0
-     - 1
-     - PyTorch eager mode
-   * - 1
-     - 0
-     - HPU Graphs
-   * - 1
-     - 1
-     - PyTorch lazy mode
+```{list-table} vLLM execution modes
+:widths: 25 25 50
+:header-rows: 1
+
+* - `PT_HPU_LAZY_MODE`
+  - `enforce_eager`
+  - execution mode
+* - 0
+  - 0
+  - torch.compile
+* - 0
+  - 1
+  - PyTorch eager mode
+* - 1
+  - 0
+  - HPU Graphs
+* - 1
+  - 1
+  - PyTorch lazy mode
 ```
 
 ```{warning}
diff --git a/docs/source/getting_started/tpu-installation.md b/docs/source/getting_started/tpu-installation.md
index 17eded4a51fec..4d3ac541c90ce 100644
--- a/docs/source/getting_started/tpu-installation.md
+++ b/docs/source/getting_started/tpu-installation.md
@@ -68,33 +68,32 @@ gcloud alpha compute tpus queued-resources create QUEUED_RESOURCE_ID \
 --service-account SERVICE_ACCOUNT
 ```
 
-```{eval-rst}
-.. list-table:: Parameter descriptions
-    :header-rows: 1
-
-    * - Parameter name
-      - Description
-    * - QUEUED_RESOURCE_ID
-      - The user-assigned ID of the queued resource request.
-    * - TPU_NAME
-      - The user-assigned name of the TPU which is created when the queued
-        resource request is allocated.
-    * - PROJECT_ID
-      - Your Google Cloud project
-    * - ZONE
-      - The GCP zone where you want to create your Cloud TPU. The value you use
-        depends on the version of TPUs you are using. For more information, see
-        `TPU regions and zones <https://cloud.google.com/tpu/docs/regions-zones>`_
-    * - ACCELERATOR_TYPE
-      - The TPU version you want to use. Specify the TPU version, for example
-        `v5litepod-4` specifies a v5e TPU with 4 cores. For more information,
-        see `TPU versions <https://cloud.devsite.corp.google.com/tpu/docs/system-architecture-tpu-vm#versions>`_.
-    * - RUNTIME_VERSION
-      - The TPU VM runtime version to use. For more information see `TPU VM images <https://cloud.google.com/tpu/docs/runtimes>`_.
-    * - SERVICE_ACCOUNT
-      - The email address for your service account. You can find it in the IAM
-        Cloud Console under *Service Accounts*. For example:
-        `tpu-service-account@<your_project_ID>.iam.gserviceaccount.com`
+```{list-table} Parameter descriptions
+:header-rows: 1
+
+* - Parameter name
+  - Description
+* - QUEUED_RESOURCE_ID
+  - The user-assigned ID of the queued resource request.
+* - TPU_NAME
+  - The user-assigned name of the TPU which is created when the queued
+    resource request is allocated.
+* - PROJECT_ID
+  - Your Google Cloud project
+* - ZONE
+  - The GCP zone where you want to create your Cloud TPU. The value you use
+    depends on the version of TPUs you are using. For more information, see
+    `TPU regions and zones <https://cloud.google.com/tpu/docs/regions-zones>`_
+* - ACCELERATOR_TYPE
+  - The TPU version you want to use. Specify the TPU version, for example
+    `v5litepod-4` specifies a v5e TPU with 4 cores. For more information,
+    see `TPU versions <https://cloud.devsite.corp.google.com/tpu/docs/system-architecture-tpu-vm#versions>`_.
+* - RUNTIME_VERSION
+  - The TPU VM runtime version to use. For more information see `TPU VM images <https://cloud.google.com/tpu/docs/runtimes>`_.
+* - SERVICE_ACCOUNT
+  - The email address for your service account. You can find it in the IAM
+    Cloud Console under *Service Accounts*. For example:
+    `tpu-service-account@<your_project_ID>.iam.gserviceaccount.com`
 ```
 
 Connect to your TPU using SSH:
diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index e11befbb8dd30..518505abeb2a9 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -72,291 +72,290 @@ See [this page](#generative-models) for more information on how to use generativ
 
 #### Text Generation (`--task generate`)
 
-```{eval-rst}
-.. list-table::
-  :widths: 25 25 50 5 5
-  :header-rows: 1
-
-  * - Architecture
-    - Models
-    - Example HF Models
-    - :ref:`LoRA <lora-adapter>`
-    - :ref:`PP <distributed-serving>`
-  * - :code:`AquilaForCausalLM`
-    - Aquila, Aquila2
-    - :code:`BAAI/Aquila-7B`, :code:`BAAI/AquilaChat-7B`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`ArcticForCausalLM`
-    - Arctic
-    - :code:`Snowflake/snowflake-arctic-base`, :code:`Snowflake/snowflake-arctic-instruct`, etc.
-    -
-    - ✅︎
-  * - :code:`BaiChuanForCausalLM`
-    - Baichuan2, Baichuan
-    - :code:`baichuan-inc/Baichuan2-13B-Chat`, :code:`baichuan-inc/Baichuan-7B`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`BloomForCausalLM`
-    - BLOOM, BLOOMZ, BLOOMChat
-    - :code:`bigscience/bloom`, :code:`bigscience/bloomz`, etc.
-    -
-    - ✅︎
-  * - :code:`BartForConditionalGeneration`
-    - BART
-    - :code:`facebook/bart-base`, :code:`facebook/bart-large-cnn`, etc.
-    -
-    -
-  * - :code:`ChatGLMModel`
-    - ChatGLM
-    - :code:`THUDM/chatglm2-6b`, :code:`THUDM/chatglm3-6b`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`CohereForCausalLM`, :code:`Cohere2ForCausalLM`
-    - Command-R
-    - :code:`CohereForAI/c4ai-command-r-v01`, :code:`CohereForAI/c4ai-command-r7b-12-2024`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`DbrxForCausalLM`
-    - DBRX
-    - :code:`databricks/dbrx-base`, :code:`databricks/dbrx-instruct`, etc.
-    -
-    - ✅︎
-  * - :code:`DeciLMForCausalLM`
-    - DeciLM
-    - :code:`Deci/DeciLM-7B`, :code:`Deci/DeciLM-7B-instruct`, etc.
-    -
-    - ✅︎
-  * - :code:`DeepseekForCausalLM`
-    - DeepSeek
-    - :code:`deepseek-ai/deepseek-llm-67b-base`, :code:`deepseek-ai/deepseek-llm-7b-chat` etc.
-    -
-    - ✅︎
-  * - :code:`DeepseekV2ForCausalLM`
-    - DeepSeek-V2
-    - :code:`deepseek-ai/DeepSeek-V2`, :code:`deepseek-ai/DeepSeek-V2-Chat` etc.
-    -
-    - ✅︎
-  * - :code:`DeepseekV3ForCausalLM`
-    - DeepSeek-V3
-    - :code:`deepseek-ai/DeepSeek-V3-Base`, :code:`deepseek-ai/DeepSeek-V3` etc.
-    -
-    - ✅︎
-  * - :code:`ExaoneForCausalLM`
-    - EXAONE-3
-    - :code:`LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`FalconForCausalLM`
-    - Falcon
-    - :code:`tiiuae/falcon-7b`, :code:`tiiuae/falcon-40b`, :code:`tiiuae/falcon-rw-7b`, etc.
-    -
-    - ✅︎
-  * - :code:`FalconMambaForCausalLM`
-    - FalconMamba
-    - :code:`tiiuae/falcon-mamba-7b`, :code:`tiiuae/falcon-mamba-7b-instruct`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`GemmaForCausalLM`
-    - Gemma
-    - :code:`google/gemma-2b`, :code:`google/gemma-7b`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`Gemma2ForCausalLM`
-    - Gemma2
-    - :code:`google/gemma-2-9b`, :code:`google/gemma-2-27b`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`GlmForCausalLM`
-    - GLM-4
-    - :code:`THUDM/glm-4-9b-chat-hf`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`GPT2LMHeadModel`
-    - GPT-2
-    - :code:`gpt2`, :code:`gpt2-xl`, etc.
-    -
-    - ✅︎
-  * - :code:`GPTBigCodeForCausalLM`
-    - StarCoder, SantaCoder, WizardCoder
-    - :code:`bigcode/starcoder`, :code:`bigcode/gpt_bigcode-santacoder`, :code:`WizardLM/WizardCoder-15B-V1.0`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`GPTJForCausalLM`
-    - GPT-J
-    - :code:`EleutherAI/gpt-j-6b`, :code:`nomic-ai/gpt4all-j`, etc.
-    -
-    - ✅︎
-  * - :code:`GPTNeoXForCausalLM`
-    - GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM
-    - :code:`EleutherAI/gpt-neox-20b`, :code:`EleutherAI/pythia-12b`, :code:`OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, :code:`databricks/dolly-v2-12b`, :code:`stabilityai/stablelm-tuned-alpha-7b`, etc.
-    -
-    - ✅︎
-  * - :code:`GraniteForCausalLM`
-    - Granite 3.0, Granite 3.1, PowerLM
-    - :code:`ibm-granite/granite-3.0-2b-base`, :code:`ibm-granite/granite-3.1-8b-instruct`, :code:`ibm/PowerLM-3b`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`GraniteMoeForCausalLM`
-    - Granite 3.0 MoE, PowerMoE
-    - :code:`ibm-granite/granite-3.0-1b-a400m-base`, :code:`ibm-granite/granite-3.0-3b-a800m-instruct`, :code:`ibm/PowerMoE-3b`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`GritLM`
-    - GritLM
-    - :code:`parasail-ai/GritLM-7B-vllm`.
-    - ✅︎
-    - ✅︎
-  * - :code:`InternLMForCausalLM`
-    - InternLM
-    - :code:`internlm/internlm-7b`, :code:`internlm/internlm-chat-7b`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`InternLM2ForCausalLM`
-    - InternLM2
-    - :code:`internlm/internlm2-7b`, :code:`internlm/internlm2-chat-7b`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`JAISLMHeadModel`
-    - Jais
-    - :code:`inceptionai/jais-13b`, :code:`inceptionai/jais-13b-chat`, :code:`inceptionai/jais-30b-v3`, :code:`inceptionai/jais-30b-chat-v3`, etc.
-    -
-    - ✅︎
-  * - :code:`JambaForCausalLM`
-    - Jamba
-    - :code:`ai21labs/AI21-Jamba-1.5-Large`, :code:`ai21labs/AI21-Jamba-1.5-Mini`, :code:`ai21labs/Jamba-v0.1`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`LlamaForCausalLM`
-    - Llama 3.1, Llama 3, Llama 2, LLaMA, Yi
-    - :code:`meta-llama/Meta-Llama-3.1-405B-Instruct`, :code:`meta-llama/Meta-Llama-3.1-70B`, :code:`meta-llama/Meta-Llama-3-70B-Instruct`, :code:`meta-llama/Llama-2-70b-hf`, :code:`01-ai/Yi-34B`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`MambaForCausalLM`
-    - Mamba
-    - :code:`state-spaces/mamba-130m-hf`, :code:`state-spaces/mamba-790m-hf`, :code:`state-spaces/mamba-2.8b-hf`, etc.
-    -
-    - ✅︎
-  * - :code:`MiniCPMForCausalLM`
-    - MiniCPM
-    - :code:`openbmb/MiniCPM-2B-sft-bf16`, :code:`openbmb/MiniCPM-2B-dpo-bf16`, :code:`openbmb/MiniCPM-S-1B-sft`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`MiniCPM3ForCausalLM`
-    - MiniCPM3
-    - :code:`openbmb/MiniCPM3-4B`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`MistralForCausalLM`
-    - Mistral, Mistral-Instruct
-    - :code:`mistralai/Mistral-7B-v0.1`, :code:`mistralai/Mistral-7B-Instruct-v0.1`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`MixtralForCausalLM`
-    - Mixtral-8x7B, Mixtral-8x7B-Instruct
-    - :code:`mistralai/Mixtral-8x7B-v0.1`, :code:`mistralai/Mixtral-8x7B-Instruct-v0.1`, :code:`mistral-community/Mixtral-8x22B-v0.1`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`MPTForCausalLM`
-    - MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter
-    - :code:`mosaicml/mpt-7b`, :code:`mosaicml/mpt-7b-storywriter`, :code:`mosaicml/mpt-30b`, etc.
-    -
-    - ✅︎
-  * - :code:`NemotronForCausalLM`
-    - Nemotron-3, Nemotron-4, Minitron
-    - :code:`nvidia/Minitron-8B-Base`, :code:`mgoin/Nemotron-4-340B-Base-hf-FP8`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`OLMoForCausalLM`
-    - OLMo
-    - :code:`allenai/OLMo-1B-hf`, :code:`allenai/OLMo-7B-hf`, etc.
-    -
-    - ✅︎
-  * - :code:`OLMo2ForCausalLM`
-    - OLMo2
-    - :code:`allenai/OLMo2-7B-1124`, etc.
-    -
-    - ✅︎
-  * - :code:`OLMoEForCausalLM`
-    - OLMoE
-    - :code:`allenai/OLMoE-1B-7B-0924`, :code:`allenai/OLMoE-1B-7B-0924-Instruct`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`OPTForCausalLM`
-    - OPT, OPT-IML
-    - :code:`facebook/opt-66b`, :code:`facebook/opt-iml-max-30b`, etc.
-    -
-    - ✅︎
-  * - :code:`OrionForCausalLM`
-    - Orion
-    - :code:`OrionStarAI/Orion-14B-Base`, :code:`OrionStarAI/Orion-14B-Chat`, etc.
-    -
-    - ✅︎
-  * - :code:`PhiForCausalLM`
-    - Phi
-    - :code:`microsoft/phi-1_5`, :code:`microsoft/phi-2`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`Phi3ForCausalLM`
-    - Phi-3
-    - :code:`microsoft/Phi-3-mini-4k-instruct`, :code:`microsoft/Phi-3-mini-128k-instruct`, :code:`microsoft/Phi-3-medium-128k-instruct`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`Phi3SmallForCausalLM`
-    - Phi-3-Small
-    - :code:`microsoft/Phi-3-small-8k-instruct`, :code:`microsoft/Phi-3-small-128k-instruct`, etc.
-    -
-    - ✅︎
-  * - :code:`PhiMoEForCausalLM`
-    - Phi-3.5-MoE
-    - :code:`microsoft/Phi-3.5-MoE-instruct`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`PersimmonForCausalLM`
-    - Persimmon
-    - :code:`adept/persimmon-8b-base`, :code:`adept/persimmon-8b-chat`, etc.
-    -
-    - ✅︎
-  * - :code:`QWenLMHeadModel`
-    - Qwen
-    - :code:`Qwen/Qwen-7B`, :code:`Qwen/Qwen-7B-Chat`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`Qwen2ForCausalLM`
-    - Qwen2
-    - :code:`Qwen/QwQ-32B-Preview`, :code:`Qwen/Qwen2-7B-Instruct`, :code:`Qwen/Qwen2-7B`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`Qwen2MoeForCausalLM`
-    - Qwen2MoE
-    - :code:`Qwen/Qwen1.5-MoE-A2.7B`, :code:`Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc.
-    -
-    - ✅︎
-  * - :code:`StableLmForCausalLM`
-    - StableLM
-    - :code:`stabilityai/stablelm-3b-4e1t`, :code:`stabilityai/stablelm-base-alpha-7b-v2`, etc.
-    -
-    - ✅︎
-  * - :code:`Starcoder2ForCausalLM`
-    - Starcoder2
-    - :code:`bigcode/starcoder2-3b`, :code:`bigcode/starcoder2-7b`, :code:`bigcode/starcoder2-15b`, etc.
-    -
-    - ✅︎
-  * - :code:`SolarForCausalLM`
-    - Solar Pro
-    - :code:`upstage/solar-pro-preview-instruct`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`TeleChat2ForCausalLM`
-    - TeleChat2
-    - :code:`TeleAI/TeleChat2-3B`, :code:`TeleAI/TeleChat2-7B`, :code:`TeleAI/TeleChat2-35B`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`XverseForCausalLM`
-    - XVERSE
-    - :code:`xverse/XVERSE-7B-Chat`, :code:`xverse/XVERSE-13B-Chat`, :code:`xverse/XVERSE-65B-Chat`, etc.
-    - ✅︎
-    - ✅︎
+```{list-table}
+:widths: 25 25 50 5 5
+:header-rows: 1
+
+* - Architecture
+  - Models
+  - Example HF Models
+  - [LoRA](#lora-adapter)
+  - [PP](#distributed-serving)
+* - `AquilaForCausalLM`
+  - Aquila, Aquila2
+  - `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc.
+  - ✅︎
+  - ✅︎
+* - `ArcticForCausalLM`
+  - Arctic
+  - `Snowflake/snowflake-arctic-base`, `Snowflake/snowflake-arctic-instruct`, etc.
+  -
+  - ✅︎
+* - `BaiChuanForCausalLM`
+  - Baichuan2, Baichuan
+  - `baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc.
+  - ✅︎
+  - ✅︎
+* - `BloomForCausalLM`
+  - BLOOM, BLOOMZ, BLOOMChat
+  - `bigscience/bloom`, `bigscience/bloomz`, etc.
+  -
+  - ✅︎
+* - `BartForConditionalGeneration`
+  - BART
+  - `facebook/bart-base`, `facebook/bart-large-cnn`, etc.
+  -
+  -
+* - `ChatGLMModel`
+  - ChatGLM
+  - `THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, etc.
+  - ✅︎
+  - ✅︎
+* - `CohereForCausalLM`, `Cohere2ForCausalLM`
+  - Command-R
+  - `CohereForAI/c4ai-command-r-v01`, `CohereForAI/c4ai-command-r7b-12-2024`, etc.
+  - ✅︎
+  - ✅︎
+* - `DbrxForCausalLM`
+  - DBRX
+  - `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc.
+  -
+  - ✅︎
+* - `DeciLMForCausalLM`
+  - DeciLM
+  - `Deci/DeciLM-7B`, `Deci/DeciLM-7B-instruct`, etc.
+  -
+  - ✅︎
+* - `DeepseekForCausalLM`
+  - DeepSeek
+  - `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat` etc.
+  -
+  - ✅︎
+* - `DeepseekV2ForCausalLM`
+  - DeepSeek-V2
+  - `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat` etc.
+  -
+  - ✅︎
+* - `DeepseekV3ForCausalLM`
+  - DeepSeek-V3
+  - `deepseek-ai/DeepSeek-V3-Base`, `deepseek-ai/DeepSeek-V3` etc.
+  -
+  - ✅︎
+* - `ExaoneForCausalLM`
+  - EXAONE-3
+  - `LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc.
+  - ✅︎
+  - ✅︎
+* - `FalconForCausalLM`
+  - Falcon
+  - `tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc.
+  -
+  - ✅︎
+* - `FalconMambaForCausalLM`
+  - FalconMamba
+  - `tiiuae/falcon-mamba-7b`, `tiiuae/falcon-mamba-7b-instruct`, etc.
+  - ✅︎
+  - ✅︎
+* - `GemmaForCausalLM`
+  - Gemma
+  - `google/gemma-2b`, `google/gemma-7b`, etc.
+  - ✅︎
+  - ✅︎
+* - `Gemma2ForCausalLM`
+  - Gemma2
+  - `google/gemma-2-9b`, `google/gemma-2-27b`, etc.
+  - ✅︎
+  - ✅︎
+* - `GlmForCausalLM`
+  - GLM-4
+  - `THUDM/glm-4-9b-chat-hf`, etc.
+  - ✅︎
+  - ✅︎
+* - `GPT2LMHeadModel`
+  - GPT-2
+  - `gpt2`, `gpt2-xl`, etc.
+  -
+  - ✅︎
+* - `GPTBigCodeForCausalLM`
+  - StarCoder, SantaCoder, WizardCoder
+  - `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc.
+  - ✅︎
+  - ✅︎
+* - `GPTJForCausalLM`
+  - GPT-J
+  - `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc.
+  -
+  - ✅︎
+* - `GPTNeoXForCausalLM`
+  - GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM
+  - `EleutherAI/gpt-neox-20b`, `EleutherAI/pythia-12b`, `OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc.
+  -
+  - ✅︎
+* - `GraniteForCausalLM`
+  - Granite 3.0, Granite 3.1, PowerLM
+  - `ibm-granite/granite-3.0-2b-base`, `ibm-granite/granite-3.1-8b-instruct`, `ibm/PowerLM-3b`, etc.
+  - ✅︎
+  - ✅︎
+* - `GraniteMoeForCausalLM`
+  - Granite 3.0 MoE, PowerMoE
+  - `ibm-granite/granite-3.0-1b-a400m-base`, `ibm-granite/granite-3.0-3b-a800m-instruct`, `ibm/PowerMoE-3b`, etc.
+  - ✅︎
+  - ✅︎
+* - `GritLM`
+  - GritLM
+  - `parasail-ai/GritLM-7B-vllm`.
+  - ✅︎
+  - ✅︎
+* - `InternLMForCausalLM`
+  - InternLM
+  - `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc.
+  - ✅︎
+  - ✅︎
+* - `InternLM2ForCausalLM`
+  - InternLM2
+  - `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc.
+  - ✅︎
+  - ✅︎
+* - `JAISLMHeadModel`
+  - Jais
+  - `inceptionai/jais-13b`, `inceptionai/jais-13b-chat`, `inceptionai/jais-30b-v3`, `inceptionai/jais-30b-chat-v3`, etc.
+  -
+  - ✅︎
+* - `JambaForCausalLM`
+  - Jamba
+  - `ai21labs/AI21-Jamba-1.5-Large`, `ai21labs/AI21-Jamba-1.5-Mini`, `ai21labs/Jamba-v0.1`, etc.
+  - ✅︎
+  - ✅︎
+* - `LlamaForCausalLM`
+  - Llama 3.1, Llama 3, Llama 2, LLaMA, Yi
+  - `meta-llama/Meta-Llama-3.1-405B-Instruct`, `meta-llama/Meta-Llama-3.1-70B`, `meta-llama/Meta-Llama-3-70B-Instruct`, `meta-llama/Llama-2-70b-hf`, `01-ai/Yi-34B`, etc.
+  - ✅︎
+  - ✅︎
+* - `MambaForCausalLM`
+  - Mamba
+  - `state-spaces/mamba-130m-hf`, `state-spaces/mamba-790m-hf`, `state-spaces/mamba-2.8b-hf`, etc.
+  -
+  - ✅︎
+* - `MiniCPMForCausalLM`
+  - MiniCPM
+  - `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc.
+  - ✅︎
+  - ✅︎
+* - `MiniCPM3ForCausalLM`
+  - MiniCPM3
+  - `openbmb/MiniCPM3-4B`, etc.
+  - ✅︎
+  - ✅︎
+* - `MistralForCausalLM`
+  - Mistral, Mistral-Instruct
+  - `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc.
+  - ✅︎
+  - ✅︎
+* - `MixtralForCausalLM`
+  - Mixtral-8x7B, Mixtral-8x7B-Instruct
+  - `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc.
+  - ✅︎
+  - ✅︎
+* - `MPTForCausalLM`
+  - MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter
+  - `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc.
+  -
+  - ✅︎
+* - `NemotronForCausalLM`
+  - Nemotron-3, Nemotron-4, Minitron
+  - `nvidia/Minitron-8B-Base`, `mgoin/Nemotron-4-340B-Base-hf-FP8`, etc.
+  - ✅︎
+  - ✅︎
+* - `OLMoForCausalLM`
+  - OLMo
+  - `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc.
+  -
+  - ✅︎
+* - `OLMo2ForCausalLM`
+  - OLMo2
+  - `allenai/OLMo2-7B-1124`, etc.
+  -
+  - ✅︎
+* - `OLMoEForCausalLM`
+  - OLMoE
+  - `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc.
+  - ✅︎
+  - ✅︎
+* - `OPTForCausalLM`
+  - OPT, OPT-IML
+  - `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc.
+  -
+  - ✅︎
+* - `OrionForCausalLM`
+  - Orion
+  - `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc.
+  -
+  - ✅︎
+* - `PhiForCausalLM`
+  - Phi
+  - `microsoft/phi-1_5`, `microsoft/phi-2`, etc.
+  - ✅︎
+  - ✅︎
+* - `Phi3ForCausalLM`
+  - Phi-3
+  - `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc.
+  - ✅︎
+  - ✅︎
+* - `Phi3SmallForCausalLM`
+  - Phi-3-Small
+  - `microsoft/Phi-3-small-8k-instruct`, `microsoft/Phi-3-small-128k-instruct`, etc.
+  -
+  - ✅︎
+* - `PhiMoEForCausalLM`
+  - Phi-3.5-MoE
+  - `microsoft/Phi-3.5-MoE-instruct`, etc.
+  - ✅︎
+  - ✅︎
+* - `PersimmonForCausalLM`
+  - Persimmon
+  - `adept/persimmon-8b-base`, `adept/persimmon-8b-chat`, etc.
+  -
+  - ✅︎
+* - `QWenLMHeadModel`
+  - Qwen
+  - `Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.
+  - ✅︎
+  - ✅︎
+* - `Qwen2ForCausalLM`
+  - Qwen2
+  - `Qwen/QwQ-32B-Preview`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-7B`, etc.
+  - ✅︎
+  - ✅︎
+* - `Qwen2MoeForCausalLM`
+  - Qwen2MoE
+  - `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc.
+  -
+  - ✅︎
+* - `StableLmForCausalLM`
+  - StableLM
+  - `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc.
+  -
+  - ✅︎
+* - `Starcoder2ForCausalLM`
+  - Starcoder2
+  - `bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc.
+  -
+  - ✅︎
+* - `SolarForCausalLM`
+  - Solar Pro
+  - `upstage/solar-pro-preview-instruct`, etc.
+  - ✅︎
+  - ✅︎
+* - `TeleChat2ForCausalLM`
+  - TeleChat2
+  - `TeleAI/TeleChat2-3B`, `TeleAI/TeleChat2-7B`, `TeleAI/TeleChat2-35B`, etc.
+  - ✅︎
+  - ✅︎
+* - `XverseForCausalLM`
+  - XVERSE
+  - `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc.
+  - ✅︎
+  - ✅︎
 ```
 
 ```{note}
@@ -374,51 +373,50 @@ you should explicitly specify the task type to ensure that the model is used in
 
 #### Text Embedding (`--task embed`)
 
-```{eval-rst}
-.. list-table::
-  :widths: 25 25 50 5 5
-  :header-rows: 1
-
-  * - Architecture
-    - Models
-    - Example HF Models
-    - :ref:`LoRA <lora-adapter>`
-    - :ref:`PP <distributed-serving>`
-  * - :code:`BertModel`
-    - BERT-based
-    - :code:`BAAI/bge-base-en-v1.5`, etc.
-    -
-    -
-  * - :code:`Gemma2Model`
-    - Gemma2-based
-    - :code:`BAAI/bge-multilingual-gemma2`, etc.
-    -
-    - ✅︎
-  * - :code:`GritLM`
-    - GritLM
-    - :code:`parasail-ai/GritLM-7B-vllm`.
-    - ✅︎
-    - ✅︎
-  * - :code:`LlamaModel`, :code:`LlamaForCausalLM`, :code:`MistralModel`, etc.
-    - Llama-based
-    - :code:`intfloat/e5-mistral-7b-instruct`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`Qwen2Model`, :code:`Qwen2ForCausalLM`
-    - Qwen2-based
-    - :code:`ssmits/Qwen2-7B-Instruct-embed-base` (see note), :code:`Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`RobertaModel`, :code:`RobertaForMaskedLM`
-    - RoBERTa-based
-    - :code:`sentence-transformers/all-roberta-large-v1`, :code:`sentence-transformers/all-roberta-large-v1`, etc.
-    -
-    -
-  * - :code:`XLMRobertaModel`
-    - XLM-RoBERTa-based
-    - :code:`intfloat/multilingual-e5-large`, etc.
-    -
-    -
+```{list-table}
+:widths: 25 25 50 5 5
+:header-rows: 1
+
+* - Architecture
+  - Models
+  - Example HF Models
+  - [LoRA](#lora-adapter)
+  - [PP](#distributed-serving)
+* - `BertModel`
+  - BERT-based
+  - `BAAI/bge-base-en-v1.5`, etc.
+  -
+  -
+* - `Gemma2Model`
+  - Gemma2-based
+  - `BAAI/bge-multilingual-gemma2`, etc.
+  -
+  - ✅︎
+* - `GritLM`
+  - GritLM
+  - `parasail-ai/GritLM-7B-vllm`.
+  - ✅︎
+  - ✅︎
+* - `LlamaModel`, `LlamaForCausalLM`, `MistralModel`, etc.
+  - Llama-based
+  - `intfloat/e5-mistral-7b-instruct`, etc.
+  - ✅︎
+  - ✅︎
+* - `Qwen2Model`, `Qwen2ForCausalLM`
+  - Qwen2-based
+  - `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc.
+  - ✅︎
+  - ✅︎
+* - `RobertaModel`, `RobertaForMaskedLM`
+  - RoBERTa-based
+  - `sentence-transformers/all-roberta-large-v1`, `sentence-transformers/all-roberta-large-v1`, etc.
+  -
+  -
+* - `XLMRobertaModel`
+  - XLM-RoBERTa-based
+  - `intfloat/multilingual-e5-large`, etc.
+  -
+  -
 ```
 
 ```{note}
@@ -440,31 +438,30 @@ of the whole prompt are extracted from the normalized hidden state corresponding
 
 #### Reward Modeling (`--task reward`)
 
-```{eval-rst}
-.. list-table::
-  :widths: 25 25 50 5 5
-  :header-rows: 1
-
-  * - Architecture
-    - Models
-    - Example HF Models
-    - :ref:`LoRA <lora-adapter>`
-    - :ref:`PP <distributed-serving>`
-  * - :code:`InternLM2ForRewardModel`
-    - InternLM2-based
-    - :code:`internlm/internlm2-1_8b-reward`, :code:`internlm/internlm2-7b-reward`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`LlamaForCausalLM`
-    - Llama-based
-    - :code:`peiyi9979/math-shepherd-mistral-7b-prm`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`Qwen2ForRewardModel`
-    - Qwen2-based
-    - :code:`Qwen/Qwen2.5-Math-RM-72B`, etc.
-    - ✅︎
-    - ✅︎
+```{list-table}
+:widths: 25 25 50 5 5
+:header-rows: 1
+
+* - Architecture
+  - Models
+  - Example HF Models
+  - [LoRA](#lora-adapter)
+  - [PP](#distributed-serving)
+* - `InternLM2ForRewardModel`
+  - InternLM2-based
+  - `internlm/internlm2-1_8b-reward`, `internlm/internlm2-7b-reward`, etc.
+  - ✅︎
+  - ✅︎
+* - `LlamaForCausalLM`
+  - Llama-based
+  - `peiyi9979/math-shepherd-mistral-7b-prm`, etc.
+  - ✅︎
+  - ✅︎
+* - `Qwen2ForRewardModel`
+  - Qwen2-based
+  - `Qwen/Qwen2.5-Math-RM-72B`, etc.
+  - ✅︎
+  - ✅︎
 ```
 
 If your model is not in the above list, we will try to automatically convert the model using
@@ -477,26 +474,25 @@ e.g.: {code}`--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 1
 
 #### Classification (`--task classify`)
 
-```{eval-rst}
-.. list-table::
-  :widths: 25 25 50 5 5
-  :header-rows: 1
-
-  * - Architecture
-    - Models
-    - Example HF Models
-    - :ref:`LoRA <lora-adapter>`
-    - :ref:`PP <distributed-serving>`
-  * - :code:`JambaForSequenceClassification`
-    - Jamba
-    - :code:`ai21labs/Jamba-tiny-reward-dev`, etc.
-    - ✅︎
-    - ✅︎
-  * - :code:`Qwen2ForSequenceClassification`
-    - Qwen2-based
-    - :code:`jason9693/Qwen2.5-1.5B-apeach`, etc.
-    - ✅︎
-    - ✅︎
+```{list-table}
+:widths: 25 25 50 5 5
+:header-rows: 1
+
+* - Architecture
+  - Models
+  - Example HF Models
+  - [LoRA](#lora-adapter)
+  - [PP](#distributed-serving)
+* - `JambaForSequenceClassification`
+  - Jamba
+  - `ai21labs/Jamba-tiny-reward-dev`, etc.
+  - ✅︎
+  - ✅︎
+* - `Qwen2ForSequenceClassification`
+  - Qwen2-based
+  - `jason9693/Qwen2.5-1.5B-apeach`, etc.
+  - ✅︎
+  - ✅︎
 ```
 
 If your model is not in the above list, we will try to automatically convert the model using
@@ -504,31 +500,30 @@ If your model is not in the above list, we will try to automatically convert the
 
 #### Sentence Pair Scoring (`--task score`)
 
-```{eval-rst}
-.. list-table::
-  :widths: 25 25 50 5 5
-  :header-rows: 1
-
-  * - Architecture
-    - Models
-    - Example HF Models
-    - :ref:`LoRA <lora-adapter>`
-    - :ref:`PP <distributed-serving>`
-  * - :code:`BertForSequenceClassification`
-    - BERT-based
-    - :code:`cross-encoder/ms-marco-MiniLM-L-6-v2`, etc.
-    -
-    -
-  * - :code:`RobertaForSequenceClassification`
-    - RoBERTa-based
-    - :code:`cross-encoder/quora-roberta-base`, etc.
-    -
-    -
-  * - :code:`XLMRobertaForSequenceClassification`
-    - XLM-RoBERTa-based
-    - :code:`BAAI/bge-reranker-v2-m3`, etc.
-    -
-    -
+```{list-table}
+:widths: 25 25 50 5 5
+:header-rows: 1
+
+* - Architecture
+  - Models
+  - Example HF Models
+  - [LoRA](#lora-adapter)
+  - [PP](#distributed-serving)
+* - `BertForSequenceClassification`
+  - BERT-based
+  - `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc.
+  -
+  -
+* - `RobertaForSequenceClassification`
+  - RoBERTa-based
+  - `cross-encoder/quora-roberta-base`, etc.
+  -
+  -
+* - `XLMRobertaForSequenceClassification`
+  - XLM-RoBERTa-based
+  - `BAAI/bge-reranker-v2-m3`, etc.
+  -
+  -
 ```
 
 (supported-mm-models)=
@@ -558,186 +553,182 @@ See [this page](#generative-models) for more information on how to use generativ
 
 #### Text Generation (`--task generate`)
 
-```{eval-rst}
-.. list-table::
-  :widths: 25 25 15 20 5 5 5
-  :header-rows: 1
-
-  * - Architecture
-    - Models
-    - Inputs
-    - Example HF Models
-    - :ref:`LoRA <lora-adapter>`
-    - :ref:`PP <distributed-serving>`
-    - V1
-  * - :code:`AriaForConditionalGeneration`
-    - Aria
-    - T + I
-    - :code:`rhymes-ai/Aria`
-    -
-    - ✅︎
-    -
-  * - :code:`Blip2ForConditionalGeneration`
-    - BLIP-2
-    - T + I\ :sup:`E`
-    - :code:`Salesforce/blip2-opt-2.7b`, :code:`Salesforce/blip2-opt-6.7b`, etc.
-    -
-    - ✅︎
-    -
-  * - :code:`ChameleonForConditionalGeneration`
-    - Chameleon
-    - T + I
-    - :code:`facebook/chameleon-7b` etc.
-    -
-    - ✅︎
-    -
-  * - :code:`FuyuForCausalLM`
-    - Fuyu
-    - T + I
-    - :code:`adept/fuyu-8b` etc.
-    -
-    - ✅︎
-    -
-  * - :code:`ChatGLMModel`
-    - GLM-4V
-    - T + I
-    - :code:`THUDM/glm-4v-9b` etc.
-    - ✅︎
-    - ✅︎
-    -
-  * - :code:`H2OVLChatModel`
-    - H2OVL
-    - T + I\ :sup:`E+`
-    - :code:`h2oai/h2ovl-mississippi-800m`, :code:`h2oai/h2ovl-mississippi-2b`, etc.
-    -
-    - ✅︎
-    -
-  * - :code:`Idefics3ForConditionalGeneration`
-    - Idefics3
-    - T + I
-    - :code:`HuggingFaceM4/Idefics3-8B-Llama3` etc.
-    - ✅︎
-    -
-    -
-  * - :code:`InternVLChatModel`
-    - InternVL 2.5, Mono-InternVL, InternVL 2.0
-    - T + I\ :sup:`E+`
-    - :code:`OpenGVLab/InternVL2_5-4B`, :code:`OpenGVLab/Mono-InternVL-2B`, :code:`OpenGVLab/InternVL2-4B`, etc.
-    -
-    - ✅︎
-    - ✅︎
-  * - :code:`LlavaForConditionalGeneration`
-    - LLaVA-1.5
-    - T + I\ :sup:`E+`
-    - :code:`llava-hf/llava-1.5-7b-hf`, :code:`TIGER-Lab/Mantis-8B-siglip-llama3` (see note), etc.
-    -
-    - ✅︎
-    - ✅︎
-  * - :code:`LlavaNextForConditionalGeneration`
-    - LLaVA-NeXT
-    - T + I\ :sup:`E+`
-    - :code:`llava-hf/llava-v1.6-mistral-7b-hf`, :code:`llava-hf/llava-v1.6-vicuna-7b-hf`, etc.
-    -
-    - ✅︎
-    -
-  * - :code:`LlavaNextVideoForConditionalGeneration`
-    - LLaVA-NeXT-Video
-    - T + V
-    - :code:`llava-hf/LLaVA-NeXT-Video-7B-hf`, etc.
-    -
-    - ✅︎
-    -
-  * - :code:`LlavaOnevisionForConditionalGeneration`
-    - LLaVA-Onevision
-    - T + I\ :sup:`+` + V\ :sup:`+`
-    - :code:`llava-hf/llava-onevision-qwen2-7b-ov-hf`, :code:`llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc.
-    -
-    - ✅︎
-    -
-  * - :code:`MiniCPMV`
-    - MiniCPM-V
-    - T + I\ :sup:`E+`
-    - :code:`openbmb/MiniCPM-V-2` (see note), :code:`openbmb/MiniCPM-Llama3-V-2_5`, :code:`openbmb/MiniCPM-V-2_6`, etc.
-    - ✅︎
-    - ✅︎
-    -
-  * - :code:`MllamaForConditionalGeneration`
-    - Llama 3.2
-    - T + I\ :sup:`+`
-    - :code:`meta-llama/Llama-3.2-90B-Vision-Instruct`, :code:`meta-llama/Llama-3.2-11B-Vision`, etc.
-    -
-    -
-    -
-  * - :code:`MolmoForCausalLM`
-    - Molmo
-    - T + I
-    - :code:`allenai/Molmo-7B-D-0924`, :code:`allenai/Molmo-72B-0924`, etc.
-    -
-    - ✅︎
-    - ✅︎
-  * - :code:`NVLM_D_Model`
-    - NVLM-D 1.0
-    - T + I\ :sup:`E+`
-    - :code:`nvidia/NVLM-D-72B`, etc.
-    -
-    - ✅︎
-    - ✅︎
-  * - :code:`PaliGemmaForConditionalGeneration`
-    - PaliGemma, PaliGemma 2
-    - T + I\ :sup:`E`
-    - :code:`google/paligemma-3b-pt-224`, :code:`google/paligemma-3b-mix-224`, :code:`google/paligemma2-3b-ft-docci-448`, etc.
-    -
-    - ✅︎
-    -
-  * - :code:`Phi3VForCausalLM`
-    - Phi-3-Vision, Phi-3.5-Vision
-    - T + I\ :sup:`E+`
-    - :code:`microsoft/Phi-3-vision-128k-instruct`, :code:`microsoft/Phi-3.5-vision-instruct` etc.
-    -
-    - ✅︎
-    - ✅︎
-  * - :code:`PixtralForConditionalGeneration`
-    - Pixtral
-    - T + I\ :sup:`+`
-    - :code:`mistralai/Pixtral-12B-2409`, :code:`mistral-community/pixtral-12b` etc.
-    -
-    - ✅︎
-    - ✅︎
-  * - :code:`QWenLMHeadModel`
-    - Qwen-VL
-    - T + I\ :sup:`E+`
-    - :code:`Qwen/Qwen-VL`, :code:`Qwen/Qwen-VL-Chat`, etc.
-    - ✅︎
-    - ✅︎
-    -
-  * - :code:`Qwen2AudioForConditionalGeneration`
-    - Qwen2-Audio
-    - T + A\ :sup:`+`
-    - :code:`Qwen/Qwen2-Audio-7B-Instruct`
-    -
-    - ✅︎
-    -
-  * - :code:`Qwen2VLForConditionalGeneration`
-    - Qwen2-VL
-    - T + I\ :sup:`E+` + V\ :sup:`E+`
-    - :code:`Qwen/QVQ-72B-Preview`, :code:`Qwen/Qwen2-VL-7B-Instruct`, :code:`Qwen/Qwen2-VL-72B-Instruct`, etc.
-    - ✅︎
-    - ✅︎
-    -
-  * - :code:`UltravoxModel`
-    - Ultravox
-    - T + A\ :sup:`E+`
-    - :code:`fixie-ai/ultravox-v0_3`
-    -
-    - ✅︎
-    -
+```{list-table}
+:widths: 25 25 15 20 5 5 5
+:header-rows: 1
+
+* - Architecture
+  - Models
+  - Inputs
+  - Example HF Models
+  - [LoRA](#lora-adapter)
+  - [PP](#distributed-serving)
+  - [V1](gh-issue:8779)
+* - `AriaForConditionalGeneration`
+  - Aria
+  - T + I
+  - `rhymes-ai/Aria`
+  -
+  - ✅︎
+  -
+* - `Blip2ForConditionalGeneration`
+  - BLIP-2
+  - T + I<sup>E</sup>
+  - `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc.
+  -
+  - ✅︎
+  -
+* - `ChameleonForConditionalGeneration`
+  - Chameleon
+  - T + I
+  - `facebook/chameleon-7b` etc.
+  -
+  - ✅︎
+  -
+* - `FuyuForCausalLM`
+  - Fuyu
+  - T + I
+  - `adept/fuyu-8b` etc.
+  -
+  - ✅︎
+  -
+* - `ChatGLMModel`
+  - GLM-4V
+  - T + I
+  - `THUDM/glm-4v-9b` etc.
+  - ✅︎
+  - ✅︎
+  -
+* - `H2OVLChatModel`
+  - H2OVL
+  - T + I<sup>E+</sup>
+  - `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc.
+  -
+  - ✅︎
+  -
+* - `Idefics3ForConditionalGeneration`
+  - Idefics3
+  - T + I
+  - `HuggingFaceM4/Idefics3-8B-Llama3` etc.
+  - ✅︎
+  -
+  -
+* - `InternVLChatModel`
+  - InternVL 2.5, Mono-InternVL, InternVL 2.0
+  - T + I<sup>E+</sup>
+  - `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc.
+  -
+  - ✅︎
+  - ✅︎
+* - `LlavaForConditionalGeneration`
+  - LLaVA-1.5
+  - T + I<sup>E+</sup>
+  - `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), etc.
+  -
+  - ✅︎
+  - ✅︎
+* - `LlavaNextForConditionalGeneration`
+  - LLaVA-NeXT
+  - T + I<sup>E+</sup>
+  - `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc.
+  -
+  - ✅︎
+  -
+* - `LlavaNextVideoForConditionalGeneration`
+  - LLaVA-NeXT-Video
+  - T + V
+  - `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc.
+  -
+  - ✅︎
+  -
+* - `LlavaOnevisionForConditionalGeneration`
+  - LLaVA-Onevision
+  - T + I<sup>+</sup> + V<sup>+</sup>
+  - `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc.
+  -
+  - ✅︎
+  -
+* - `MiniCPMV`
+  - MiniCPM-V
+  - T + I<sup>E+</sup>
+  - `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, etc.
+  - ✅︎
+  - ✅︎
+  -
+* - `MllamaForConditionalGeneration`
+  - Llama 3.2
+  - T + I<sup>+</sup>
+  - `meta-llama/Llama-3.2-90B-Vision-Instruct`, `meta-llama/Llama-3.2-11B-Vision`, etc.
+  -
+  -
+  -
+* - `MolmoForCausalLM`
+  - Molmo
+  - T + I
+  - `allenai/Molmo-7B-D-0924`, `allenai/Molmo-72B-0924`, etc.
+  -
+  - ✅︎
+  - ✅︎
+* - `NVLM_D_Model`
+  - NVLM-D 1.0
+  - T + I<sup>E+</sup>
+  - `nvidia/NVLM-D-72B`, etc.
+  -
+  - ✅︎
+  - ✅︎
+* - `PaliGemmaForConditionalGeneration`
+  - PaliGemma, PaliGemma 2
+  - T + I<sup>E</sup>
+  - `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc.
+  -
+  - ✅︎
+  -
+* - `Phi3VForCausalLM`
+  - Phi-3-Vision, Phi-3.5-Vision
+  - T + I<sup>E+</sup>
+  - `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct` etc.
+  -
+  - ✅︎
+  - ✅︎
+* - `PixtralForConditionalGeneration`
+  - Pixtral
+  - T + I<sup>+</sup>
+  - `mistralai/Pixtral-12B-2409`, `mistral-community/pixtral-12b` etc.
+  -
+  - ✅︎
+  - ✅︎
+* - `QWenLMHeadModel`
+  - Qwen-VL
+  - T + I<sup>E+</sup>
+  - `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc.
+  - ✅︎
+  - ✅︎
+  -
+* - `Qwen2AudioForConditionalGeneration`
+  - Qwen2-Audio
+  - T + A<sup>+</sup>
+  - `Qwen/Qwen2-Audio-7B-Instruct`
+  -
+  - ✅︎
+  -
+* - `Qwen2VLForConditionalGeneration`
+  - Qwen2-VL
+  - T + I<sup>E+</sup> + V<sup>E+</sup>
+  - `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc.
+  - ✅︎
+  - ✅︎
+  -
+* - `UltravoxModel`
+  - Ultravox
+  - T + A<sup>E+</sup>
+  - `fixie-ai/ultravox-v0_3`
+  -
+  - ✅︎
+  -
 ```
 
-```{eval-rst}
-:sup:`E` Pre-computed embeddings can be inputted for this modality.
-
-:sup:`+` Multiple items can be inputted per text prompt for this modality.
-```
+<sup>E</sup> Pre-computed embeddings can be inputted for this modality.  
+<sup>+</sup> Multiple items can be inputted per text prompt for this modality.
 
 ````{important}
 To enable multiple multi-modal items per text prompt, you have to set {code}`limit_mm_per_prompt` (offline inference)
@@ -787,38 +778,37 @@ To get the best results, you should use pooling models that are specifically tra
 
 The following table lists those that are tested in vLLM.
 
-```{eval-rst}
-.. list-table::
-  :widths: 25 25 15 25 5 5
-  :header-rows: 1
-
-  * - Architecture
-    - Models
-    - Inputs
-    - Example HF Models
-    - :ref:`LoRA <lora-adapter>`
-    - :ref:`PP <distributed-serving>`
-  * - :code:`LlavaNextForConditionalGeneration`
-    - LLaVA-NeXT-based
-    - T / I
-    - :code:`royokong/e5-v`
-    -
-    - ✅︎
-  * - :code:`Phi3VForCausalLM`
-    - Phi-3-Vision-based
-    - T + I
-    - :code:`TIGER-Lab/VLM2Vec-Full`
-    - 🚧
-    - ✅︎
-  * - :code:`Qwen2VLForConditionalGeneration`
-    - Qwen2-VL-based
-    - T + I
-    - :code:`MrLight/dse-qwen2-2b-mrl-v1`
-    -
-    - ✅︎
+```{list-table}
+:widths: 25 25 15 25 5 5
+:header-rows: 1
+
+* - Architecture
+  - Models
+  - Inputs
+  - Example HF Models
+  - [LoRA](#lora-adapter)
+  - [PP](#distributed-serving)
+* - `LlavaNextForConditionalGeneration`
+  - LLaVA-NeXT-based
+  - T / I
+  - `royokong/e5-v`
+  -
+  - ✅︎
+* - `Phi3VForCausalLM`
+  - Phi-3-Vision-based
+  - T + I
+  - `TIGER-Lab/VLM2Vec-Full`
+  - 🚧
+  - ✅︎
+* - `Qwen2VLForConditionalGeneration`
+  - Qwen2-VL-based
+  - T + I
+  - `MrLight/dse-qwen2-2b-mrl-v1`
+  -
+  - ✅︎
 ```
 
-______________________________________________________________________
+_________________
 
 # Model Support Policy
 
diff --git a/docs/source/quantization/supported_hardware.md b/docs/source/quantization/supported_hardware.md
index 843ee21627d78..7330c2f8aa194 100644
--- a/docs/source/quantization/supported_hardware.md
+++ b/docs/source/quantization/supported_hardware.md
@@ -4,121 +4,120 @@
 
 The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM:
 
-```{eval-rst}
-.. list-table::
-   :header-rows: 1
-   :widths: 20 8 8 8 8 8 8 8 8 8 8
+```{list-table}
+:header-rows: 1
+:widths: 20 8 8 8 8 8 8 8 8 8 8
 
-   * - Implementation
-     - Volta
-     - Turing
-     - Ampere
-     - Ada
-     - Hopper
-     - AMD GPU
-     - Intel GPU
-     - x86 CPU
-     - AWS Inferentia
-     - Google TPU
-   * - AWQ
-     - ✗
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✗
-     - ✅︎
-     - ✅︎
-     - ✗
-     - ✗
-   * - GPTQ
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✗
-     - ✅︎
-     - ✅︎
-     - ✗
-     - ✗
-   * - Marlin (GPTQ/AWQ/FP8)
-     - ✗
-     - ✗
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✗
-     - ✗
-     - ✗
-     - ✗
-     - ✗
-   * - INT8 (W8A8)
-     - ✗
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✗
-     - ✗
-     - ✅︎
-     - ✗
-     - ✗
-   * - FP8 (W8A8)
-     - ✗
-     - ✗
-     - ✗
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✗
-     - ✗
-     - ✗
-     - ✗
-   * - AQLM
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✗
-     - ✗
-     - ✗
-     - ✗
-     - ✗
-   * - bitsandbytes
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✗
-     - ✗
-     - ✗
-     - ✗
-     - ✗
-   * - DeepSpeedFP
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✗
-     - ✗
-     - ✗
-     - ✗
-     - ✗
-   * - GGUF
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✗
-     - ✗
-     - ✗
-     - ✗
-     - ✗
+* - Implementation
+  - Volta
+  - Turing
+  - Ampere
+  - Ada
+  - Hopper
+  - AMD GPU
+  - Intel GPU
+  - x86 CPU
+  - AWS Inferentia
+  - Google TPU
+* - AWQ
+  - ✗
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✗
+  - ✅︎
+  - ✅︎
+  - ✗
+  - ✗
+* - GPTQ
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✗
+  - ✅︎
+  - ✅︎
+  - ✗
+  - ✗
+* - Marlin (GPTQ/AWQ/FP8)
+  - ✗
+  - ✗
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✗
+  - ✗
+  - ✗
+  - ✗
+  - ✗
+* - INT8 (W8A8)
+  - ✗
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✗
+  - ✗
+  - ✅︎
+  - ✗
+  - ✗
+* - FP8 (W8A8)
+  - ✗
+  - ✗
+  - ✗
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✗
+  - ✗
+  - ✗
+  - ✗
+* - AQLM
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✗
+  - ✗
+  - ✗
+  - ✗
+  - ✗
+* - bitsandbytes
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✗
+  - ✗
+  - ✗
+  - ✗
+  - ✗
+* - DeepSpeedFP
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✗
+  - ✗
+  - ✗
+  - ✗
+  - ✗
+* - GGUF
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✅︎
+  - ✗
+  - ✗
+  - ✗
+  - ✗
+  - ✗
 ```
 
 ## Notes:
diff --git a/docs/source/serving/deploying_with_helm.md b/docs/source/serving/deploying_with_helm.md
index 3b26575827011..7286a0a88968f 100644
--- a/docs/source/serving/deploying_with_helm.md
+++ b/docs/source/serving/deploying_with_helm.md
@@ -43,209 +43,208 @@ chart **including persistent volumes** and deletes the release.
 
 ## Values
 
-```{eval-rst}
-.. list-table:: Values
-   :widths: 25 25 25 25
-   :header-rows: 1
-
-   * - Key
-     - Type
-     - Default
-     - Description
-   * - autoscaling
-     - object
-     - {"enabled":false,"maxReplicas":100,"minReplicas":1,"targetCPUUtilizationPercentage":80}
-     - Autoscaling configuration
-   * - autoscaling.enabled
-     - bool
-     - false
-     - Enable autoscaling
-   * - autoscaling.maxReplicas
-     - int
-     - 100
-     - Maximum replicas
-   * - autoscaling.minReplicas
-     - int
-     - 1
-     - Minimum replicas
-   * - autoscaling.targetCPUUtilizationPercentage
-     - int
-     - 80
-     - Target CPU utilization for autoscaling
-   * - configs
-     - object
-     - {}
-     - Configmap
-   * - containerPort
-     - int
-     - 8000
-     - Container port
-   * - customObjects
-     - list
-     - []
-     - Custom Objects configuration
-   * - deploymentStrategy
-     - object
-     - {}
-     - Deployment strategy configuration
-   * - externalConfigs
-     - list
-     - []
-     - External configuration
-   * - extraContainers
-     - list
-     - []
-     - Additional containers configuration
-   * - extraInit
-     - object
-     - {"pvcStorage":"1Gi","s3modelpath":"relative_s3_model_path/opt-125m", "awsEc2MetadataDisabled": true}
-     - Additional configuration for the init container
-   * - extraInit.pvcStorage
-     - string
-     - "50Gi"
-     - Storage size of the s3
-   * - extraInit.s3modelpath
-     - string
-     - "relative_s3_model_path/opt-125m"
-     - Path of the model on the s3 which hosts model weights and config files
-   * - extraInit.awsEc2MetadataDisabled
-     - boolean
-     - true
-     - Disables the use of the Amazon EC2 instance metadata service
-   * - extraPorts
-     - list
-     - []
-     - Additional ports configuration
-   * - gpuModels
-     - list
-     - ["TYPE_GPU_USED"]
-     - Type of gpu used
-   * - image
-     - object
-     - {"command":["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"],"repository":"vllm/vllm-openai","tag":"latest"}
-     - Image configuration
-   * - image.command
-     - list
-     - ["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"]
-     - Container launch command
-   * - image.repository
-     - string
-     - "vllm/vllm-openai"
-     - Image repository
-   * - image.tag
-     - string
-     - "latest"
-     - Image tag
-   * - livenessProbe
-     - object
-     - {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":15,"periodSeconds":10}
-     - Liveness probe configuration
-   * - livenessProbe.failureThreshold
-     - int
-     - 3
-     - Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not alive
-   * - livenessProbe.httpGet
-     - object
-     - {"path":"/health","port":8000}
-     - Configuration of the Kubelet http request on the server
-   * - livenessProbe.httpGet.path
-     - string
-     - "/health"
-     - Path to access on the HTTP server
-   * - livenessProbe.httpGet.port
-     - int
-     - 8000
-     - Name or number of the port to access on the container, on which the server is listening
-   * - livenessProbe.initialDelaySeconds
-     - int
-     - 15
-     - Number of seconds after the container has started before liveness probe is initiated
-   * - livenessProbe.periodSeconds
-     - int
-     - 10
-     - How often (in seconds) to perform the liveness probe
-   * - maxUnavailablePodDisruptionBudget
-     - string
-     - ""
-     - Disruption Budget Configuration
-   * - readinessProbe
-     - object
-     - {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":5,"periodSeconds":5}
-     - Readiness probe configuration
-   * - readinessProbe.failureThreshold
-     - int
-     - 3
-     - Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not ready
-   * - readinessProbe.httpGet
-     - object
-     - {"path":"/health","port":8000}
-     - Configuration of the Kubelet http request on the server
-   * - readinessProbe.httpGet.path
-     - string
-     - "/health"
-     - Path to access on the HTTP server
-   * - readinessProbe.httpGet.port
-     - int
-     - 8000
-     - Name or number of the port to access on the container, on which the server is listening
-   * - readinessProbe.initialDelaySeconds
-     - int
-     - 5
-     - Number of seconds after the container has started before readiness probe is initiated
-   * - readinessProbe.periodSeconds
-     - int
-     - 5
-     - How often (in seconds) to perform the readiness probe
-   * - replicaCount
-     - int
-     - 1
-     - Number of replicas
-   * - resources
-     - object
-     - {"limits":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1},"requests":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1}}
-     - Resource configuration
-   * - resources.limits."nvidia.com/gpu"
-     - int
-     - 1
-     - Number of gpus used
-   * - resources.limits.cpu
-     - int
-     - 4
-     - Number of CPUs
-   * - resources.limits.memory
-     - string
-     - "16Gi"
-     - CPU memory configuration
-   * - resources.requests."nvidia.com/gpu"
-     - int
-     - 1
-     - Number of gpus used
-   * - resources.requests.cpu
-     - int
-     - 4
-     - Number of CPUs
-   * - resources.requests.memory
-     - string
-     - "16Gi"
-     - CPU memory configuration
-   * - secrets
-     - object
-     - {}
-     - Secrets configuration
-   * - serviceName
-     - string
-     -
-     - Service name
-   * - servicePort
-     - int
-     - 80
-     - Service port
-   * - labels.environment
-     - string
-     - test
-     - Environment name
-   * - labels.release
-     - string
-     - test
-     - Release name
+```{list-table}
+:widths: 25 25 25 25
+:header-rows: 1
+
+* - Key
+  - Type
+  - Default
+  - Description
+* - autoscaling
+  - object
+  - {"enabled":false,"maxReplicas":100,"minReplicas":1,"targetCPUUtilizationPercentage":80}
+  - Autoscaling configuration
+* - autoscaling.enabled
+  - bool
+  - false
+  - Enable autoscaling
+* - autoscaling.maxReplicas
+  - int
+  - 100
+  - Maximum replicas
+* - autoscaling.minReplicas
+  - int
+  - 1
+  - Minimum replicas
+* - autoscaling.targetCPUUtilizationPercentage
+  - int
+  - 80
+  - Target CPU utilization for autoscaling
+* - configs
+  - object
+  - {}
+  - Configmap
+* - containerPort
+  - int
+  - 8000
+  - Container port
+* - customObjects
+  - list
+  - []
+  - Custom Objects configuration
+* - deploymentStrategy
+  - object
+  - {}
+  - Deployment strategy configuration
+* - externalConfigs
+  - list
+  - []
+  - External configuration
+* - extraContainers
+  - list
+  - []
+  - Additional containers configuration
+* - extraInit
+  - object
+  - {"pvcStorage":"1Gi","s3modelpath":"relative_s3_model_path/opt-125m", "awsEc2MetadataDisabled": true}
+  - Additional configuration for the init container
+* - extraInit.pvcStorage
+  - string
+  - "50Gi"
+  - Storage size of the s3
+* - extraInit.s3modelpath
+  - string
+  - "relative_s3_model_path/opt-125m"
+  - Path of the model on the s3 which hosts model weights and config files
+* - extraInit.awsEc2MetadataDisabled
+  - boolean
+  - true
+  - Disables the use of the Amazon EC2 instance metadata service
+* - extraPorts
+  - list
+  - []
+  - Additional ports configuration
+* - gpuModels
+  - list
+  - ["TYPE_GPU_USED"]
+  - Type of gpu used
+* - image
+  - object
+  - {"command":["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"],"repository":"vllm/vllm-openai","tag":"latest"}
+  - Image configuration
+* - image.command
+  - list
+  - ["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"]
+  - Container launch command
+* - image.repository
+  - string
+  - "vllm/vllm-openai"
+  - Image repository
+* - image.tag
+  - string
+  - "latest"
+  - Image tag
+* - livenessProbe
+  - object
+  - {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":15,"periodSeconds":10}
+  - Liveness probe configuration
+* - livenessProbe.failureThreshold
+  - int
+  - 3
+  - Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not alive
+* - livenessProbe.httpGet
+  - object
+  - {"path":"/health","port":8000}
+  - Configuration of the Kubelet http request on the server
+* - livenessProbe.httpGet.path
+  - string
+  - "/health"
+  - Path to access on the HTTP server
+* - livenessProbe.httpGet.port
+  - int
+  - 8000
+  - Name or number of the port to access on the container, on which the server is listening
+* - livenessProbe.initialDelaySeconds
+  - int
+  - 15
+  - Number of seconds after the container has started before liveness probe is initiated
+* - livenessProbe.periodSeconds
+  - int
+  - 10
+  - How often (in seconds) to perform the liveness probe
+* - maxUnavailablePodDisruptionBudget
+  - string
+  - ""
+  - Disruption Budget Configuration
+* - readinessProbe
+  - object
+  - {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":5,"periodSeconds":5}
+  - Readiness probe configuration
+* - readinessProbe.failureThreshold
+  - int
+  - 3
+  - Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not ready
+* - readinessProbe.httpGet
+  - object
+  - {"path":"/health","port":8000}
+  - Configuration of the Kubelet http request on the server
+* - readinessProbe.httpGet.path
+  - string
+  - "/health"
+  - Path to access on the HTTP server
+* - readinessProbe.httpGet.port
+  - int
+  - 8000
+  - Name or number of the port to access on the container, on which the server is listening
+* - readinessProbe.initialDelaySeconds
+  - int
+  - 5
+  - Number of seconds after the container has started before readiness probe is initiated
+* - readinessProbe.periodSeconds
+  - int
+  - 5
+  - How often (in seconds) to perform the readiness probe
+* - replicaCount
+  - int
+  - 1
+  - Number of replicas
+* - resources
+  - object
+  - {"limits":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1},"requests":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1}}
+  - Resource configuration
+* - resources.limits."nvidia.com/gpu"
+  - int
+  - 1
+  - Number of gpus used
+* - resources.limits.cpu
+  - int
+  - 4
+  - Number of CPUs
+* - resources.limits.memory
+  - string
+  - "16Gi"
+  - CPU memory configuration
+* - resources.requests."nvidia.com/gpu"
+  - int
+  - 1
+  - Number of gpus used
+* - resources.requests.cpu
+  - int
+  - 4
+  - Number of CPUs
+* - resources.requests.memory
+  - string
+  - "16Gi"
+  - CPU memory configuration
+* - secrets
+  - object
+  - {}
+  - Secrets configuration
+* - serviceName
+  - string
+  -
+  - Service name
+* - servicePort
+  - int
+  - 80
+  - Service port
+* - labels.environment
+  - string
+  - test
+  - Environment name
+* - labels.release
+  - string
+  - test
+  - Release name
 ```

From dba4d9dec606da028fbb28240e99cabd5a761e6a Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sun, 29 Dec 2024 17:03:49 +0800
Subject: [PATCH 022/462] [v1][bugfix] fix cudagraph with inplace buffer
 assignment (#11596)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/compilation/wrapper.py                    | 10 +++++++++-
 vllm/model_executor/layers/rotary_embedding.py | 11 +----------
 2 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py
index c10241b483169..e3260a10c02ae 100644
--- a/vllm/compilation/wrapper.py
+++ b/vllm/compilation/wrapper.py
@@ -28,11 +28,12 @@ def __init__(self,
                  compiled_callable: Optional[Callable] = None,
                  compilation_level: int = 0):
 
+        vllm_config = get_current_vllm_config()
+        self.vllm_config = vllm_config
         if compiled_callable is None:
             # default compilation settings
             # compiling the forward method
 
-            vllm_config = get_current_vllm_config()
             backend = vllm_config.compilation_config.init_backend(vllm_config)
 
             compiled_callable = torch.compile(
@@ -82,6 +83,13 @@ def bytecode_hook(self, old_code: CodeType, new_code: CodeType):
 
         self.compiled_codes.append(new_code)
 
+        if self.vllm_config.compilation_config.use_cudagraph and \
+            "update" in new_code.co_names:
+            import depyf
+            src = depyf.decompile(new_code)
+            msg = "Assigning / modifying buffers of nn.Module during forward pass is not allowed when using cudagraph inside the compiler because it will cause silent errors. Please use eager mode or fix the code. The following code contains clues about which buffer is being modified (please search for the usage of the function `update`):\n" + src  # noqa
+            raise RuntimeError(msg)
+
     @contextmanager
     def dispatch_to_code(self, index: int):
         """Context manager to dispatch to the compiled code.
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index 117fe086e5e87..6695d44dfa32b 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -541,19 +541,12 @@ def __init__(
         short_cache = self._compute_cos_sin_cache(
             original_max_position_embeddings, short_factor, short_mscale)
         short_cache = short_cache.to(dtype)
-        self.register_buffer("short_cos_sin_cache",
-                             short_cache,
-                             persistent=False)
 
         long_cache = self._compute_cos_sin_cache(max_position_embeddings,
                                                  long_factor, long_mscale)
         long_cache = long_cache.to(dtype)
-        self.register_buffer("long_cos_sin_cache",
-                             long_cache,
-                             persistent=False)
 
-        long_short_cache = torch.cat(
-            [self.short_cos_sin_cache, self.long_cos_sin_cache], dim=0)
+        long_short_cache = torch.cat([short_cache, long_cache], dim=0)
         self.register_buffer("long_short_cos_sin_cache",
                              long_short_cache,
                              persistent=False)
@@ -593,8 +586,6 @@ def forward(
                               torch.full_like(positions, k)).long()
         idx = (torch.add(positions, long_prompt_offset)
                if long_prompt_offset is not None else positions)
-        self.long_short_cos_sin_cache: torch.Tensor = (
-            self.long_short_cos_sin_cache.to(idx.device))
         idx = torch.add(idx, offsets) if offsets is not None else idx
         cos_sin = torch.index_select(self.long_short_cos_sin_cache, 0, idx)
 

From faef77c0d69c5429182f475a57127676e6bcb230 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Sun, 29 Dec 2024 10:08:09 -0600
Subject: [PATCH 023/462] [Misc] KV cache transfer connector registry (#11481)

Signed-off-by: KuntaiDu <kuntai@uchicago.edu>
---
 vllm/config.py                                |  8 ----
 .../kv_transfer/kv_connector/factory.py       | 48 +++++++++++++++----
 2 files changed, 38 insertions(+), 18 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 6ae1d4d944447..8e556743c8528 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2559,14 +2559,6 @@ def from_cli(cls, cli_value: str) -> "KVTransferConfig":
         return KVTransferConfig.model_validate_json(cli_value)
 
     def model_post_init(self, __context: Any) -> None:
-        supported_kv_connector = ["PyNcclConnector", "MooncakeConnector"]
-        if all([
-                self.kv_connector is not None, self.kv_connector
-                not in supported_kv_connector
-        ]):
-            raise ValueError(f"Unsupported kv_connector: {self.kv_connector}. "
-                             f"Supported connectors are "
-                             f"{supported_kv_connector}.")
 
         if self.kv_role is not None and self.kv_role not in [
                 "kv_producer", "kv_consumer", "kv_both"
diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py
index 3e2bb436d24b5..6372dab726086 100644
--- a/vllm/distributed/kv_transfer/kv_connector/factory.py
+++ b/vllm/distributed/kv_transfer/kv_connector/factory.py
@@ -1,4 +1,5 @@
-from typing import TYPE_CHECKING
+import importlib
+from typing import TYPE_CHECKING, Callable, Dict, Type
 
 from .base import KVConnectorBase
 
@@ -7,14 +8,41 @@
 
 
 class KVConnectorFactory:
+    _registry: Dict[str, Callable[[], Type[KVConnectorBase]]] = {}
 
-    @staticmethod
-    def create_connector(rank: int, local_rank: int,
+    @classmethod
+    def register_connector(cls, name: str, module_path: str,
+                           class_name: str) -> None:
+        """Register a connector with a lazy-loading module and class name."""
+        if name in cls._registry:
+            raise ValueError(f"Connector '{name}' is already registered.")
+
+        def loader() -> Type[KVConnectorBase]:
+            module = importlib.import_module(module_path)
+            return getattr(module, class_name)
+
+        cls._registry[name] = loader
+
+    @classmethod
+    def create_connector(cls, rank: int, local_rank: int,
                          config: "VllmConfig") -> KVConnectorBase:
-        supported_kv_connector = ["PyNcclConnector", "MooncakeConnector"]
-        if config.kv_transfer_config.kv_connector in supported_kv_connector:
-            from .simple_connector import SimpleConnector
-            return SimpleConnector(rank, local_rank, config)
-        else:
-            raise ValueError(f"Unsupported connector type: "
-                             f"{config.kv_connector}")
+        connector_name = config.kv_transfer_config.kv_connector
+        if connector_name not in cls._registry:
+            raise ValueError(f"Unsupported connector type: {connector_name}")
+
+        connector_cls = cls._registry[connector_name]()
+        return connector_cls(rank, local_rank, config)
+
+
+# Register various connectors here.
+# The registration should not be done in each individual file, as we want to
+# only load the files corresponding to the current connector.
+KVConnectorFactory.register_connector(
+    "PyNcclConnector",
+    "vllm.distributed.kv_transfer.kv_connector.simple_connector",
+    "SimpleConnector")
+
+KVConnectorFactory.register_connector(
+    "MooncakeConnector",
+    "vllm.distributed.kv_transfer.kv_connector.simple_connector",
+    "SimpleConnector")

From 0aa38d16f56327622c1689d7510171662757deee Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Sun, 29 Dec 2024 15:16:46 -0500
Subject: [PATCH 024/462] Remove print statement in
 DeepseekScalingRotaryEmbedding (#11604)

---
 vllm/model_executor/layers/rotary_embedding.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index 6695d44dfa32b..3fcd81a3c4213 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -668,7 +668,6 @@ def _compute_cos_sin_cache(self) -> torch.Tensor:
         cos = (freqs.cos() * self.mscale)
         sin = (freqs.sin() * self.mscale)
         cache = torch.cat((cos, sin), dim=-1)
-        print("Cache shape", cache.shape)
         return cache
 
     def forward(

From 3682e33f9ff9d8baade6112a8e75a77da898f504 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 30 Dec 2024 12:24:12 +0800
Subject: [PATCH 025/462] [v1] fix compilation cache (#11598)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 tests/compile/piecewise/test_toy_llama.py | 15 +++++++-
 vllm/compilation/backends.py              | 22 ++++++-----
 vllm/config.py                            | 45 +++++++++++++++++++++--
 vllm/v1/worker/gpu_worker.py              |  1 +
 4 files changed, 69 insertions(+), 14 deletions(-)

diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py
index 07c10a3a18c55..d4ede4d2320a7 100644
--- a/tests/compile/piecewise/test_toy_llama.py
+++ b/tests/compile/piecewise/test_toy_llama.py
@@ -7,7 +7,7 @@
 initialized randomly with a fixed seed.
 """
 from dataclasses import dataclass
-from typing import Optional, Tuple
+from typing import Any, List, Optional, Tuple
 
 import torch
 from torch import nn
@@ -54,6 +54,16 @@ class LlamaConfig:
     tractable_init: bool = False
     random_seed: int = 0
 
+    def compute_hash(self) -> str:
+        factors: List[Any] = []
+        for k, v in self.__dict__.items():
+            if k == "random_seed":
+                continue
+            factors.append((k, v))
+        factors.sort()
+        import hashlib
+        return hashlib.md5(str(factors).encode()).hexdigest()
+
     def __post_init__(self):
         assert self.mlp_size >= self.hidden_size
 
@@ -263,7 +273,8 @@ def run_model(llama_config,
         compilation_config = CompilationConfig(
             level=CompilationLevel.NO_COMPILATION, )
 
-    vllm_config = VllmConfig(compilation_config=compilation_config)
+    vllm_config = VllmConfig(compilation_config=compilation_config,
+                             additional_config=llama_config)
     with set_current_vllm_config(vllm_config):
         model = LlamaModel(config=llama_config,
                            vllm_config=vllm_config,
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 4f960b441f21d..a8dd628b9cd6f 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -619,8 +619,10 @@ def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig,
         # the entries for different shapes that we need to either
         # compile or capture cudagraph
         self.concrete_size_entries: Dict[int, ConcreteSizeEntry] = {}
-        self.to_be_compiled_sizes: Set[int] = self.compile_sizes.union(
-            self.capture_sizes)
+
+        # to_be_compiled_sizes tracks the remaining sizes to compile,
+        # and updates during the compilation process, so we need to copy it
+        self.to_be_compiled_sizes: Set[int] = self.compile_sizes.copy()
         for shape in self.compile_sizes.union(self.capture_sizes):
             self.concrete_size_entries[shape] = ConcreteSizeEntry(
                 runtime_shape=shape,
@@ -628,12 +630,17 @@ def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig,
                 use_cudagraph=shape in self.capture_sizes,
             )
 
+    def check_for_ending_compilation(self):
+        if self.is_last_graph and not self.to_be_compiled_sizes:
+            # no specific sizes to compile
+            # save the hash of the inductor graph for the next run
+            self.compilation_config.inductor_hash_cache.save_to_file()
+            end_monitoring_torch_compile(self.vllm_config)
+
     def __call__(self, *args) -> Any:
         if not self.first_run_finished:
             self.first_run_finished = True
-            # no specific sizes to compile
-            if self.is_last_graph and not self.to_be_compiled_sizes:
-                end_monitoring_torch_compile(self.vllm_config)
+            self.check_for_ending_compilation()
             return self.compiled_graph_for_general_shape(*args)
 
         runtime_shape = args[self.sym_shape_indices[0]]
@@ -662,10 +669,7 @@ def __call__(self, *args) -> Any:
 
             # finished compilations for all required shapes
             if self.is_last_graph and not self.to_be_compiled_sizes:
-
-                # save the hash of the inductor graph for the next run
-                self.compilation_config.inductor_hash_cache.save_to_file()
-                end_monitoring_torch_compile(self.vllm_config)
+                self.check_for_ending_compilation()
 
         if not entry.use_cudagraph:
             return entry.runnable(*args)
diff --git a/vllm/config.py b/vllm/config.py
index 8e556743c8528..765a46e6aeee3 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -9,8 +9,8 @@
 from dataclasses import dataclass, field, replace
 from pathlib import Path
 from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Counter, Dict,
-                    Final, List, Literal, Mapping, Optional, Set, Tuple, Type,
-                    Union)
+                    Final, List, Literal, Mapping, Optional, Protocol, Set,
+                    Tuple, Type, Union)
 
 import torch
 from pydantic import BaseModel, Field, PrivateAttr
@@ -75,6 +75,12 @@
                                              PretrainedConfig]]
 
 
+class SupportsHash(Protocol):
+
+    def compute_hash(self) -> str:
+        ...
+
+
 class ModelConfig:
     """Configuration for the model.
 
@@ -2969,6 +2975,10 @@ class VllmConfig:
                                                   init=True)  # type: ignore
     kv_transfer_config: KVTransferConfig = field(default=None,
                                                  init=True)  # type: ignore
+    # some opaque config, only used to provide additional information
+    # for the hash computation, mainly used for testing and debugging.
+    additional_config: SupportsHash = field(default=None,
+                                            init=True)  # type: ignore
     instance_id: str = ""
 
     def compute_hash(self) -> str:
@@ -3000,33 +3010,62 @@ def compute_hash(self) -> str:
         vllm_factors.append(__version__)
         if self.model_config:
             vllm_factors.append(self.model_config.compute_hash())
+        else:
+            vllm_factors.append("None")
         if self.cache_config:
             vllm_factors.append(self.cache_config.compute_hash())
+        else:
+            vllm_factors.append("None")
         if self.parallel_config:
             vllm_factors.append(self.parallel_config.compute_hash())
+        else:
+            vllm_factors.append("None")
         if self.scheduler_config:
             vllm_factors.append(self.scheduler_config.compute_hash())
+        else:
+            vllm_factors.append("None")
         if self.device_config:
             vllm_factors.append(self.device_config.compute_hash())
+        else:
+            vllm_factors.append("None")
         if self.load_config:
             vllm_factors.append(self.load_config.compute_hash())
+        else:
+            vllm_factors.append("None")
         if self.lora_config:
             vllm_factors.append(self.lora_config.compute_hash())
+        else:
+            vllm_factors.append("None")
         if self.speculative_config:
             vllm_factors.append(self.speculative_config.compute_hash())
+        else:
+            vllm_factors.append("None")
         if self.decoding_config:
             vllm_factors.append(self.decoding_config.compute_hash())
+        else:
+            vllm_factors.append("None")
         if self.observability_config:
             vllm_factors.append(self.observability_config.compute_hash())
+        else:
+            vllm_factors.append("None")
         if self.prompt_adapter_config:
             vllm_factors.append(self.prompt_adapter_config.compute_hash())
+        else:
+            vllm_factors.append("None")
         if self.quant_config:
             pass  # should be captured by model_config.quantization
         if self.compilation_config:
             vllm_factors.append(self.compilation_config.compute_hash())
+        else:
+            vllm_factors.append("None")
         if self.kv_transfer_config:
             vllm_factors.append(self.kv_transfer_config.compute_hash())
-
+        else:
+            vllm_factors.append("None")
+        if self.additional_config:
+            vllm_factors.append(self.additional_config.compute_hash())
+        else:
+            vllm_factors.append("None")
         factors.append(vllm_factors)
 
         hash_str = hashlib.md5(str(factors).encode()).hexdigest()[:10]
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 0000b09bfaa36..af438f7d5820c 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -48,6 +48,7 @@ def __init__(
         self.prompt_adapter_config = vllm_config.prompt_adapter_config
         self.observability_config = vllm_config.observability_config
 
+        self.parallel_config.rank = rank
         self.local_rank = local_rank
         self.rank = rank
         self.distributed_init_method = distributed_init_method

From 628ec6c17b8121517e8f303b64567573036cdb38 Mon Sep 17 00:00:00 2001
From: Liangfu Chen <liangfc@amazon.com>
Date: Sun, 29 Dec 2024 21:46:14 -0800
Subject: [PATCH 026/462] [Docker] bump up neuron sdk v2.21 (#11593)

Signed-off-by: Liangfu Chen <liangfc@amazon.com>
---
 Dockerfile.neuron              | 6 +++---
 requirements-neuron.txt        | 4 ++--
 vllm/_custom_ops.py            | 3 +--
 vllm/triton_utils/importing.py | 1 -
 4 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/Dockerfile.neuron b/Dockerfile.neuron
index 77162bc82de62..269139fe90f0b 100644
--- a/Dockerfile.neuron
+++ b/Dockerfile.neuron
@@ -1,6 +1,6 @@
 # default base image
 # https://gallery.ecr.aws/neuron/pytorch-inference-neuronx
-ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.1.2-neuronx-py310-sdk2.20.2-ubuntu20.04"
+ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.5.1-neuronx-py310-sdk2.21.0-ubuntu22.04"
 
 FROM $BASE_IMAGE
 
@@ -22,9 +22,9 @@ WORKDIR ${APP_MOUNT}/vllm
 
 RUN python3 -m pip install --upgrade pip
 RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
-RUN python3 -m pip install sentencepiece transformers==4.36.2 -U
+RUN python3 -m pip install sentencepiece transformers==4.45.2 -U
 RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
-RUN python3 -m pip install --pre neuronx-cc==2.15.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
+RUN python3 -m pip install neuronx-cc==2.16.345.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
 
 COPY . .
 ARG GIT_REPO_CHECK=0
diff --git a/requirements-neuron.txt b/requirements-neuron.txt
index 148fdbe0d6310..5e08d101fcd61 100644
--- a/requirements-neuron.txt
+++ b/requirements-neuron.txt
@@ -2,6 +2,6 @@
 -r requirements-common.txt
 
 # Dependencies for Neuron devices
-transformers-neuronx >= 0.12.0
-torch-neuronx >= 2.1.2
+transformers-neuronx >= 0.13.0
+torch-neuronx >= 2.5.0
 neuronx-cc
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index aeacf5dda5761..eb2f69df42624 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -23,8 +23,7 @@
     import vllm._moe_C  # noqa: F401
     supports_moe_ops = True
 
-# neuron has torch version that doesn't even have impl_abstract
-if TYPE_CHECKING or current_platform.is_neuron():
+if TYPE_CHECKING:
 
     def register_fake(fn):
         return lambda name: fn
diff --git a/vllm/triton_utils/importing.py b/vllm/triton_utils/importing.py
index 36315abcdfcda..0c96e0632f646 100644
--- a/vllm/triton_utils/importing.py
+++ b/vllm/triton_utils/importing.py
@@ -8,7 +8,6 @@
 HAS_TRITON = (
     find_spec("triton") is not None
     and not current_platform.is_xpu()  # Not compatible
-    and not current_platform.is_neuron()  # neuron has too old torch
 )
 
 if not HAS_TRITON:

From 970d6d0776076f17604077ba4d484cdadd604ceb Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Mon, 30 Dec 2024 04:22:13 -0500
Subject: [PATCH 027/462] [Build][Kernel] Update CUTLASS to v3.6.0 (#11607)

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
---
 CMakeLists.txt                                 |  4 ++--
 .../vllm_cutlass_library_extension.py          | 18 +++++++++---------
 csrc/quantization/machete/generate.py          |  8 ++++----
 .../machete/machete_collective_builder.cuh     | 10 ++++------
 csrc/quantization/machete/machete_mainloop.cuh | 11 ++++-------
 .../machete/machete_prepacked_layout.cuh       |  5 ++---
 6 files changed, 25 insertions(+), 31 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 83c8033434f3b..3206d76125545 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -223,13 +223,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     FetchContent_Declare(
         cutlass
         GIT_REPOSITORY https://github.com/nvidia/cutlass.git
-        GIT_TAG 8aa95dbb888be6d81c6fbf7169718c5244b53227
+        GIT_TAG v3.6.0
         GIT_PROGRESS TRUE
 
         # Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
         # Important: If GIT_SHALLOW is enabled then GIT_TAG works only with branch names and tags.
         # So if the GIT_TAG above is updated to a commit hash, GIT_SHALLOW must be set to FALSE
-        GIT_SHALLOW FALSE
+        GIT_SHALLOW TRUE
     )
   endif()
   FetchContent_MakeAvailable(cutlass)
diff --git a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py
index a5beea1a35e49..b401736c9824b 100644
--- a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py
+++ b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py
@@ -14,9 +14,9 @@ class VLLMDataType(enum.Enum):
 
 
 class MixedInputKernelScheduleType(enum.Enum):
-    TmaWarpSpecializedMixedInput = enum_auto()
-    TmaWarpSpecializedPingpongMixedInput = enum_auto()
-    TmaWarpSpecializedCooperativeMixedInput = enum_auto()
+    TmaWarpSpecialized = enum_auto()
+    TmaWarpSpecializedPingpong = enum_auto()
+    TmaWarpSpecializedCooperative = enum_auto()
 
 
 VLLMDataTypeNames: Dict[Union[VLLMDataType, DataType], str] = {
@@ -68,11 +68,11 @@ class MixedInputKernelScheduleType(enum.Enum):
     MixedInputKernelScheduleType, KernelScheduleType], str] = {
         **KernelScheduleTag,  # type: ignore
         **{
-            MixedInputKernelScheduleType.TmaWarpSpecializedMixedInput:
-            "cutlass::gemm::KernelTmaWarpSpecializedMixedInput",
-            MixedInputKernelScheduleType.TmaWarpSpecializedPingpongMixedInput:
-            "cutlass::gemm::KernelTmaWarpSpecializedPingpongMixedInput",
-            MixedInputKernelScheduleType.TmaWarpSpecializedCooperativeMixedInput:
-            "cutlass::gemm::KernelTmaWarpSpecializedCooperativeMixedInput",
+            MixedInputKernelScheduleType.TmaWarpSpecialized:
+            "cutlass::gemm::KernelTmaWarpSpecialized",
+            MixedInputKernelScheduleType.TmaWarpSpecializedPingpong:
+            "cutlass::gemm::KernelTmaWarpSpecializedPingpong",
+            MixedInputKernelScheduleType.TmaWarpSpecializedCooperative:
+            "cutlass::gemm::KernelTmaWarpSpecializedCooperative",
         }
     }
diff --git a/csrc/quantization/machete/generate.py b/csrc/quantization/machete/generate.py
index ac63afe79a255..2df4d181902f8 100644
--- a/csrc/quantization/machete/generate.py
+++ b/csrc/quantization/machete/generate.py
@@ -189,7 +189,7 @@
   {{DataTypeTag[t.b_group_zeropoint]}}, // GroupZeroT
   {{DataTypeTag[t.b_channel_scale]}}, // ChannelScaleT
   {{DataTypeTag[t.a_token_scale]}}, // TokenScaleT
-  cutlass::gemm::KernelTmaWarpSpecializedCooperativeMixedInput,
+  cutlass::gemm::KernelTmaWarpSpecializedCooperative,
   Sch>;
 
 {% for sch in schs %}
@@ -223,7 +223,7 @@
         {{DataTypeTag[t.convert]}}, // ElementConvert
         {{DataTypeTag[t.accumulator]}}, // Accumulator
         cutlass::layout::ColumnMajor,
-        cutlass::gemm::KernelTmaWarpSpecializedCooperativeMixedInput>
+        cutlass::gemm::KernelTmaWarpSpecializedCooperative>
     >(args.B); 
   }
   {%- endfor %}
@@ -239,7 +239,7 @@
 }; // namespace machete
 """
 
-TmaMI = MixedInputKernelScheduleType.TmaWarpSpecializedCooperativeMixedInput
+TmaMI = MixedInputKernelScheduleType.TmaWarpSpecializedCooperative
 TmaCoop = EpilogueScheduleType.TmaWarpSpecializedCooperative
 
 
@@ -300,7 +300,7 @@ def generate_sch_sig(schedule_config: ScheduleConfig) -> str:
 # mostly unique shorter sch_sig
 def generate_terse_sch_sig(schedule_config: ScheduleConfig) -> str:
     kernel_terse_names_replace = {
-        "KernelTmaWarpSpecializedCooperativeMixedInput_": "TmaMI_",
+        "KernelTmaWarpSpecializedCooperative": "TmaMI_",
         "TmaWarpSpecializedCooperative_": "TmaCoop_",
         "StreamKScheduler": "streamK",
     }
diff --git a/csrc/quantization/machete/machete_collective_builder.cuh b/csrc/quantization/machete/machete_collective_builder.cuh
index a74cf8b2dd455..ee825583dee1a 100644
--- a/csrc/quantization/machete/machete_collective_builder.cuh
+++ b/csrc/quantization/machete/machete_collective_builder.cuh
@@ -18,16 +18,14 @@ struct VLLMCollectiveBuilder<
     ElementAccumulator, TileShape_MNK, ClusterShape_MNK, StageCountType,
     KernelScheduleType,
     cute::enable_if_t<(
+        cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecialized> ||
+        cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedPingpong> ||
         cute::is_same_v<KernelScheduleType,
-                        KernelTmaWarpSpecializedMixedInput> ||
-        cute::is_same_v<KernelScheduleType,
-                        KernelTmaWarpSpecializedPingpongMixedInput> ||
-        cute::is_same_v<KernelScheduleType,
-                        KernelTmaWarpSpecializedCooperativeMixedInput>)>> {
+                        KernelTmaWarpSpecializedCooperative>)>> {
   using CollectiveOp = machete::MacheteCollectiveMma<
       ElementPairA_, GmemLayoutA_, AlignmentA, ElementPairB_, GmemLayoutB_,
       AlignmentB, ElementAccumulator, TileShape_MNK, ClusterShape_MNK,
       StageCountType, KernelScheduleType>;
 };
 
-};  // namespace cutlass::gemm::collective
\ No newline at end of file
+};  // namespace cutlass::gemm::collective
diff --git a/csrc/quantization/machete/machete_mainloop.cuh b/csrc/quantization/machete/machete_mainloop.cuh
index 816f33a1078e5..4071b19a3564d 100644
--- a/csrc/quantization/machete/machete_mainloop.cuh
+++ b/csrc/quantization/machete/machete_mainloop.cuh
@@ -66,13 +66,11 @@ struct MacheteCollectiveMma {
   using Schedule = KernelScheduleType;
   static_assert(
       cute::is_same_v<Schedule, KernelTmaWarpSpecialized> ||
-          cute::is_same_v<Schedule, KernelTmaWarpSpecializedMixedInput> ||
+          cute::is_same_v<Schedule, KernelTmaWarpSpecialized> ||
+          cute::is_same_v<Schedule, KernelTmaWarpSpecializedPingpong> ||
           cute::is_same_v<Schedule, KernelTmaWarpSpecializedPingpong> ||
-          cute::is_same_v<Schedule,
-                          KernelTmaWarpSpecializedPingpongMixedInput> ||
           cute::is_same_v<Schedule, KernelTmaWarpSpecializedCooperative> ||
-          cute::is_same_v<Schedule,
-                          KernelTmaWarpSpecializedCooperativeMixedInput>,
+          cute::is_same_v<Schedule, KernelTmaWarpSpecializedCooperative>,
       "KernelSchedule must be one of the warp specialized policies");
 
  public:
@@ -113,8 +111,7 @@ struct MacheteCollectiveMma {
   // For coop schedules we have two warp groups cooperatively issuing wgmma
   // instructions so we use 2 atoms along the M dim (one for each warpgroup)
   using AtomLayoutMNK = cute::conditional_t<
-      cute::is_same_v<KernelScheduleType,
-                      KernelTmaWarpSpecializedCooperativeMixedInput>,
+      cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedCooperative>,
       Layout<Shape<_2, _1, _1>>, Layout<Shape<_1, _1, _1>>>;
 
   using TiledMma = decltype(cute::make_tiled_mma(
diff --git a/csrc/quantization/machete/machete_prepacked_layout.cuh b/csrc/quantization/machete/machete_prepacked_layout.cuh
index 680a858a893c1..81aaa6c4f3a28 100644
--- a/csrc/quantization/machete/machete_prepacked_layout.cuh
+++ b/csrc/quantization/machete/machete_prepacked_layout.cuh
@@ -98,8 +98,7 @@ struct PrepackedLayoutBTemplate {
   // For coop schedules we have two warp groups cooperatively issuing wgmma
   // instructions so we use 2 atoms along the M dim (one for each warpgroup)
   using AtomLayoutMNK = cute::conditional_t<
-      cute::is_same_v<KernelSchedule,
-                      KernelTmaWarpSpecializedCooperativeMixedInput>,
+      cute::is_same_v<KernelSchedule, KernelTmaWarpSpecializedCooperative>,
       Layout<Shape<_2, _1, _1>>, Layout<Shape<_1, _1, _1>>>;
 
   using TiledMma = decltype(cute::make_tiled_mma(
@@ -247,4 +246,4 @@ struct PrepackedLayoutBTemplate {
   }
 };
 
-};  // namespace machete
\ No newline at end of file
+};  // namespace machete

From 5dbf854553cb6ac97f0c633ed36ba64e0fc9bb29 Mon Sep 17 00:00:00 2001
From: "Li, Jiang" <jiang1.li@intel.com>
Date: Mon, 30 Dec 2024 18:17:04 +0800
Subject: [PATCH 028/462] [CI/Build][CPU] Fix CPU CI by lazy importing triton
 FP8 kernels (#11618)

Signed-off-by: jiang1.li <jiang1.li@intel.com>
---
 vllm/model_executor/layers/quantization/fp8.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 7f779ac8d3b3e..2fe22903a385b 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -15,8 +15,6 @@
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
-from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    apply_w8a8_block_fp8_linear)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
     apply_fp8_marlin_linear, prepare_fp8_layer_for_marlin)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
@@ -337,6 +335,9 @@ def apply(self,
                 size_k=layer.input_size_per_partition,
                 bias=bias)
 
+        # Note: lazy import to avoid triton import error.
+        from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+            apply_w8a8_block_fp8_linear)
         if self.block_quant:
             assert self.quant_config.weight_block_size is not None
             return apply_w8a8_block_fp8_linear(

From b12e87f942eb7740c17ab546b964bc327afdda37 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 30 Dec 2024 20:24:45 +0800
Subject: [PATCH 029/462] [platforms] enable platform plugins (#11602)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 .buildkite/test-pipeline.yaml                 |  25 +-
 docs/source/design/plugin_system.md           |   6 +-
 tests/conftest.py                             |   2 +-
 tests/kernels/test_attention_selector.py      |  16 +-
 .../plugins/vllm_add_dummy_platform/setup.py  |  11 +
 .../vllm_add_dummy_platform/__init__.py       |   5 +
 .../vllm_add_dummy_platform/dummy_platform.py |   5 +
 tests/plugins_tests/test_platform_plugins.py  |  16 +
 vllm/config.py                                |  15 +-
 vllm/distributed/parallel_state.py            |   3 +-
 vllm/engine/arg_utils.py                      |   2 +-
 vllm/executor/ray_utils.py                    |   2 +-
 .../guided_decoding/__init__.py               |   3 +-
 vllm/model_executor/models/registry.py        |   2 +-
 vllm/model_executor/utils.py                  |   4 +-
 vllm/platforms/__init__.py                    | 320 ++++++++++++------
 vllm/plugins/__init__.py                      |  72 ++--
 vllm/spec_decode/metrics.py                   |   2 +-
 vllm/usage/usage_lib.py                       |   2 +-
 vllm/utils.py                                 |   8 +-
 vllm/worker/model_runner_base.py              |   5 +-
 vllm/worker/multi_step_model_runner.py        |   1 +
 vllm/worker/worker_base.py                    |  14 +-
 23 files changed, 360 insertions(+), 181 deletions(-)
 create mode 100644 tests/plugins/vllm_add_dummy_platform/setup.py
 create mode 100644 tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py
 create mode 100644 tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
 create mode 100644 tests/plugins_tests/test_platform_plugins.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index b563c96343f92..bee968b4d2e43 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -106,14 +106,12 @@ steps:
   source_file_dependencies:
   - vllm/
   commands:
-  - pip install -e ./plugins/vllm_add_dummy_model
   - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py
   - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
   - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
   - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
   - pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
   - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py
-  - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
   - pytest -v -s entrypoints/test_chat_utils.py
   - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
 
@@ -333,8 +331,6 @@ steps:
   - vllm/
   - tests/models
   commands:
-    - pip install -e ./plugins/vllm_add_dummy_model
-    - pytest -v -s models/test_oot_registration.py # it needs a clean process
     - pytest -v -s models/test_registry.py
     - pytest -v -s models/test_initialization.py
 
@@ -469,11 +465,28 @@ steps:
   - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
   - pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
   - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
-  - pip install -e ./plugins/vllm_add_dummy_model
-  - pytest -v -s distributed/test_distributed_oot.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py
 
+- label: Plugin Tests (2 GPUs) # 40min
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  fast_check: true
+  source_file_dependencies:
+  - vllm/plugins/
+  - tests/plugins/
+  commands:
+  # begin platform plugin tests, all the code in-between runs on dummy platform
+  - pip install -e ./plugins/vllm_add_dummy_platform
+  - pytest -v -s plugins_tests/test_platform_plugins.py
+  - pip uninstall vllm_add_dummy_platform -y
+  # end platform plugin tests
+  # other tests continue here:
+  - pip install -e ./plugins/vllm_add_dummy_model
+  - pytest -v -s distributed/test_distributed_oot.py
+  - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
+  - pytest -v -s models/test_oot_registration.py # it needs a clean process
+
 - label: Multi-step Tests (4 GPUs) # 36min
   working_dir: "/vllm-workspace/tests"
   num_gpus: 4
diff --git a/docs/source/design/plugin_system.md b/docs/source/design/plugin_system.md
index 79aff757518f2..225030885f629 100644
--- a/docs/source/design/plugin_system.md
+++ b/docs/source/design/plugin_system.md
@@ -41,9 +41,11 @@ Every plugin has three parts:
 2. **Plugin name**: The name of the plugin. This is the value in the dictionary of the `entry_points` dictionary. In the example above, the plugin name is `register_dummy_model`. Plugins can be filtered by their names using the `VLLM_PLUGINS` environment variable. To load only a specific plugin, set `VLLM_PLUGINS` to the plugin name.
 3. **Plugin value**: The fully qualified name of the function to register in the plugin system. In the example above, the plugin value is `vllm_add_dummy_model:register`, which refers to a function named `register` in the `vllm_add_dummy_model` module.
 
-## What Can Plugins Do?
+## Types of supported plugins
 
-Currently, the primary use case for plugins is to register custom, out-of-the-tree models into vLLM. This is done by calling `ModelRegistry.register_model` to register the model. In the future, the plugin system may be extended to support more features, such as swapping in custom implementations for certain classes in vLLM.
+- **General plugins** (with group name `vllm.general_plugins`): The primary use case for these plugins is to register custom, out-of-the-tree models into vLLM. This is done by calling `ModelRegistry.register_model` to register the model inside the plugin function.
+
+- **Platform plugins** (with group name `vllm.platform_plugins`): The primary use case for these plugins is to register custom, out-of-the-tree platforms into vLLM. The plugin function should return `None` when the platform is not supported in the current environment, or the platform class's fully qualified name when the platform is supported.
 
 ## Guidelines for Writing Plugins
 
diff --git a/tests/conftest.py b/tests/conftest.py
index 4e939221329cd..6e2f75e33654f 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -31,7 +31,6 @@
                          to_enc_dec_tuple_list, zip_enc_dec_prompts)
 from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
-from vllm.platforms import current_platform
 from vllm.sampling_params import BeamSearchParams
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, cuda_device_count_stateless,
                         identity)
@@ -242,6 +241,7 @@ def video_assets() -> _VideoAssets:
 class HfRunner:
 
     def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
+        from vllm.platforms import current_platform
         if x is None or isinstance(x, (bool, )):
             return x
 
diff --git a/tests/kernels/test_attention_selector.py b/tests/kernels/test_attention_selector.py
index d37f95d48d5b2..916cc2efa3895 100644
--- a/tests/kernels/test_attention_selector.py
+++ b/tests/kernels/test_attention_selector.py
@@ -5,7 +5,10 @@
 
 from tests.kernels.utils import override_backend_env_variable
 from vllm.attention.selector import which_attn_to_use
-from vllm.platforms import cpu, cuda, openvino, rocm
+from vllm.platforms.cpu import CpuPlatform
+from vllm.platforms.cuda import CudaPlatform
+from vllm.platforms.openvino import OpenVinoPlatform
+from vllm.platforms.rocm import RocmPlatform
 from vllm.utils import STR_FLASH_ATTN_VAL, STR_INVALID_VAL
 
 
@@ -20,26 +23,23 @@ def test_env(name: str, device: str, monkeypatch):
     override_backend_env_variable(monkeypatch, name)
 
     if device == "cpu":
-        with patch("vllm.attention.selector.current_platform",
-                   cpu.CpuPlatform()):
+        with patch("vllm.attention.selector.current_platform", CpuPlatform()):
             backend = which_attn_to_use(16, torch.float16, torch.float16, 16,
                                         False)
         assert backend.name == "TORCH_SDPA"
     elif device == "hip":
-        with patch("vllm.attention.selector.current_platform",
-                   rocm.RocmPlatform()):
+        with patch("vllm.attention.selector.current_platform", RocmPlatform()):
             backend = which_attn_to_use(16, torch.float16, torch.float16, 16,
                                         False)
         assert backend.name == "ROCM_FLASH"
     elif device == "openvino":
         with patch("vllm.attention.selector.current_platform",
-                   openvino.OpenVinoPlatform()):
+                   OpenVinoPlatform()):
             backend = which_attn_to_use(16, torch.float16, torch.float16, 16,
                                         False)
         assert backend.name == "OPENVINO"
     else:
-        with patch("vllm.attention.selector.current_platform",
-                   cuda.CudaPlatform()):
+        with patch("vllm.attention.selector.current_platform", CudaPlatform()):
             backend = which_attn_to_use(16, torch.float16, torch.float16, 16,
                                         False)
         assert backend.name == name
diff --git a/tests/plugins/vllm_add_dummy_platform/setup.py b/tests/plugins/vllm_add_dummy_platform/setup.py
new file mode 100644
index 0000000000000..31639906898db
--- /dev/null
+++ b/tests/plugins/vllm_add_dummy_platform/setup.py
@@ -0,0 +1,11 @@
+from setuptools import setup
+
+setup(
+    name='vllm_add_dummy_platform',
+    version='0.1',
+    packages=['vllm_add_dummy_platform'],
+    entry_points={
+        'vllm.platform_plugins': [
+            "dummy_platform_plugin = vllm_add_dummy_platform:dummy_platform_plugin"  # noqa
+        ]
+    })
diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py
new file mode 100644
index 0000000000000..594cef520a7de
--- /dev/null
+++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py
@@ -0,0 +1,5 @@
+from typing import Optional
+
+
+def dummy_platform_plugin() -> Optional[str]:
+    return "vllm_add_dummy_platform.dummy_platform.DummyPlatform"
diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
new file mode 100644
index 0000000000000..fde93142f1103
--- /dev/null
+++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
@@ -0,0 +1,5 @@
+from vllm.platforms.cuda import CudaPlatform
+
+
+class DummyPlatform(CudaPlatform):
+    device_name = "DummyDevice"
diff --git a/tests/plugins_tests/test_platform_plugins.py b/tests/plugins_tests/test_platform_plugins.py
new file mode 100644
index 0000000000000..0d27cf9f152e0
--- /dev/null
+++ b/tests/plugins_tests/test_platform_plugins.py
@@ -0,0 +1,16 @@
+def test_platform_plugins():
+    # simulate workload by running an example
+    import runpy
+    current_file = __file__
+    import os
+    example_file = os.path.join(
+        os.path.dirname(os.path.dirname(os.path.dirname(current_file))),
+        "examples", "offline_inference.py")
+    runpy.run_path(example_file)
+
+    # check if the plugin is loaded correctly
+    from vllm.platforms import _init_trace, current_platform
+    assert current_platform.device_name == "DummyDevice", (
+        f"Expected DummyDevice, got {current_platform.device_name}, "
+        "possibly because current_platform is imported before the plugin"
+        f" is loaded. The first import:\n{_init_trace}")
diff --git a/vllm/config.py b/vllm/config.py
index 765a46e6aeee3..e72c53b6130d0 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -22,7 +22,7 @@
 from vllm.model_executor.layers.quantization import (QUANTIZATION_METHODS,
                                                      get_quantization_config)
 from vllm.model_executor.models import ModelRegistry
-from vllm.platforms import current_platform, interface
+from vllm.platforms import CpuArchEnum
 from vllm.tracing import is_otel_available, otel_import_error_traceback
 from vllm.transformers_utils.config import (
     ConfigFormat, get_config, get_hf_image_processor_config,
@@ -349,6 +349,7 @@ def __init__(self,
         self.is_hybrid = self._init_is_hybrid()
         self.has_inner_state = self._init_has_inner_state()
 
+        from vllm.platforms import current_platform
         if current_platform.is_neuron():
             self.override_neuron_config = override_neuron_config
         else:
@@ -589,6 +590,7 @@ def _verify_quantization(self) -> None:
                 raise ValueError(
                     f"Unknown quantization method: {self.quantization}. Must "
                     f"be one of {supported_quantization}.")
+            from vllm.platforms import current_platform
             current_platform.verify_quantization(self.quantization)
             if self.quantization not in optimized_quantization_methods:
                 logger.warning(
@@ -644,6 +646,7 @@ def verify_async_output_proc(self, parallel_config, speculative_config,
 
         # Reminder: Please update docs/source/usage/compatibility_matrix.md
         # If the feature combo become valid
+        from vllm.platforms import current_platform
         if not current_platform.is_async_output_supported(self.enforce_eager):
             logger.warning(
                 "Async output processing is not supported on the "
@@ -1012,6 +1015,7 @@ def _verify_args(self) -> None:
             raise ValueError(
                 "GPU memory utilization must be less than 1.0. Got "
                 f"{self.gpu_memory_utilization}.")
+        from vllm.platforms import current_platform
         if (current_platform.is_cuda() and self.block_size is not None
                 and self.block_size > 32):
             raise ValueError("CUDA Paged Attention kernel only supports "
@@ -1279,6 +1283,7 @@ def __post_init__(self) -> None:
                                  f"distributed executor backend "
                                  f"'{self.distributed_executor_backend}'.")
         ray_only_devices = ["tpu", "hpu"]
+        from vllm.platforms import current_platform
         if (current_platform.device_type in ray_only_devices
                 and self.world_size > 1):
             if self.distributed_executor_backend is None:
@@ -1327,7 +1332,7 @@ def use_ray(self) -> bool:
     def _verify_args(self) -> None:
         # Lazy import to avoid circular import
         from vllm.executor.executor_base import ExecutorBase
-
+        from vllm.platforms import current_platform
         if self.distributed_executor_backend not in (
                 "ray", "mp", None) and not (isinstance(
                     self.distributed_executor_backend, type) and issubclass(
@@ -1528,6 +1533,7 @@ def compute_hash(self) -> str:
     def __init__(self, device: str = "auto") -> None:
         if device == "auto":
             # Automated device type detection
+            from vllm.platforms import current_platform
             self.device_type = current_platform.device_type
             if not self.device_type:
                 raise RuntimeError("Failed to infer device type")
@@ -2241,9 +2247,10 @@ def _get_and_verify_dtype(
             else:
                 torch_dtype = config_dtype
 
+            from vllm.platforms import current_platform
             if (current_platform.is_cpu()
                     and current_platform.get_cpu_architecture()
-                    == interface.CpuArchEnum.POWERPC
+                    == CpuArchEnum.POWERPC
                     and (config_dtype == torch.float16
                          or config_dtype == torch.float32)):
                 logger.info(
@@ -3083,6 +3090,7 @@ def _get_quantization_config(
             model_config: ModelConfig,
             load_config: LoadConfig) -> Optional[QuantizationConfig]:
         """Get the quantization config."""
+        from vllm.platforms import current_platform
         if model_config.quantization is not None:
             from vllm.model_executor.model_loader.weight_utils import (
                 get_quant_config)
@@ -3145,6 +3153,7 @@ def __post_init__(self):
             self.quant_config = VllmConfig._get_quantization_config(
                 self.model_config, self.load_config)
 
+        from vllm.platforms import current_platform
         if self.scheduler_config is not None and \
             self.model_config is not None and \
             self.scheduler_config.chunked_prefill_enabled and \
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 5b9236f8c56b6..e6768467f4c27 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -39,7 +39,6 @@
 import vllm.envs as envs
 from vllm.distributed.utils import StatelessProcessGroup
 from vllm.logger import init_logger
-from vllm.platforms import current_platform
 from vllm.utils import direct_register_custom_op, supports_custom_op
 
 if TYPE_CHECKING:
@@ -194,6 +193,7 @@ def __init__(
         assert self.cpu_group is not None
         assert self.device_group is not None
 
+        from vllm.platforms import current_platform
         if current_platform.is_cuda_alike():
             self.device = torch.device(f"cuda:{local_rank}")
         else:
@@ -1188,6 +1188,7 @@ def cleanup_dist_env_and_memory(shutdown_ray: bool = False):
         import ray  # Lazy import Ray
         ray.shutdown()
     gc.collect()
+    from vllm.platforms import current_platform
     if not current_platform.is_cpu():
         torch.cuda.empty_cache()
 
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 21966d003c7ef..69c7c5077fe32 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -18,7 +18,6 @@
 from vllm.executor.executor_base import ExecutorBase
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
-from vllm.platforms import current_platform
 from vllm.transformers_utils.utils import check_gguf_file
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import FlexibleArgumentParser, StoreBoolean
@@ -1094,6 +1093,7 @@ def create_engine_config(self,
                 use_sliding_window = (model_config.get_sliding_window()
                                       is not None)
                 use_spec_decode = self.speculative_model is not None
+                from vllm.platforms import current_platform
                 if (is_gpu and not use_sliding_window and not use_spec_decode
                         and not self.enable_lora
                         and not self.enable_prompt_adapter
diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py
index 426aa1b5c728f..8d766bad1a072 100644
--- a/vllm/executor/ray_utils.py
+++ b/vllm/executor/ray_utils.py
@@ -8,7 +8,6 @@
 from vllm.config import ParallelConfig
 from vllm.executor.msgspec_utils import decode_hook, encode_hook
 from vllm.logger import init_logger
-from vllm.platforms import current_platform
 from vllm.sequence import ExecuteModelRequest, IntermediateTensors
 from vllm.utils import get_ip
 from vllm.worker.worker_base import WorkerWrapperBase
@@ -229,6 +228,7 @@ def initialize_ray_cluster(
             the default Ray cluster address.
     """
     assert_ray_available()
+    from vllm.platforms import current_platform
 
     # Connect to a ray cluster.
     if current_platform.is_rocm() or current_platform.is_xpu():
diff --git a/vllm/model_executor/guided_decoding/__init__.py b/vllm/model_executor/guided_decoding/__init__.py
index 694c5b68b1cbd..18b435a42544a 100644
--- a/vllm/model_executor/guided_decoding/__init__.py
+++ b/vllm/model_executor/guided_decoding/__init__.py
@@ -6,7 +6,7 @@
 from vllm.model_executor.guided_decoding.utils import (
     convert_lark_to_gbnf, grammar_is_likely_lark,
     has_lmf_unsupported_json_features, has_xgrammar_unsupported_json_features)
-from vllm.platforms import CpuArchEnum, current_platform
+from vllm.platforms import CpuArchEnum
 
 if TYPE_CHECKING:
     from transformers import PreTrainedTokenizer
@@ -39,6 +39,7 @@ def maybe_backend_fallback(
 
     if guided_params.backend == "xgrammar":
         # xgrammar only has x86 wheels for linux, fallback to outlines
+        from vllm.platforms import current_platform
         if current_platform.get_cpu_architecture() is not CpuArchEnum.X86:
             logger.warning("xgrammar is only supported on x86 CPUs. "
                            "Falling back to use outlines instead.")
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 67268eb4bb85f..07f4b5a3b3bc8 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -18,7 +18,6 @@
 import torch.nn as nn
 
 from vllm.logger import init_logger
-from vllm.platforms import current_platform
 
 from .interfaces import (has_inner_state, is_attention_free, is_hybrid,
                          supports_cross_encoding, supports_multimodal,
@@ -273,6 +272,7 @@ def _try_load_model_cls(
     model_arch: str,
     model: _BaseRegisteredModel,
 ) -> Optional[Type[nn.Module]]:
+    from vllm.platforms import current_platform
     current_platform.verify_model_arch(model_arch)
     try:
         return model.load_model_cls()
diff --git a/vllm/model_executor/utils.py b/vllm/model_executor/utils.py
index 39ead08c238ce..6f1cc9d5e0c30 100644
--- a/vllm/model_executor/utils.py
+++ b/vllm/model_executor/utils.py
@@ -3,10 +3,9 @@
 
 import torch
 
-from vllm.platforms import current_platform
-
 
 def set_random_seed(seed: int) -> None:
+    from vllm.platforms import current_platform
     current_platform.seed_everything(seed)
 
 
@@ -38,6 +37,7 @@ def set_weight_attrs(
         # This sometimes causes OOM errors during model loading. To avoid this,
         # we sync the param tensor after its weight loader is called.
         # TODO(woosuk): Remove this hack once we have a better solution.
+        from vllm.platforms import current_platform
         if current_platform.is_tpu() and key == "weight_loader":
             value = _make_synced_weight_loader(value)
         setattr(weight, key, value)
diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py
index 419237c252ffd..f6ac14446c021 100644
--- a/vllm/platforms/__init__.py
+++ b/vllm/platforms/__init__.py
@@ -1,123 +1,223 @@
+import logging
+import traceback
+from itertools import chain
+from typing import TYPE_CHECKING, Optional
+
+from vllm.plugins import load_plugins_by_group
+from vllm.utils import resolve_obj_by_qualname
+
 from .interface import _Backend  # noqa: F401
-from .interface import CpuArchEnum, Platform, PlatformEnum, UnspecifiedPlatform
+from .interface import CpuArchEnum, Platform, PlatformEnum
 
-current_platform: Platform
+logger = logging.getLogger(__name__)
 
-# NOTE: we don't use `torch.version.cuda` / `torch.version.hip` because
-# they only indicate the build configuration, not the runtime environment.
-# For example, people can install a cuda build of pytorch but run on tpu.
 
-is_tpu = False
-try:
-    # While it's technically possible to install libtpu on a non-TPU machine,
-    # this is a very uncommon scenario. Therefore, we assume that libtpu is
-    # installed if and only if the machine has TPUs.
-    import libtpu  # noqa: F401
-    is_tpu = True
-except Exception:
-    pass
+def tpu_platform_plugin() -> Optional[str]:
+    is_tpu = False
+    try:
+        # While it's technically possible to install libtpu on a
+        # non-TPU machine, this is a very uncommon scenario. Therefore,
+        # we assume that libtpu is installed if and only if the machine
+        # has TPUs.
+        import libtpu  # noqa: F401
+        is_tpu = True
+    except Exception:
+        pass
+
+    return "vllm.platforms.tpu.TpuPlatform" if is_tpu else None
 
-is_cuda = False
 
-try:
-    import pynvml
-    pynvml.nvmlInit()
+def cuda_platform_plugin() -> Optional[str]:
+    is_cuda = False
+
     try:
-        if pynvml.nvmlDeviceGetCount() > 0:
+        import pynvml
+        pynvml.nvmlInit()
+        try:
+            if pynvml.nvmlDeviceGetCount() > 0:
+                is_cuda = True
+        finally:
+            pynvml.nvmlShutdown()
+    except Exception:
+        # CUDA is supported on Jetson, but NVML may not be.
+        import os
+
+        def cuda_is_jetson() -> bool:
+            return os.path.isfile("/etc/nv_tegra_release") \
+                or os.path.exists("/sys/class/tegra-firmware")
+
+        if cuda_is_jetson():
             is_cuda = True
-    finally:
-        pynvml.nvmlShutdown()
-except Exception:
-    # CUDA is supported on Jetson, but NVML may not be.
-    import os
 
-    def cuda_is_jetson() -> bool:
-        return os.path.isfile("/etc/nv_tegra_release") \
-            or os.path.exists("/sys/class/tegra-firmware")
+    return "vllm.platforms.cuda.CudaPlatform" if is_cuda else None
+
+
+def rocm_platform_plugin() -> Optional[str]:
+    is_rocm = False
+
+    try:
+        import amdsmi
+        amdsmi.amdsmi_init()
+        try:
+            if len(amdsmi.amdsmi_get_processor_handles()) > 0:
+                is_rocm = True
+        finally:
+            amdsmi.amdsmi_shut_down()
+    except Exception:
+        pass
+
+    return "vllm.platforms.rocm.RocmPlatform" if is_rocm else None
+
+
+def hpu_platform_plugin() -> Optional[str]:
+    is_hpu = False
+    try:
+        from importlib import util
+        is_hpu = util.find_spec('habana_frameworks') is not None
+    except Exception:
+        pass
+
+    return "vllm.platforms.hpu.HpuPlatform" if is_hpu else None
+
+
+def xpu_platform_plugin() -> Optional[str]:
+    is_xpu = False
+
+    try:
+        # installed IPEX if the machine has XPUs.
+        import intel_extension_for_pytorch  # noqa: F401
+        import oneccl_bindings_for_pytorch  # noqa: F401
+        import torch
+        if hasattr(torch, 'xpu') and torch.xpu.is_available():
+            is_xpu = True
+    except Exception:
+        pass
+
+    return "vllm.platforms.xpu.XPUPlatform" if is_xpu else None
+
+
+def cpu_platform_plugin() -> Optional[str]:
+    is_cpu = False
+    try:
+        from importlib.metadata import version
+        is_cpu = "cpu" in version("vllm")
+    except Exception:
+        pass
+
+    return "vllm.platforms.cpu.CpuPlatform" if is_cpu else None
+
+
+def neuron_platform_plugin() -> Optional[str]:
+    is_neuron = False
+    try:
+        import transformers_neuronx  # noqa: F401
+        is_neuron = True
+    except ImportError:
+        pass
 
-    if cuda_is_jetson():
-        is_cuda = True
+    return "vllm.platforms.neuron.NeuronPlatform" if is_neuron else None
 
-is_rocm = False
 
-try:
-    import amdsmi
-    amdsmi.amdsmi_init()
+def openvino_platform_plugin() -> Optional[str]:
+    is_openvino = False
     try:
-        if len(amdsmi.amdsmi_get_processor_handles()) > 0:
-            is_rocm = True
-    finally:
-        amdsmi.amdsmi_shut_down()
-except Exception:
-    pass
-
-is_hpu = False
-try:
-    from importlib import util
-    is_hpu = util.find_spec('habana_frameworks') is not None
-except Exception:
-    pass
-
-is_xpu = False
-
-try:
-    # installed IPEX if the machine has XPUs.
-    import intel_extension_for_pytorch  # noqa: F401
-    import oneccl_bindings_for_pytorch  # noqa: F401
-    import torch
-    if hasattr(torch, 'xpu') and torch.xpu.is_available():
-        is_xpu = True
-except Exception:
-    pass
-
-is_cpu = False
-try:
-    from importlib.metadata import version
-    is_cpu = "cpu" in version("vllm")
-except Exception:
-    pass
-
-is_neuron = False
-try:
-    import transformers_neuronx  # noqa: F401
-    is_neuron = True
-except ImportError:
-    pass
-
-is_openvino = False
-try:
-    from importlib.metadata import version
-    is_openvino = "openvino" in version("vllm")
-except Exception:
-    pass
-
-if is_tpu:
-    # people might install pytorch built with cuda but run on tpu
-    # so we need to check tpu first
-    from .tpu import TpuPlatform
-    current_platform = TpuPlatform()
-elif is_cuda:
-    from .cuda import CudaPlatform
-    current_platform = CudaPlatform()
-elif is_rocm:
-    from .rocm import RocmPlatform
-    current_platform = RocmPlatform()
-elif is_hpu:
-    from .hpu import HpuPlatform
-    current_platform = HpuPlatform()
-elif is_xpu:
-    from .xpu import XPUPlatform
-    current_platform = XPUPlatform()
-elif is_cpu:
-    from .cpu import CpuPlatform
-    current_platform = CpuPlatform()
-elif is_neuron:
-    from .neuron import NeuronPlatform
-    current_platform = NeuronPlatform()
-elif is_openvino:
-    from .openvino import OpenVinoPlatform
-    current_platform = OpenVinoPlatform()
-else:
-    current_platform = UnspecifiedPlatform()
-
-__all__ = ['Platform', 'PlatformEnum', 'current_platform', 'CpuArchEnum']
+        from importlib.metadata import version
+        is_openvino = "openvino" in version("vllm")
+    except Exception:
+        pass
+
+    return "vllm.platforms.openvino.OpenVinoPlatform" if is_openvino else None
+
+
+builtin_platform_plugins = {
+    'tpu': tpu_platform_plugin,
+    'cuda': cuda_platform_plugin,
+    'rocm': rocm_platform_plugin,
+    'hpu': hpu_platform_plugin,
+    'xpu': xpu_platform_plugin,
+    'cpu': cpu_platform_plugin,
+    'neuron': neuron_platform_plugin,
+    'openvino': openvino_platform_plugin,
+}
+
+
+def resolve_current_platform_cls_qualname() -> str:
+    platform_plugins = load_plugins_by_group('vllm.platform_plugins')
+
+    activated_plugins = []
+
+    for name, func in chain(builtin_platform_plugins.items(),
+                            platform_plugins.items()):
+        try:
+            assert callable(func)
+            platform_cls_qualname = func()
+            if platform_cls_qualname is not None:
+                activated_plugins.append(name)
+        except Exception:
+            pass
+
+    activated_builtin_plugins = list(
+        set(activated_plugins) & set(builtin_platform_plugins.keys()))
+    activated_oot_plugins = list(
+        set(activated_plugins) & set(platform_plugins.keys()))
+
+    if len(activated_oot_plugins) >= 2:
+        raise RuntimeError(
+            "Only one platform plugin can be activated, but got: "
+            f"{activated_oot_plugins}")
+    elif len(activated_oot_plugins) == 1:
+        platform_cls_qualname = platform_plugins[activated_oot_plugins[0]]()
+        logger.info("Platform plugin %s is activated",
+                    activated_oot_plugins[0])
+    elif len(activated_builtin_plugins) >= 2:
+        raise RuntimeError(
+            "Only one platform plugin can be activated, but got: "
+            f"{activated_builtin_plugins}")
+    elif len(activated_builtin_plugins) == 1:
+        platform_cls_qualname = builtin_platform_plugins[
+            activated_builtin_plugins[0]]()
+        logger.info("Automatically detected platform %s.",
+                    activated_builtin_plugins[0])
+    else:
+        platform_cls_qualname = "vllm.interface.UnspecifiedPlatform"
+        logger.info(
+            "No platform detected, vLLM is running on UnspecifiedPlatform")
+    return platform_cls_qualname
+
+
+_current_platform = None
+_init_trace: str = ''
+
+if TYPE_CHECKING:
+    current_platform: Platform
+
+
+def __getattr__(name: str):
+    if name == 'current_platform':
+        # lazy init current_platform.
+        # 1. out-of-tree platform plugins need `from vllm.platforms import
+        #    Platform` so that they can inherit `Platform` class. Therefore,
+        #    we cannot resolve `current_platform` during the import of
+        #    `vllm.platforms`.
+        # 2. when users use out-of-tree platform plugins, they might run
+        #    `import vllm`, some vllm internal code might access
+        #    `current_platform` during the import, and we need to make sure
+        #    `current_platform` is only resolved after the plugins are loaded
+        #    (we have tests for this, if any developer violate this, they will
+        #    see the test failures).
+        global _current_platform
+        if _current_platform is None:
+            platform_cls_qualname = resolve_current_platform_cls_qualname()
+            _current_platform = resolve_obj_by_qualname(
+                platform_cls_qualname)()
+            global _init_trace
+            _init_trace = "".join(traceback.format_stack())
+        return _current_platform
+    else:
+        return globals()[name]
+
+
+__all__ = [
+    'Platform', 'PlatformEnum', 'current_platform', 'CpuArchEnum',
+    "_init_trace"
+]
diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py
index 17f604ea0e202..c50eb2cef4cd5 100644
--- a/vllm/plugins/__init__.py
+++ b/vllm/plugins/__init__.py
@@ -1,10 +1,10 @@
 import logging
 import os
+from typing import Callable, Dict
 
 import torch
 
 import vllm.envs as envs
-from vllm.platforms import current_platform
 
 logger = logging.getLogger(__name__)
 
@@ -12,6 +12,39 @@
 plugins_loaded = False
 
 
+def load_plugins_by_group(group: str) -> Dict[str, Callable]:
+    import sys
+    if sys.version_info < (3, 10):
+        from importlib_metadata import entry_points
+    else:
+        from importlib.metadata import entry_points
+
+    allowed_plugins = envs.VLLM_PLUGINS
+
+    discovered_plugins = entry_points(group=group)
+    if len(discovered_plugins) == 0:
+        logger.debug("No plugins for group %s found.", group)
+        return {}
+    logger.info("Available plugins for group %s:", group)
+    for plugin in discovered_plugins:
+        logger.info("name=%s, value=%s", plugin.name, plugin.value)
+    if allowed_plugins is None:
+        logger.info("all available plugins for group %s will be loaded.",
+                    group)
+        logger.info("set environment variable VLLM_PLUGINS to control"
+                    " which plugins to load.")
+    plugins = {}
+    for plugin in discovered_plugins:
+        if allowed_plugins is None or plugin.name in allowed_plugins:
+            try:
+                func = plugin.load()
+                plugins[plugin.name] = func
+                logger.info("plugin %s loaded.", plugin.name)
+            except Exception:
+                logger.exception("Failed to load plugin %s", plugin.name)
+    return plugins
+
+
 def load_general_plugins():
     """WARNING: plugins can be loaded for multiple times in different
     processes. They should be designed in a way that they can be loaded
@@ -26,6 +59,9 @@ def load_general_plugins():
     os.environ['TORCHINDUCTOR_COMPILE_THREADS'] = '1'
     # see https://github.com/vllm-project/vllm/issues/10619
     torch._inductor.config.compile_threads = 1
+
+    from vllm.platforms import current_platform
+
     if current_platform.is_xpu():
         # see https://github.com/pytorch/pytorch/blob/8cada5cbe5450e17c26fb8b358116785324537b2/torch/_dynamo/config.py#L158  # noqa
         os.environ['TORCH_COMPILE_DISABLE'] = 'True'
@@ -47,33 +83,7 @@ def load_general_plugins():
     if plugins_loaded:
         return
     plugins_loaded = True
-    import sys
-    if sys.version_info < (3, 10):
-        from importlib_metadata import entry_points
-    else:
-        from importlib.metadata import entry_points
-
-    allowed_plugins = envs.VLLM_PLUGINS
-
-    discovered_plugins = entry_points(group='vllm.general_plugins')
-    if len(discovered_plugins) == 0:
-        logger.debug("No plugins found.")
-        return
-    logger.info("Available plugins:")
-    for plugin in discovered_plugins:
-        logger.info("name=%s, value=%s, group=%s", plugin.name, plugin.value,
-                    plugin.group)
-    if allowed_plugins is None:
-        logger.info("all available plugins will be loaded.")
-        logger.info("set environment variable VLLM_PLUGINS to control"
-                    " which plugins to load.")
-    else:
-        logger.info("plugins to load: %s", allowed_plugins)
-    for plugin in discovered_plugins:
-        if allowed_plugins is None or plugin.name in allowed_plugins:
-            try:
-                func = plugin.load()
-                func()
-                logger.info("plugin %s loaded.", plugin.name)
-            except Exception:
-                logger.exception("Failed to load plugin %s", plugin.name)
+    plugins = load_plugins_by_group(group='vllm.general_plugins')
+    # general plugins, we only need to execute the loaded functions
+    for func in plugins.values():
+        func()
diff --git a/vllm/spec_decode/metrics.py b/vllm/spec_decode/metrics.py
index 03dc46600d8a9..d678f4578499b 100644
--- a/vllm/spec_decode/metrics.py
+++ b/vllm/spec_decode/metrics.py
@@ -6,7 +6,6 @@
 
 from vllm.model_executor.layers.spec_decode_base_sampler import (
     SpecDecodeBaseSampler)
-from vllm.platforms import current_platform
 from vllm.utils import is_pin_memory_available
 
 
@@ -94,6 +93,7 @@ def init_tensors(self,
     def maybe_collect_rejsample_metrics(
             self, k: int) -> Optional[SpecDecodeWorkerMetrics]:
         # currently using cuda.Event, skip for any non_cuda_alike platform
+        from vllm.platforms import current_platform
         if not current_platform.is_cuda_alike():
             return None
 
diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py
index 9ae46ff43a916..a9deee881f41a 100644
--- a/vllm/usage/usage_lib.py
+++ b/vllm/usage/usage_lib.py
@@ -17,7 +17,6 @@
 
 import vllm.envs as envs
 from vllm.connections import global_http_connection
-from vllm.platforms import current_platform
 from vllm.version import __version__ as VLLM_VERSION
 
 _config_home = envs.VLLM_CONFIG_ROOT
@@ -152,6 +151,7 @@ def _report_usage_once(self, model_architecture: str,
                            usage_context: UsageContext,
                            extra_kvs: Dict[str, Any]) -> None:
         # Platform information
+        from vllm.platforms import current_platform
         if current_platform.is_cuda_alike():
             device_property = torch.cuda.get_device_properties(0)
             self.gpu_count = torch.cuda.device_count()
diff --git a/vllm/utils.py b/vllm/utils.py
index 2b46c1fef0d09..8ef07d2c326a3 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -50,7 +50,6 @@
 
 import vllm.envs as envs
 from vllm.logger import enable_trace_function_call, init_logger
-from vllm.platforms import current_platform
 
 if TYPE_CHECKING:
     from vllm.config import VllmConfig
@@ -609,6 +608,7 @@ def create_kv_caches_with_random_flash(
     seed: int = 0,
     device: Optional[str] = "cuda",
 ) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
+    from vllm.platforms import current_platform
     current_platform.seed_everything(seed)
 
     torch_dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype)
@@ -650,7 +650,7 @@ def create_kv_caches_with_random(
         raise ValueError(
             f"Does not support key cache of type fp8 with head_size {head_size}"
         )
-
+    from vllm.platforms import current_platform
     current_platform.seed_everything(seed)
 
     torch_dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype)
@@ -703,6 +703,7 @@ def print_warning_once(msg: str) -> None:
 
 @lru_cache(maxsize=None)
 def is_pin_memory_available() -> bool:
+    from vllm.platforms import current_platform
     return current_platform.is_pin_memory_available()
 
 
@@ -713,6 +714,7 @@ def __init__(self, device: Optional[torch.types.Device] = None):
 
     def current_memory_usage(self) -> float:
         # Return the memory usage in bytes.
+        from vllm.platforms import current_platform
         if current_platform.is_cuda_alike():
             torch.cuda.reset_peak_memory_stats(self.device)
             mem = torch.cuda.max_memory_allocated(self.device)
@@ -1066,6 +1068,7 @@ def _cuda_device_count_stateless(
     import torch.cuda
     import torch.version
 
+    from vllm.platforms import current_platform
     if not torch.cuda._is_compiled():
         return 0
     if current_platform.is_rocm():
@@ -1673,6 +1676,7 @@ def direct_register_custom_op(
         return
 
     if not supports_custom_op():
+        from vllm.platforms import current_platform
         assert not current_platform.is_cuda_alike(), (
             "cuda platform needs torch>=2.4 to support custom op, "
             "chances are you are using an old version of pytorch "
diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py
index cd4770202a186..c7abad7e0258d 100644
--- a/vllm/worker/model_runner_base.py
+++ b/vllm/worker/model_runner_base.py
@@ -12,7 +12,6 @@
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
 
 if TYPE_CHECKING:
@@ -265,13 +264,13 @@ def prepare_model_input(
         """
         raise NotImplementedError
 
-    @current_platform.inference_mode()
     def execute_model(
         self,
         model_input: T,
         kv_caches: Optional[List[torch.Tensor]],
-        intermediate_tensors: Optional[IntermediateTensors],
+        intermediate_tensors: Optional[IntermediateTensors] = None,
         num_steps: int = 1,
+        **kwargs,
     ) -> Optional[List[SamplerOutput]]:
         """
         Execute the model on the given input.
diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py
index 65d9bab0e2822..dee63a75c0605 100644
--- a/vllm/worker/multi_step_model_runner.py
+++ b/vllm/worker/multi_step_model_runner.py
@@ -544,6 +544,7 @@ def execute_model(
         model_input.record_step_event(current_stream)
 
         if get_pp_group().is_last_rank and self.is_driver_worker:
+            assert isinstance(output, list)
             assert len(
                 output
             ) == 1, "MultiStepModelRunner requires single-step base_models"
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index 3ac7fb8dfb766..249b3ed2dfd37 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -11,7 +11,6 @@
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.platforms import current_platform
 from vllm.sequence import ExecuteModelRequest, IntermediateTensors
 from vllm.utils import (enable_trace_function_call_for_thread,
                         resolve_obj_by_qualname, update_environment_variables)
@@ -44,6 +43,8 @@ def __init__(
         self.prompt_adapter_config = vllm_config.prompt_adapter_config
         self.observability_config = vllm_config.observability_config
         self.kv_transfer_config = vllm_config.kv_transfer_config
+        from vllm.platforms import current_platform
+        self.current_platform = current_platform
 
     @abstractmethod
     def init_device(self) -> None:
@@ -74,17 +75,17 @@ def initialize_cache(self, num_gpu_blocks: int,
         """
         raise NotImplementedError
 
-    @current_platform.inference_mode()
     def start_worker_execution_loop(self) -> None:
         """Execute model loop in parallel worker.
 
         You can stop the loop by executing a driver worker with an empty output.
         See `stop_remote_worker_execution_loop` for more details.
         """
-        while True:
-            output = self.execute_model(execute_model_req=None)
-            if output is None:
-                return None
+        with self.current_platform.inference_mode():
+            while True:
+                output = self.execute_model(execute_model_req=None)
+                if output is None:
+                    return None
 
     @abstractmethod
     def execute_model(
@@ -352,6 +353,7 @@ def execute_model(
         model_execute_time = time.perf_counter() - start_time
         if not get_pp_group().is_last_rank:
             # output is IntermediateTensors
+            assert isinstance(output, IntermediateTensors)
             if (self.observability_config is not None
                     and self.observability_config.collect_model_execute_time):
                 output.tensors["model_execute_time"] = torch.tensor(

From 8d9b6721e7f5b7d191951c6f1cd12710ffd08093 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 30 Dec 2024 23:01:35 +0800
Subject: [PATCH 030/462] [VLM] Abstract out multi-modal data parsing in merged
 processor (#11620)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .buildkite/test-pipeline.yaml             |   4 +-
 vllm/model_executor/models/chatglm.py     |   4 +-
 vllm/model_executor/models/llava.py       |  18 +-
 vllm/model_executor/models/phi3v.py       |  19 +-
 vllm/model_executor/models/qwen2_audio.py |  22 +-
 vllm/model_executor/models/qwen2_vl.py    | 153 +++++-----
 vllm/model_executor/models/ultravox.py    |  22 +-
 vllm/multimodal/__init__.py               |   9 +-
 vllm/multimodal/audio.py                  |   4 +-
 vllm/multimodal/base.py                   |   8 +-
 vllm/multimodal/image.py                  |   4 +-
 vllm/multimodal/inputs.py                 | 195 ++++--------
 vllm/multimodal/parse.py                  | 344 ++++++++++++++++++++++
 vllm/multimodal/processing.py             |  62 ++--
 vllm/multimodal/video.py                  |   4 +-
 15 files changed, 560 insertions(+), 312 deletions(-)
 create mode 100644 vllm/multimodal/parse.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index bee968b4d2e43..c6f8316412e2f 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -356,7 +356,7 @@ steps:
     - pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
     - pytest -v -s models/embedding/language -m 'not core_model'
 
-- label: Multi-Modal Models Test (Standard) # 28min
+- label: Multi-Modal Models Test (Standard) # 40min
   #mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/
@@ -372,7 +372,7 @@ steps:
     - pytest -v -s models/encoder_decoder/language -m core_model
     - pytest -v -s models/encoder_decoder/vision_language -m core_model
 
-- label: Multi-Modal Models Test (Extended) 1 # 1h16m
+- label: Multi-Modal Models Test (Extended) 1 # 48m
   optional: true
   source_file_dependencies:
   - vllm/
diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py
index 6c50882d83c3b..ffd6891b25965 100644
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -33,7 +33,7 @@
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalData, MultiModalKwargs,
+from vllm.multimodal.inputs import (ModalityData, MultiModalKwargs,
                                     NestedTensors)
 from vllm.multimodal.utils import cached_get_tokenizer
 from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors,
@@ -54,7 +54,7 @@ def calculate_image_placeholder(vision_config):
 
 def mm_input_mapper_for_glmv(
     ctx: InputContext,
-    data: MultiModalData[object],
+    data: ModalityData[object],
 ) -> Dict:
     model_config = ctx.model_config
     tokenizer = cached_get_tokenizer(
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 0ecba5a1cae0f..1d6ee2a0be72e 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -20,11 +20,13 @@
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalDataItems,
-                                    MultiModalFieldConfig, MultiModalInputsV2,
-                                    MultiModalKwargs, NestedTensors)
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalInputsV2, MultiModalKwargs,
+                                    NestedTensors)
+from vllm.multimodal.parse import ImageProcessorItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        ProcessorInputs, PromptReplacement,
+                                        MultiModalDataItems, ProcessorInputs,
+                                        PromptReplacement,
                                         full_groupby_modality)
 from vllm.sequence import IntermediateTensors
 
@@ -179,7 +181,9 @@ def _get_prompt_replacements(
             assert isinstance(vision_config, PixtralVisionConfig)
 
             def get_replacement_pixtral(item_idx: int):
-                image_size = mm_items.get_image_size(item_idx)
+                images = mm_items.get_items("image", ImageProcessorItems)
+                image_size = images.get_image_size(item_idx)
+
                 (
                     num_width_tokens,
                     num_height_tokens,
@@ -591,8 +595,8 @@ def apply(
 
         result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs)
 
-        mm_items = self._get_mm_items(mm_data)
-        mm_item_counts = mm_items.get_item_counts()
+        mm_items = self._to_mm_items(mm_data)
+        mm_item_counts = mm_items.get_all_counts()
         mm_kwargs = result["mm_kwargs"]
 
         # We reimplement the functionality of MLlavaProcessor from
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index fefa9fd62d1d0..15362db6cdfbf 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -32,12 +32,13 @@
 from vllm.model_executor.models.clip import CLIPVisionModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalDataItems,
-                                    MultiModalFieldConfig, MultiModalInputsV2,
-                                    MultiModalKwargs, NestedTensors,
-                                    PlaceholderRange)
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalInputsV2, MultiModalKwargs,
+                                    NestedTensors, PlaceholderRange)
+from vllm.multimodal.parse import ImageProcessorItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        ProcessorInputs, PromptReplacement,
+                                        MultiModalDataItems, ProcessorInputs,
+                                        PromptReplacement,
                                         _BoundPromptReplacement,
                                         _PlaceholderInfo)
 from vllm.sequence import IntermediateTensors
@@ -381,7 +382,9 @@ def _get_prompt_replacements(
         assert isinstance(bos_token_id, int)
 
         def get_replacement_phi3v(item_idx: int):
-            image_size = mm_items.get_image_size(item_idx)
+            images = mm_items.get_items("image", ImageProcessorItems)
+            image_size = images.get_image_size(item_idx)
+
             num_tokens = image_processor.calc_num_image_tokens_from_image_size(
                 width=image_size.width,
                 height=image_size.height,
@@ -389,12 +392,14 @@ def get_replacement_phi3v(item_idx: int):
 
             return [_IMAGE_TOKEN_ID] * num_tokens + [bos_token_id]
 
+        num_images = mm_items.get_count("image", strict=False)
+
         return [
             PromptReplacement(
                 modality="image",
                 target=image_token,
                 replacement=get_replacement_phi3v,
-            ) for image_token in image_tokens[:len(mm_items.images)]
+            ) for image_token in image_tokens[:num_images]
         ]
 
     def _apply_prompt_replacements(
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index 25a351bd9c656..e3d43b017f894 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -20,8 +20,8 @@
 # limitations under the License.
 """Inference-only Qwen2-Audio model compatible with HuggingFace weights."""
 from functools import cached_property
-from typing import (Any, Iterable, List, Mapping, Optional, Set, Tuple,
-                    TypedDict, Union)
+from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict,
+                    Union)
 
 import numpy as np
 import torch
@@ -38,10 +38,12 @@
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalDataItems, MultiModalFieldConfig,
-                                    MultiModalKwargs, NestedTensors)
+from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
+                                    NestedTensors)
+from vllm.multimodal.parse import MultiModalDataParser
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        ProcessorInputs, PromptReplacement)
+                                        MultiModalDataItems, ProcessorInputs,
+                                        PromptReplacement)
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsMultiModal, SupportsPP
@@ -99,15 +101,9 @@ def _get_hf_processor(
     def _get_feature_extractor(self) -> WhisperFeatureExtractor:
         return self._get_hf_processor().feature_extractor  # type: ignore
 
-    def _get_hf_mm_data(
-        self,
-        mm_items: MultiModalDataItems,
-    ) -> tuple[dict[str, Any], dict[str, Any]]:
-        # resample audio to the model's sampling rate
+    def _get_data_parser(self) -> MultiModalDataParser:
         feature_extractor = self._get_feature_extractor()
-        mm_items.resample_audios(feature_extractor.sampling_rate)
-
-        return super()._get_hf_mm_data(mm_items)
+        return MultiModalDataParser(target_sr=feature_extractor.sampling_rate)
 
     def _call_hf_processor(
         self,
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 574845ef5a525..6181fe3dd13d8 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -25,7 +25,6 @@
 from typing import (Any, Callable, Iterable, List, Literal, Mapping, Optional,
                     Set, Tuple, Type, TypedDict, Union)
 
-import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -55,15 +54,16 @@
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalDataItems,
+from vllm.multimodal.inputs import (ImageItem, ModalityData,
                                     MultiModalFieldConfig, MultiModalKwargs,
-                                    NestedTensors)
+                                    NestedTensors, VideoItem)
+from vllm.multimodal.parse import ModalityDataItems, MultiModalDataParser
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        ProcessorInputs, PromptReplacement)
+                                        MultiModalDataItems, ProcessorInputs,
+                                        PromptReplacement)
 from vllm.platforms import _Backend
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.config import uses_mrope
-from vllm.utils import is_list_of
 
 from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, WeightsMapper, get_vit_attn_backend,
@@ -719,61 +719,81 @@ def get_max_qwen2_vl_mm_tokens(ctx: InputContext,
                                         data_type_key="video")
 
 
-class Qwen2VLMultiModalDataItems(MultiModalDataItems):
+class Qwen2EmbeddingItems(ModalityDataItems[dict[str, torch.Tensor],
+                                            dict[str, torch.Tensor]]):
 
-    @staticmethod
-    def from_dict(data: MultiModalDataDict) -> "MultiModalDataItems":
-        """
-        Normalize :class:`MultiModalDataDict` to :class:`MultiModalDataItems`.
-        """
-        multi_data = Qwen2VLMultiModalDataItems()
-
-        for k, v in data.items():
-            # TODO: Make a separate modality for embedding inputs
-            # to avoid confusion
-            # yapf: disable
-            if k == "video":
-                # Special case since even a single item can be a list
-                multi_data[k] = (  # type: ignore[index]
-                    v if (
-                        isinstance(v, (dict, torch.Tensor))  # type: ignore[assignment]
-                        or is_list_of(v, list)
-                        or isinstance(v[0], (np.ndarray, torch.Tensor))
-                           and v[0].ndim == 4
-                    ) else [v]
-                )
-            elif k in ("image", "audio"):
-                multi_data[k] = (  # type: ignore[index]
-                    v if isinstance(v, (dict, torch.Tensor, list)) else [v]
-                )
-            else:
-                multi_data[k] = v if isinstance(v, list) else [v]  # type: ignore[index]
-            # yapf: enable
+    def __init__(self, data: dict, modality: str) -> None:
+        super().__init__(data)
 
-        return multi_data
+        self.modality = modality
 
-    def get_item_counts(self) -> Mapping[str, int]:
-        return {
-            m: (
-                len(items[f"{m}_grid_thw"])  # type: ignore
-                if isinstance(items, dict) else len(items))
-            for m, items in self.items()
-        }
+        grid_thw = data[f"{modality}_grid_thw"]
+        slice_idxs = [0] + grid_thw.prod(-1).cumsum_(0).tolist()
+        self._slices = [
+            slice(slice_idxs[i], slice_idxs[i + 1])
+            for i in range(len(grid_thw))
+        ]
 
-    def has_embedding_inputs(self) -> bool:
-        return any(
-            isinstance(items, dict) or any(
-                isinstance(item, torch.Tensor) for item in items)
-            for items in self.values())
+    def __repr__(self) -> str:
+        return (f"{type(self).__name__}(modality={self.modality!r})")
 
+    def get_count(self) -> int:
+        return len(self.data[f"{self.modality}_grid_thw"])
 
-class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor):
+    def get(self, index: int) -> dict[str, torch.Tensor]:
+        out = {}
+        for k, v in self.data.items():
+            if v != f"{self.modality}_grid_thw":
+                v = v[self._slices[index]]
+
+            out[k] = v
+
+        return out
+
+    def get_processor_data(self) -> Mapping[str, object]:
+        return {}
+
+    def get_passthrough_data(self) -> Mapping[str, object]:
+        return self.data
+
+
+class Qwen2ImageEmbeddingItems(Qwen2EmbeddingItems):
+
+    def __init__(self, data: dict) -> None:
+        super().__init__(data, "image")
+
+
+class Qwen2VideoEmbeddingItems(Qwen2EmbeddingItems):
 
-    def _get_mm_items(
+    def __init__(self, data: dict) -> None:
+        super().__init__(data, "video")
+
+
+class Qwen2MultiModalDataParser(MultiModalDataParser):
+
+    def _parse_image_data(
+        self,
+        data: Union[dict[str, torch.Tensor], ModalityData[ImageItem]],
+    ) -> ModalityDataItems[Any, Any]:
+        if isinstance(data, dict):
+            return Qwen2EmbeddingItems(data, modality="image")
+
+        return super()._parse_image_data(data)
+
+    def _parse_video_data(
         self,
-        mm_data: MultiModalDataDict,
-    ) -> MultiModalDataItems:
-        return Qwen2VLMultiModalDataItems.from_dict(mm_data)
+        data: Union[dict[str, torch.Tensor], ModalityData[VideoItem]],
+    ) -> ModalityDataItems[Any, Any]:
+        if isinstance(data, dict):
+            return Qwen2EmbeddingItems(data, modality="video")
+
+        return super()._parse_video_data(data)
+
+
+class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor):
+
+    def _get_data_parser(self) -> MultiModalDataParser:
+        return Qwen2MultiModalDataParser()
 
     def _get_hf_processor(
         self,
@@ -796,35 +816,6 @@ def _get_hf_processor(
 
         return hf_processor
 
-    def _get_hf_mm_data(
-        self,
-        mm_items: MultiModalDataItems,
-    ) -> tuple[dict[str, Any], dict[str, Any]]:
-        processor_data = dict[str, Any]()
-        passthrough_data = dict[str, Any]()
-
-        for k, v in mm_items.items():
-            # TODO: Make a separate modality for embedding inputs
-            # to avoid confusion
-            if k in ("image", "video", "audio"):
-                if isinstance(v, dict):
-                    # Pass through embedding inputs (dict)
-                    passthrough_data.update(v)
-                elif isinstance(v, torch.Tensor) and v.ndim == 3:
-                    # Pass through embedding inputs (single)
-                    passthrough_data[f"{k}_embeds"] = [v]
-                elif (is_list_of(v, torch.Tensor) and len(v) > 0
-                      and v[0].ndim == 2):
-                    # Pass through embedding inputs (multi)
-                    passthrough_data[f"{k}_embeds"] = v
-                elif len(v) > 0:
-                    # Map keys to plural form, e.g.: image -> images
-                    processor_data[f"{k}s"] = v
-            else:
-                processor_data[k] = v
-
-        return processor_data, passthrough_data
-
     def _get_prompt_replacements(
         self,
         mm_items: MultiModalDataItems,
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index 7b4aeeec5f403..7e853e5b90096 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -3,8 +3,8 @@
 
 import math
 from functools import cached_property, lru_cache
-from typing import (Any, Iterable, List, Literal, Mapping, Optional, Set,
-                    Tuple, TypedDict, Union)
+from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
+                    TypedDict, Union)
 
 import numpy as np
 import torch
@@ -24,10 +24,12 @@
 from vllm.model_executor.model_loader.loader import DefaultModelLoader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalDataItems, MultiModalFieldConfig,
-                                    MultiModalKwargs, NestedTensors)
+from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
+                                    NestedTensors)
+from vllm.multimodal.parse import MultiModalDataParser
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        ProcessorInputs, PromptReplacement)
+                                        MultiModalDataItems, ProcessorInputs,
+                                        PromptReplacement)
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs.ultravox import UltravoxConfig
 from vllm.utils import is_list_of
@@ -85,15 +87,9 @@ def _get_feature_extractor(self) -> WhisperFeatureExtractor:
         hf_processor = self._get_hf_processor()
         return hf_processor.audio_processor.feature_extractor  # type: ignore
 
-    def _get_hf_mm_data(
-        self,
-        mm_items: MultiModalDataItems,
-    ) -> tuple[dict[str, Any], dict[str, Any]]:
-        # resample audio to the model's sampling rate
+    def _get_data_parser(self) -> MultiModalDataParser:
         feature_extractor = self._get_feature_extractor()
-        mm_items.resample_audios(feature_extractor.sampling_rate)
-
-        return super()._get_hf_mm_data(mm_items)
+        return MultiModalDataParser(target_sr=feature_extractor.sampling_rate)
 
     def _call_hf_processor(
         self,
diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py
index 9255e062e4870..e58bbe81717a0 100644
--- a/vllm/multimodal/__init__.py
+++ b/vllm/multimodal/__init__.py
@@ -1,8 +1,7 @@
 from .base import MultiModalPlaceholderMap, MultiModalPlugin
-from .inputs import (BatchedTensorInputs, MultiModalData,
-                     MultiModalDataBuiltins, MultiModalDataDict,
-                     MultiModalKwargs, MultiModalPlaceholderDict,
-                     NestedTensors)
+from .inputs import (BatchedTensorInputs, ModalityData, MultiModalDataBuiltins,
+                     MultiModalDataDict, MultiModalKwargs,
+                     MultiModalPlaceholderDict, NestedTensors)
 from .registry import MultiModalRegistry
 
 MULTIMODAL_REGISTRY = MultiModalRegistry()
@@ -16,7 +15,7 @@
 
 __all__ = [
     "BatchedTensorInputs",
-    "MultiModalData",
+    "ModalityData",
     "MultiModalDataBuiltins",
     "MultiModalDataDict",
     "MultiModalKwargs",
diff --git a/vllm/multimodal/audio.py b/vllm/multimodal/audio.py
index 3e09ef1fcbb56..de80f22bac2a3 100644
--- a/vllm/multimodal/audio.py
+++ b/vllm/multimodal/audio.py
@@ -9,7 +9,7 @@
 from vllm.utils import PlaceholderModule
 
 from .base import MediaIO, MultiModalPlugin
-from .inputs import AudioItem, MultiModalData, MultiModalKwargs
+from .inputs import AudioItem, ModalityData, MultiModalKwargs
 
 try:
     import librosa
@@ -31,7 +31,7 @@ def get_data_key(self) -> str:
     def _default_input_mapper(
         self,
         ctx: InputContext,
-        data: MultiModalData[AudioItem],
+        data: ModalityData[AudioItem],
         **mm_processor_kwargs,
     ) -> MultiModalKwargs:
         raise NotImplementedError("There is no default audio input mapper")
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index cdda6f8052794..7f4029e726332 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -15,12 +15,12 @@
     from vllm.config import ModelConfig
     from vllm.sequence import SequenceGroupMetadata
 
-from .inputs import (MultiModalData, MultiModalDataDict, MultiModalKwargs,
+from .inputs import (ModalityData, MultiModalDataDict, MultiModalKwargs,
                      PlaceholderRange)
 
 logger = init_logger(__name__)
 
-MultiModalInputMapper = Callable[[InputContext, MultiModalData[object]],
+MultiModalInputMapper = Callable[[InputContext, ModalityData[object]],
                                  MultiModalKwargs]
 """
 Return a dictionary to be passed as keyword arguments to
@@ -69,7 +69,7 @@ def get_data_key(self) -> str:
     def _default_input_mapper(
         self,
         ctx: InputContext,
-        data: MultiModalData[Any],
+        data: ModalityData[Any],
         **mm_processor_kwargs,
     ) -> MultiModalKwargs:
         """
@@ -118,7 +118,7 @@ def wrapper(model_cls: N) -> N:
     def map_input(
         self,
         model_config: "ModelConfig",
-        data: MultiModalData[Any],
+        data: ModalityData[Any],
         mm_processor_kwargs: Optional[dict[str, Any]],
     ) -> MultiModalKwargs:
         """
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index 14c79dfadec0c..da13a381c4530 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -13,7 +13,7 @@
 from vllm.utils import is_list_of
 
 from .base import MediaIO, MultiModalPlugin
-from .inputs import ImageItem, MultiModalData, MultiModalKwargs
+from .inputs import ImageItem, ModalityData, MultiModalKwargs
 
 if TYPE_CHECKING:
     from vllm.config import ModelConfig
@@ -44,7 +44,7 @@ def _get_hf_image_processor(
     def _default_input_mapper(
         self,
         ctx: InputContext,
-        data: MultiModalData[ImageItem],
+        data: ModalityData[ImageItem],
         **mm_processor_kwargs,
     ) -> MultiModalKwargs:
         model_config = ctx.model_config
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 1fbda6e0b8750..db489af7ac475 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -2,53 +2,74 @@
 from collections import UserDict, defaultdict
 from collections.abc import Mapping, Sequence
 from dataclasses import dataclass
-from typing import (Any, Literal, NamedTuple, TypedDict, TypeVar, Union, cast,
-                    final)
+from typing import Any, Literal, TypedDict, TypeVar, Union, cast, final
 
 import numpy as np
 import torch
 import torch.types
 from PIL.Image import Image
 from transformers import BatchFeature
-from typing_extensions import NotRequired, TypeAlias, assert_never
+from typing_extensions import NotRequired, TypeAlias
 
 from vllm.utils import JSONTree, is_list_of, json_map_leaves
 
 _T = TypeVar("_T")
 
-# yapf: disable
-ImageItem: TypeAlias = Union[Image, np.ndarray, torch.Tensor]
+HfImageItem: TypeAlias = Union[Image, np.ndarray, torch.Tensor]
 """
 A :class:`transformers.image_utils.ImageInput` representing a single image
 item, which can be passed to a HuggingFace :code:`ImageProcessor`.
 """
 
-VideoItem: TypeAlias = Union[
-    list[Image],
-    np.ndarray,
-    torch.Tensor,
-    list[np.ndarray],
-    list[torch.Tensor],
-]
+HfVideoItem: TypeAlias = Union[list[Image], np.ndarray, torch.Tensor,
+                               list[np.ndarray], list[torch.Tensor]]
 """
 A :class:`transformers.image_utils.VideoInput` representing a single video
 item, which can be passed to a HuggingFace :code:`VideoProcessor`.
 """
 
-AudioItem: TypeAlias = Union[
-    np.ndarray,
-    list[float],
-    # `(audio, sampling_rate)`: If the audio's sampling rate is different
-    # from that expected by the model, we need to resample it.
-    tuple[np.ndarray, float],
-]
+HfAudioItem: TypeAlias = Union[list[float], np.ndarray, torch.Tensor]
 """
 Represents a single audio
 item, which can be passed to a HuggingFace :code:`AudioProcessor`.
 """
-# yapf: enable
 
-MultiModalData: TypeAlias = Union[_T, list[_T]]
+ImageItem: TypeAlias = Union[HfImageItem, torch.Tensor]
+"""
+A :class:`transformers.image_utils.ImageInput` representing a single image
+item, which can be passed to a HuggingFace :code:`ImageProcessor`.
+
+Alternatively, a 3-D tensor or batch of 2-D tensors,
+which are treated as image embeddings;
+these are directly passed to the model without HF processing.
+"""
+
+VideoItem: TypeAlias = Union[HfVideoItem, torch.Tensor]
+"""
+A :class:`transformers.image_utils.VideoInput` representing a single video
+item, which can be passed to a HuggingFace :code:`VideoProcessor`.
+
+Alternatively, a 3-D tensor or batch of 2-D tensors,
+which are treated as video embeddings;
+these are directly passed to the model without HF processing.
+"""
+
+AudioItem: TypeAlias = Union[HfAudioItem, tuple[np.ndarray, float],
+                             torch.Tensor]
+"""
+Represents a single audio
+item, which can be passed to a HuggingFace :code:`AudioProcessor`.
+
+Alternatively, a tuple `(audio, sampling_rate)`, where the sampling rate
+is different from that expected by the model;
+these are resampled to the model's sampling rate before being processed by HF.
+
+Alternatively, a 3-D tensor or batch of 2-D tensors,
+which are treated as audio embeddings;
+these are directly passed to the model without HF processing.
+"""
+
+ModalityData: TypeAlias = Union[_T, list[_T]]
 """
 Either a single data item, or a list of data items.
 
@@ -61,17 +82,17 @@
 class MultiModalDataBuiltins(TypedDict, total=False):
     """Type annotations for modality types predefined by vLLM."""
 
-    image: MultiModalData[ImageItem]
+    image: ModalityData[ImageItem]
     """The input image(s)."""
 
-    video: MultiModalData[VideoItem]
+    video: ModalityData[VideoItem]
     """The input video(s)."""
 
-    audio: MultiModalData[AudioItem]
+    audio: ModalityData[AudioItem]
     """The input audio(s)."""
 
 
-MultiModalDataDict: TypeAlias = Mapping[str, MultiModalData[Any]]
+MultiModalDataDict: TypeAlias = Mapping[str, ModalityData[Any]]
 """
 A dictionary containing an entry for each modality type to input.
 
@@ -83,123 +104,6 @@ class MultiModalDataBuiltins(TypedDict, total=False):
 """
 
 
-class ImageSize(NamedTuple):
-    width: int
-    height: int
-
-
-class MultiModalDataItems(UserDict[str, list[Any]]):
-    """
-    As :class:`MultiModalDataDict`, but normalized such that each entry
-    corresponds to a list.
-    """
-
-    @staticmethod
-    def from_dict(data: MultiModalDataDict) -> "MultiModalDataItems":
-        """
-        Normalize :class:`MultiModalDataDict` to :class:`MultiModalDataItems`.
-        """
-        multi_data = MultiModalDataItems()
-
-        for k, v in data.items():
-            # TODO: Make a separate modality for embedding inputs
-            # to avoid confusion
-            # yapf: disable
-            if k == "video":
-                # Special case since even a single item can be a list
-                multi_data[k] = (  # type: ignore[index]
-                    v if (
-                        isinstance(v, torch.Tensor)
-                        or is_list_of(v, list)
-                        or isinstance(v[0], (np.ndarray, torch.Tensor))
-                           and v[0].ndim == 4
-                    ) else [v]
-                )
-            elif k in ("image", "audio"):
-                multi_data[k] = (  # type: ignore[index]
-                    v if isinstance(v, (torch.Tensor, list)) else [v]
-                )
-            else:
-                multi_data[k] = v if isinstance(v, list) else [v]  # type: ignore[index]
-            # yapf: enable
-
-        return multi_data
-
-    # NOTE: When a field (e.g. `images`) doesn't exist, directly appending to
-    # `self.images` doesn't update this dictionary, which may be confusing
-    # We annotate the getter methods as `Sequence` to prevent others from
-    # trying to update the list in this way
-    @property
-    def images(self) -> Sequence[ImageItem]:
-        return self.get("image", [])
-
-    @property
-    def videos(self) -> Sequence[VideoItem]:
-        return self.get("video", [])
-
-    @property
-    def audios(self) -> Sequence[AudioItem]:
-        return self.get("audio", [])
-
-    def get_item_counts(self) -> Mapping[str, int]:
-        return {m: len(items) for m, items in self.items()}
-
-    def has_embedding_inputs(self) -> bool:
-        return any(
-            any(isinstance(item, torch.Tensor) for item in items)
-            for items in self.values())
-
-    def get_image_size(self, item_idx: int) -> ImageSize:
-        image = self.images[item_idx]
-
-        if isinstance(image, Image):
-            return ImageSize(*image.size)
-        if isinstance(image, (np.ndarray, torch.Tensor)):
-            _, h, w = image.shape
-            return ImageSize(w, h)
-
-        assert_never(image)
-
-    def get_audio_with_sr(
-        self,
-        item_idx: int,
-        *,
-        default_sr: float,
-    ) -> tuple[np.ndarray, float]:
-        audio = self.audios[item_idx]
-
-        if isinstance(audio, tuple):
-            return audio
-        if isinstance(audio, list):
-            return np.array(audio), default_sr
-        if isinstance(audio, np.ndarray):
-            return audio, default_sr
-
-        assert_never(audio)
-
-    def resample_audios(self, new_sr: float, *, drop_sr: bool = True) -> None:
-        """
-        If :code:`drop_sr=True`, the audio items in this dictionary are updated
-        to be NumPy arrays which implicitly means that their sampling rate is
-        the same as the model's expected sampling rate; otherwise, they remain
-        as :code:`(audio, new_sr)` tuples.
-        """
-        # Avoid circular import
-        from .audio import resample_audio
-
-        if not self.audios:
-            return
-
-        new_audios = []
-        for item_idx in range(len(self.audios)):
-            audio, sr = self.get_audio_with_sr(item_idx, default_sr=new_sr)
-            audio = resample_audio(audio, orig_sr=sr, target_sr=new_sr)
-
-            new_audios.append(audio if drop_sr else (audio, new_sr))
-
-        self["audio"] = new_audios
-
-
 class PlaceholderRange(TypedDict):
     """
     Placeholder location information for multi-modal data.
@@ -436,7 +340,7 @@ def from_items_by_key(
     ) -> "MultiModalKwargs":
         data = {
             key: items[0].field.reduce(items).data
-            for key, items in items_by_key.items()
+            for key, items in items_by_key.items() if len(items) > 0
         }
 
         return MultiModalKwargs(data,
@@ -567,6 +471,11 @@ def get_items_by_modality(
         Get the keyword arguments corresponding to an item identified by
         its modality and index.
         """
+        if modality not in self._keys_by_modality:
+            available_modalities = set(self._keys_by_modality.keys())
+            raise KeyError(f"Modality {modality!r} not found. "
+                           f"Available modalities: {available_modalities}")
+
         keys_to_gather = self._keys_by_modality[modality]
 
         return {
diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py
new file mode 100644
index 0000000000000..17a795247372e
--- /dev/null
+++ b/vllm/multimodal/parse.py
@@ -0,0 +1,344 @@
+from abc import ABC, abstractmethod
+from collections import UserDict
+from collections.abc import Callable, Iterator, Mapping, Sequence
+from typing import TYPE_CHECKING, Any, Generic, NamedTuple, Optional, TypeVar
+
+import numpy as np
+import torch
+from PIL.Image import Image
+from typing_extensions import TypeAlias, TypeGuard, assert_never
+
+from vllm.utils import is_list_of
+
+from .audio import resample_audio
+from .inputs import (AudioItem, HfAudioItem, HfImageItem, HfVideoItem,
+                     ImageItem, ModalityData, MultiModalDataDict,
+                     NestedTensors, VideoItem)
+
+_T = TypeVar("_T")
+_I = TypeVar("_I")
+
+
+class ModalityDataItems(ABC, Generic[_T, _I]):
+
+    def __init__(self, data: _T) -> None:
+        super().__init__()
+
+        self.data = data
+
+    def __len__(self) -> int:
+        return self.get_count()
+
+    def __getitem__(self, index: int) -> _I:
+        return self.get(index)
+
+    if TYPE_CHECKING:
+        # Auto-generated
+        def __iter__(self) -> Iterator[_I]:
+            ...
+
+    @abstractmethod
+    def get_count(self) -> int:
+        """Get the number of data items."""
+        raise NotImplementedError
+
+    @abstractmethod
+    def get(self, index: int) -> _I:
+        """Get a data item by its index."""
+        raise NotImplementedError
+
+    def get_all(self) -> list[_I]:
+        """Get all data items."""
+        return [self.get(idx) for idx in range(self.get_count())]
+
+    @abstractmethod
+    def get_processor_data(self) -> Mapping[str, object]:
+        """Get the data to pass to the HF processor."""
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_passthrough_data(self) -> Mapping[str, object]:
+        """Get the data to pass directly to the model."""
+        raise NotImplementedError
+
+
+class ProcessorBatchItems(ModalityDataItems[Sequence[_T], _T]):
+
+    def __init__(self, data: Sequence[_T], modality: str) -> None:
+        super().__init__(data)
+
+        self.modality = modality
+
+    def __repr__(self) -> str:
+        return (f"{type(self).__name__}(modality={self.modality!r})")
+
+    def get_count(self) -> int:
+        return len(self.data)
+
+    def get(self, index: int) -> _T:
+        return self.data[index]
+
+    def get_processor_data(self) -> Mapping[str, object]:
+        return {f"{self.modality}s": self.data}
+
+    def get_passthrough_data(self) -> Mapping[str, object]:
+        return {}
+
+
+class EmbeddingItems(ModalityDataItems[NestedTensors, torch.Tensor]):
+
+    def __init__(self, data: NestedTensors, modality: str) -> None:
+        super().__init__(data)
+
+        self.modality = modality
+
+    def __repr__(self) -> str:
+        return (f"{type(self).__name__}(modality={self.modality!r})")
+
+    def get_count(self) -> int:
+        return len(self.data)
+
+    def get(self, index: int) -> object:
+        return self.data[index]
+
+    def get_processor_data(self) -> Mapping[str, object]:
+        return {}
+
+    def get_passthrough_data(self) -> Mapping[str, object]:
+        return {f"{self.modality}_embeds": self.data}
+
+
+class AudioProcessorItems(ProcessorBatchItems[HfAudioItem]):
+
+    def __init__(self, data: Sequence[HfAudioItem]) -> None:
+        super().__init__(data, "audio")
+
+
+class AudioEmbeddingItems(EmbeddingItems):
+
+    def __init__(self, data: NestedTensors) -> None:
+        super().__init__(data, "audio")
+
+
+class ImageSize(NamedTuple):
+    width: int
+    height: int
+
+
+class ImageProcessorItems(ProcessorBatchItems[HfImageItem]):
+
+    def __init__(self, data: Sequence[HfImageItem]) -> None:
+        super().__init__(data, "image")
+
+    def get_image_size(self, item_idx: int) -> ImageSize:
+        image = self.get(item_idx)
+
+        if isinstance(image, Image):
+            return ImageSize(*image.size)
+        if isinstance(image, (np.ndarray, torch.Tensor)):
+            _, h, w = image.shape
+            return ImageSize(w, h)
+
+        assert_never(image)
+
+
+class ImageEmbeddingItems(EmbeddingItems):
+
+    def __init__(self, data: NestedTensors) -> None:
+        super().__init__(data, "image")
+
+
+class VideoProcessorItems(ProcessorBatchItems[HfVideoItem]):
+
+    def __init__(self, data: Sequence[HfVideoItem]) -> None:
+        super().__init__(data, "video")
+
+
+class VideoEmbeddingItems(EmbeddingItems):
+
+    def __init__(self, data: NestedTensors) -> None:
+        super().__init__(data, "video")
+
+
+_D = TypeVar("_D", bound=ModalityDataItems[Any, Any])
+
+
+class MultiModalDataItems(UserDict[str, ModalityDataItems[Any, Any]]):
+    """
+    As :class:`MultiModalDataDict`, but normalized such that each entry
+    corresponds to a list.
+    """
+
+    def get_count(self, modality: str, *, strict: bool = True) -> int:
+        """
+        Get the number of data items belonging to a modality.
+        
+        If `strict=False`, return `0` instead of raising :exc:`KeyError`
+        even if the modality is not found.
+        """
+        if modality not in self:
+            if strict:
+                available_modalities = set(self.keys())
+                raise KeyError(f"Modality {modality!r} not found. "
+                               f"Available modalities: {available_modalities}")
+
+            return 0
+
+        return self[modality].get_count()
+
+    def get_all_counts(self) -> Mapping[str, int]:
+        """Get the number of items belonging to each modality."""
+        return {m: items.get_count() for m, items in self.items()}
+
+    def get_items(
+        self,
+        modality: str,
+        typ: type[_D],
+    ) -> _D:
+        """
+        Get the data items belonging to a modality,
+        requiring that they belong to a certain type.
+        """
+        if modality not in self:
+            available_modalities = set(self.keys())
+            raise KeyError(f"Modality {modality!r} not found. "
+                           f"Available modalities: {available_modalities}")
+
+        items = self[modality]
+        if not isinstance(items, typ):
+            raise TypeError(f"Invalid type of data items for {modality=}. "
+                            f"Expected type: {typ}, but "
+                            f"found type: {type(items)}")
+
+        return items
+
+
+ModalityDataParser: TypeAlias = Callable[[ModalityData[Any]],
+                                         ModalityDataItems[Any, Any]]
+
+
+class MultiModalDataParser:
+    """
+    Parses :class:`MultiModalDataDict` into :class:`MultiModalDataItems`.
+    """
+
+    def __init__(self, *, target_sr: Optional[float] = None) -> None:
+        super().__init__()
+
+        self.target_sr = target_sr
+
+    def _is_embeddings(self, data: object) -> TypeGuard[NestedTensors]:
+        if isinstance(data, torch.Tensor):
+            return data.ndim == 3
+        if is_list_of(data, torch.Tensor):
+            return len(data) == 0 or data[0].ndim == 2
+
+        return False
+
+    def _get_audio_with_sr(
+        self,
+        audio: AudioItem,
+    ) -> tuple[np.ndarray, Optional[float]]:
+        if isinstance(audio, tuple):
+            return audio
+        if isinstance(audio, list):
+            return np.array(audio), None
+        if isinstance(audio, np.ndarray):
+            return audio, None
+        if isinstance(audio, torch.Tensor):
+            return audio.numpy(), None
+
+        assert_never(audio)
+
+    def _parse_audio_data(
+        self,
+        data: ModalityData[AudioItem],
+    ) -> ModalityDataItems[Any, Any]:
+        if self._is_embeddings(data):
+            return AudioEmbeddingItems(data)
+
+        if (is_list_of(data, float)
+                or isinstance(data,
+                              (np.ndarray, torch.Tensor)) and data.ndim == 1
+                or isinstance(data, tuple)):
+            data_items = [data]
+        elif isinstance(data, (np.ndarray, torch.Tensor)):
+            data_items = [elem for elem in data]
+        else:
+            data_items = data
+
+        new_audios = list[np.ndarray]()
+        for data_item in data_items:
+            audio, orig_sr = self._get_audio_with_sr(data_item)
+            if orig_sr is None:
+                new_audio = audio
+            else:
+                target_sr = self.target_sr
+                if target_sr is None:
+                    raise RuntimeError(
+                        "Audio resampling is not supported when "
+                        "`target_sr` is not provided")
+
+                new_audio = resample_audio(audio,
+                                           orig_sr=orig_sr,
+                                           target_sr=target_sr)
+
+            new_audios.append(new_audio)
+
+        return AudioProcessorItems(new_audios)
+
+    def _parse_image_data(
+        self,
+        data: ModalityData[ImageItem],
+    ) -> ModalityDataItems[Any, Any]:
+        if self._is_embeddings(data):
+            return ImageEmbeddingItems(data)
+
+        if (isinstance(data, Image)
+                or isinstance(data,
+                              (np.ndarray, torch.Tensor)) and data.ndim == 3):
+            data_items = [data]
+        elif isinstance(data, (np.ndarray, torch.Tensor)):
+            data_items = [elem for elem in data]
+        else:
+            data_items = data
+
+        return ImageProcessorItems(data_items)
+
+    def _parse_video_data(
+        self,
+        data: ModalityData[VideoItem],
+    ) -> ModalityDataItems[Any, Any]:
+        if self._is_embeddings(data):
+            return VideoEmbeddingItems(data)
+
+        if (is_list_of(data, Image)
+                or isinstance(data,
+                              (np.ndarray, torch.Tensor)) and data.ndim == 4):
+            data_items = [data]
+        elif isinstance(data, (np.ndarray, torch.Tensor)):
+            data_items = [elem for elem in data]
+        else:
+            data_items = data
+
+        return VideoProcessorItems(data_items)
+
+    def _get_subparsers(self) -> Mapping[str, ModalityDataParser]:
+        return {
+            "audio": self._parse_audio_data,
+            "image": self._parse_image_data,
+            "video": self._parse_video_data,
+        }
+
+    def parse_mm_data(self,
+                      mm_data: MultiModalDataDict) -> MultiModalDataItems:
+        subparsers = self._get_subparsers()
+
+        mm_items = MultiModalDataItems()
+        for k, v in mm_data.items():
+            if k not in subparsers:
+                raise ValueError(f"Unsupported modality: {k}")
+
+            mm_items[k] = subparsers[k](v)
+
+        return mm_items
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 3ece0762e3228..180489166b407 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -15,11 +15,12 @@
 from vllm.inputs import DummyData, InputProcessingContext
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
-from vllm.utils import LRUCache, flatten_2d_lists, full_groupby, is_list_of
+from vllm.utils import LRUCache, flatten_2d_lists, full_groupby
 
-from .inputs import (MultiModalDataDict, MultiModalDataItems,
-                     MultiModalFieldConfig, MultiModalFieldItem,
-                     MultiModalInputsV2, MultiModalKwargs, PlaceholderRange)
+from .inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                     MultiModalFieldItem, MultiModalInputsV2, MultiModalKwargs,
+                     PlaceholderRange)
+from .parse import MultiModalDataItems, MultiModalDataParser
 
 logger = init_logger(__name__)
 
@@ -621,6 +622,16 @@ def __call__(
     ) -> MultiModalInputsV2:
         return self.apply(prompt, mm_data, hf_processor_mm_kwargs)
 
+    def _get_data_parser(self) -> MultiModalDataParser:
+        """
+        Construct a data parser to preprocess multi-modal data items
+        before passing them to :meth:`_get_hf_mm_data`.
+
+        You can support additional modalities by creating a subclass
+        of :class:`MultiModalDataParser` that has additional subparsers.
+        """
+        return MultiModalDataParser()
+
     def _get_hf_processor(self) -> ProcessorMixin:
         """
         Subclasses can add keyword arguments to this method to accept
@@ -631,11 +642,16 @@ def _get_hf_processor(self) -> ProcessorMixin:
     def _get_tokenizer(self) -> AnyTokenizer:
         return self.ctx.tokenizer
 
-    def _get_mm_items(
+    def _to_mm_items(
         self,
         mm_data: MultiModalDataDict,
     ) -> MultiModalDataItems:
-        return MultiModalDataItems.from_dict(mm_data)
+        """
+        Normalize :class:`MultiModalDataDict` to :class:`MultiModalDataItems`
+        before passing them to :meth:`_get_hf_mm_data`.
+        """
+        parser = self._get_data_parser()
+        return parser.parse_mm_data(mm_data)
 
     @abstractmethod
     def _get_mm_fields_config(
@@ -680,22 +696,9 @@ def _get_hf_mm_data(
         processor_data = dict[str, Any]()
         passthrough_data = dict[str, Any]()
 
-        for k, v in mm_items.items():
-            # TODO: Make a separate modality for embedding inputs
-            # to avoid confusion
-            if k in ("image", "video", "audio"):
-                if isinstance(v, torch.Tensor) and v.ndim == 3:
-                    # Pass through embedding inputs (single)
-                    passthrough_data[f"{k}_embeds"] = [v]
-                elif (is_list_of(v, torch.Tensor) and len(v) > 0
-                      and v[0].ndim == 2):
-                    # Pass through embedding inputs (multi)
-                    passthrough_data[f"{k}_embeds"] = v
-                elif len(v) > 0:
-                    # Map keys to plural form, e.g.: image -> images
-                    processor_data[f"{k}s"] = v
-            else:
-                processor_data[k] = v
+        for items in mm_items.values():
+            processor_data.update(items.get_processor_data())
+            passthrough_data.update(items.get_passthrough_data())
 
         return processor_data, passthrough_data
 
@@ -756,7 +759,7 @@ def _apply_hf_processor_missing(
         cached items; instead, we rely on our own prompt replacement logic
         for the full text.
         """
-        mm_missing_counts = mm_missing_data_items.get_item_counts()
+        mm_missing_counts = mm_missing_data_items.get_all_counts()
 
         prompt_ids, _ = self._apply_hf_processor(
             prompt_text=prompt_text,
@@ -789,7 +792,8 @@ def _cached_apply_hf_processor(
         cache = self.cache
         model_id = self.ctx.model_config.model
 
-        if cache is None or mm_data_items.has_embedding_inputs():
+        _, passthrough_data = self._get_hf_mm_data(mm_data_items)
+        if cache is None or passthrough_data:
             return self._apply_hf_processor(
                 prompt_text=prompt_text,
                 mm_items=mm_data_items,
@@ -812,7 +816,7 @@ def _cached_apply_hf_processor(
             modality: [mm_data_items[modality][idx] for idx in idxs]
             for modality, idxs in mm_missing_idxs.items()
         }
-        mm_missing_data_items = self._get_mm_items(mm_missing_data)
+        mm_missing_data_items = self._to_mm_items(mm_missing_data)
 
         prompt_ids, mm_missing_kwargs = self._apply_hf_processor_missing(
             prompt_text=prompt_text,
@@ -852,7 +856,7 @@ def _cached_apply_hf_processor(
             mm_merged_field_items[modality] = merged_modal_items_lst
 
         if self.enable_sanity_checks:
-            mm_missing_counts = mm_missing_data_items.get_item_counts()
+            mm_missing_counts = mm_missing_data_items.get_all_counts()
             assert all(
                 item_count == mm_missing_counts[modality]
                 for modality, item_count in mm_missing_next_idx.items()), dict(
@@ -865,7 +869,7 @@ def _cached_apply_hf_processor(
         )
 
         if self.enable_sanity_checks:
-            mm_item_counts = mm_data_items.get_item_counts()
+            mm_item_counts = mm_data_items.get_all_counts()
 
             for modality, item_count in mm_item_counts.items():
                 for item_idx in range(item_count):
@@ -958,7 +962,7 @@ def apply(
         3. Extract information about the placeholder tokens from the
            processed token IDs.
         """
-        mm_items = self._get_mm_items(mm_data)
+        mm_items = self._to_mm_items(mm_data)
 
         prompt_ids, mm_kwargs = self._cached_apply_hf_processor(
             prompt_text,
@@ -975,7 +979,7 @@ def apply(
 
         # If HF processor already inserts placeholder tokens,
         # there is no need for us to insert them
-        mm_item_counts = mm_items.get_item_counts()
+        mm_item_counts = mm_items.get_all_counts()
         all_placeholders = self._find_placeholders(prompt_repls, prompt_ids,
                                                    mm_item_counts)
 
diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index b7d43c830cc46..1ad1f5abc27a2 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -15,7 +15,7 @@
 from vllm.transformers_utils.tokenizer import get_tokenizer
 from vllm.utils import PlaceholderModule, is_list_of
 
-from .base import MediaIO, MultiModalData
+from .base import MediaIO, ModalityData
 from .image import ImageMediaIO, ImagePlugin
 from .inputs import MultiModalKwargs, VideoItem
 
@@ -54,7 +54,7 @@ def _get_hf_video_processor(
     def _default_input_mapper(
         self,
         ctx: InputContext,
-        data: MultiModalData[VideoItem],
+        data: ModalityData[VideoItem],
         **mm_processor_kwargs,
     ) -> MultiModalKwargs:
         model_config = ctx.model_config

From 5886aa496e8fa31c9180bcfc8e89faaa8899907d Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Mon, 30 Dec 2024 10:51:02 -0500
Subject: [PATCH 031/462] [V1] [6/N] API Server: Better Shutdown (#11586)

---
 vllm/entrypoints/openai/api_server.py | 44 ++++++++-------------------
 vllm/v1/engine/async_llm.py           | 25 +++++++++++++--
 vllm/v1/engine/core_client.py         | 16 ++++------
 3 files changed, 40 insertions(+), 45 deletions(-)

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 094cc15a317e9..bac72d87376da 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -68,7 +68,7 @@
 from vllm.logger import init_logger
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import (FlexibleArgumentParser, get_open_zmq_ipc_path,
-                        is_valid_ipv6_address, kill_process_tree, set_ulimit)
+                        is_valid_ipv6_address, set_ulimit)
 from vllm.version import __version__ as VLLM_VERSION
 
 TIMEOUT_KEEP_ALIVE = 5  # seconds
@@ -133,32 +133,21 @@ async def build_async_engine_client_from_engine_args(
     Returns the Client or None if the creation failed.
     """
 
-    # Fall back
-    # TODO: fill out feature matrix.
+    # AsyncLLMEngine.
     if (MQLLMEngineClient.is_unsupported_config(engine_args)
             or envs.VLLM_USE_V1 or disable_frontend_multiprocessing):
-        engine_config = engine_args.create_engine_config(
-            UsageContext.OPENAI_API_SERVER)
-        uses_ray = getattr(AsyncLLMEngine._get_executor_cls(engine_config),
-                           "uses_ray", False)
-
-        build_engine = partial(AsyncLLMEngine.from_engine_args,
-                               engine_args=engine_args,
-                               engine_config=engine_config,
-                               usage_context=UsageContext.OPENAI_API_SERVER)
-        if uses_ray:
-            # Must run in main thread with ray for its signal handlers to work
-            engine_client = build_engine()
-        else:
-            engine_client = await asyncio.get_running_loop().run_in_executor(
-                None, build_engine)
 
-        yield engine_client
-        if hasattr(engine_client, "shutdown"):
-            engine_client.shutdown()
-        return
+        engine_client: Optional[EngineClient] = None
+        try:
+            engine_client = AsyncLLMEngine.from_engine_args(
+                engine_args=engine_args,
+                usage_context=UsageContext.OPENAI_API_SERVER)
+            yield engine_client
+        finally:
+            if engine_client and hasattr(engine_client, "shutdown"):
+                engine_client.shutdown()
 
-    # Otherwise, use the multiprocessing AsyncLLMEngine.
+    # MQLLMEngine.
     else:
         if "PROMETHEUS_MULTIPROC_DIR" not in os.environ:
             # Make TemporaryDirectory for prometheus multiprocessing
@@ -737,15 +726,6 @@ def signal_handler(*_) -> None:
 
     signal.signal(signal.SIGTERM, signal_handler)
 
-    # The child processes will send SIGQUIT to this process when
-    # any error happens. This process then clean up the whole tree.
-    # TODO(rob): move this into AsyncLLM.__init__ once we remove
-    # the context manager below.
-    def sigquit_handler(signum, frame):
-        kill_process_tree(os.getpid())
-
-    signal.signal(signal.SIGQUIT, sigquit_handler)
-
     async with build_async_engine_client(args) as engine_client:
         app = build_app(args)
 
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 213ddaa023dbc..3f097ca7f439c 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -1,4 +1,6 @@
 import asyncio
+import os
+import signal
 from typing import AsyncGenerator, Dict, List, Mapping, Optional, Type, Union
 
 from vllm.config import ModelConfig, VllmConfig
@@ -16,6 +18,7 @@
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
 from vllm.usage.usage_lib import UsageContext
+from vllm.utils import kill_process_tree
 from vllm.v1.engine.core_client import EngineCoreClient
 from vllm.v1.engine.detokenizer import Detokenizer
 from vllm.v1.engine.processor import Processor
@@ -38,6 +41,22 @@ def __init__(
         log_requests: bool = True,
         start_engine_loop: bool = True,
     ) -> None:
+
+        # The child processes will send SIGQUIT when unrecoverable
+        # errors happen. We kill the process tree here so that the
+        # stack trace is very evident.
+        # TODO: rather than killing the main process, we should
+        # figure out how to raise an AsyncEngineDeadError and
+        # handle at the API server level so we can return a better
+        # error code to the clients calling VLLM.
+        def sigquit_handler(signum, frame):
+            logger.fatal(
+                "AsyncLLM got SIGQUIT from worker processes, shutting "
+                "down. See stack trace above for root cause issue.")
+            kill_process_tree(os.getpid())
+
+        signal.signal(signal.SIGQUIT, sigquit_handler)
+
         assert start_engine_loop
 
         self.log_requests = log_requests
@@ -276,9 +295,9 @@ async def _run_output_handler(self):
                 # 4) Abort any requests that finished due to stop strings.
                 await self.engine_core.abort_requests_async(reqs_to_abort)
 
-        except BaseException as e:
-            logger.error(e)
-            raise e
+        except Exception as e:
+            logger.exception("EngineCore output handler hit an error: %s", e)
+            kill_process_tree(os.getpid())
 
     async def abort(self, request_id: str) -> None:
         """Abort RequestId in self, detokenizer, and engine core."""
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index beb5d57c20c83..3293205e110af 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -6,7 +6,7 @@
 
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
-from vllm.utils import get_open_zmq_ipc_path
+from vllm.utils import get_open_zmq_ipc_path, make_zmq_socket
 from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs,
                             EngineCoreProfile, EngineCoreRequest,
                             EngineCoreRequestType, EngineCoreRequestUnion)
@@ -144,17 +144,13 @@ def __init__(
         else:
             self.ctx = zmq.Context()  # type: ignore[attr-defined]
 
-        # Path for IPC.
+        # Paths and sockets for IPC.
         output_path = get_open_zmq_ipc_path()
         input_path = get_open_zmq_ipc_path()
-
-        # Get output (EngineCoreOutput) from EngineCore.
-        self.output_socket = self.ctx.socket(zmq.constants.PULL)
-        self.output_socket.connect(output_path)
-
-        # Send input (EngineCoreRequest) to EngineCore.
-        self.input_socket = self.ctx.socket(zmq.constants.PUSH)
-        self.input_socket.bind(input_path)
+        self.output_socket = make_zmq_socket(self.ctx, output_path,
+                                             zmq.constants.PULL)
+        self.input_socket = make_zmq_socket(self.ctx, input_path,
+                                            zmq.constants.PUSH)
 
         # Start EngineCore in background process.
         self.proc_handle: Optional[BackgroundProcHandle]

From 36e76700453924c8d421db99af70a88a1df835cd Mon Sep 17 00:00:00 2001
From: whyiug <whyiug@hotmail.com>
Date: Tue, 31 Dec 2024 02:51:04 +0800
Subject: [PATCH 032/462] [Bugfix] Validate and concatenate image embeddings in
 MiniCPMVBaseModel (#11631)

---
 vllm/model_executor/models/minicpmv.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 1e8f9bd4cf418..712022502539b 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -487,6 +487,12 @@ def _parse_and_validate_inputs(
         image_embeds = kwargs.pop("image_embeds", None)
 
         if image_embeds is not None:
+            if not isinstance(image_embeds, (torch.Tensor, list)):
+                raise ValueError(f"Incorrect type of image embeds. "
+                                 f"Got type: {type(image_embeds)}")
+            if isinstance(image_embeds, list):
+                image_embeds = torch.concat(image_embeds)
+
             return MiniCPMVImageEmbeddingInputs(
                 image_bounds=self._get_image_bounds(input_ids, im_start_id,
                                                     im_end_id, slice_start_id,

From ccb1aabccaa7aaf07b08fd8be30380e828efba0f Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Mon, 30 Dec 2024 12:27:07 -0800
Subject: [PATCH 033/462] [benchmark] Remove dependency for H100 benchmark step
 (#11572)

---
 .buildkite/nightly-benchmarks/benchmark-pipeline.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
index 708e548727cf5..868b8e95db01d 100644
--- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
@@ -73,7 +73,7 @@ steps:
     # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
     agents:
       queue: H100
-    depends_on: block-h100
+    depends_on: ~
     plugins:
     - docker#v5.12.0:
         image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT

From a2a40bcd0d8275e19c46e9cc06ee994d8839b98d Mon Sep 17 00:00:00 2001
From: Matthias Vogler <60004995+ayylemao@users.noreply.github.com>
Date: Tue, 31 Dec 2024 02:33:06 +0100
Subject: [PATCH 034/462] [Model][LoRA]LoRA support added for MolmoForCausalLM
 (#11439)

Signed-off-by: Matthias Vogler <matthias.vogler@joesecurity.org>
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Matthias Vogler <matthias.vogler@joesecurity.org>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
---
 docs/source/models/supported_models.md |  2 +-
 vllm/model_executor/models/molmo.py    | 45 ++++++++++++++++++++++++--
 2 files changed, 43 insertions(+), 4 deletions(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 518505abeb2a9..613343281464c 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -666,7 +666,7 @@ See [this page](#generative-models) for more information on how to use generativ
   - Molmo
   - T + I
   - `allenai/Molmo-7B-D-0924`, `allenai/Molmo-72B-0924`, etc.
-  -
+  - ✅︎
   - ✅︎
   - ✅︎
 * - `NVLM_D_Model`
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index 5d52d2c3e6b48..cc25be9f5b6a9 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -36,6 +36,7 @@
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
 from vllm.multimodal.inputs import NestedTensors, PlaceholderRange
 from vllm.multimodal.utils import cached_get_tokenizer
@@ -43,7 +44,7 @@
                            SequenceData)
 from vllm.transformers_utils.processor import get_processor
 
-from .interfaces import SupportsMultiModal, SupportsPP
+from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, WeightsMapper, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix, merge_multimodal_embeddings)
@@ -1161,8 +1162,8 @@ def input_processor_for_molmo(ctx: InputContext, inputs: DecoderOnlyInputs):
 @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_molmo_image_tokens)
 @INPUT_REGISTRY.register_dummy_data(dummy_data_for_molmo)
 @INPUT_REGISTRY.register_input_processor(input_processor_for_molmo)
-class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
-
+class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP,
+                       SupportsLoRA):
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_substr={
             # vision backbone mapping
@@ -1191,6 +1192,32 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
         },
     )
 
+    packed_modules_mapping = {
+        "qkv_proj": ["qkv_proj"],
+        "gate_up_proj": ["gate_up_proj"],  # language model
+        "merged_linear": ["gate_proj", "up_proj"]  # image_projector
+    }
+
+    # LoRA specific attributes
+    supported_lora_modules = [
+        # language model
+        "qkv_proj",
+        "o_proj",
+        "gate_up_proj",
+        "down_proj",  # same name with image_projector
+        # vision tower
+        "wq",
+        "wk",
+        "wv",
+        "wo",
+        "w1",
+        "w2",
+        # image_projector
+        "merged_linear",
+    ]
+    embedding_modules = {}
+    embedding_padding_modules = []
+
     # BitandBytes specific attributes
     bitsandbytes_stacked_params_mapping = {
         "gate_proj": ("merged_linear", 0),
@@ -1202,8 +1229,10 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
         multimodal_config = vllm_config.model_config.multimodal_config
+        lora_config = vllm_config.lora_config
         self.config = config
         self.multimodal_config = multimodal_config
+        self.lora_config = lora_config
 
         vision_config = VisionBackboneConfig()
         self.vision_backbone = MolmoVisionBackbone(config, vision_config,
@@ -1377,6 +1406,16 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         weights = _get_weights_with_merged_embedding(weights)
         return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
 
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """
+        Get the module prefix in multimodal models
+        """
+        return MultiModelKeys.from_string_field(
+            language_model="model",
+            connector="vision_backbone.image_projector",
+            tower_model="vision_backbone",
+        )
+
 
 def _get_weights_with_merged_embedding(
     weights: Iterable[Tuple[str, torch.Tensor]]

From 74fa1d123c2818065d862d2ceb2338468914fa79 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Mon, 30 Dec 2024 22:43:54 -0500
Subject: [PATCH 035/462] [Bugfix] Fix OpenAI parallel sampling when using
 xgrammar (#11637)

Signed-off-by: mgoin <michael@neuralmagic.com>
---
 tests/entrypoints/openai/test_completion.py        | 14 ++++++--------
 .../guided_decoding/xgrammar_decoding.py           |  5 +++++
 vllm/sampling_params.py                            |  9 +++++----
 vllm/sequence.py                                   |  2 +-
 4 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py
index c81cfdbbe5cff..183d900c493e5 100644
--- a/tests/entrypoints/openai/test_completion.py
+++ b/tests/entrypoints/openai/test_completion.py
@@ -28,6 +28,8 @@
 # need to change to match the prompt adapter
 PA_NUM_VIRTUAL_TOKENS = 8
 
+GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
+
 
 @pytest.fixture(scope="module")
 def zephyr_lora_files():
@@ -635,8 +637,7 @@ async def test_allowed_token_ids(client: openai.AsyncOpenAI):
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend",
-                         ["outlines", "lm-format-enforcer"])
+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
 async def test_guided_json_completion(client: openai.AsyncOpenAI,
                                       guided_decoding_backend: str,
                                       sample_json_schema):
@@ -658,8 +659,7 @@ async def test_guided_json_completion(client: openai.AsyncOpenAI,
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend",
-                         ["outlines", "lm-format-enforcer"])
+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
 async def test_guided_regex_completion(client: openai.AsyncOpenAI,
                                        guided_decoding_backend: str,
                                        sample_regex):
@@ -680,8 +680,7 @@ async def test_guided_regex_completion(client: openai.AsyncOpenAI,
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend",
-                         ["outlines", "lm-format-enforcer"])
+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
 async def test_guided_choice_completion(client: openai.AsyncOpenAI,
                                         guided_decoding_backend: str,
                                         sample_guided_choice):
@@ -761,8 +760,7 @@ async def test_echo_logprob_completion(client: openai.AsyncOpenAI,
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend",
-                         ["outlines", "lm-format-enforcer"])
+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
 async def test_guided_decoding_type_error(client: openai.AsyncOpenAI,
                                           guided_decoding_backend: str,
                                           sample_json_schema, sample_regex):
diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
index 5e1948977bff4..f10a8fb8e03cf 100644
--- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py
+++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
@@ -1,6 +1,7 @@
 # noqa: UP007
 from __future__ import annotations
 
+import copy
 import json
 from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Any
@@ -309,3 +310,7 @@ def __call__(self, input_ids: list[int],
             scores = scores.to(device_type).squeeze()
 
         return scores
+
+    def clone(self) -> XGrammarLogitsProcessor:
+        """Deepcopy due to per-sequence state in the matchers"""
+        return copy.deepcopy(self)
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index fc77f3ca529b2..605c09b8d7225 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -450,15 +450,16 @@ def all_stop_token_ids(self) -> Set[int]:
         return self._all_stop_token_ids
 
     def clone(self) -> "SamplingParams":
-        """Deep copy excluding LogitsProcessor objects.
+        """Deep copy, but maybe not the LogitsProcessor objects.
 
-        LogitsProcessor objects are excluded because they may contain an
-        arbitrary, nontrivial amount of data.
+        LogitsProcessor objects may contain an arbitrary, nontrivial amount of
+        data that is expensive to copy. However, if not copied, the processor
+        needs to support parallel decoding for multiple sequences
         See https://github.com/vllm-project/vllm/issues/3087
         """
 
         logit_processor_refs = None if self.logits_processors is None else {
-            id(lp): lp
+            id(lp): lp.clone() if hasattr(lp, 'clone') else lp
             for lp in self.logits_processors
         }
         return copy.deepcopy(self, memo=logit_processor_refs)
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 34f910d47b7d9..034f89c0ddbe9 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -1372,7 +1372,7 @@ class ParallelSampleSequenceGroup(SequenceGroupBase):
     @staticmethod
     def add_request(request_id: str, engine, params, **kwargs):
         original_params = params
-        params = copy.deepcopy(original_params)
+        params = original_params.clone()
         params.n = 1
         group = ParallelSampleSequenceGroup(request_id)
         seqs = []

From 82c49d3260f1fb9fcd686736e8439dc69cd2f1c4 Mon Sep 17 00:00:00 2001
From: John Giorgi <johnmgiorgi@gmail.com>
Date: Tue, 31 Dec 2024 01:15:58 -0500
Subject: [PATCH 036/462] [Misc][LoRA] Support Rank Stabilized LoRA (RSLoRA)
 (#6909)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
---
 tests/lora/test_lora_manager.py | 20 +++++++++++++-------
 vllm/lora/lora.py               | 12 +++---------
 vllm/lora/models.py             |  2 +-
 vllm/lora/peft_helper.py        | 18 +++++++++++++-----
 4 files changed, 30 insertions(+), 22 deletions(-)

diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py
index 0b76f466702fc..a099f36b0a465 100644
--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@@ -1,4 +1,5 @@
 import json
+import math
 import os
 from typing import Dict, List
 
@@ -50,6 +51,18 @@ def test_peft_helper(sql_lora_files):
         "embed_tokens",
         "lm_head",
     ]
+    scaling = peft_helper.lora_alpha / peft_helper.r
+    assert abs(peft_helper.vllm_lora_scaling_factor - scaling) < 1e-3
+
+    # test RSLoRA
+    config = dict(r=8,
+                  lora_alpha=16,
+                  target_modules=["gate_proj"],
+                  use_rslora=True)
+    peft_helper = PEFTHelper.from_dict(config)
+
+    scaling = peft_helper.lora_alpha / math.sqrt(peft_helper.r)
+    assert abs(peft_helper.vllm_lora_scaling_factor - scaling) < 1e-3
 
     expected_error = "vLLM only supports modules_to_save being None."
     with pytest.raises(ValueError, match=expected_error):
@@ -60,13 +73,6 @@ def test_peft_helper(sql_lora_files):
             modules_to_save=["lm_head"],
         )
         PEFTHelper.from_dict(config)
-    expected_error = "vLLM does not yet support RSLoRA."
-    with pytest.raises(ValueError, match=expected_error):
-        config = dict(r=8,
-                      lora_alpha=16,
-                      target_modules=["gate_proj"],
-                      use_rslora=True)
-        PEFTHelper.from_dict(config)
 
     expected_error = "vLLM does not yet support DoRA."
     with pytest.raises(ValueError, match=expected_error):
diff --git a/vllm/lora/lora.py b/vllm/lora/lora.py
index dde347b78bf81..93ad4651f4b77 100644
--- a/vllm/lora/lora.py
+++ b/vllm/lora/lora.py
@@ -67,15 +67,9 @@ def from_config(
         peft_helper: PEFTHelper,
         embeddings_tensor: Optional[torch.Tensor] = None,
     ) -> "LoRALayerWeights":
-        return cls(
-            module_name,
-            peft_helper.r,
-            peft_helper.lora_alpha,
-            None,
-            None,
-            None,
-            embeddings_tensor,
-        )
+        return cls(module_name, peft_helper.r, peft_helper.lora_alpha, None,
+                   None, None, embeddings_tensor,
+                   peft_helper.vllm_lora_scaling_factor)
 
     @classmethod
     def create_dummy_lora_weights(
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 5c0e4e5cbc636..9cfcc6bba727f 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -173,7 +173,7 @@ def from_lora_tensors(
         return cls(lora_model_id,
                    peft_helper.r,
                    loras,
-                   scaling_factor=peft_helper.vllm_scaling_factor)
+                   scaling_factor=peft_helper.vllm_long_context_scaling_factor)
 
     @classmethod
     def from_local_checkpoint(
diff --git a/vllm/lora/peft_helper.py b/vllm/lora/peft_helper.py
index edf4ba5659575..ddd42ae93d290 100644
--- a/vllm/lora/peft_helper.py
+++ b/vllm/lora/peft_helper.py
@@ -4,6 +4,8 @@
 from dataclasses import MISSING, dataclass, field, fields
 from typing import Literal, Optional, Union
 
+from vllm.utils import print_info_once
+
 
 @dataclass
 class PEFTHelper:
@@ -14,21 +16,22 @@ class PEFTHelper:
 
     bias: Literal["none", "all", "lora_only"] = field(default="none")
     modules_to_save: Optional[list[str]] = field(default=None)
+    # True to use Rank-Stabilized LoRA (rsLoRA, see: https://arxiv.org/abs/2312.03732)
     use_rslora: bool = field(default=False)
+    # True to use Weight-Decomposed Low-Rank Adaptation (DoRA, see: https://arxiv.org/abs/2402.09353)
     use_dora: bool = field(default=False)
-    # long lora field
+    # long context lora field
     context_length: int = field(default=0)
     # Extra vllm field, start with 'vllm_' to avoid conflict
+    vllm_lora_scaling_factor: float = field(default=1.0)
     vllm_max_position_embeddings: Optional[int] = field(default=False)
-    vllm_scaling_factor: Optional[float] = field(default=None)
+    vllm_long_context_scaling_factor: Optional[float] = field(default=None)
 
     def _validate_features(self):
         error_msg = []
 
         if self.modules_to_save:
             error_msg.append("vLLM only supports modules_to_save being None.")
-        if self.use_rslora:
-            error_msg.append("vLLM does not yet support RSLoRA.")
 
         if self.use_dora:
             error_msg.append("vLLM does not yet support DoRA.")
@@ -38,10 +41,15 @@ def _validate_features(self):
 
     def __post_init__(self):
         self._validate_features()
+        if self.use_rslora:
+            print_info_once("Loading LoRA weights trained with rsLoRA.")
+            self.vllm_lora_scaling_factor = self.lora_alpha / math.sqrt(self.r)
+        else:
+            self.vllm_lora_scaling_factor = self.lora_alpha / self.r
         if self.context_length:
             if self.vllm_max_position_embeddings is None:
                 self.vllm_max_position_embeddings = self.context_length
-            self.vllm_scaling_factor = float(
+            self.vllm_long_context_scaling_factor = float(
                 math.ceil(self.context_length /
                           self.vllm_max_position_embeddings))
 

From 2c5718809bb5f4bce2ae8e05041d613215dac1aa Mon Sep 17 00:00:00 2001
From: sakunkun <zhou.qianjun@zte.com.cn>
Date: Tue, 31 Dec 2024 14:29:04 +0800
Subject: [PATCH 037/462] [Bugfix] Move the _touch(computed_blocks) call in the
 allocate_slots method to after the check for allocating new blocks. (#11565)

---
 tests/v1/core/test_prefix_caching.py | 63 +++++++++++++++++++++++++++-
 vllm/v1/core/kv_cache_manager.py     | 19 ++++++---
 2 files changed, 74 insertions(+), 8 deletions(-)

diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index ed04f0a373c51..dafaa6aee9995 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -98,9 +98,9 @@ def test_prefill():
     # Incomplete 1 block (6 tokens)
     unique_token_ids = [3] * 6
     req2 = make_request("2", common_token_ids + unique_token_ids)
-    computed_block = manager.get_computed_blocks(req2)
+    computed_blocks = manager.get_computed_blocks(req2)
     assert len(req2.kv_block_hashes) == 3
-    assert [b.block_id for b in computed_block] == [0, 1, 2]
+    assert [b.block_id for b in computed_blocks] == [0, 1, 2]
     num_new_tokens = 53 - 3 * 16
     blocks = manager.allocate_slots(req2, num_new_tokens, computed_blocks)
     assert [b.block_id for b in blocks] == [7, 8]
@@ -500,3 +500,62 @@ def test_mm_prefix_caching():
                         mm_hashes=mm_hashes)
     computed_blocks = manager.get_computed_blocks(req1)
     assert len(computed_blocks) == 3
+
+
+def test_prefill_not_enough_free_blocks_with_computed_blocks():
+    """
+    This is a unit test that tests the correctness of the allocate_slots
+    when there is not enough free blocks. Specifically, when a request
+    has computed blocks but cannot be allocated due to not enough free blocks,
+    the computed blocks should not be touched.
+    """
+    block_size = 16
+    manager = KVCacheManager(
+        block_size=block_size,
+        num_gpu_blocks=10,
+        max_model_len=8192,
+        sliding_window=None,
+        enable_caching=True,
+        num_preallocate_tokens=0,
+    )
+    # Complete 3 blocks (48 tokens)
+    # | Common-0 | Common-1 | Common-2 | ... |
+    common_token_ids = [i for i in range(3) for _ in range(16)]
+    req0 = make_request("0", common_token_ids)
+    computed_blocks = manager.get_computed_blocks(req0)
+    assert not computed_blocks
+    manager.allocate_slots(req0, 48, computed_blocks)
+    block_part0 = manager.req_to_blocks[req0.request_id]
+
+    # | Common-0 | Common-1 | Common-2 | Req1-3 | Req1-4 | Req1-5 | ... |
+    req1 = make_request("1", common_token_ids * 2)
+    computed_blocks = manager.get_computed_blocks(req1)
+    assert computed_blocks == block_part0
+    manager.allocate_slots(req1, 48, computed_blocks)
+    block_part1 = manager.req_to_blocks[req1.request_id]
+    # | Common-0 | Common-1 | Common-2 | Req1-3 (F) | Req1-4 (F) |
+    # | Req1-5(F)| ... |
+    manager.free(req1)
+    assert {block.ref_cnt for block in block_part1[:3]} == {1}
+    assert {block.ref_cnt for block in block_part1[3:]} == {0}
+
+    # | Common-0 | Common-1 | Common-2 | Req1-3 (F) | Req1-4 (F) |
+    # | Req1-5(F)| Req2-0   | Req2-1   | ... |
+    req2 = make_request("2", [7] * block_size * 2)
+    computed_blocks = manager.get_computed_blocks(req2)
+    assert not computed_blocks
+    manager.allocate_slots(req2, block_size * 2, computed_blocks)
+
+    # Req3 is Req2 + 3 new blocks, so the first 6 blocks are computed,
+    # but it cannot be allocated due to insufficient free blocks (2).
+    # In this case, the ref_cnt of the computed blocks should not be changed.
+    assert manager.free_block_queue.num_free_blocks == 5
+    req3 = make_request("3", common_token_ids * 3)
+    computed_blocks = manager.get_computed_blocks(req3)
+    assert computed_blocks == block_part1
+    # Req3 cannot be allocated.
+    assert manager.allocate_slots(req3, 48, computed_blocks) is None
+    # Block 0-2 are used by Req 1.
+    assert {block.ref_cnt for block in block_part1[:3]} == {1}
+    # Block 3-5 are free.
+    assert {block.ref_cnt for block in block_part1[3:]} == {0}
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index 78efacccfa078..00d0de51634ae 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -191,7 +191,7 @@ def allocate_slots(
             request: The request to allocate slots.
             num_tokens: The number of tokens to allocate. Note that this does
                 not include the tokens that have already been computed.
-            computed_blocks: The blocks that have already been computed.
+            computed_blocks: A list of computed blocks.
 
         Returns:
             A list of new allocated blocks.
@@ -200,6 +200,18 @@ def allocate_slots(
             raise ValueError(
                 f"num_tokens must be greater than 0, got {num_tokens}")
 
+        # If a computed block of a request is an eviction candidate (in the
+        # free queue and ref_cnt == 0), it cannot be counted as a free block
+        # when allocating this request.
+        num_evictable_computed_blocks = sum(1 for blk in computed_blocks
+                                            if blk.ref_cnt == 0)
+
+        num_required_blocks = cdiv(num_tokens, self.block_size)
+        if (num_required_blocks > self.free_block_queue.num_free_blocks -
+                num_evictable_computed_blocks):
+            # Cannot allocate new blocks.
+            return None
+
         # Touch the computed blocks to make sure they won't be evicted.
         if self.enable_caching:
             self._touch(computed_blocks)
@@ -208,11 +220,6 @@ def allocate_slots(
                 "Computed blocks should be empty when "
                 "prefix caching is disabled")
 
-        num_required_blocks = cdiv(num_tokens, self.block_size)
-        if (num_required_blocks > self.free_block_queue.num_free_blocks):
-            # Cannot allocate new blocks.
-            return None
-
         # Determine the number of new blocks to allocate considering
         # preallocated blocks.
         num_new_blocks = min(

From 8c3230d8c1cf114618c2316c54bf06b7d0c198b6 Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Tue, 31 Dec 2024 16:56:01 +0800
Subject: [PATCH 038/462] [V1] Simpify vision block hash for prefix caching by
 removing offset from hash (#11646)

---
 tests/v1/core/test_prefix_caching.py | 8 ++++----
 vllm/v1/core/kv_cache_utils.py       | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index dafaa6aee9995..35e3a2f972720 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -469,9 +469,9 @@ def test_mm_prefix_caching():
     # Completed block should have hashes with extra keys.
     assert not computed_blocks
     assert len(req0.kv_block_hashes) == 3
-    assert req0.kv_block_hashes[0].extra_keys == (("aaa", 0), )
-    assert req0.kv_block_hashes[1].extra_keys == (("aaa", 5), ("bbb", 0))
-    assert req0.kv_block_hashes[2].extra_keys == (("bbb", 2), )
+    assert req0.kv_block_hashes[0].extra_keys == ("aaa", )
+    assert req0.kv_block_hashes[1].extra_keys == ("aaa", "bbb")
+    assert req0.kv_block_hashes[2].extra_keys == ("bbb", )
 
     blocks = manager.allocate_slots(req0, 59, computed_blocks)
     assert [b.block_id for b in blocks] == [0, 1, 2, 3, 4]
@@ -485,7 +485,7 @@ def test_mm_prefix_caching():
 
     # The just completed block should have hashes with extra keys.
     assert len(req0.kv_block_hashes) == 4
-    assert req0.kv_block_hashes[3].extra_keys == (("ccc", 0), )
+    assert req0.kv_block_hashes[3].extra_keys == ("ccc", )
 
     # Cache hit.
     unique_token_ids = [-1] * 7 + [200] * 5
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index 9ddbff7c9a604..84ff48bf428a0 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -218,8 +218,8 @@ def generate_block_hash_extra_keys(
                 continue
 
             # The block contains the current mm input.
-            mm_start = max(0, start_token_idx - offset)
-            extra_keys.append((mm_hashes[curr_mm_idx], mm_start))
+            extra_keys.append(mm_hashes[curr_mm_idx])
+
             if end_token_idx >= offset + length:
                 # If this block contains the end of the current mm input,
                 # move to the next mm input as this block may also contain

From e7c7c5e822a886e3dba202ca1b756c3260efffcc Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Tue, 31 Dec 2024 13:17:22 -0800
Subject: [PATCH 039/462] [V1][VLM] V1 support for selected single-image
 models. (#11632)

Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Isotr0py <2037008807@qq.com>
---
 docs/source/models/supported_models.md        |  10 +-
 examples/offline_inference_vision_language.py |  10 +-
 .../vision_language/test_models.py            |   7 +-
 tests/multimodal/test_processing.py           |  29 +-
 vllm/model_executor/models/aria.py            | 169 ++++----
 vllm/model_executor/models/blip.py            |  92 -----
 vllm/model_executor/models/blip2.py           | 172 ++++----
 vllm/model_executor/models/chameleon.py       | 191 ++++-----
 vllm/model_executor/models/fuyu.py            | 381 +++++++++---------
 .../models/idefics2_vision_model.py           |   6 +-
 vllm/model_executor/models/llava.py           |   4 +-
 vllm/model_executor/models/llava_next.py      |   6 +-
 vllm/model_executor/models/pixtral.py         |  12 +-
 vllm/model_executor/models/qwen2_audio.py     |  14 +-
 vllm/model_executor/models/qwen2_vl.py        |  17 +-
 vllm/model_executor/models/ultravox.py        |  13 +-
 vllm/multimodal/processing.py                 |  68 +++-
 vllm/multimodal/utils.py                      |  10 +-
 vllm/v1/worker/gpu_model_runner.py            |  15 +-
 19 files changed, 590 insertions(+), 636 deletions(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 613343281464c..f74c201bdff6b 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -570,28 +570,28 @@ See [this page](#generative-models) for more information on how to use generativ
   - `rhymes-ai/Aria`
   -
   - ✅︎
-  -
+  - ✅︎
 * - `Blip2ForConditionalGeneration`
   - BLIP-2
   - T + I<sup>E</sup>
   - `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc.
   -
   - ✅︎
-  -
+  - ✅︎
 * - `ChameleonForConditionalGeneration`
   - Chameleon
   - T + I
   - `facebook/chameleon-7b` etc.
   -
   - ✅︎
-  -
+  - ✅︎
 * - `FuyuForCausalLM`
   - Fuyu
   - T + I
   - `adept/fuyu-8b` etc.
   -
   - ✅︎
-  -
+  - ✅︎
 * - `ChatGLMModel`
   - GLM-4V
   - T + I
@@ -633,7 +633,7 @@ See [this page](#generative-models) for more information on how to use generativ
   - `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc.
   -
   - ✅︎
-  -
+  - ✅︎
 * - `LlavaNextVideoForConditionalGeneration`
   - LLaVA-NeXT-Video
   - T + V
diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py
index 77af914a6ef02..b51bfae455267 100644
--- a/examples/offline_inference_vision_language.py
+++ b/examples/offline_inference_vision_language.py
@@ -24,10 +24,13 @@ def run_aria(question: str, modality: str):
     assert modality == "image"
     model_name = "rhymes-ai/Aria"
 
+    # NOTE: Need L40 (or equivalent) to avoid OOM
     llm = LLM(model=model_name,
               tokenizer_mode="slow",
-              trust_remote_code=True,
               dtype="bfloat16",
+              max_model_len=4096,
+              max_num_seqs=2,
+              trust_remote_code=True,
               disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
 
     prompt = (f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>\n{question}"
@@ -57,6 +60,7 @@ def run_chameleon(question: str, modality: str):
     prompt = f"{question}<image>"
     llm = LLM(model="facebook/chameleon-7b",
               max_model_len=4096,
+              max_num_seqs=2,
               disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
     stop_token_ids = None
     return llm, prompt, stop_token_ids
@@ -257,7 +261,7 @@ def run_minicpmv(question: str, modality: str):
     # 2.5
     # model_name = "openbmb/MiniCPM-Llama3-V-2_5"
 
-    #2.6
+    # 2.6
     model_name = "openbmb/MiniCPM-V-2_6"
     tokenizer = AutoTokenizer.from_pretrained(model_name,
                                               trust_remote_code=True)
@@ -430,9 +434,11 @@ def run_pixtral_hf(question: str, modality: str):
 
     model_name = "mistral-community/pixtral-12b"
 
+    # NOTE: Need L40 (or equivalent) to avoid OOM
     llm = LLM(
         model=model_name,
         max_model_len=8192,
+        max_num_seqs=2,
         disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
 
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index 1a9c1b4ef1be0..7db08166826eb 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -140,10 +140,7 @@
     "aria": VLMTestInfo(
         models=["rhymes-ai/Aria"],
         tokenizer_mode="slow",
-        test_type=(
-            VLMTestType.IMAGE,
-            VLMTestType.MULTI_IMAGE,
-        ),
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
         dtype="bfloat16",
         prompt_formatter=lambda img_prompt: f"<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n ", # noqa: E501
         img_idx_to_prompt=lambda idx: "<fim_prefix><|img|><fim_suffix>\n",
@@ -179,6 +176,7 @@
         test_type=VLMTestType.IMAGE,
         prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
         max_model_len=4096,
+        max_num_seqs=2,
         auto_cls=AutoModelForVision2Seq,
         postprocess_inputs=model_utils.cast_dtype_post_processor(
             "pixel_values"
@@ -201,7 +199,6 @@
         vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output,
         num_logprobs=10,
         image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
-        marks=[large_gpu_mark(min_gb=48)],
     ),
     "glm4": VLMTestInfo(
         models=["THUDM/glm-4v-9b"],
diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index 1b2847ed0f534..81278cde264ff 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -528,7 +528,7 @@ def _rand_audio(
 
 def _test_processing_cache_correctness(
     model_id: str,
-    modalities: set[str],
+    modalities: dict[str, bool],
     hit_rate: float,
     num_batches: int,
     simplify_rate: float,
@@ -583,9 +583,8 @@ def _test_processing_cache_correctness(
         partial(_rand_audio, rng, min_len=256, max_len=512, sr=16000),
     }
     input_max_count = {
-        "image": 3,
-        "video": 3,
-        "audio": 3,
+        modality: 3 if supports_multi else 1
+        for modality, supports_multi in modalities.items()
     }
 
     for batch_idx in range(num_batches):
@@ -624,12 +623,16 @@ def _test_processing_cache_correctness(
 
 # yapf: disable
 @pytest.mark.parametrize(("model_id", "modalities"), [
-    ("llava-hf/llava-1.5-7b-hf", {"image"}),
-    ("TIGER-Lab/Mantis-8B-siglip-llama3", {"image"}),
-    ("mistral-community/pixtral-12b", {"image"}),
-    ("Qwen/Qwen2-VL-2B-Instruct", {"image", "video"}),
-    ("Qwen/Qwen2-Audio-7B-Instruct", {"audio"}),
-    ("fixie-ai/ultravox-v0_3", {"audio"}),
+    ("rhymes-ai/Aria", {"image": True}),
+    ("Salesforce/blip2-opt-2.7b", {"image": False}),
+    ("facebook/chameleon-7b", {"image": True}),
+    ("adept/fuyu-8b", {"image": False}),
+    ("llava-hf/llava-1.5-7b-hf", {"image": True}),
+    ("TIGER-Lab/Mantis-8B-siglip-llama3", {"image": True}),
+    ("mistral-community/pixtral-12b", {"image": True}),
+    ("Qwen/Qwen2-VL-2B-Instruct", {"image": True, "video": True}),
+    ("Qwen/Qwen2-Audio-7B-Instruct", {"audio": True}),
+    ("fixie-ai/ultravox-v0_3", {"audio": True}),
 ])
 @pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
 @pytest.mark.parametrize("num_batches", [32])
@@ -637,7 +640,7 @@ def _test_processing_cache_correctness(
 # yapf: enable
 def test_processing_cache_correctness(
     model_id: str,
-    modalities: set[str],
+    modalities: dict[str, bool],
     hit_rate: float,
     num_batches: int,
     simplify_rate: float,
@@ -653,7 +656,7 @@ def test_processing_cache_correctness(
 
 # yapf: disable
 @pytest.mark.parametrize(("model_id", "modalities"), [
-    ("microsoft/Phi-3-vision-128k-instruct", {"image"}),
+    ("microsoft/Phi-3-vision-128k-instruct", {"image": True}),
 ])
 @pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
 @pytest.mark.parametrize("num_batches", [32])
@@ -661,7 +664,7 @@ def test_processing_cache_correctness(
 # yapf: enable
 def test_processing_cache_correctness_phi3v(
     model_id: str,
-    modalities: set[str],
+    modalities: dict[str, bool],
     hit_rate: float,
     num_batches: int,
     simplify_rate: float,
diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
index 9437ad9688422..4ad6e859f4d93 100644
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@@ -1,15 +1,15 @@
-import math
-from typing import Iterable, List, Optional, Set, Tuple, TypedDict, Union
+from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict,
+                    Union)
 
 import torch
 import torch.nn as nn
 from torch.nn.init import trunc_normal_
-from transformers import LlamaConfig
+from transformers import BatchFeature, PretrainedConfig
 
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, QuantizationConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_rank
-from vllm.inputs import INPUT_REGISTRY, token_inputs
+from vllm.inputs import InputContext
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
@@ -17,30 +17,27 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
     get_compressed_tensors_cache_scale)
-from vllm.model_executor.layers.sampler import (Sampler, SamplerOutput,
-                                                SamplingMetadata)
+from vllm.model_executor.layers.sampler import (SamplerOutput,
+                                                SamplingMetadata, get_sampler)
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader.weight_utils import (
     default_weight_loader, maybe_remap_kv_scale_name)
-from vllm.model_executor.models.idefics2_vision_model import (
-    Idefics2VisionTransformer)
-from vllm.model_executor.models.interfaces import SupportsMultiModal
-from vllm.model_executor.models.llama import (LlamaDecoderLayer, LlamaMLP,
-                                              LlamaModel)
-from vllm.model_executor.models.utils import (AutoWeightsLoader, WeightsMapper,
-                                              is_pp_missing_parameter,
-                                              maybe_prefix,
-                                              merge_multimodal_embeddings)
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.image import cached_get_image_processor
-from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors
-from vllm.multimodal.utils import (cached_get_tokenizer,
-                                   repeat_and_pad_placeholder_tokens)
+from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
+                                    NestedTensors)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        MultiModalDataItems, ProcessorInputs,
+                                        PromptReplacement)
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs.aria import (AriaMoELMConfig,
                                                   AriaVisionConfig)
 
-from .utils import flatten_bn
+from .idefics2_vision_model import Idefics2VisionTransformer
+from .interfaces import SupportsMultiModal
+from .llama import LlamaDecoderLayer, LlamaMLP, LlamaModel
+from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
+                    is_pp_missing_parameter, maybe_prefix,
+                    merge_multimodal_embeddings)
 
 
 class AriaImagePixelInputs(TypedDict):
@@ -251,7 +248,7 @@ def forward(self, x, attn_mask=None):
 class AriaFusedMoE(FusedMoE):
 
     def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor,
-                      shard_id: str) -> Set[str]:
+                      shard_id: str) -> None:
         # Override the weight_loader to handle the expert weights in the Aria
         # model, which are already packed with experts, and merge the gate and
         # up weights for each expert.
@@ -346,7 +343,7 @@ class MoEDecoderLayer(LlamaDecoderLayer):
 
     def __init__(
         self,
-        config: LlamaConfig,
+        config: AriaMoELMConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
@@ -434,7 +431,7 @@ def load_weights(self, weights: Iterable[Tuple[str,
         return loaded_params
 
 
-def build_mm_projector(config):
+def build_mm_projector(config: PretrainedConfig):
     return AriaProjector(
         patch_to_query_dict=config.projector_patch_to_query_dict,
         embed_dim=config.vision_config.hidden_size,
@@ -445,75 +442,70 @@ def build_mm_projector(config):
     )
 
 
-def get_max_multimodal_tokens(ctx):
-    return max(ctx.model_config.hf_config.image_size2tokens.values())
-
-
-def input_mapper_for_aria(ctx, data):
-    return MultiModalKwargs(data)
+def get_max_aria_image_tokens(ctx: InputContext):
+    hf_config = ctx.get_hf_config()
+    return max(hf_config.projector_patch_to_query_dict.values())
 
 
-def input_processor(ctx, llm_inputs):
-    multi_modal_data = llm_inputs.get("multi_modal_data")
-    # if it is pure text input, use it as is
-    if multi_modal_data is None or "image" not in multi_modal_data:
-        return llm_inputs
+class AriaMultiModalProcessor(BaseMultiModalProcessor):
 
-    model_config = ctx.model_config
-
-    tokenizer = cached_get_tokenizer(model_config.tokenizer)
-    image_processor = cached_get_image_processor(
-        model_config.model, trust_remote_code=model_config.trust_remote_code)
-    hf_config = model_config.hf_config
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+            pixel_mask=MultiModalFieldConfig.batched("image"),
+        )
 
-    # prepare image tokens, the max_image_size is used to determine the number
-    # of patch_size for every image
-    max_image_size = multi_modal_data.pop("max_image_size", 980)
-    _split_image = multi_modal_data.pop("split_image", False)
+    def _get_prompt_replacements(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> list[PromptReplacement]:
+        hf_config = self.ctx.get_hf_config()
+        image_token_id = hf_config.image_token_index
+
+        max_image_tokens = get_max_aria_image_tokens(self.ctx)
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=[image_token_id],
+                replacement=[image_token_id] * max_image_tokens,
+            )
+        ]
 
-    assert isinstance(max_image_size,
-                      (int, float)), "max_image_size should be float or int"
-    images = (multi_modal_data["image"] if isinstance(
-        multi_modal_data["image"], list) else [multi_modal_data["image"]])
+    def _get_dummy_mm_inputs(
+        self,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        hf_config = self.ctx.get_hf_config()
+        vision_config: AriaVisionConfig = hf_config.vision_config
+
+        max_image_size = vision_config.image_size
+        num_images = mm_counts.get("image", 0)
+
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=max_image_size,
+                                   height=max_image_size,
+                                   num_images=num_images)
+        }
 
-    image_inputs = image_processor.preprocess(images,
-                                              max_image_size=max_image_size,
-                                              split_image=_split_image,
-                                              return_tensors="pt").data
-    image_inputs['pixel_values'] = image_inputs['pixel_values'].to(
-        ctx.model_config.dtype)
-    num_crops = image_inputs.pop("num_crops")
+        hf_processor = self._get_hf_processor()
+        image_token: str = hf_processor.image_token  # type: ignore
 
-    prompt_token_ids = llm_inputs["prompt_token_ids"]
-    if num_crops.sum().item() > 0:
-        _, prompt_token_ids, _ = repeat_and_pad_placeholder_tokens(
-            tokenizer,
-            None,
-            prompt_token_ids,
-            placeholder_token_id=hf_config.image_token_index,
-            repeat_count=num_crops,
+        return ProcessorInputs(
+            prompt_text=image_token * num_images,
+            mm_data=mm_data,
         )
 
-    repeat_count = [hf_config.image_size2tokens[max_image_size]
-                    ] * sum(num_crops).item()
-    new_prompt, new_token_ids, _ = repeat_and_pad_placeholder_tokens(
-        tokenizer,
-        None,
-        prompt_token_ids,
-        placeholder_token_id=hf_config.image_token_index,
-        repeat_count=repeat_count,
-    )
-
-    return token_inputs(
-        prompt_token_ids=new_token_ids,
-        prompt=new_prompt,
-        multi_modal_data={"image": image_inputs},
-    )
-
 
-@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_multimodal_tokens)
-@MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_aria)
-@INPUT_REGISTRY.register_input_processor(input_processor)
+@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_aria_image_tokens)
+@MULTIMODAL_REGISTRY.register_processor(AriaMultiModalProcessor)
 class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
     """
     Aria model for conditional generation tasks.
@@ -540,12 +532,6 @@ def __init__(
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
 
-        # prepare the image_size to tokens mapping for the image preprocess, see
-        # input_processor
-        config.image_size2tokens = {
-            int(math.sqrt(k) * config.vision_config.patch_size): v
-            for k, v in config.projector_patch_to_query_dict.items()
-        }
         self.config = config
         self.vision_tower = AriaVisionModel(config.vision_config)
         self.multi_modal_projector = build_mm_projector(config)
@@ -566,7 +552,7 @@ def __init__(
         logit_scale = getattr(config, "logit_scale", 1.0)
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 self.vocab_size, logit_scale)
-        self.sampler = Sampler()
+        self.sampler = get_sampler()
 
     def _validate_image_sizes(
             self, images: List[torch.Tensor]) -> List[torch.Tensor]:
@@ -588,7 +574,12 @@ def _parse_and_validate_image_input(
 
         pixel_values = self._validate_image_sizes(pixel_values)
         pixel_values = flatten_bn(pixel_values, concat=True)
+
         if pixel_mask is not None:
+            if not isinstance(pixel_mask, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of pixel mask. "
+                                 f"Got type: {type(pixel_mask)}")
+
             pixel_mask = flatten_bn(pixel_mask, concat=True)
 
         return AriaImagePixelInputs(
diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py
index 42a239cadac46..987dfaf44f228 100644
--- a/vllm/model_executor/models/blip.py
+++ b/vllm/model_executor/models/blip.py
@@ -4,22 +4,16 @@
 
 import torch
 import torch.nn as nn
-from PIL import Image
 from transformers import Blip2VisionConfig, BlipVisionConfig
 
 from vllm.attention.layer import MultiHeadAttention
-from vllm.config import ModelConfig
 from vllm.distributed import divide, get_tensor_model_parallel_world_size
-from vllm.inputs import DecoderOnlyInputs, token_inputs
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                QKVParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.multimodal.utils import (cached_get_tokenizer,
-                                   repeat_and_pad_placeholder_tokens)
-from vllm.sequence import SequenceData
 
 
 def get_blip_patch_grid_length(*, image_size: int, patch_size: int) -> int:
@@ -33,92 +27,6 @@ def get_blip_num_patches(*, image_size: int, patch_size: int) -> int:
     return grid_length * grid_length
 
 
-def get_blip_image_feature_size(
-        hf_config: Union[BlipVisionConfig, Blip2VisionConfig]) -> int:
-    return get_blip_num_patches(image_size=hf_config.image_size,
-                                patch_size=hf_config.patch_size)
-
-
-def get_max_blip_image_tokens(
-        hf_config: Union[BlipVisionConfig, Blip2VisionConfig]) -> int:
-    return get_blip_image_feature_size(hf_config)
-
-
-def dummy_seq_data_for_blip(
-    hf_config: Union[BlipVisionConfig, Blip2VisionConfig],
-    seq_len: int,
-    num_images: int,
-    *,
-    image_token_id: int,
-    image_feature_size_override: Optional[int] = None,
-):
-    if image_feature_size_override is None:
-        image_feature_size = get_blip_image_feature_size(hf_config)
-    else:
-        image_feature_size = image_feature_size_override
-
-    return SequenceData.from_prompt_token_counts(
-        (image_token_id, image_feature_size * num_images),
-        (0, seq_len - image_feature_size * num_images),
-    )
-
-
-def dummy_image_for_blip(
-    hf_config: Union[BlipVisionConfig, Blip2VisionConfig],
-    num_images: int,
-    *,
-    image_width_override: Optional[int] = None,
-    image_height_override: Optional[int] = None,
-):
-    width = height = hf_config.image_size
-    if image_width_override is not None:
-        width = image_width_override
-    if image_height_override is not None:
-        height = image_height_override
-
-    image = Image.new("RGB", (width, height), color=0)
-    return {"image": image if num_images == 1 else [image] * num_images}
-
-
-def input_processor_for_blip(
-    model_config: ModelConfig,
-    hf_config: Union[BlipVisionConfig, Blip2VisionConfig],
-    inputs: DecoderOnlyInputs,
-    *,
-    image_token_id: int,
-    image_feature_size_override: Optional[int] = None,
-):
-    multi_modal_data = inputs.get("multi_modal_data")
-    if multi_modal_data is None or "image" not in multi_modal_data:
-        return inputs
-
-    if "multi_modal_placeholders" in inputs and "image" in inputs[
-            "multi_modal_placeholders"]:
-        # The inputs already have placeholders.
-        return inputs
-
-    tokenizer = cached_get_tokenizer(model_config.tokenizer)
-
-    if image_feature_size_override is None:
-        image_feature_size = get_blip_image_feature_size(hf_config)
-    else:
-        image_feature_size = image_feature_size_override
-
-    new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens(
-        tokenizer,
-        inputs.get("prompt"),
-        inputs["prompt_token_ids"],
-        placeholder_token_id=image_token_id,
-        repeat_count=image_feature_size,
-    )
-
-    # NOTE: Create a defensive copy of the original inputs
-    return token_inputs(prompt_token_ids=new_token_ids,
-                        prompt=new_prompt,
-                        multi_modal_data=multi_modal_data,
-                        multi_modal_placeholders={"image": ranges})
-
-
 # Adapted from https://github.com/huggingface/transformers/blob/v4.39.0/src/transformers/models/blip/modeling_blip.py#L164 # noqa
 class BlipVisionEmbeddings(nn.Module):
 
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index 76b8505ee1c2a..bf70f5d904f5b 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -4,32 +4,33 @@
 
 import torch
 import torch.nn as nn
-from transformers import (Blip2Config, Blip2QFormerConfig, Blip2VisionConfig,
-                          apply_chunking_to_forward)
+from transformers import (BatchFeature, Blip2Config, Blip2Processor,
+                          Blip2QFormerConfig, apply_chunking_to_forward)
 
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, VllmConfig
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
-                         InputContext, token_inputs)
+from vllm.inputs import InputContext
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import NestedTensors
-from vllm.multimodal.utils import consecutive_placeholder_ranges
-from vllm.sequence import IntermediateTensors, SequenceData
-
-from .blip import (BlipVisionModel, dummy_image_for_blip,
-                   get_max_blip_image_tokens)
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalInputsV2, MultiModalKwargs,
+                                    NestedTensors, PlaceholderRange)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        MultiModalDataItems, ProcessorInputs,
+                                        PromptReplacement)
+from vllm.sequence import IntermediateTensors
+
+from .blip import BlipVisionModel
 from .interfaces import SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
 
 # We use this internally as placeholders since there is no image token
 # defined on the HuggingFace repo
-BLIP2_IMAGE_TOKEN = "<image>"
-BLIP2_IMAGE_TOKEN_ID = 50265
+_IMAGE_TOKEN_ID = 50265
 
 
 class Blip2ImagePixelInputs(TypedDict):
@@ -396,92 +397,87 @@ def forward(
         return sequence_output
 
 
-def get_blip2_image_feature_size(hf_config: Blip2Config) -> int:
-    return hf_config.num_query_tokens
-
-
 def get_max_blip2_image_tokens(ctx: InputContext):
     hf_config = ctx.get_hf_config(Blip2Config)
-    vision_config = hf_config.vision_config
-
-    if isinstance(vision_config, Blip2VisionConfig):
-        return get_max_blip_image_tokens(vision_config)
-
-    msg = f"Unsupported vision config: {type(vision_config)}"
-    raise NotImplementedError(msg)
-
-
-def dummy_seq_data_for_blip2(
-    hf_config: Blip2Config,
-    seq_len: int,
-    num_images: int,
-    *,
-    image_token_id: int,
-    image_feature_size_override: Optional[int] = None,
-):
-    if image_feature_size_override is None:
-        image_feature_size = get_blip2_image_feature_size(hf_config)
-    else:
-        image_feature_size = image_feature_size_override
-
-    return SequenceData.from_prompt_token_counts(
-        (image_token_id, image_feature_size * num_images),
-        (0, seq_len - image_feature_size * num_images),
-    ), {
-        "image":
-        consecutive_placeholder_ranges(num_items=num_images,
-                                       item_size=image_feature_size)
-    }
-
-
-def dummy_data_for_blip2(ctx: InputContext, seq_len: int,
-                         mm_counts: Mapping[str, int]):
-    hf_config = ctx.get_hf_config(Blip2Config)
-    vision_config = hf_config.vision_config
-    num_images = mm_counts["image"]
-
-    seq_data, ranges = dummy_seq_data_for_blip2(
-        hf_config,
-        seq_len,
-        num_images,
-        image_token_id=BLIP2_IMAGE_TOKEN_ID,
-    )
-
-    if isinstance(vision_config, Blip2VisionConfig):
-        mm_data = dummy_image_for_blip(vision_config, num_images)
-
-        return DummyData(seq_data, mm_data, ranges)
-
-    msg = f"Unsupported vision config: {type(vision_config)}"
-    raise NotImplementedError(msg)
+    return hf_config.num_query_tokens
 
 
-def input_processor_for_blip2(ctx: InputContext, inputs: DecoderOnlyInputs):
-    multi_modal_data = inputs.get("multi_modal_data")
-    if multi_modal_data is None or "image" not in multi_modal_data:
-        return inputs
+class Blip2MultiModalProcessor(BaseMultiModalProcessor):
 
-    hf_config = ctx.get_hf_config(Blip2Config)
-    image_feature_size = get_blip2_image_feature_size(hf_config)
+    def _get_hf_processor(self) -> Blip2Processor:
+        return self.ctx.get_hf_processor(Blip2Processor)
 
-    # The original model places image tokens at the front
-    # https://github.com/huggingface/transformers/blob/v4.41.2/src/transformers/models/blip_2/modeling_blip_2.py#L1514
-    new_token_ids = [BLIP2_IMAGE_TOKEN_ID] * image_feature_size
-    new_token_ids += inputs["prompt_token_ids"]
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+            image_embeds=MultiModalFieldConfig.batched("image"),
+        )
 
-    new_prompt = inputs.get("prompt")
-    if new_prompt is not None:
-        new_prompt = BLIP2_IMAGE_TOKEN * image_feature_size + new_prompt
+    def _get_prompt_replacements(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> list[PromptReplacement]:
+        max_image_tokens = get_max_blip2_image_tokens(self.ctx)
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target="</s>",
+                replacement="<image>" * max_image_tokens + "</s>",
+            )
+        ]
 
-    return token_inputs(prompt_token_ids=new_token_ids,
-                        prompt=new_prompt,
-                        multi_modal_data=multi_modal_data)
+    def apply(
+        self,
+        prompt_text: str,
+        mm_data: MultiModalDataDict,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> MultiModalInputsV2:
+        result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs)
+
+        # Only <image> tokens should be considered as placeholders,
+        # so we ignore the trailing bos_token
+        result["mm_placeholders"] = {
+            modality: [
+                PlaceholderRange(offset=p["offset"], length=p["length"] - 1)
+                for p in ps
+            ]
+            for modality, ps in result["mm_placeholders"].items()
+        }
+
+        return result
+
+    def _get_dummy_mm_inputs(
+        self,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        hf_config = self.ctx.get_hf_config(Blip2Config)
+        vision_config = hf_config.vision_config
+
+        max_image_size = vision_config.image_size
+        num_images = mm_counts.get("image", 0)
+
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=max_image_size,
+                                   height=max_image_size,
+                                   num_images=num_images)
+        }
+
+        return ProcessorInputs(
+            prompt_text="",
+            mm_data=mm_data,
+        )
 
 
-@MULTIMODAL_REGISTRY.register_image_input_mapper()
 @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_blip2_image_tokens)
-@INPUT_REGISTRY.register_dummy_data(dummy_data_for_blip2)
-@INPUT_REGISTRY.register_input_processor(input_processor_for_blip2)
+@MULTIMODAL_REGISTRY.register_processor(Blip2MultiModalProcessor)
 class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
@@ -627,7 +623,7 @@ def get_input_embeddings(
         if multimodal_embeddings is not None:
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids, inputs_embeds, multimodal_embeddings,
-                BLIP2_IMAGE_TOKEN_ID)
+                _IMAGE_TOKEN_ID)
         return inputs_embeds
 
     def forward(
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index a40c321ce0a58..85fca23b05746 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -3,16 +3,15 @@
                     Tuple, TypedDict, Union)
 
 import torch
+import torch.nn as nn
 import torch.nn.functional as F
-from PIL import Image
-from torch import nn
-from transformers import ChameleonConfig, ChameleonVQVAEConfig
+from transformers import (BatchFeature, ChameleonConfig, ChameleonProcessor,
+                          ChameleonVQVAEConfig)
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
-                         InputContext, token_inputs)
+from vllm.inputs import InputContext
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
@@ -29,11 +28,13 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import NestedTensors
-from vllm.multimodal.utils import (cached_get_tokenizer,
-                                   consecutive_placeholder_ranges,
-                                   repeat_and_pad_placeholder_tokens)
-from vllm.sequence import IntermediateTensors, SequenceData
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalInputsV2, MultiModalKwargs,
+                                    NestedTensors, PlaceholderRange)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        MultiModalDataItems, ProcessorInputs,
+                                        PromptReplacement)
+from vllm.sequence import IntermediateTensors
 from vllm.utils import print_warning_once
 
 from .interfaces import SupportsMultiModal, SupportsPP
@@ -45,10 +46,6 @@
 # and processor files, so we hardcode them in the model file for now.
 CHAMELEON_CROP_SIZE_HEIGHT = CHAMELEON_CROP_SIZE_WIDTH = 512
 CHAMELEON_IMAGE_SEQ_LENGTH = 1024
-CHAMELEON_IMAGE_TOKEN_ID = 8711
-CHAMELEON_IMAGE_START_TOKEN_ID = 8197
-CHAMELEON_IMAGE_END_TOKEN_ID = 8196
-CHAMELEON_SEP_TOKEN_ID = 8710
 
 
 class ChameleonImagePixelInputs(TypedDict):
@@ -61,99 +58,75 @@ def get_max_chameleon_image_tokens(ctx: InputContext):
     return CHAMELEON_IMAGE_SEQ_LENGTH
 
 
-def dummy_seq_data_for_chameleon(
-    seq_len: int,
-    num_images: int,
-    *,
-    image_token_id: int,
-    image_feature_size_override: Optional[int] = None,
-):
-    if image_feature_size_override is None:
-        image_feature_size = CHAMELEON_IMAGE_SEQ_LENGTH
-    else:
-        image_feature_size = image_feature_size_override
-
-    return SequenceData.from_prompt_token_counts(
-        (image_token_id, image_feature_size * num_images),
-        (0, seq_len - image_feature_size * num_images),
-    ), {
-        "image":
-        consecutive_placeholder_ranges(num_items=num_images,
-                                       item_size=image_feature_size)
-    }
-
-
-def dummy_image_for_chameleon(
-    num_images: int,
-    *,
-    image_width_override: Optional[int] = None,
-    image_height_override: Optional[int] = None,
-):
-    width = CHAMELEON_CROP_SIZE_WIDTH
-    height = CHAMELEON_CROP_SIZE_HEIGHT
-    if image_width_override is not None:
-        width = image_width_override
-    if image_height_override is not None:
-        height = image_height_override
-
-    image = Image.new("RGB", (width, height), color=0)
-    return {"image": image if num_images == 1 else [image] * num_images}
-
-
-def dummy_data_for_chameleon(ctx: InputContext, seq_len: int,
-                             mm_counts: Mapping[str, int]):
-    num_images = mm_counts["image"]
-
-    seq_data, ranges = dummy_seq_data_for_chameleon(
-        seq_len,
-        num_images,
-        image_token_id=CHAMELEON_IMAGE_TOKEN_ID,
-    )
-
-    mm_data = dummy_image_for_chameleon(num_images)
-    return DummyData(seq_data, mm_data, ranges)
-
-
-def input_processor_for_chameleon(ctx: InputContext,
-                                  inputs: DecoderOnlyInputs):
+class ChameleonMultiModalProcessor(BaseMultiModalProcessor):
 
-    """
-    Processing input prompt to insert required tokens for image placeholder.
-
-    See https://github.com/huggingface/transformers/blob/0fdea8607d7e01eb0e38a1ebeb7feee30a22f0cf/src/transformers/models/chameleon/processing_chameleon.py#L58
-    """ # noqa
-
-    multi_modal_data = inputs.get("multi_modal_data")
-    if multi_modal_data is None or "image" not in multi_modal_data:
-        return inputs
-
-    if "multi_modal_placeholders" in inputs and "image" in inputs[
-            "multi_modal_placeholders"]:
-        # The inputs already have placeholders.
-        return inputs
-
-    model_config = ctx.model_config
-    tokenizer = cached_get_tokenizer(model_config.tokenizer)
-    new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens(
-        tokenizer,
-        inputs.get("prompt"),
-        inputs["prompt_token_ids"],
-        placeholder_token_id=CHAMELEON_IMAGE_TOKEN_ID,
-        repeat_count=CHAMELEON_IMAGE_SEQ_LENGTH,
-        pad_token_left=CHAMELEON_IMAGE_START_TOKEN_ID,
-        pad_token_right=CHAMELEON_IMAGE_END_TOKEN_ID,
-    )
-
-    # Appending sep token for chat mode to follow default processor
-    # behavior
-    if new_prompt is not None:
-        new_prompt += tokenizer.sep_token
-    new_token_ids += [CHAMELEON_SEP_TOKEN_ID]
-
-    # NOTE: Create a defensive copy of the original inputs
-    return token_inputs(prompt_token_ids=new_token_ids,
-                        prompt=new_prompt,
-                        multi_modal_data=multi_modal_data)
+    def _get_hf_processor(self) -> ChameleonProcessor:
+        return self.ctx.get_hf_processor(ChameleonProcessor)
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(pixel_values=MultiModalFieldConfig.batched("image"))
+
+    def _get_prompt_replacements(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> list[PromptReplacement]:
+        processor = self._get_hf_processor()
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target="<image>",
+                replacement="".join([
+                    processor.image_start_token,
+                    processor.image_token * CHAMELEON_IMAGE_SEQ_LENGTH,
+                    processor.image_end_token,
+                ]),
+            )
+        ]
+
+    def _get_dummy_mm_inputs(
+        self,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        num_images = mm_counts.get("image", 0)
+
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=CHAMELEON_CROP_SIZE_WIDTH,
+                                   height=CHAMELEON_CROP_SIZE_HEIGHT,
+                                   num_images=num_images)
+        }
+
+        return ProcessorInputs(
+            prompt_text="<image>" * num_images,
+            mm_data=mm_data,
+        )
+
+    def apply(
+        self,
+        prompt_text: str,
+        mm_data: MultiModalDataDict,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> MultiModalInputsV2:
+        result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs)
+
+        # Only <image> tokens should be considered as placeholders,
+        # so we ignore the image_start_token and image_end_token
+        result["mm_placeholders"] = {
+            modality: [
+                PlaceholderRange(offset=p["offset"] + 1,
+                                 length=p["length"] - 2) for p in ps
+            ]
+            for modality, ps in result["mm_placeholders"].items()
+        }
+
+        return result
 
 
 class ChameleonLayerNorm(nn.LayerNorm):
@@ -736,7 +709,7 @@ def forward(self, pixel_values: torch.Tensor):
         for i_level in range(self.num_resolutions):
             for i_block in range(self.num_res_blocks):
                 hidden_state = self.down[i_level].block[i_block](
-                    hidden_states[-1], )
+                    hidden_states[-1])
                 if len(self.down[i_level].attn) > 0:
                     hidden_state = self.down[i_level].attn[i_block](
                         hidden_state)
@@ -925,10 +898,8 @@ def forward(
         return hidden_states
 
 
-@MULTIMODAL_REGISTRY.register_image_input_mapper()
 @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_chameleon_image_tokens)
-@INPUT_REGISTRY.register_dummy_data(dummy_data_for_chameleon)
-@INPUT_REGISTRY.register_input_processor(input_processor_for_chameleon)
+@MULTIMODAL_REGISTRY.register_processor(ChameleonMultiModalProcessor)
 class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal,
                                         SupportsPP):
 
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index 6e86900326c4b..8c14866f20b92 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -15,32 +15,30 @@
 # limitations under the License.
 """ PyTorch Fuyu model."""
 import math
-from array import array
 from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
                     TypedDict)
 
 import torch
 import torch.nn as nn
-import torch.utils.checkpoint
-from PIL import Image
-from transformers import FuyuImageProcessor
+from transformers import (BatchFeature, FuyuConfig, FuyuImageProcessor,
+                          FuyuProcessor)
 
 from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
-                         InputContext, token_inputs)
+from vllm.inputs import InputContext
 from vllm.model_executor.layers.linear import ColumnParallelLinear
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.models.persimmon import PersimmonForCausalLM
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
-from vllm.multimodal.image import cached_get_image_processor
-from vllm.multimodal.inputs import NestedTensors
-from vllm.multimodal.utils import (cached_get_tokenizer,
-                                   consecutive_placeholder_ranges)
-from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors,
-                           SequenceData)
-from vllm.utils import is_list_of
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalInputsV2, MultiModalKwargs,
+                                    NestedTensors, PlaceholderRange)
+from vllm.multimodal.parse import ImageProcessorItems
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        MultiModalDataItems, ProcessorInputs,
+                                        PromptReplacement)
+from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix,
@@ -54,178 +52,193 @@
 MAX_IMAGE_FEATURE_SIZE_WIDTH = 1920
 
 
-class FuyuImagePixelInputs(TypedDict):
-    type: Literal["pixel_values"]
+class FuyuImagePatchInputs(TypedDict):
+    type: Literal["image_patches"]
     data: torch.Tensor
     """
     Shape: 
-    (batch_size, num_patches, patch_size_x * patch_size_y * num_channels)
+    `(batch_size * num_patches, patch_size_x * patch_size_y * num_channels)`
+    """
+
+    patches_per_image: List[int]
+    """
+    List of number of total patches for each image in the batch.
+    This is used to restore the first two dimensions of `data`.
     """
 
 
-def _calculate_num_image_tokens(
-    height: int,
-    width: int,
+def _get_fuyu_num_image_tokens(
+    image_height: int,
+    image_width: int,
 ) -> Tuple[int, int]:
     """
-    calculate number of image tokens needed for a given image size
-    The expected Fuyu image prompts is in format:
-        (image_token * ncols + newline_token) * nrows
-    args:
-        image_size: Tuple[int, int] - (width, height) of the image
-    returns:
-        ncols: int - number of image tokens in x direction
-        nrows: int - number of image tokens in y direction
-    """
-    ncol = math.ceil(width / 30)
-    nrow = math.ceil(height / 30)
-    return ncol, nrow
+    Calculate the number of image tokens needed for a given image size.
 
+    The expected Fuyu image prompts can be expressed as:
 
-def get_max_fuyu_image_feature_size():
+    .. code-block::
+        (image_token * ncols + newline_token) * nrows
 
-    return _calculate_num_image_tokens(
-        height=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
-        width=MAX_IMAGE_FEATURE_SIZE_WIDTH,
-    )
+    Args:
+        image_size: Tuple[int, int] - `(width, height)` of the image
+
+    Returns:
+        ncols: int - number of image tokens in `x` direction
+        nrows: int - number of image tokens in `y` direction
+    """
+    ncols = math.ceil(image_width / 30)
+    nrows = math.ceil(image_height / 30)
+    return ncols, nrows
 
 
 def get_max_fuyu_image_tokens(ctx: InputContext):
-    ncol, nrow = get_max_fuyu_image_feature_size()
-    return (ncol + 1) * nrow
-
-
-def dummy_seq_data_for_fuyu(ctx: InputContext, seq_len: int, num_images: int):
-    ncol, nrow = get_max_fuyu_image_feature_size()
-    image_feature_size = get_max_fuyu_image_tokens(ctx)
-
-    image_token_ids = (
-        array(VLLM_TOKEN_ID_ARRAY_TYPE, [_IMAGE_TOKEN_ID]) * ncol +
-        array(VLLM_TOKEN_ID_ARRAY_TYPE, [_NEWLINE_TOKEN_ID])) * nrow
-    token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE, image_token_ids) * num_images
-    token_ids += array(VLLM_TOKEN_ID_ARRAY_TYPE,
-                       [0]) * (seq_len - image_feature_size * num_images)
-    return SequenceData(token_ids), {
-        "image":
-        consecutive_placeholder_ranges(num_items=num_images,
-                                       item_size=image_feature_size)
-    }
-
-
-def dummy_image_for_fuyu(
-    num_images: int,
-    *,
-    image_width: int,
-    image_height: int,
-):
-    image = Image.new("RGB", (image_width, image_height), color=0)
-    return {"image": image if num_images == 1 else [image] * num_images}
-
-
-def dummy_data_for_fuyu(ctx: InputContext, seq_len: int,
-                        mm_counts: Mapping[str, int]):
-    num_images = mm_counts["image"]
-    seq_data, ranges = dummy_seq_data_for_fuyu(ctx, seq_len, num_images)
-    mm_data = dummy_image_for_fuyu(num_images,
-                                   image_width=MAX_IMAGE_FEATURE_SIZE_WIDTH,
-                                   image_height=MAX_IMAGE_FEATURE_SIZE_HEIGHT)
-    return DummyData(seq_data, mm_data, ranges)
-
-
-def _fuyu_image_preprocess(image_processor: FuyuImageProcessor,
-                           data: List[Image.Image]):
-    image_encoding = image_processor.preprocess(data, return_tensors="pt")
-    batch_images = torch.stack([img[0] for img in image_encoding["images"]
-                                ]).unsqueeze(1)
-    image_unpadded_heights = torch.tensor(
-        image_encoding["image_unpadded_heights"])
-    image_unpadded_widths = torch.tensor(
-        image_encoding["image_unpadded_widths"])
-
-    batch_size = len(image_encoding["images"])
-    image_present = torch.ones(batch_size, 1, 1)
-    model_image_input = image_processor.preprocess_with_tokenizer_info(
-        image_input=batch_images,
-        image_present=image_present,
-        image_unpadded_h=image_unpadded_heights,
-        image_unpadded_w=image_unpadded_widths,
-        image_placeholder_id=_IMAGE_TOKEN_ID,
-        image_newline_id=_NEWLINE_TOKEN_ID,
-        variable_sized=True,
+    ncols, nrows = _get_fuyu_num_image_tokens(
+        image_height=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
+        image_width=MAX_IMAGE_FEATURE_SIZE_WIDTH,
     )
-    return model_image_input
-
-
-def input_processor_for_fuyu(ctx: InputContext, inputs: DecoderOnlyInputs):
-    multi_modal_data = inputs.get("multi_modal_data")
-    if multi_modal_data is None or "image" not in multi_modal_data:
-        return inputs
-
-    model_config = ctx.model_config
-    image_data = multi_modal_data["image"]
-    new_multi_modal_data = {}
-    image_list = image_data if isinstance(image_data, list) else [image_data]
-
-    # process image data
-    if is_list_of(image_list, Image.Image):
-        # Fuyu's image_processor can also finish token padding
-        image_processor: FuyuImageProcessor = cached_get_image_processor(
-            model_config.model)
-
-        model_image_input = _fuyu_image_preprocess(image_processor, image_data)
-        image_patches = torch.cat([
-            image_patch[0]
-            for image_patch in model_image_input["image_patches"]
-        ])
-        new_multi_modal_data["image"] = image_patches
-
-    elif is_list_of(image_list, torch.Tensor):
-        raise NotImplementedError("Embeddings input is not supported yet")
-    else:
-        raise TypeError(f"Invalid image type: {type(image_data)}")
-
-    # process prompts
-    prompt = inputs.get("prompt")
-    prompt_token_ids = inputs["prompt_token_ids"]
-    tokenizer = cached_get_tokenizer(model_config.model)
-    # dim0 is batch_size, dim1 is subseq_size which will always be 1
-    image_input_ids: List[List[
-        torch.Tensor]] = model_image_input["image_input_ids"]
-    image_input_ids = image_input_ids[0][0].tolist()
-    bos_token = tokenizer.encode("<s>", add_special_tokens=False)[1:]
-    boa_token = tokenizer.encode("\x04", add_special_tokens=False)[1:]
-
-    new_prompt = prompt + "\x04"
-    new_prompt_token_ids = image_input_ids + bos_token + prompt_token_ids[
-        1:] + boa_token
-
-    return token_inputs(prompt=new_prompt,
-                        prompt_token_ids=new_prompt_token_ids,
-                        multi_modal_data=new_multi_modal_data)
-
-
-def input_mapper_for_fuyu(ctx: InputContext, data: object):
-    model_config = ctx.model_config
-    data_list = data if isinstance(data, list) else [data]
-    if is_list_of(data_list, Image.Image):
-        # Fuyu's image_processor can also finish token padding
-        image_processor: FuyuImageProcessor = cached_get_image_processor(
-            model_config.model)
-
-        model_image_input = _fuyu_image_preprocess(image_processor, data_list)
-        data = torch.stack([
-            image_patch[0]
-            for image_patch in model_image_input["image_patches"]
-        ])
-
-    # image has been processed with prompt in input processor
-    return MultiModalKwargs({"pixel_values": data})
-
-
-@MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_fuyu)
+
+    return (ncols + 1) * nrows
+
+
+class FuyuMultiModalProcessor(BaseMultiModalProcessor):
+
+    def _get_hf_processor(self) -> FuyuProcessor:
+        return self.ctx.get_hf_processor(FuyuProcessor)
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+
+        if not mm_data:
+            # Avoid warning from HF logger for text-only input
+            # Input_ids format: bos_token_id + prompt_token_ids + boa_token_id
+            # Tokenizer won't add boa_token_id by default, we add it manually.
+            tokenizer = self._get_tokenizer()
+            boa_token_id: int = tokenizer.vocab["<0x04>"]  # type: ignore
+            prompt_ids = tokenizer.encode(prompt) + [boa_token_id]
+            return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
+
+        processed_outputs = super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+        )
+
+        image_patches = processed_outputs.get("image_patches")
+        if image_patches is not None:
+            images = mm_data["images"]
+            assert isinstance(images, list)
+
+            # Original output: (1, num_images, Pn, Px * Py * C)
+            # New output: (num_images, Pn, Px * Py * C)
+            assert (isinstance(image_patches, list)
+                    and len(image_patches) == 1)
+            assert (isinstance(image_patches[0], torch.Tensor)
+                    and len(image_patches[0]) == len(images))
+
+            processed_outputs["image_patches"] = image_patches[0]
+
+        return processed_outputs
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(image_patches=MultiModalFieldConfig.batched("image"))
+
+    def _get_prompt_replacements(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> list[PromptReplacement]:
+        hf_config = self.ctx.get_hf_config(FuyuConfig)
+        bos_token_id = hf_config.bos_token_id
+
+        tokenizer = self._get_tokenizer()
+        eot_token_id = tokenizer.bos_token_id
+        assert isinstance(eot_token_id, int)
+
+        hf_processor = self._get_hf_processor()
+        image_processor: FuyuImageProcessor = hf_processor.image_processor
+        target_size = image_processor.size
+        target_height, target_width = (target_size["height"],
+                                       target_size["width"])
+
+        def get_replacement_fuyu(item_idx: int):
+            images = mm_items.get_items("image", ImageProcessorItems)
+            image_size = images.get_image_size(item_idx)
+            width, height = image_size.width, image_size.height
+            if not (width <= target_width and height <= target_height):
+                height_scale_factor = target_height / height
+                width_scale_factor = target_width / width
+                optimal_scale_factor = min(height_scale_factor,
+                                           width_scale_factor)
+
+                height = int(height * optimal_scale_factor)
+                width = int(width * optimal_scale_factor)
+
+            ncols, nrows = _get_fuyu_num_image_tokens(
+                image_width=width,
+                image_height=height,
+            )
+
+            return (([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows +
+                    [bos_token_id])
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=[eot_token_id],
+                replacement=get_replacement_fuyu,
+            )
+        ]
+
+    def apply(
+        self,
+        prompt_text: str,
+        mm_data: MultiModalDataDict,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> MultiModalInputsV2:
+        result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs)
+
+        # Only |SPEAKER| (image) tokens should be considered as placeholders,
+        # so we ignore the trailing bos_token_id
+        result["mm_placeholders"] = {
+            modality: [
+                PlaceholderRange(offset=p["offset"], length=p["length"] - 1)
+                for p in ps
+            ]
+            for modality, ps in result["mm_placeholders"].items()
+        }
+
+        return result
+
+    def _get_dummy_mm_inputs(
+        self,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        num_images = mm_counts.get("image", 0)
+
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=MAX_IMAGE_FEATURE_SIZE_WIDTH,
+                                   height=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
+                                   num_images=num_images)
+        }
+
+        return ProcessorInputs(
+            prompt_text="",
+            mm_data=mm_data,
+        )
+
+
 @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_fuyu_image_tokens)
-@INPUT_REGISTRY.register_dummy_data(dummy_data_for_fuyu)
-@INPUT_REGISTRY.register_input_processor(input_processor_for_fuyu)
+@MULTIMODAL_REGISTRY.register_processor(FuyuMultiModalProcessor)
 class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
@@ -280,28 +293,32 @@ def _validate_shape(d: torch.Tensor):
         return data.to(self.vision_embed_tokens.weight.dtype)
 
     def _parse_and_validate_image_input(
-            self, **kwargs: object) -> Optional[FuyuImagePixelInputs]:
-        pixel_values = kwargs.pop("pixel_values", None)
-
-        if pixel_values is not None:
-            if not isinstance(pixel_values, (torch.Tensor, list)):
+            self, **kwargs: object) -> Optional[FuyuImagePatchInputs]:
+        image_patches = kwargs.pop("image_patches", None)
+        if image_patches is not None:
+            if not isinstance(image_patches, (torch.Tensor, list)):
                 raise ValueError("Incorrect type of image patches. "
-                                 f"Got type: {type(pixel_values)}")
+                                 f"Got type: {type(image_patches)}")
 
-            return FuyuImagePixelInputs(
-                type="pixel_values",
+            image_patches_flat = flatten_bn(image_patches)
+
+            return FuyuImagePatchInputs(
+                type="image_patches",
                 data=self._validate_pixel_values(
-                    flatten_bn(pixel_values, concat=True)),
+                    flatten_bn(image_patches_flat, concat=True)),
+                patches_per_image=[x.size(0) for x in image_patches_flat],
             )
 
         return None
 
     def _process_image_input(
-            self, image_input: FuyuImagePixelInputs) -> torch.Tensor:
+            self, image_input: FuyuImagePatchInputs) -> NestedTensors:
+        image_patches = image_input["data"]
+        patches_per_image = image_input["patches_per_image"]
 
         assert self.vision_embed_tokens is not None
-        vision_embeddings, _ = self.vision_embed_tokens(image_input["data"])
-        return vision_embeddings
+        vision_embeddings, _ = self.vision_embed_tokens(image_patches)
+        return vision_embeddings.split(patches_per_image, dim=0)
 
     def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
         image_input = self._parse_and_validate_image_input(**kwargs)
diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py
index e430a158d869a..4e42a4b6f9e64 100644
--- a/vllm/model_executor/models/idefics2_vision_model.py
+++ b/vllm/model_executor/models/idefics2_vision_model.py
@@ -69,7 +69,8 @@ def forward(self,
                 patch_attention_mask: torch.BoolTensor,
                 tgt_sizes: Optional[torch.IntTensor] = None) -> torch.Tensor:
         batch_size, _, max_im_h, max_im_w = pixel_values.shape
-        patch_embeds = self.patch_embedding(pixel_values)
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values.to(target_dtype))
         embeddings = patch_embeds.flatten(2).transpose(1, 2)
         max_nb_patches_h, max_nb_patches_w = (
             max_im_h // self.patch_size,
@@ -309,7 +310,8 @@ def forward(
         hidden_states = self.embeddings(
             pixel_values=pixel_values,
             patch_attention_mask=patch_attention_mask,
-            tgt_sizes=tgt_sizes)
+            tgt_sizes=tgt_sizes,
+        )
         encoder_outputs = self.encoder(hidden_states)
         last_hidden_state = self.post_layernorm(encoder_outputs)
         return last_hidden_state
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 1d6ee2a0be72e..34dc7fa31ce6f 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -144,8 +144,8 @@ def _call_hf_processor(
                 # Original output: (1, num_images, C, H, W)
                 # New output: (num_images, C, H, W)
                 assert (isinstance(pixel_values, list)
-                        and len(pixel_values) == 1
-                        and isinstance(pixel_values[0], list)
+                        and len(pixel_values) == 1)
+                assert (isinstance(pixel_values[0], list)
                         and len(pixel_values[0]) == len(images))
 
                 processed_outputs["pixel_values"] = pixel_values[0]
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index a39f2f4124d05..5e70c11363c83 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -528,10 +528,8 @@ def _process_image_pixels(
         stacked_image_features = self._image_pixels_to_features(
             self.vision_tower, stacked_pixel_values)
 
-        return [
-            self.multi_modal_projector(image_features) for image_features in
-            torch.split(stacked_image_features, num_patches_per_batch)
-        ]
+        return torch.split(self.multi_modal_projector(stacked_image_features),
+                           num_patches_per_batch)
 
     def _process_image_input(
         self,
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 22d29f5bbc50c..2bce13792a88d 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -1,8 +1,8 @@
+import math
 from dataclasses import dataclass, fields
 from functools import cached_property
 from typing import Iterable, List, Mapping, Optional, Set, Tuple, Union
 
-import numpy
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -306,7 +306,7 @@ def _parse_and_validate_image_input(
         images: Optional[Union[List[List[torch.Tensor]], List[torch.Tensor],
                                torch.Tensor]] = None,
         image_tokens: Optional[torch.Tensor] = None,
-    ) -> Optional[List[torch.Tensor]]:
+    ) -> Tuple[Optional[List[torch.Tensor]], Optional[torch.Tensor]]:
         if images is None:
             return None, None
 
@@ -604,11 +604,11 @@ def max_patches_per_side(self) -> int:
         return self.args.image_size // self.args.patch_size
 
     @property
-    def device(self) -> torch.device:
+    def device(self) -> torch.types.Device:
         return next(self.parameters()).device
 
     @property
-    def dtype(self) -> torch.device:
+    def dtype(self) -> torch.dtype:
         return next(self.parameters()).dtype
 
     @property
@@ -741,8 +741,8 @@ def get_pixtral_hf_image_feature_size(hf_config: PixtralVisionConfig,
     ratio = max(image_width / max_width, image_height / max_height)
 
     if ratio > 1:
-        image_width = int(numpy.ceil(image_width / ratio))
-        image_height = int(numpy.ceil(image_height / ratio))
+        image_width = int(math.ceil(image_width / ratio))
+        image_height = int(math.ceil(image_height / ratio))
 
     num_height_tokens, num_width_tokens = _get_pixtral_hf_num_image_tokens(
         (image_height, image_width),
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index e3d43b017f894..de55bc6bcc123 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -23,7 +23,6 @@
 from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict,
                     Union)
 
-import numpy as np
 import torch
 import torch.nn as nn
 from transformers import BatchFeature
@@ -177,16 +176,19 @@ def _get_dummy_mm_inputs(
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
         feature_extractor = self._get_feature_extractor()
+
         sampling_rate = feature_extractor.sampling_rate
         audio_len = feature_extractor.chunk_length * sampling_rate
+        num_audios = mm_counts.get("audio", 0)
 
-        audio_count = mm_counts.get("audio", 0)
-        audio = np.zeros(audio_len)
-        data = {"audio": [audio] * audio_count}
+        mm_data = {
+            "audio":
+            self._get_dummy_audios(length=audio_len, num_audios=num_audios)
+        }
 
         return ProcessorInputs(
-            prompt_text="<|AUDIO|>" * audio_count,
-            mm_data=data,
+            prompt_text="<|AUDIO|>" * num_audios,
+            mm_data=mm_data,
         )
 
 
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 6181fe3dd13d8..1e485f87bb7a4 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -29,7 +29,6 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange, repeat
-from PIL import Image
 from transformers import BatchFeature
 from transformers.models.qwen2_vl import (Qwen2VLImageProcessor,
                                           Qwen2VLProcessor)
@@ -882,12 +881,10 @@ def _get_dummy_mm_inputs(
         self,
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
-        num_images = mm_counts.get("image", 0)
         hf_processor = self._get_hf_processor()
-        image_token: str = hf_processor.image_token
         image_processor = _get_image_processor(hf_processor)
 
-        data = {}
+        image_token: str = hf_processor.image_token
         resized_height, resized_width = smart_resize(
             height=9999999,
             width=9999999,
@@ -895,14 +892,18 @@ def _get_dummy_mm_inputs(
             min_pixels=image_processor.min_pixels,
             max_pixels=image_processor.max_pixels,
         )
+        num_images = mm_counts.get("image", 0)
 
-        dummy_image = Image.new("RGB", (resized_width, resized_height),
-                                color=0)
-        data["image"] = [dummy_image] * num_images
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=resized_width,
+                                   height=resized_height,
+                                   num_images=num_images)
+        }
 
         return ProcessorInputs(
             prompt_text=image_token * num_images,
-            mm_data=data,
+            mm_data=mm_data,
         )
 
 
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index 7e853e5b90096..54be7fed3f2be 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -188,16 +188,19 @@ def _get_dummy_mm_inputs(
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
         feature_extractor = self._get_feature_extractor()
+
         sampling_rate = feature_extractor.sampling_rate
         audio_len = feature_extractor.chunk_length * sampling_rate
+        num_audios = mm_counts.get("audio", 0)
 
-        audio_count = mm_counts.get("audio", 0)
-        audio = np.zeros(audio_len)
-        data = {"audio": [audio] * audio_count}
+        mm_data = {
+            "audio":
+            self._get_dummy_audios(length=audio_len, num_audios=num_audios)
+        }
 
         return ProcessorInputs(
-            prompt_text="<|audio|>" * audio_count,
-            mm_data=data,
+            prompt_text="<|audio|>" * num_audios,
+            mm_data=mm_data,
         )
 
 
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 180489166b407..7712c3bcebe20 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -1,15 +1,17 @@
 import pickle
 import re
 from abc import ABC, abstractmethod
+from collections import defaultdict
 from collections.abc import Callable, ItemsView, Iterable, Mapping, Sequence
 from dataclasses import dataclass, field
 from functools import lru_cache
 from typing import Any, NamedTuple, Optional, Protocol, TypeVar, Union
 
 import numpy as np
+import numpy.typing as npt
 import torch
 from blake3 import blake3
-from PIL.Image import Image
+from PIL import Image
 from transformers import BatchFeature, ProcessorMixin
 
 from vllm.inputs import DummyData, InputProcessingContext
@@ -353,13 +355,13 @@ def _replace_matches(
 ) -> list[_S]:
     out_seqs = list[_S]()
     prev_end_idx = 0
-    next_idx_by_modality = {modality: 0 for modality in mm_item_counts}
+    next_idx_by_modality = defaultdict[str, int](lambda: 0)
 
     for match in _resolve_matches(prompt, matches):
         modality = match.modality
 
         item_idx = next_idx_by_modality[modality]
-        if item_idx >= mm_item_counts[modality]:
+        if item_idx >= mm_item_counts.get(modality, 0):
             continue
 
         start_idx = match.start_idx
@@ -513,7 +515,7 @@ def _serialize_item(self, obj: object) -> bytes:
             return obj.encode("utf-8")
         if isinstance(obj, bytes):
             return obj
-        if isinstance(obj, Image):
+        if isinstance(obj, Image.Image):
             return obj.tobytes()
 
         # Convertible to NumPy arrays
@@ -673,10 +675,14 @@ def _get_prompt_replacements(
         Given the original multi-modal items for this modality
         and HF-processed data, output the replacements to perform.
 
-        Note:
-            Even when the HF processor already performs replacement for us,
-            we still use this replacement information to determine
-            the placeholder token positions for each multi-modal item.
+        Notes:
+            - You should not assume that HF processor always performs prompt
+              replacement: in :meth:`_apply_hf_processor_missing`, this method
+              is called on text-only and multimodal-only inputs separately,
+              instead of passing them in the same call.
+            - The replacement information returned by this method is also used
+              to determine the placeholder token positions for each multi-modal
+              item.
         """
         raise NotImplementedError
 
@@ -710,6 +716,10 @@ def _call_hf_processor(
         mm_data: Mapping[str, object],
         mm_kwargs: Mapping[str, object],
     ) -> BatchFeature:
+        """
+        Call the HF processor on the prompt text and
+        associated multi-modal data.
+        """
         return self.ctx.call_hf_processor(
             self._get_hf_processor(**mm_kwargs),
             dict(text=prompt, **mm_data),
@@ -723,7 +733,8 @@ def _apply_hf_processor(
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> tuple[list[int], MultiModalKwargs]:
         """
-        Apply the HF processor on the full prompt text and multi-modal data.
+        Wrapper of :meth:`_call_hf_processor` that applies
+        additional pre-processing and post-processing.
         """
         processor_data, passthrough_data = self._get_hf_mm_data(mm_items)
 
@@ -754,10 +765,11 @@ def _apply_hf_processor_missing(
         Apply the HF processor on the full prompt text, but only on the
         multi-modal data that are missing from the cache.
 
-        Note: We pass prompt text and multi-modal data into the HF processor
-        in separate calls to avoid HF prompt replacement being done for
-        cached items; instead, we rely on our own prompt replacement logic
-        for the full text.
+        Note:
+            We pass prompt text and multi-modal data into the HF processor
+            in separate calls to avoid HF prompt replacement being done for
+            cached items; instead, we rely on our own prompt replacement logic
+            (:meth:`_get_prompt_replacements`) for the full text.
         """
         mm_missing_counts = mm_missing_data_items.get_all_counts()
 
@@ -1010,6 +1022,36 @@ def apply(
             mm_placeholders=mm_placeholders,
         )
 
+    def _get_dummy_audios(
+        self,
+        *,
+        length: int,
+        num_audios: int,
+    ) -> list[npt.NDArray]:
+        audio = np.zeros((length, ))
+        return [audio] * num_audios
+
+    def _get_dummy_images(
+        self,
+        *,
+        width: int,
+        height: int,
+        num_images: int,
+    ) -> list[Image.Image]:
+        image = Image.new("RGB", (width, height), color=0)
+        return [image] * num_images
+
+    def _get_dummy_videos(
+        self,
+        *,
+        width: int,
+        height: int,
+        num_frames: int,
+        num_videos: int,
+    ) -> list[npt.NDArray]:
+        video = np.zeros((num_frames, width, height, 3))
+        return [video] * num_videos
+
     @abstractmethod
     def _get_dummy_mm_inputs(
         self,
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 87b12a6fb33c1..7b6ded6a27084 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -400,15 +400,19 @@ def repeat_and_pad_placeholder_tokens(
     placeholder_token_idx = 0
     for i, token in enumerate(prompt_token_ids):
         if token == placeholder_token_id:
+            curr_repeat_count = repeat_count[placeholder_token_idx]
             replacement_ids = repeat_and_pad_token(
                 placeholder_token_id,
-                repeat_count=repeat_count[placeholder_token_idx],
+                repeat_count=curr_repeat_count,
                 pad_token_left=pad_token_left,
                 pad_token_right=pad_token_right,
             )
+            offset = len(new_token_ids)
+            if pad_token_left is not None:
+                offset += 1
             placeholder_ranges.append({
-                "offset": len(new_token_ids),
-                "length": len(replacement_ids)
+                "offset": offset,
+                "length": curr_repeat_count,
             })
             new_token_ids.extend(replacement_ids)
             placeholder_token_idx += 1
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 509771b7e2e5a..a08a86d4007dc 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -647,10 +647,23 @@ def profile_run(self) -> None:
                 self.mm_registry.get_max_tokens_per_item_by_modality(
                     self.model_config).values())
 
-            max_num_mm_items = min(
+            max_num_mm_items_encoder_budget = min(
                 self.max_num_encoder_input_tokens,
                 self.encoder_cache_size) // max_tokens_per_mm_item
 
+            max_mm_items_per_req = max(
+                self.mm_registry.get_mm_limits_per_prompt(
+                    self.model_config).values())
+
+            # NOTE: We do not consider max_num_batched_tokens on purpose
+            # because the multimodal embeddings can be generated in advance
+            # and chunked prefilled.
+            max_num_mm_items_decoder_budget = self.max_num_reqs * \
+                max_mm_items_per_req
+
+            max_num_mm_items = min(max_num_mm_items_encoder_budget,
+                                   max_num_mm_items_decoder_budget)
+
             # Dummy data definition in V0 may contain multiple multimodal items
             # (e.g, multiple images) for a single request, therefore here we
             # always replicate first item by max_num_mm_items times since in V1

From 0c6f9985547d6b510d34c6c873db54abe03eb346 Mon Sep 17 00:00:00 2001
From: Yihua Cheng <yihua98@uchicago.edu>
Date: Tue, 31 Dec 2024 18:10:55 -0600
Subject: [PATCH 040/462] [Benchmark] Add benchmark script for CPU offloading 
 (#11533)

Signed-off-by: ApostaC <yihua98@uchicago.edu>
Co-authored-by: KuntaiDu <kuntai@uchicago.edu>
---
 .../benchmark_long_document_qa_throughput.py  | 184 ++++++++++++++++++
 1 file changed, 184 insertions(+)
 create mode 100644 benchmarks/benchmark_long_document_qa_throughput.py

diff --git a/benchmarks/benchmark_long_document_qa_throughput.py b/benchmarks/benchmark_long_document_qa_throughput.py
new file mode 100644
index 0000000000000..13477ef535e86
--- /dev/null
+++ b/benchmarks/benchmark_long_document_qa_throughput.py
@@ -0,0 +1,184 @@
+"""
+Offline benchmark to test the long document QA throughput.
+
+Example usage:
+    # This command run the vllm with 50GB CPU memory for offloading
+    # The workload samples 8 different prompts with a default input
+    # length of 20000 tokens, then replicates each prompt 2 times 
+    # in random order.
+    python benchmark_long_document_qa_throughput.py \
+        --model meta-llama/Llama-2-7b-chat-hf \
+        --enable-prefix-caching \
+        --num-documents 8 \
+        --repeat-count 2 
+
+Commandline arguments:
+    --num-documents: The number of documents to sample prompts from.
+
+    --document-length: The length of each document in tokens. 
+                       (Optional, default: 20000)
+
+    --output-len: The number of tokens to generate for each prompt.
+                  (Optional, default: 10)
+
+    --repeat-count: The number of times to repeat each prompt.
+                    (Optional, default: 2)
+
+    --repeat-mode: The mode to repeat prompts. The supported modes are:
+        - 'random': shuffle the prompts randomly. (Default)
+        - 'tile': the entire prompt list is repeated in sequence. (Potentially
+                  lowest cache hit)
+        - 'interleave': each prompt is repeated consecutively before 
+                        moving to the next element. (Highest cache hit)
+    
+    --shuffle-seed: Random seed when the repeat mode is "random".
+                    (Optional, default: 0)
+
+In the meantime, it also supports all the vLLM engine args to initialize the 
+LLM engine. You can refer to the `vllm.engine.arg_utils.EngineArgs` for more
+details.
+"""
+
+import dataclasses
+import random
+import time
+
+from vllm import LLM, SamplingParams
+from vllm.engine.arg_utils import EngineArgs
+from vllm.utils import FlexibleArgumentParser
+
+
+def test_long_document_qa(llm=None, sampling_params=None, prompts=None):
+    """
+    Test long document QA with the given prompts and sampling parameters.
+    Print the time spent in processing all the prompts.
+
+    Args:
+        llm: The language model used for generating responses.
+        sampling_params: Sampling parameter used to generate the response.
+        prompts: A list of prompt strings to be processed by the LLM.
+    """
+    start_time = time.time()
+    llm.generate(prompts, sampling_params=sampling_params)
+    end_time = time.time()
+    print(f"Time to execute all requests: {end_time - start_time:.4f} secs")
+
+
+def repeat_prompts(prompts, repeat_count, mode: str):
+    """
+    Repeat each prompt in the list for a specified number of times.
+    The order of prompts in the output list depends on the mode.
+
+    Args:
+        prompts: A list of prompts to be repeated.
+        repeat_count: The number of times each prompt is repeated.
+        mode: The mode of repetition. Supported modes are:
+            - 'random': Shuffle the prompts randomly after repetition.
+            - 'tile': Repeat the entire prompt list in sequence.
+              Example: [1, 2, 3] -> [1, 2, 3, 1, 2, 3].
+            - 'interleave': Repeat each prompt consecutively before moving to 
+              the next. Example: [1, 2, 3] -> [1, 1, 2, 2, 3, 3].
+
+    Returns:
+        A list of repeated prompts in the specified order.
+
+    Raises:
+        ValueError: If an invalid mode is provided.
+    """
+    print("Repeat mode: ", mode)
+    if mode == 'random':
+        repeated_prompts = prompts * repeat_count
+        random.shuffle(repeated_prompts)
+        return repeated_prompts
+    elif mode == 'tile':
+        return prompts * repeat_count
+    elif mode == 'interleave':
+        repeated_prompts = []
+        for prompt in prompts:
+            repeated_prompts.extend([prompt] * repeat_count)
+        return repeated_prompts
+    else:
+        raise ValueError(f"Invalid mode: {mode}, only support "
+                         "'random', 'tile', 'interleave'")
+
+
+def main(args):
+    random.seed(args.shuffle_seed)
+
+    # Prepare the prompts:
+    # we append the document id at the beginning to avoid any of the document
+    # being the prefix of other documents
+    prompts = [
+        str(i) + ' '.join(['hi'] * args.document_length)
+        for i in range(args.num_documents)
+    ]
+
+    prompts = repeat_prompts(prompts, args.repeat_count, mode=args.repeat_mode)
+
+    warmup_prompts = [
+        "This is warm up request " + str(i) + \
+                ' '.join(['hi'] * args.document_length)
+        for i in range(args.num_documents)]
+
+    # Create the LLM engine
+    engine_args = EngineArgs.from_cli_args(args)
+    llm = LLM(**dataclasses.asdict(engine_args))
+    sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
+
+    print("------warm up------")
+    test_long_document_qa(
+        llm=llm,
+        prompts=warmup_prompts,
+        sampling_params=sampling_params,
+    )
+
+    print("------start generating------")
+    test_long_document_qa(
+        llm=llm,
+        prompts=prompts,
+        sampling_params=sampling_params,
+    )
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(
+        description=
+        'Benchmark the performance with or without automatic prefix caching.')
+
+    parser.add_argument(
+        '--document-length',
+        type=int,
+        # Roughly the number of tokens for a system paper,
+        # excluding images
+        default=20000,
+        help='Range of input lengths for sampling prompts,'
+        'specified as "min:max" (e.g., "128:256").')
+
+    parser.add_argument('--num-documents',
+                        type=int,
+                        default=8,
+                        help='Range of input lengths for sampling prompts,'
+                        'specified as "min:max" (e.g., "128:256").')
+
+    parser.add_argument('--output-len', type=int, default=10)
+
+    parser.add_argument('--repeat-count',
+                        type=int,
+                        default=2,
+                        help='Number of times to repeat each prompt')
+
+    parser.add_argument("--repeat-mode",
+                        type=str,
+                        default='random',
+                        help='The mode to repeat prompts. The supported '
+                        'modes are "random", "tile", and "interleave". '
+                        'See repeat_prompts() in the source code for details.')
+
+    parser.add_argument("--shuffle-seed",
+                        type=int,
+                        default=0,
+                        help='Random seed when the repeat mode is "random"')
+
+    parser = EngineArgs.add_cli_args(parser)
+    args = parser.parse_args()
+    main(args)

From 4db72e57f6e8da5e78285e9868e9327167bea973 Mon Sep 17 00:00:00 2001
From: Joe Runde <Joseph.Runde@ibm.com>
Date: Tue, 31 Dec 2024 18:21:51 -0800
Subject: [PATCH 041/462] [Bugfix][Refactor] Unify model management in frontend
 (#11660)

Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
---
 tests/entrypoints/openai/test_cli_args.py     |   2 +-
 tests/entrypoints/openai/test_lora_lineage.py |  32 ++-
 tests/entrypoints/openai/test_serving_chat.py |  20 +-
 ...rving_engine.py => test_serving_models.py} |  66 +++---
 vllm/entrypoints/openai/api_server.py         |  62 +++---
 vllm/entrypoints/openai/cli_args.py           |   2 +-
 vllm/entrypoints/openai/run_batch.py          |  15 +-
 vllm/entrypoints/openai/serving_chat.py       |  16 +-
 vllm/entrypoints/openai/serving_completion.py |  16 +-
 vllm/entrypoints/openai/serving_embedding.py  |   9 +-
 vllm/entrypoints/openai/serving_engine.py     | 192 ++--------------
 vllm/entrypoints/openai/serving_models.py     | 210 ++++++++++++++++++
 vllm/entrypoints/openai/serving_pooling.py    |   9 +-
 vllm/entrypoints/openai/serving_score.py      |   9 +-
 .../openai/serving_tokenization.py            |  12 +-
 15 files changed, 365 insertions(+), 307 deletions(-)
 rename tests/entrypoints/openai/{test_serving_engine.py => test_serving_models.py} (61%)
 create mode 100644 vllm/entrypoints/openai/serving_models.py

diff --git a/tests/entrypoints/openai/test_cli_args.py b/tests/entrypoints/openai/test_cli_args.py
index 45e6980a94630..e49562ad6a21f 100644
--- a/tests/entrypoints/openai/test_cli_args.py
+++ b/tests/entrypoints/openai/test_cli_args.py
@@ -4,7 +4,7 @@
 
 from vllm.entrypoints.openai.cli_args import (make_arg_parser,
                                               validate_parsed_serve_args)
-from vllm.entrypoints.openai.serving_engine import LoRAModulePath
+from vllm.entrypoints.openai.serving_models import LoRAModulePath
 from vllm.utils import FlexibleArgumentParser
 
 from ...utils import VLLM_PATH
diff --git a/tests/entrypoints/openai/test_lora_lineage.py b/tests/entrypoints/openai/test_lora_lineage.py
index ab39684c2f31a..ce4f85c13fff9 100644
--- a/tests/entrypoints/openai/test_lora_lineage.py
+++ b/tests/entrypoints/openai/test_lora_lineage.py
@@ -55,7 +55,10 @@ def server_with_lora_modules_json(zephyr_lora_files):
         "64",
     ]
 
-    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+    # Enable the /v1/load_lora_adapter endpoint
+    envs = {"VLLM_ALLOW_RUNTIME_LORA_UPDATING": "True"}
+
+    with RemoteOpenAIServer(MODEL_NAME, args, env_dict=envs) as remote_server:
         yield remote_server
 
 
@@ -67,8 +70,8 @@ async def client_for_lora_lineage(server_with_lora_modules_json):
 
 
 @pytest.mark.asyncio
-async def test_check_lora_lineage(client_for_lora_lineage: openai.AsyncOpenAI,
-                                  zephyr_lora_files):
+async def test_static_lora_lineage(client_for_lora_lineage: openai.AsyncOpenAI,
+                                   zephyr_lora_files):
     models = await client_for_lora_lineage.models.list()
     models = models.data
     served_model = models[0]
@@ -81,3 +84,26 @@ async def test_check_lora_lineage(client_for_lora_lineage: openai.AsyncOpenAI,
     assert all(lora_model.parent == MODEL_NAME for lora_model in lora_models)
     assert lora_models[0].id == "zephyr-lora"
     assert lora_models[1].id == "zephyr-lora2"
+
+
+@pytest.mark.asyncio
+async def test_dynamic_lora_lineage(
+        client_for_lora_lineage: openai.AsyncOpenAI, zephyr_lora_files):
+
+    response = await client_for_lora_lineage.post("load_lora_adapter",
+                                                  cast_to=str,
+                                                  body={
+                                                      "lora_name":
+                                                      "zephyr-lora-3",
+                                                      "lora_path":
+                                                      zephyr_lora_files
+                                                  })
+    # Ensure adapter loads before querying /models
+    assert "success" in response
+
+    models = await client_for_lora_lineage.models.list()
+    models = models.data
+    dynamic_lora_model = models[-1]
+    assert dynamic_lora_model.root == zephyr_lora_files
+    assert dynamic_lora_model.parent == MODEL_NAME
+    assert dynamic_lora_model.id == "zephyr-lora-3"
diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index 61677b65af342..97248f1150979 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -8,7 +8,8 @@
 from vllm.engine.multiprocessing.client import MQLLMEngineClient
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
-from vllm.entrypoints.openai.serving_engine import BaseModelPath
+from vllm.entrypoints.openai.serving_models import (BaseModelPath,
+                                                    OpenAIServingModels)
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
 MODEL_NAME = "openai-community/gpt2"
@@ -50,14 +51,13 @@ async def _async_serving_chat_init():
     engine = MockEngine()
     model_config = await engine.get_model_config()
 
+    models = OpenAIServingModels(model_config, BASE_MODEL_PATHS)
     serving_completion = OpenAIServingChat(engine,
                                            model_config,
-                                           BASE_MODEL_PATHS,
+                                           models,
                                            response_role="assistant",
                                            chat_template=CHAT_TEMPLATE,
                                            chat_template_content_format="auto",
-                                           lora_modules=None,
-                                           prompt_adapters=None,
                                            request_logger=None)
     return serving_completion
 
@@ -72,14 +72,14 @@ def test_serving_chat_should_set_correct_max_tokens():
     mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
     mock_engine.errored = False
 
+    models = OpenAIServingModels(base_model_paths=BASE_MODEL_PATHS,
+                                 model_config=MockModelConfig())
     serving_chat = OpenAIServingChat(mock_engine,
                                      MockModelConfig(),
-                                     BASE_MODEL_PATHS,
+                                     models,
                                      response_role="assistant",
                                      chat_template=CHAT_TEMPLATE,
                                      chat_template_content_format="auto",
-                                     lora_modules=None,
-                                     prompt_adapters=None,
                                      request_logger=None)
     req = ChatCompletionRequest(
         model=MODEL_NAME,
@@ -115,14 +115,14 @@ def test_serving_chat_could_load_correct_generation_config():
     mock_engine.errored = False
 
     # Initialize the serving chat
+    models = OpenAIServingModels(base_model_paths=BASE_MODEL_PATHS,
+                                 model_config=mock_model_config)
     serving_chat = OpenAIServingChat(mock_engine,
                                      mock_model_config,
-                                     BASE_MODEL_PATHS,
+                                     models,
                                      response_role="assistant",
                                      chat_template=CHAT_TEMPLATE,
                                      chat_template_content_format="auto",
-                                     lora_modules=None,
-                                     prompt_adapters=None,
                                      request_logger=None)
     req = ChatCompletionRequest(
         model=MODEL_NAME,
diff --git a/tests/entrypoints/openai/test_serving_engine.py b/tests/entrypoints/openai/test_serving_models.py
similarity index 61%
rename from tests/entrypoints/openai/test_serving_engine.py
rename to tests/entrypoints/openai/test_serving_models.py
index 096ab6fa0ac09..96897dc730da2 100644
--- a/tests/entrypoints/openai/test_serving_engine.py
+++ b/tests/entrypoints/openai/test_serving_models.py
@@ -4,11 +4,11 @@
 import pytest
 
 from vllm.config import ModelConfig
-from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.openai.protocol import (ErrorResponse,
                                               LoadLoraAdapterRequest,
                                               UnloadLoraAdapterRequest)
-from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing
+from vllm.entrypoints.openai.serving_models import (BaseModelPath,
+                                                    OpenAIServingModels)
 from vllm.lora.request import LoRARequest
 
 MODEL_NAME = "meta-llama/Llama-2-7b"
@@ -19,47 +19,45 @@
     "Success: LoRA adapter '{lora_name}' removed successfully.")
 
 
-async def _async_serving_engine_init():
-    mock_engine_client = MagicMock(spec=EngineClient)
+async def _async_serving_models_init() -> OpenAIServingModels:
     mock_model_config = MagicMock(spec=ModelConfig)
     # Set the max_model_len attribute to avoid missing attribute
     mock_model_config.max_model_len = 2048
 
-    serving_engine = OpenAIServing(mock_engine_client,
-                                   mock_model_config,
-                                   BASE_MODEL_PATHS,
-                                   lora_modules=None,
-                                   prompt_adapters=None,
-                                   request_logger=None)
-    return serving_engine
+    serving_models = OpenAIServingModels(base_model_paths=BASE_MODEL_PATHS,
+                                         model_config=mock_model_config,
+                                         lora_modules=None,
+                                         prompt_adapters=None)
+
+    return serving_models
 
 
 @pytest.mark.asyncio
 async def test_serving_model_name():
-    serving_engine = await _async_serving_engine_init()
-    assert serving_engine._get_model_name(None) == MODEL_NAME
+    serving_models = await _async_serving_models_init()
+    assert serving_models.model_name(None) == MODEL_NAME
     request = LoRARequest(lora_name="adapter",
                           lora_path="/path/to/adapter2",
                           lora_int_id=1)
-    assert serving_engine._get_model_name(request) == request.lora_name
+    assert serving_models.model_name(request) == request.lora_name
 
 
 @pytest.mark.asyncio
 async def test_load_lora_adapter_success():
-    serving_engine = await _async_serving_engine_init()
+    serving_models = await _async_serving_models_init()
     request = LoadLoraAdapterRequest(lora_name="adapter",
                                      lora_path="/path/to/adapter2")
-    response = await serving_engine.load_lora_adapter(request)
+    response = await serving_models.load_lora_adapter(request)
     assert response == LORA_LOADING_SUCCESS_MESSAGE.format(lora_name='adapter')
-    assert len(serving_engine.lora_requests) == 1
-    assert serving_engine.lora_requests[0].lora_name == "adapter"
+    assert len(serving_models.lora_requests) == 1
+    assert serving_models.lora_requests[0].lora_name == "adapter"
 
 
 @pytest.mark.asyncio
 async def test_load_lora_adapter_missing_fields():
-    serving_engine = await _async_serving_engine_init()
+    serving_models = await _async_serving_models_init()
     request = LoadLoraAdapterRequest(lora_name="", lora_path="")
-    response = await serving_engine.load_lora_adapter(request)
+    response = await serving_models.load_lora_adapter(request)
     assert isinstance(response, ErrorResponse)
     assert response.type == "InvalidUserInput"
     assert response.code == HTTPStatus.BAD_REQUEST
@@ -67,43 +65,43 @@ async def test_load_lora_adapter_missing_fields():
 
 @pytest.mark.asyncio
 async def test_load_lora_adapter_duplicate():
-    serving_engine = await _async_serving_engine_init()
+    serving_models = await _async_serving_models_init()
     request = LoadLoraAdapterRequest(lora_name="adapter1",
                                      lora_path="/path/to/adapter1")
-    response = await serving_engine.load_lora_adapter(request)
+    response = await serving_models.load_lora_adapter(request)
     assert response == LORA_LOADING_SUCCESS_MESSAGE.format(
         lora_name='adapter1')
-    assert len(serving_engine.lora_requests) == 1
+    assert len(serving_models.lora_requests) == 1
 
     request = LoadLoraAdapterRequest(lora_name="adapter1",
                                      lora_path="/path/to/adapter1")
-    response = await serving_engine.load_lora_adapter(request)
+    response = await serving_models.load_lora_adapter(request)
     assert isinstance(response, ErrorResponse)
     assert response.type == "InvalidUserInput"
     assert response.code == HTTPStatus.BAD_REQUEST
-    assert len(serving_engine.lora_requests) == 1
+    assert len(serving_models.lora_requests) == 1
 
 
 @pytest.mark.asyncio
 async def test_unload_lora_adapter_success():
-    serving_engine = await _async_serving_engine_init()
+    serving_models = await _async_serving_models_init()
     request = LoadLoraAdapterRequest(lora_name="adapter1",
                                      lora_path="/path/to/adapter1")
-    response = await serving_engine.load_lora_adapter(request)
-    assert len(serving_engine.lora_requests) == 1
+    response = await serving_models.load_lora_adapter(request)
+    assert len(serving_models.lora_requests) == 1
 
     request = UnloadLoraAdapterRequest(lora_name="adapter1")
-    response = await serving_engine.unload_lora_adapter(request)
+    response = await serving_models.unload_lora_adapter(request)
     assert response == LORA_UNLOADING_SUCCESS_MESSAGE.format(
         lora_name='adapter1')
-    assert len(serving_engine.lora_requests) == 0
+    assert len(serving_models.lora_requests) == 0
 
 
 @pytest.mark.asyncio
 async def test_unload_lora_adapter_missing_fields():
-    serving_engine = await _async_serving_engine_init()
+    serving_models = await _async_serving_models_init()
     request = UnloadLoraAdapterRequest(lora_name="", lora_int_id=None)
-    response = await serving_engine.unload_lora_adapter(request)
+    response = await serving_models.unload_lora_adapter(request)
     assert isinstance(response, ErrorResponse)
     assert response.type == "InvalidUserInput"
     assert response.code == HTTPStatus.BAD_REQUEST
@@ -111,9 +109,9 @@ async def test_unload_lora_adapter_missing_fields():
 
 @pytest.mark.asyncio
 async def test_unload_lora_adapter_not_found():
-    serving_engine = await _async_serving_engine_init()
+    serving_models = await _async_serving_models_init()
     request = UnloadLoraAdapterRequest(lora_name="nonexistent_adapter")
-    response = await serving_engine.unload_lora_adapter(request)
+    response = await serving_models.unload_lora_adapter(request)
     assert isinstance(response, ErrorResponse)
     assert response.type == "InvalidUserInput"
     assert response.code == HTTPStatus.BAD_REQUEST
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index bac72d87376da..74fe378fdae42 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -58,7 +58,9 @@
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
 from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
-from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing
+from vllm.entrypoints.openai.serving_engine import OpenAIServing
+from vllm.entrypoints.openai.serving_models import (BaseModelPath,
+                                                    OpenAIServingModels)
 from vllm.entrypoints.openai.serving_pooling import OpenAIServingPooling
 from vllm.entrypoints.openai.serving_score import OpenAIServingScores
 from vllm.entrypoints.openai.serving_tokenization import (
@@ -269,6 +271,10 @@ def base(request: Request) -> OpenAIServing:
     return tokenization(request)
 
 
+def models(request: Request) -> OpenAIServingModels:
+    return request.app.state.openai_serving_models
+
+
 def chat(request: Request) -> Optional[OpenAIServingChat]:
     return request.app.state.openai_serving_chat
 
@@ -336,10 +342,10 @@ async def detokenize(request: DetokenizeRequest, raw_request: Request):
 
 @router.get("/v1/models")
 async def show_available_models(raw_request: Request):
-    handler = base(raw_request)
+    handler = models(raw_request)
 
-    models = await handler.show_available_models()
-    return JSONResponse(content=models.model_dump())
+    models_ = await handler.show_available_models()
+    return JSONResponse(content=models_.model_dump())
 
 
 @router.get("/version")
@@ -505,26 +511,22 @@ async def stop_profile(raw_request: Request):
     @router.post("/v1/load_lora_adapter")
     async def load_lora_adapter(request: LoadLoraAdapterRequest,
                                 raw_request: Request):
-        for route in [chat, completion, embedding]:
-            handler = route(raw_request)
-            if handler is not None:
-                response = await handler.load_lora_adapter(request)
-                if isinstance(response, ErrorResponse):
-                    return JSONResponse(content=response.model_dump(),
-                                        status_code=response.code)
+        handler = models(raw_request)
+        response = await handler.load_lora_adapter(request)
+        if isinstance(response, ErrorResponse):
+            return JSONResponse(content=response.model_dump(),
+                                status_code=response.code)
 
         return Response(status_code=200, content=response)
 
     @router.post("/v1/unload_lora_adapter")
     async def unload_lora_adapter(request: UnloadLoraAdapterRequest,
                                   raw_request: Request):
-        for route in [chat, completion, embedding]:
-            handler = route(raw_request)
-            if handler is not None:
-                response = await handler.unload_lora_adapter(request)
-                if isinstance(response, ErrorResponse):
-                    return JSONResponse(content=response.model_dump(),
-                                        status_code=response.code)
+        handler = models(raw_request)
+        response = await handler.unload_lora_adapter(request)
+        if isinstance(response, ErrorResponse):
+            return JSONResponse(content=response.model_dump(),
+                                status_code=response.code)
 
         return Response(status_code=200, content=response)
 
@@ -628,13 +630,18 @@ def init_app_state(
     resolved_chat_template = load_chat_template(args.chat_template)
     logger.info("Using supplied chat template:\n%s", resolved_chat_template)
 
+    state.openai_serving_models = OpenAIServingModels(
+        model_config=model_config,
+        base_model_paths=base_model_paths,
+        lora_modules=args.lora_modules,
+        prompt_adapters=args.prompt_adapters,
+    )
+    # TODO: The chat template is now broken for lora adapters :(
     state.openai_serving_chat = OpenAIServingChat(
         engine_client,
         model_config,
-        base_model_paths,
+        state.openai_serving_models,
         args.response_role,
-        lora_modules=args.lora_modules,
-        prompt_adapters=args.prompt_adapters,
         request_logger=request_logger,
         chat_template=resolved_chat_template,
         chat_template_content_format=args.chat_template_content_format,
@@ -646,16 +653,14 @@ def init_app_state(
     state.openai_serving_completion = OpenAIServingCompletion(
         engine_client,
         model_config,
-        base_model_paths,
-        lora_modules=args.lora_modules,
-        prompt_adapters=args.prompt_adapters,
+        state.openai_serving_models,
         request_logger=request_logger,
         return_tokens_as_token_ids=args.return_tokens_as_token_ids,
     ) if model_config.runner_type == "generate" else None
     state.openai_serving_pooling = OpenAIServingPooling(
         engine_client,
         model_config,
-        base_model_paths,
+        state.openai_serving_models,
         request_logger=request_logger,
         chat_template=resolved_chat_template,
         chat_template_content_format=args.chat_template_content_format,
@@ -663,7 +668,7 @@ def init_app_state(
     state.openai_serving_embedding = OpenAIServingEmbedding(
         engine_client,
         model_config,
-        base_model_paths,
+        state.openai_serving_models,
         request_logger=request_logger,
         chat_template=resolved_chat_template,
         chat_template_content_format=args.chat_template_content_format,
@@ -671,14 +676,13 @@ def init_app_state(
     state.openai_serving_scores = OpenAIServingScores(
         engine_client,
         model_config,
-        base_model_paths,
+        state.openai_serving_models,
         request_logger=request_logger
     ) if model_config.task == "score" else None
     state.openai_serving_tokenization = OpenAIServingTokenization(
         engine_client,
         model_config,
-        base_model_paths,
-        lora_modules=args.lora_modules,
+        state.openai_serving_models,
         request_logger=request_logger,
         chat_template=resolved_chat_template,
         chat_template_content_format=args.chat_template_content_format,
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index 908f8c3532c9e..22206ef8dbfe6 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -12,7 +12,7 @@
 from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str
 from vllm.entrypoints.chat_utils import (ChatTemplateContentFormatOption,
                                          validate_chat_template)
-from vllm.entrypoints.openai.serving_engine import (LoRAModulePath,
+from vllm.entrypoints.openai.serving_models import (LoRAModulePath,
                                                     PromptAdapterPath)
 from vllm.entrypoints.openai.tool_parsers import ToolParserManager
 from vllm.utils import FlexibleArgumentParser
diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
index 572ed27b39083..822c0f5f7c211 100644
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -20,7 +20,8 @@
 # yapf: enable
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
-from vllm.entrypoints.openai.serving_engine import BaseModelPath
+from vllm.entrypoints.openai.serving_models import (BaseModelPath,
+                                                    OpenAIServingModels)
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import FlexibleArgumentParser, random_uuid
 from vllm.version import __version__ as VLLM_VERSION
@@ -213,13 +214,17 @@ async def main(args):
         request_logger = RequestLogger(max_log_len=args.max_log_len)
 
     # Create the openai serving objects.
+    openai_serving_models = OpenAIServingModels(
+        model_config=model_config,
+        base_model_paths=base_model_paths,
+        lora_modules=None,
+        prompt_adapters=None,
+    )
     openai_serving_chat = OpenAIServingChat(
         engine,
         model_config,
-        base_model_paths,
+        openai_serving_models,
         args.response_role,
-        lora_modules=None,
-        prompt_adapters=None,
         request_logger=request_logger,
         chat_template=None,
         chat_template_content_format="auto",
@@ -228,7 +233,7 @@ async def main(args):
     openai_serving_embedding = OpenAIServingEmbedding(
         engine,
         model_config,
-        base_model_paths,
+        openai_serving_models,
         request_logger=request_logger,
         chat_template=None,
         chat_template_content_format="auto",
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index d085333563d19..9ba5eeb7709c9 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -21,10 +21,8 @@
     ChatCompletionStreamResponse, ChatMessage, DeltaFunctionCall, DeltaMessage,
     DeltaToolCall, ErrorResponse, FunctionCall, PromptTokenUsageInfo,
     RequestResponseMetadata, ToolCall, UsageInfo)
-from vllm.entrypoints.openai.serving_engine import (BaseModelPath,
-                                                    LoRAModulePath,
-                                                    OpenAIServing,
-                                                    PromptAdapterPath)
+from vllm.entrypoints.openai.serving_engine import OpenAIServing
+from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
 from vllm.logger import init_logger
 from vllm.outputs import CompletionOutput, RequestOutput
@@ -42,11 +40,9 @@ def __init__(
         self,
         engine_client: EngineClient,
         model_config: ModelConfig,
-        base_model_paths: List[BaseModelPath],
+        models: OpenAIServingModels,
         response_role: str,
         *,
-        lora_modules: Optional[List[LoRAModulePath]],
-        prompt_adapters: Optional[List[PromptAdapterPath]],
         request_logger: Optional[RequestLogger],
         chat_template: Optional[str],
         chat_template_content_format: ChatTemplateContentFormatOption,
@@ -57,9 +53,7 @@ def __init__(
     ) -> None:
         super().__init__(engine_client=engine_client,
                          model_config=model_config,
-                         base_model_paths=base_model_paths,
-                         lora_modules=lora_modules,
-                         prompt_adapters=prompt_adapters,
+                         models=models,
                          request_logger=request_logger,
                          return_tokens_as_token_ids=return_tokens_as_token_ids)
 
@@ -126,7 +120,7 @@ async def create_chat_completion(
                 prompt_adapter_request,
             ) = self._maybe_get_adapters(request)
 
-            model_name = self._get_model_name(lora_request)
+            model_name = self.models.model_name(lora_request)
 
             tokenizer = await self.engine_client.get_tokenizer(lora_request)
 
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index aaad7b8c7f44c..17197dce8da23 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -21,10 +21,8 @@
                                               RequestResponseMetadata,
                                               UsageInfo)
 # yapf: enable
-from vllm.entrypoints.openai.serving_engine import (BaseModelPath,
-                                                    LoRAModulePath,
-                                                    OpenAIServing,
-                                                    PromptAdapterPath)
+from vllm.entrypoints.openai.serving_engine import OpenAIServing
+from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import BeamSearchParams, SamplingParams
@@ -41,18 +39,14 @@ def __init__(
         self,
         engine_client: EngineClient,
         model_config: ModelConfig,
-        base_model_paths: List[BaseModelPath],
+        models: OpenAIServingModels,
         *,
-        lora_modules: Optional[List[LoRAModulePath]],
-        prompt_adapters: Optional[List[PromptAdapterPath]],
         request_logger: Optional[RequestLogger],
         return_tokens_as_token_ids: bool = False,
     ):
         super().__init__(engine_client=engine_client,
                          model_config=model_config,
-                         base_model_paths=base_model_paths,
-                         lora_modules=lora_modules,
-                         prompt_adapters=prompt_adapters,
+                         models=models,
                          request_logger=request_logger,
                          return_tokens_as_token_ids=return_tokens_as_token_ids)
         diff_sampling_param = self.model_config.get_diff_sampling_param()
@@ -170,7 +164,7 @@ async def create_completion(
 
         result_generator = merge_async_iterators(*generators)
 
-        model_name = self._get_model_name(lora_request)
+        model_name = self.models.model_name(lora_request)
         num_prompts = len(engine_prompts)
 
         # Similar to the OpenAI API, when n != best_of, we do not stream the
diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py
index b8fb9d6bd77f2..e7116a3d95d10 100644
--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/openai/serving_embedding.py
@@ -16,7 +16,8 @@
                                               EmbeddingResponse,
                                               EmbeddingResponseData,
                                               ErrorResponse, UsageInfo)
-from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing
+from vllm.entrypoints.openai.serving_engine import OpenAIServing
+from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.logger import init_logger
 from vllm.outputs import (EmbeddingOutput, EmbeddingRequestOutput,
                           PoolingRequestOutput)
@@ -46,7 +47,7 @@ def __init__(
         self,
         engine_client: EngineClient,
         model_config: ModelConfig,
-        base_model_paths: List[BaseModelPath],
+        models: OpenAIServingModels,
         *,
         request_logger: Optional[RequestLogger],
         chat_template: Optional[str],
@@ -54,9 +55,7 @@ def __init__(
     ) -> None:
         super().__init__(engine_client=engine_client,
                          model_config=model_config,
-                         base_model_paths=base_model_paths,
-                         lora_modules=None,
-                         prompt_adapters=None,
+                         models=models,
                          request_logger=request_logger)
 
         self.chat_template = chat_template
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 5b6a089e4c319..319f869240036 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -1,7 +1,5 @@
 import json
-import pathlib
 from concurrent.futures.thread import ThreadPoolExecutor
-from dataclasses import dataclass
 from http import HTTPStatus
 from typing import (Any, Callable, Dict, Iterable, Iterator, List, Mapping,
                     Optional, Sequence, Tuple, TypedDict, Union)
@@ -28,13 +26,10 @@
                                               DetokenizeRequest,
                                               EmbeddingChatRequest,
                                               EmbeddingCompletionRequest,
-                                              ErrorResponse,
-                                              LoadLoraAdapterRequest,
-                                              ModelCard, ModelList,
-                                              ModelPermission, ScoreRequest,
+                                              ErrorResponse, ScoreRequest,
                                               TokenizeChatRequest,
-                                              TokenizeCompletionRequest,
-                                              UnloadLoraAdapterRequest)
+                                              TokenizeCompletionRequest)
+from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.openai.tool_parsers import ToolParser
 # yapf: enable
 from vllm.inputs import TokensPrompt
@@ -48,30 +43,10 @@
 from vllm.tracing import (contains_trace_headers, extract_trace_headers,
                           log_tracing_disabled_warning)
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
-from vllm.utils import AtomicCounter, is_list_of, make_async, random_uuid
+from vllm.utils import is_list_of, make_async, random_uuid
 
 logger = init_logger(__name__)
 
-
-@dataclass
-class BaseModelPath:
-    name: str
-    model_path: str
-
-
-@dataclass
-class PromptAdapterPath:
-    name: str
-    local_path: str
-
-
-@dataclass
-class LoRAModulePath:
-    name: str
-    path: str
-    base_model_name: Optional[str] = None
-
-
 CompletionLikeRequest = Union[CompletionRequest, DetokenizeRequest,
                               EmbeddingCompletionRequest, ScoreRequest,
                               TokenizeCompletionRequest]
@@ -96,10 +71,8 @@ def __init__(
         self,
         engine_client: EngineClient,
         model_config: ModelConfig,
-        base_model_paths: List[BaseModelPath],
+        models: OpenAIServingModels,
         *,
-        lora_modules: Optional[List[LoRAModulePath]],
-        prompt_adapters: Optional[List[PromptAdapterPath]],
         request_logger: Optional[RequestLogger],
         return_tokens_as_token_ids: bool = False,
     ):
@@ -109,35 +82,7 @@ def __init__(
         self.model_config = model_config
         self.max_model_len = model_config.max_model_len
 
-        self.base_model_paths = base_model_paths
-
-        self.lora_id_counter = AtomicCounter(0)
-        self.lora_requests = []
-        if lora_modules is not None:
-            self.lora_requests = [
-                LoRARequest(lora_name=lora.name,
-                            lora_int_id=i,
-                            lora_path=lora.path,
-                            base_model_name=lora.base_model_name
-                            if lora.base_model_name
-                            and self._is_model_supported(lora.base_model_name)
-                            else self.base_model_paths[0].name)
-                for i, lora in enumerate(lora_modules, start=1)
-            ]
-
-        self.prompt_adapter_requests = []
-        if prompt_adapters is not None:
-            for i, prompt_adapter in enumerate(prompt_adapters, start=1):
-                with pathlib.Path(prompt_adapter.local_path,
-                                  "adapter_config.json").open() as f:
-                    adapter_config = json.load(f)
-                    num_virtual_tokens = adapter_config["num_virtual_tokens"]
-                self.prompt_adapter_requests.append(
-                    PromptAdapterRequest(
-                        prompt_adapter_name=prompt_adapter.name,
-                        prompt_adapter_id=i,
-                        prompt_adapter_local_path=prompt_adapter.local_path,
-                        prompt_adapter_num_virtual_tokens=num_virtual_tokens))
+        self.models = models
 
         self.request_logger = request_logger
         self.return_tokens_as_token_ids = return_tokens_as_token_ids
@@ -150,33 +95,6 @@ def __init__(
             self._tokenize_prompt_input_or_inputs,
             executor=self._tokenizer_executor)
 
-    async def show_available_models(self) -> ModelList:
-        """Show available models. Right now we only have one model."""
-        model_cards = [
-            ModelCard(id=base_model.name,
-                      max_model_len=self.max_model_len,
-                      root=base_model.model_path,
-                      permission=[ModelPermission()])
-            for base_model in self.base_model_paths
-        ]
-        lora_cards = [
-            ModelCard(id=lora.lora_name,
-                      root=lora.local_path,
-                      parent=lora.base_model_name if lora.base_model_name else
-                      self.base_model_paths[0].name,
-                      permission=[ModelPermission()])
-            for lora in self.lora_requests
-        ]
-        prompt_adapter_cards = [
-            ModelCard(id=prompt_adapter.prompt_adapter_name,
-                      root=self.base_model_paths[0].name,
-                      permission=[ModelPermission()])
-            for prompt_adapter in self.prompt_adapter_requests
-        ]
-        model_cards.extend(lora_cards)
-        model_cards.extend(prompt_adapter_cards)
-        return ModelList(data=model_cards)
-
     def create_error_response(
             self,
             message: str,
@@ -205,11 +123,13 @@ async def _check_model(
     ) -> Optional[ErrorResponse]:
         if self._is_model_supported(request.model):
             return None
-        if request.model in [lora.lora_name for lora in self.lora_requests]:
+        if request.model in [
+                lora.lora_name for lora in self.models.lora_requests
+        ]:
             return None
         if request.model in [
                 prompt_adapter.prompt_adapter_name
-                for prompt_adapter in self.prompt_adapter_requests
+                for prompt_adapter in self.models.prompt_adapter_requests
         ]:
             return None
         return self.create_error_response(
@@ -223,10 +143,10 @@ def _maybe_get_adapters(
             None, PromptAdapterRequest]]:
         if self._is_model_supported(request.model):
             return None, None
-        for lora in self.lora_requests:
+        for lora in self.models.lora_requests:
             if request.model == lora.lora_name:
                 return lora, None
-        for prompt_adapter in self.prompt_adapter_requests:
+        for prompt_adapter in self.models.prompt_adapter_requests:
             if request.model == prompt_adapter.prompt_adapter_name:
                 return None, prompt_adapter
         # if _check_model has been called earlier, this will be unreachable
@@ -588,91 +508,5 @@ def _get_decoded_token(logprob: Logprob,
             return logprob.decoded_token
         return tokenizer.decode(token_id)
 
-    async def _check_load_lora_adapter_request(
-            self, request: LoadLoraAdapterRequest) -> Optional[ErrorResponse]:
-        # Check if both 'lora_name' and 'lora_path' are provided
-        if not request.lora_name or not request.lora_path:
-            return self.create_error_response(
-                message="Both 'lora_name' and 'lora_path' must be provided.",
-                err_type="InvalidUserInput",
-                status_code=HTTPStatus.BAD_REQUEST)
-
-        # Check if the lora adapter with the given name already exists
-        if any(lora_request.lora_name == request.lora_name
-               for lora_request in self.lora_requests):
-            return self.create_error_response(
-                message=
-                f"The lora adapter '{request.lora_name}' has already been"
-                "loaded.",
-                err_type="InvalidUserInput",
-                status_code=HTTPStatus.BAD_REQUEST)
-
-        return None
-
-    async def _check_unload_lora_adapter_request(
-            self,
-            request: UnloadLoraAdapterRequest) -> Optional[ErrorResponse]:
-        # Check if either 'lora_name' or 'lora_int_id' is provided
-        if not request.lora_name and not request.lora_int_id:
-            return self.create_error_response(
-                message=
-                "either 'lora_name' and 'lora_int_id' needs to be provided.",
-                err_type="InvalidUserInput",
-                status_code=HTTPStatus.BAD_REQUEST)
-
-        # Check if the lora adapter with the given name exists
-        if not any(lora_request.lora_name == request.lora_name
-                   for lora_request in self.lora_requests):
-            return self.create_error_response(
-                message=
-                f"The lora adapter '{request.lora_name}' cannot be found.",
-                err_type="InvalidUserInput",
-                status_code=HTTPStatus.BAD_REQUEST)
-
-        return None
-
-    async def load_lora_adapter(
-            self,
-            request: LoadLoraAdapterRequest) -> Union[ErrorResponse, str]:
-        error_check_ret = await self._check_load_lora_adapter_request(request)
-        if error_check_ret is not None:
-            return error_check_ret
-
-        lora_name, lora_path = request.lora_name, request.lora_path
-        unique_id = self.lora_id_counter.inc(1)
-        self.lora_requests.append(
-            LoRARequest(lora_name=lora_name,
-                        lora_int_id=unique_id,
-                        lora_path=lora_path))
-        return f"Success: LoRA adapter '{lora_name}' added successfully."
-
-    async def unload_lora_adapter(
-            self,
-            request: UnloadLoraAdapterRequest) -> Union[ErrorResponse, str]:
-        error_check_ret = await self._check_unload_lora_adapter_request(request
-                                                                        )
-        if error_check_ret is not None:
-            return error_check_ret
-
-        lora_name = request.lora_name
-        self.lora_requests = [
-            lora_request for lora_request in self.lora_requests
-            if lora_request.lora_name != lora_name
-        ]
-        return f"Success: LoRA adapter '{lora_name}' removed successfully."
-
     def _is_model_supported(self, model_name):
-        return any(model.name == model_name for model in self.base_model_paths)
-
-    def _get_model_name(self, lora: Optional[LoRARequest]):
-        """
-        Returns the appropriate model name depending on the availability
-        and support of the LoRA or base model.
-        Parameters:
-        - lora: LoRARequest that contain a base_model_name.
-        Returns:
-        - str: The name of the base model or the first available model path.
-        """
-        if lora is not None:
-            return lora.lora_name
-        return self.base_model_paths[0].name
+        return self.models.is_base_model(model_name)
diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py
new file mode 100644
index 0000000000000..26966896bc272
--- /dev/null
+++ b/vllm/entrypoints/openai/serving_models.py
@@ -0,0 +1,210 @@
+import json
+import pathlib
+from dataclasses import dataclass
+from http import HTTPStatus
+from typing import List, Optional, Union
+
+from vllm.config import ModelConfig
+from vllm.entrypoints.openai.protocol import (ErrorResponse,
+                                              LoadLoraAdapterRequest,
+                                              ModelCard, ModelList,
+                                              ModelPermission,
+                                              UnloadLoraAdapterRequest)
+from vllm.lora.request import LoRARequest
+from vllm.prompt_adapter.request import PromptAdapterRequest
+from vllm.utils import AtomicCounter
+
+
+@dataclass
+class BaseModelPath:
+    name: str
+    model_path: str
+
+
+@dataclass
+class PromptAdapterPath:
+    name: str
+    local_path: str
+
+
+@dataclass
+class LoRAModulePath:
+    name: str
+    path: str
+    base_model_name: Optional[str] = None
+
+
+class OpenAIServingModels:
+    """Shared instance to hold data about the loaded base model(s) and adapters.
+
+    Handles the routes:
+    - /v1/models
+    - /v1/load_lora_adapter
+    - /v1/unload_lora_adapter
+    """
+
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        base_model_paths: List[BaseModelPath],
+        *,
+        lora_modules: Optional[List[LoRAModulePath]] = None,
+        prompt_adapters: Optional[List[PromptAdapterPath]] = None,
+    ):
+        super().__init__()
+
+        self.base_model_paths = base_model_paths
+        self.max_model_len = model_config.max_model_len
+
+        self.lora_id_counter = AtomicCounter(0)
+        self.lora_requests = []
+        if lora_modules is not None:
+            self.lora_requests = [
+                LoRARequest(lora_name=lora.name,
+                            lora_int_id=i,
+                            lora_path=lora.path,
+                            base_model_name=lora.base_model_name
+                            if lora.base_model_name
+                            and self.is_base_model(lora.base_model_name) else
+                            self.base_model_paths[0].name)
+                for i, lora in enumerate(lora_modules, start=1)
+            ]
+
+        self.prompt_adapter_requests = []
+        if prompt_adapters is not None:
+            for i, prompt_adapter in enumerate(prompt_adapters, start=1):
+                with pathlib.Path(prompt_adapter.local_path,
+                                  "adapter_config.json").open() as f:
+                    adapter_config = json.load(f)
+                    num_virtual_tokens = adapter_config["num_virtual_tokens"]
+                self.prompt_adapter_requests.append(
+                    PromptAdapterRequest(
+                        prompt_adapter_name=prompt_adapter.name,
+                        prompt_adapter_id=i,
+                        prompt_adapter_local_path=prompt_adapter.local_path,
+                        prompt_adapter_num_virtual_tokens=num_virtual_tokens))
+
+    def is_base_model(self, model_name):
+        return any(model.name == model_name for model in self.base_model_paths)
+
+    def model_name(self, lora_request: Optional[LoRARequest] = None) -> str:
+        """Returns the appropriate model name depending on the availability
+        and support of the LoRA or base model.
+        Parameters:
+        - lora: LoRARequest that contain a base_model_name.
+        Returns:
+        - str: The name of the base model or the first available model path.
+        """
+        if lora_request is not None:
+            return lora_request.lora_name
+        return self.base_model_paths[0].name
+
+    async def show_available_models(self) -> ModelList:
+        """Show available models. This includes the base model and all 
+        adapters"""
+        model_cards = [
+            ModelCard(id=base_model.name,
+                      max_model_len=self.max_model_len,
+                      root=base_model.model_path,
+                      permission=[ModelPermission()])
+            for base_model in self.base_model_paths
+        ]
+        lora_cards = [
+            ModelCard(id=lora.lora_name,
+                      root=lora.local_path,
+                      parent=lora.base_model_name if lora.base_model_name else
+                      self.base_model_paths[0].name,
+                      permission=[ModelPermission()])
+            for lora in self.lora_requests
+        ]
+        prompt_adapter_cards = [
+            ModelCard(id=prompt_adapter.prompt_adapter_name,
+                      root=self.base_model_paths[0].name,
+                      permission=[ModelPermission()])
+            for prompt_adapter in self.prompt_adapter_requests
+        ]
+        model_cards.extend(lora_cards)
+        model_cards.extend(prompt_adapter_cards)
+        return ModelList(data=model_cards)
+
+    async def load_lora_adapter(
+            self,
+            request: LoadLoraAdapterRequest) -> Union[ErrorResponse, str]:
+        error_check_ret = await self._check_load_lora_adapter_request(request)
+        if error_check_ret is not None:
+            return error_check_ret
+
+        lora_name, lora_path = request.lora_name, request.lora_path
+        unique_id = self.lora_id_counter.inc(1)
+        self.lora_requests.append(
+            LoRARequest(lora_name=lora_name,
+                        lora_int_id=unique_id,
+                        lora_path=lora_path))
+        return f"Success: LoRA adapter '{lora_name}' added successfully."
+
+    async def unload_lora_adapter(
+            self,
+            request: UnloadLoraAdapterRequest) -> Union[ErrorResponse, str]:
+        error_check_ret = await self._check_unload_lora_adapter_request(request
+                                                                        )
+        if error_check_ret is not None:
+            return error_check_ret
+
+        lora_name = request.lora_name
+        self.lora_requests = [
+            lora_request for lora_request in self.lora_requests
+            if lora_request.lora_name != lora_name
+        ]
+        return f"Success: LoRA adapter '{lora_name}' removed successfully."
+
+    async def _check_load_lora_adapter_request(
+            self, request: LoadLoraAdapterRequest) -> Optional[ErrorResponse]:
+        # Check if both 'lora_name' and 'lora_path' are provided
+        if not request.lora_name or not request.lora_path:
+            return create_error_response(
+                message="Both 'lora_name' and 'lora_path' must be provided.",
+                err_type="InvalidUserInput",
+                status_code=HTTPStatus.BAD_REQUEST)
+
+        # Check if the lora adapter with the given name already exists
+        if any(lora_request.lora_name == request.lora_name
+               for lora_request in self.lora_requests):
+            return create_error_response(
+                message=
+                f"The lora adapter '{request.lora_name}' has already been"
+                "loaded.",
+                err_type="InvalidUserInput",
+                status_code=HTTPStatus.BAD_REQUEST)
+
+        return None
+
+    async def _check_unload_lora_adapter_request(
+            self,
+            request: UnloadLoraAdapterRequest) -> Optional[ErrorResponse]:
+        # Check if either 'lora_name' or 'lora_int_id' is provided
+        if not request.lora_name and not request.lora_int_id:
+            return create_error_response(
+                message=
+                "either 'lora_name' and 'lora_int_id' needs to be provided.",
+                err_type="InvalidUserInput",
+                status_code=HTTPStatus.BAD_REQUEST)
+
+        # Check if the lora adapter with the given name exists
+        if not any(lora_request.lora_name == request.lora_name
+                   for lora_request in self.lora_requests):
+            return create_error_response(
+                message=
+                f"The lora adapter '{request.lora_name}' cannot be found.",
+                err_type="InvalidUserInput",
+                status_code=HTTPStatus.BAD_REQUEST)
+
+        return None
+
+
+def create_error_response(
+        message: str,
+        err_type: str = "BadRequestError",
+        status_code: HTTPStatus = HTTPStatus.BAD_REQUEST) -> ErrorResponse:
+    return ErrorResponse(message=message,
+                         type=err_type,
+                         code=status_code.value)
diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py
index 01852f0df1eca..5830322071e58 100644
--- a/vllm/entrypoints/openai/serving_pooling.py
+++ b/vllm/entrypoints/openai/serving_pooling.py
@@ -15,7 +15,8 @@
                                               PoolingChatRequest,
                                               PoolingRequest, PoolingResponse,
                                               PoolingResponseData, UsageInfo)
-from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing
+from vllm.entrypoints.openai.serving_engine import OpenAIServing
+from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.logger import init_logger
 from vllm.outputs import PoolingOutput, PoolingRequestOutput
 from vllm.utils import merge_async_iterators
@@ -44,7 +45,7 @@ def __init__(
         self,
         engine_client: EngineClient,
         model_config: ModelConfig,
-        base_model_paths: List[BaseModelPath],
+        models: OpenAIServingModels,
         *,
         request_logger: Optional[RequestLogger],
         chat_template: Optional[str],
@@ -52,9 +53,7 @@ def __init__(
     ) -> None:
         super().__init__(engine_client=engine_client,
                          model_config=model_config,
-                         base_model_paths=base_model_paths,
-                         lora_modules=None,
-                         prompt_adapters=None,
+                         models=models,
                          request_logger=request_logger)
 
         self.chat_template = chat_template
diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py
index a8a126e697641..5d3e7139d7a17 100644
--- a/vllm/entrypoints/openai/serving_score.py
+++ b/vllm/entrypoints/openai/serving_score.py
@@ -10,7 +10,8 @@
 from vllm.entrypoints.openai.protocol import (ErrorResponse, ScoreRequest,
                                               ScoreResponse, ScoreResponseData,
                                               UsageInfo)
-from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing
+from vllm.entrypoints.openai.serving_engine import OpenAIServing
+from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.inputs.data import TokensPrompt
 from vllm.logger import init_logger
 from vllm.outputs import PoolingRequestOutput, ScoringRequestOutput
@@ -50,15 +51,13 @@ def __init__(
         self,
         engine_client: EngineClient,
         model_config: ModelConfig,
-        base_model_paths: List[BaseModelPath],
+        models: OpenAIServingModels,
         *,
         request_logger: Optional[RequestLogger],
     ) -> None:
         super().__init__(engine_client=engine_client,
                          model_config=model_config,
-                         base_model_paths=base_model_paths,
-                         lora_modules=None,
-                         prompt_adapters=None,
+                         models=models,
                          request_logger=request_logger)
 
     async def create_score(
diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py
index 2e849333680d4..b67ecfb01316f 100644
--- a/vllm/entrypoints/openai/serving_tokenization.py
+++ b/vllm/entrypoints/openai/serving_tokenization.py
@@ -15,9 +15,8 @@
                                               TokenizeRequest,
                                               TokenizeResponse)
 # yapf: enable
-from vllm.entrypoints.openai.serving_engine import (BaseModelPath,
-                                                    LoRAModulePath,
-                                                    OpenAIServing)
+from vllm.entrypoints.openai.serving_engine import OpenAIServing
+from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.logger import init_logger
 
 logger = init_logger(__name__)
@@ -29,18 +28,15 @@ def __init__(
         self,
         engine_client: EngineClient,
         model_config: ModelConfig,
-        base_model_paths: List[BaseModelPath],
+        models: OpenAIServingModels,
         *,
-        lora_modules: Optional[List[LoRAModulePath]],
         request_logger: Optional[RequestLogger],
         chat_template: Optional[str],
         chat_template_content_format: ChatTemplateContentFormatOption,
     ) -> None:
         super().__init__(engine_client=engine_client,
                          model_config=model_config,
-                         base_model_paths=base_model_paths,
-                         lora_modules=lora_modules,
-                         prompt_adapters=None,
+                         models=models,
                          request_logger=request_logger)
 
         self.chat_template = chat_template

From 365801feddaf5c4448704a1f55269dd992f5a4b1 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 1 Jan 2025 14:15:21 +0800
Subject: [PATCH 042/462] [VLM] Add max-count checking in data parser for
 single image models (#11661)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
---
 docs/source/models/supported_models.md  |  2 +-
 tests/multimodal/test_processing.py     |  3 ++-
 vllm/model_executor/models/blip2.py     |  4 ++++
 vllm/model_executor/models/chameleon.py |  4 ++++
 vllm/model_executor/models/fuyu.py      | 18 +++++++++-------
 vllm/multimodal/parse.py                | 28 +++++++++++++++++++++++--
 6 files changed, 48 insertions(+), 11 deletions(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index f74c201bdff6b..7682ed104b8c5 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -566,7 +566,7 @@ See [this page](#generative-models) for more information on how to use generativ
   - [V1](gh-issue:8779)
 * - `AriaForConditionalGeneration`
   - Aria
-  - T + I
+  - T + I<sup>+</sup>
   - `rhymes-ai/Aria`
   -
   - ✅︎
diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index 81278cde264ff..1850ca46ccc8f 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -622,10 +622,11 @@ def _test_processing_cache_correctness(
 
 
 # yapf: disable
+# True if the model supports multiple data items of the modality per request
 @pytest.mark.parametrize(("model_id", "modalities"), [
     ("rhymes-ai/Aria", {"image": True}),
     ("Salesforce/blip2-opt-2.7b", {"image": False}),
-    ("facebook/chameleon-7b", {"image": True}),
+    ("facebook/chameleon-7b", {"image": False}),
     ("adept/fuyu-8b", {"image": False}),
     ("llava-hf/llava-1.5-7b-hf", {"image": True}),
     ("TIGER-Lab/Mantis-8B-siglip-llama3", {"image": True}),
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index bf70f5d904f5b..50680fadc4aa3 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -18,6 +18,7 @@
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
                                     MultiModalInputsV2, MultiModalKwargs,
                                     NestedTensors, PlaceholderRange)
+from vllm.multimodal.parse import MultiModalDataParser
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         MultiModalDataItems, ProcessorInputs,
                                         PromptReplacement)
@@ -404,6 +405,9 @@ def get_max_blip2_image_tokens(ctx: InputContext):
 
 class Blip2MultiModalProcessor(BaseMultiModalProcessor):
 
+    def _get_data_parser(self) -> MultiModalDataParser:
+        return MultiModalDataParser(max_mm_counts={"image": 1})
+
     def _get_hf_processor(self) -> Blip2Processor:
         return self.ctx.get_hf_processor(Blip2Processor)
 
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index 85fca23b05746..c731934e792fc 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -31,6 +31,7 @@
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
                                     MultiModalInputsV2, MultiModalKwargs,
                                     NestedTensors, PlaceholderRange)
+from vllm.multimodal.parse import MultiModalDataParser
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         MultiModalDataItems, ProcessorInputs,
                                         PromptReplacement)
@@ -60,6 +61,9 @@ def get_max_chameleon_image_tokens(ctx: InputContext):
 
 class ChameleonMultiModalProcessor(BaseMultiModalProcessor):
 
+    def _get_data_parser(self) -> MultiModalDataParser:
+        return MultiModalDataParser(max_mm_counts={"image": 1})
+
     def _get_hf_processor(self) -> ChameleonProcessor:
         return self.ctx.get_hf_processor(ChameleonProcessor)
 
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index 8c14866f20b92..0a48fa3fe11c0 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -34,7 +34,7 @@
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
                                     MultiModalInputsV2, MultiModalKwargs,
                                     NestedTensors, PlaceholderRange)
-from vllm.multimodal.parse import ImageProcessorItems
+from vllm.multimodal.parse import ImageProcessorItems, MultiModalDataParser
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         MultiModalDataItems, ProcessorInputs,
                                         PromptReplacement)
@@ -54,7 +54,7 @@
 
 class FuyuImagePatchInputs(TypedDict):
     type: Literal["image_patches"]
-    data: torch.Tensor
+    flat_data: torch.Tensor
     """
     Shape: 
     `(batch_size * num_patches, patch_size_x * patch_size_y * num_channels)`
@@ -63,7 +63,7 @@ class FuyuImagePatchInputs(TypedDict):
     patches_per_image: List[int]
     """
     List of number of total patches for each image in the batch.
-    This is used to restore the first two dimensions of `data`.
+    This is used to restore the first two dimensions of `flat_data`.
     """
 
 
@@ -102,6 +102,9 @@ def get_max_fuyu_image_tokens(ctx: InputContext):
 
 class FuyuMultiModalProcessor(BaseMultiModalProcessor):
 
+    def _get_data_parser(self) -> MultiModalDataParser:
+        return MultiModalDataParser(max_mm_counts={"image": 1})
+
     def _get_hf_processor(self) -> FuyuProcessor:
         return self.ctx.get_hf_processor(FuyuProcessor)
 
@@ -304,7 +307,7 @@ def _parse_and_validate_image_input(
 
             return FuyuImagePatchInputs(
                 type="image_patches",
-                data=self._validate_pixel_values(
+                flat_data=self._validate_pixel_values(
                     flatten_bn(image_patches_flat, concat=True)),
                 patches_per_image=[x.size(0) for x in image_patches_flat],
             )
@@ -313,12 +316,13 @@ def _parse_and_validate_image_input(
 
     def _process_image_input(
             self, image_input: FuyuImagePatchInputs) -> NestedTensors:
-        image_patches = image_input["data"]
+        image_patches_flat = image_input["flat_data"]
         patches_per_image = image_input["patches_per_image"]
 
         assert self.vision_embed_tokens is not None
-        vision_embeddings, _ = self.vision_embed_tokens(image_patches)
-        return vision_embeddings.split(patches_per_image, dim=0)
+        vision_embeddings_flat, _ = self.vision_embed_tokens(
+            image_patches_flat)
+        return vision_embeddings_flat.split(patches_per_image, dim=0)
 
     def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
         image_input = self._parse_and_validate_image_input(**kwargs)
diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py
index 17a795247372e..da111e999ebb8 100644
--- a/vllm/multimodal/parse.py
+++ b/vllm/multimodal/parse.py
@@ -220,11 +220,24 @@ def get_items(
 class MultiModalDataParser:
     """
     Parses :class:`MultiModalDataDict` into :class:`MultiModalDataItems`.
+
+    Args:
+        max_mm_counts (Mapping[str, int]): The maximum allowed number of items
+            belonging to each modality. This effectively sets a hard limit over
+            `--limit-mm-per-prompt`.
+        target_sr (float, optional): Enables automatic resampling of audio
+            items to the model's expected sampling rate.
     """
 
-    def __init__(self, *, target_sr: Optional[float] = None) -> None:
+    def __init__(
+        self,
+        *,
+        max_mm_counts: Mapping[str, int] = {},
+        target_sr: Optional[float] = None,
+    ) -> None:
         super().__init__()
 
+        self.max_mm_counts = max_mm_counts
         self.target_sr = target_sr
 
     def _is_embeddings(self, data: object) -> TypeGuard[NestedTensors]:
@@ -332,6 +345,7 @@ def _get_subparsers(self) -> Mapping[str, ModalityDataParser]:
 
     def parse_mm_data(self,
                       mm_data: MultiModalDataDict) -> MultiModalDataItems:
+        max_mm_counts = self.max_mm_counts
         subparsers = self._get_subparsers()
 
         mm_items = MultiModalDataItems()
@@ -339,6 +353,16 @@ def parse_mm_data(self,
             if k not in subparsers:
                 raise ValueError(f"Unsupported modality: {k}")
 
-            mm_items[k] = subparsers[k](v)
+            modality_items = subparsers[k](v)
+
+            if k in max_mm_counts:
+                max_count = max_mm_counts[k]
+                if len(modality_items) > max_count:
+                    raise ValueError(
+                        f"This model supports at most {max_count} {k} items "
+                        f"per prompt, but {len(modality_items)} {k} items "
+                        "were given or set as its limit_mm_per_prompt.")
+
+            mm_items[k] = modality_items
 
         return mm_items

From 11d8a091c6c775575a53d37408c94faa0b07730f Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Wed, 1 Jan 2025 14:42:23 +0800
Subject: [PATCH 043/462] [Misc] Optimize Qwen2-VL LoRA test (#11663)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 tests/lora/test_qwen2vl.py             |  5 ++---
 vllm/model_executor/models/qwen2_vl.py | 20 +++++++++++++++++++-
 2 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/tests/lora/test_qwen2vl.py b/tests/lora/test_qwen2vl.py
index c9f48402b0268..ebdd129db5f6a 100644
--- a/tests/lora/test_qwen2vl.py
+++ b/tests/lora/test_qwen2vl.py
@@ -7,7 +7,7 @@
 from vllm.lora.request import LoRARequest
 from vllm.platforms import current_platform
 
-MODEL_PATH = "Qwen/Qwen2-VL-7B-Instruct"
+MODEL_PATH = "Qwen/Qwen2-VL-2B-Instruct"
 
 PROMPT_TEMPLATE = (
     "<|im_start|>system\nYou are a helpful assistant.<|im_end|>"
@@ -49,10 +49,9 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
     # Print the outputs.
     generated_texts: List[str] = []
     for output in outputs:
-        prompt = output.prompt
         generated_text = output.outputs[0].text.strip()
         generated_texts.append(generated_text)
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        print(f"Generated text: {generated_text!r}")
     return generated_texts
 
 
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 1e485f87bb7a4..0df101b3dcce4 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -52,6 +52,7 @@
     GPTQMarlinConfig)
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (ImageItem, ModalityData,
                                     MultiModalFieldConfig, MultiModalKwargs,
@@ -926,15 +927,23 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
     }
 
     # LoRA specific attributes
-    # TODO Support LoRA for the visual encoder in the future.
     supported_lora_modules = [
         "qkv_proj",
         "o_proj",
         "gate_up_proj",
         "down_proj",
+        # vision tower
+        "qkv",
+        "attn.proj",  # Distinguish patch_embed.proj
+        "fc1",
+        "fc2",
+        # projector
+        "mlp.0",
+        "mlp.2"
     ]
     embedding_modules = {}
     embedding_padding_modules = []
+
     # To ensure correct weight loading and mapping.
     hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={
         "lm_head.": "language_model.lm_head.",
@@ -1231,3 +1240,12 @@ def load_weights(self, weights: Iterable[Tuple[str,
 
         loader = AutoWeightsLoader(self)
         return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
+
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """
+        Get the module prefix in multimodal models
+        """
+        return MultiModelKeys.from_string_field(
+            language_model="language_model",
+            connector="visual.",
+            tower_model="visual.merger.")

From f962f426bc63b66301da61d2ac7078bf0ba941b0 Mon Sep 17 00:00:00 2001
From: Lu Fang <30275821+houseroad@users.noreply.github.com>
Date: Tue, 31 Dec 2024 23:39:30 -0800
Subject: [PATCH 044/462] [Misc] Replace space with - in the file names
 (#11667)

Signed-off-by: Lu Fang <lufang@fb.com>
---
 .github/ISSUE_TEMPLATE/{400-bug report.yml => 400-bug-report.yml} | 0
 .../{500-feature request.yml => 500-feature-request.yml}          | 0
 .github/ISSUE_TEMPLATE/{600-new model.yml => 600-new-model.yml}   | 0
 ...-performance discussion.yml => 700-performance-discussion.yml} | 0
 .../{800-misc discussion.yml => 800-misc-discussion.yml}          | 0
 5 files changed, 0 insertions(+), 0 deletions(-)
 rename .github/ISSUE_TEMPLATE/{400-bug report.yml => 400-bug-report.yml} (100%)
 rename .github/ISSUE_TEMPLATE/{500-feature request.yml => 500-feature-request.yml} (100%)
 rename .github/ISSUE_TEMPLATE/{600-new model.yml => 600-new-model.yml} (100%)
 rename .github/ISSUE_TEMPLATE/{700-performance discussion.yml => 700-performance-discussion.yml} (100%)
 rename .github/ISSUE_TEMPLATE/{800-misc discussion.yml => 800-misc-discussion.yml} (100%)

diff --git a/.github/ISSUE_TEMPLATE/400-bug report.yml b/.github/ISSUE_TEMPLATE/400-bug-report.yml
similarity index 100%
rename from .github/ISSUE_TEMPLATE/400-bug report.yml
rename to .github/ISSUE_TEMPLATE/400-bug-report.yml
diff --git a/.github/ISSUE_TEMPLATE/500-feature request.yml b/.github/ISSUE_TEMPLATE/500-feature-request.yml
similarity index 100%
rename from .github/ISSUE_TEMPLATE/500-feature request.yml
rename to .github/ISSUE_TEMPLATE/500-feature-request.yml
diff --git a/.github/ISSUE_TEMPLATE/600-new model.yml b/.github/ISSUE_TEMPLATE/600-new-model.yml
similarity index 100%
rename from .github/ISSUE_TEMPLATE/600-new model.yml
rename to .github/ISSUE_TEMPLATE/600-new-model.yml
diff --git a/.github/ISSUE_TEMPLATE/700-performance discussion.yml b/.github/ISSUE_TEMPLATE/700-performance-discussion.yml
similarity index 100%
rename from .github/ISSUE_TEMPLATE/700-performance discussion.yml
rename to .github/ISSUE_TEMPLATE/700-performance-discussion.yml
diff --git a/.github/ISSUE_TEMPLATE/800-misc discussion.yml b/.github/ISSUE_TEMPLATE/800-misc-discussion.yml
similarity index 100%
rename from .github/ISSUE_TEMPLATE/800-misc discussion.yml
rename to .github/ISSUE_TEMPLATE/800-misc-discussion.yml

From 6d70198b17b008f5b845582590b96a507b4d68b5 Mon Sep 17 00:00:00 2001
From: Kazuhiro Serizawa <nserihiro@gmail.com>
Date: Wed, 1 Jan 2025 17:10:10 +0900
Subject: [PATCH 045/462] [Doc] Fix typo (#11666)

Signed-off-by: Kazuhiro Serizawa <nserihiro@gmail.com>
---
 vllm/model_executor/layers/rejection_sampler.py | 2 +-
 vllm/v1/sample/ops/topk_topp_sampler.py         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py
index 97a1b0c9603bd..165e8309fee64 100644
--- a/vllm/model_executor/layers/rejection_sampler.py
+++ b/vllm/model_executor/layers/rejection_sampler.py
@@ -39,7 +39,7 @@ def __init__(self,
             strict_mode: Whether or not to perform shape/device/dtype checks
             during sampling. This catches correctness issues but adds
             nontrivial latency.
-            use_falshinfer: We will use this parameter to determine whether
+            use_flashinfer: We will use this parameter to determine whether
             to use the FlashInfer rejection sampling kernel or not. If it's
             None, we will use the default value from the environment variable.
             This parameter is only used for testing purposes.
diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py
index c088c3c129ca5..f2007d85c61a5 100644
--- a/vllm/v1/sample/ops/topk_topp_sampler.py
+++ b/vllm/v1/sample/ops/topk_topp_sampler.py
@@ -44,7 +44,7 @@ def __init__(self):
                 logger.warning(
                     "FlashInfer is not available. Falling back to the PyTorch-"
                     "native implementation of top-p & top-k sampling. For the "
-                    "best performance, please install FalshInfer.")
+                    "best performance, please install FlashInfer.")
                 self.forward = self.forward_native
         else:
             self.forward = self.forward_native

From 73001445fbfc42d386d68066519738dfffa62df3 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 1 Jan 2025 21:56:46 +0900
Subject: [PATCH 046/462] [V1] Implement Cascade Attention (#11635)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 CMakeLists.txt                              |   2 +-
 tests/conftest.py                           |   7 +
 tests/kernels/test_cascade_flash_attn.py    | 182 +++++++++++++
 tests/system_messages/sonnet3.5_nov2024.txt |  71 ++++++
 tests/v1/e2e/__init__.py                    |   0
 tests/v1/e2e/test_cascade_attention.py      |  22 ++
 vllm/v1/attention/backends/flash_attn.py    | 267 +++++++++++++++++++-
 vllm/v1/core/kv_cache_manager.py            |  52 +++-
 vllm/v1/core/scheduler.py                   |  10 +
 vllm/v1/worker/gpu_model_runner.py          |  96 ++++++-
 10 files changed, 693 insertions(+), 16 deletions(-)
 create mode 100644 tests/kernels/test_cascade_flash_attn.py
 create mode 100644 tests/system_messages/sonnet3.5_nov2024.txt
 create mode 100644 tests/v1/e2e/__init__.py
 create mode 100644 tests/v1/e2e/test_cascade_attention.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3206d76125545..f4b9c3ec9c14f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -550,7 +550,7 @@ else()
   FetchContent_Declare(
           vllm-flash-attn
           GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 04325b6798bcc326c86fb35af62d05a9c8c8eceb
+          GIT_TAG 96266b1111111f3d11aabefaf3bacbab6a89d03c
           GIT_PROGRESS TRUE
           # Don't share the vllm-flash-attn build between build types
           BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
diff --git a/tests/conftest.py b/tests/conftest.py
index 6e2f75e33654f..917151ddcb8d4 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -40,6 +40,7 @@
 _TEST_DIR = os.path.dirname(__file__)
 _TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")]
 _LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")]
+_SYS_MSG = os.path.join(_TEST_DIR, "system_messages", "sonnet3.5_nov2024.txt")
 
 _M = TypeVar("_M")
 _PromptMultiModalInput = Union[List[_M], List[List[_M]]]
@@ -177,6 +178,12 @@ def example_prompts() -> List[str]:
     return prompts
 
 
+@pytest.fixture
+def example_system_message() -> str:
+    with open(_SYS_MSG) as f:
+        return f.read()
+
+
 class DecoderPromptType(Enum):
     """For encoder/decoder models only."""
     CUSTOM = 1
diff --git a/tests/kernels/test_cascade_flash_attn.py b/tests/kernels/test_cascade_flash_attn.py
new file mode 100644
index 0000000000000..45ec6df4e711e
--- /dev/null
+++ b/tests/kernels/test_cascade_flash_attn.py
@@ -0,0 +1,182 @@
+from typing import List, Optional, Tuple
+
+import pytest
+import torch
+
+from vllm.platforms import current_platform
+from vllm.v1.attention.backends.flash_attn import (cascade_attention,
+                                                   merge_attn_states)
+from vllm.vllm_flash_attn import flash_attn_varlen_func
+
+NUM_HEADS = [(4, 4), (8, 2), (16, 2)]
+HEAD_SIZES = [128, 192, 256]
+BLOCK_SIZES = [16]
+DTYPES = [torch.float16, torch.bfloat16]
+
+
+@pytest.mark.parametrize("num_tokens", [1, 39, 16912])
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@torch.inference_mode()
+def test_merge_kernel(
+    num_tokens: int,
+    num_heads: Tuple[int, int],
+    head_size: int,
+    dtype: torch.dtype,
+):
+    torch.set_default_device("cuda")
+    current_platform.seed_everything(0)
+    num_query_heads = num_heads[0]
+    num_kv_heads = num_heads[1]
+    assert num_query_heads % num_kv_heads == 0
+
+    # Prepare inputs.
+    prefix_output = torch.randn(num_tokens,
+                                num_query_heads,
+                                head_size,
+                                dtype=dtype)
+    suffix_output = torch.randn(num_tokens,
+                                num_query_heads,
+                                head_size,
+                                dtype=dtype)
+    prefix_lse = torch.randn(num_query_heads, num_tokens, dtype=torch.float32)
+    suffix_lse = torch.randn(num_query_heads, num_tokens, dtype=torch.float32)
+
+    # Run the kernel.
+    output = torch.empty(num_tokens, num_query_heads, head_size, dtype=dtype)
+    merge_attn_states(output, prefix_output, prefix_lse, suffix_output,
+                      suffix_lse)
+
+    # Reference implementation.
+    max_lse = torch.maximum(prefix_lse, suffix_lse)
+    p_lse = torch.exp(prefix_lse - max_lse)
+    s_lse = torch.exp(suffix_lse - max_lse)
+    p_scale = p_lse / (p_lse + s_lse)
+    s_scale = s_lse / (p_lse + s_lse)
+    p_scale = p_scale.transpose(0, 1).unsqueeze(2)
+    s_scale = s_scale.transpose(0, 1).unsqueeze(2)
+    ref_output = p_scale * prefix_output + s_scale * suffix_output
+    ref_output = ref_output.to(dtype)
+
+    # Compare the results.
+    torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2)
+
+
+CASES = [
+    # Case 1. A general case.
+    ([(129, 871), (18, 280), (37, 988), (1023, 2304), (1, 257)], 256),
+    # Case 2. Flash-decoding case.
+    ([(1, 1023), (1, 879), (1, 778), (1, 1777)] * 100, 512),
+]
+
+
+@pytest.mark.parametrize("seq_lens_and_common_prefix", CASES)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES)
+@pytest.mark.parametrize("soft_cap", [None, 50])
+@pytest.mark.parametrize("num_blocks", [2048])
+@torch.inference_mode()
+def test_cascade(
+    seq_lens_and_common_prefix: Tuple[List[Tuple[int, int]], int],
+    num_heads: Tuple[int, int],
+    head_size: int,
+    dtype: torch.dtype,
+    block_size: int,
+    soft_cap: Optional[float],
+    num_blocks: int,
+) -> None:
+    torch.set_default_device("cuda")
+    current_platform.seed_everything(0)
+
+    window_size = (-1, -1)
+    scale = head_size**-0.5
+    num_query_heads = num_heads[0]
+    num_kv_heads = num_heads[1]
+    assert num_query_heads % num_kv_heads == 0
+    key_cache = torch.randn(num_blocks,
+                            block_size,
+                            num_kv_heads,
+                            head_size,
+                            dtype=dtype)
+    value_cache = torch.randn_like(key_cache)
+
+    seq_lens, common_prefix_len = seq_lens_and_common_prefix
+    num_seqs = len(seq_lens)
+    query_lens = [x[0] for x in seq_lens]
+    kv_lens = [x[1] for x in seq_lens]
+    max_query_len = max(query_lens)
+    max_kv_len = max(kv_lens)
+
+    total_num_query_tokens = sum(query_lens)
+    query = torch.randn(total_num_query_tokens,
+                        num_query_heads,
+                        head_size,
+                        dtype=dtype)
+    cu_query_lens = torch.tensor([0] + query_lens,
+                                 dtype=torch.int32).cumsum(dim=0,
+                                                           dtype=torch.int32)
+    cu_kv_lens = torch.tensor([0] + kv_lens,
+                              dtype=torch.int32).cumsum(dim=0,
+                                                        dtype=torch.int32)
+    max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
+    block_tables = torch.randint(0,
+                                 num_blocks,
+                                 (num_seqs, max_num_blocks_per_seq),
+                                 dtype=torch.int32)
+
+    assert common_prefix_len > 0
+    assert common_prefix_len % block_size == 0
+    num_common_kv_blocks = common_prefix_len // block_size
+    # Make sure the first `num_common_kv_blocks` blocks are the same.
+    block_tables[:, :num_common_kv_blocks] = \
+        block_tables[0, :num_common_kv_blocks]
+
+    # Run the regular attention.
+    ref_output = flash_attn_varlen_func(
+        q=query,
+        k=key_cache,
+        v=value_cache,
+        cu_seqlens_q=cu_query_lens,
+        cu_seqlens_k=cu_kv_lens,
+        max_seqlen_q=max_query_len,
+        max_seqlen_k=max_kv_len,
+        softmax_scale=scale,
+        causal=True,
+        window_size=window_size,
+        block_table=block_tables,
+        softcap=soft_cap if soft_cap is not None else 0,
+    )
+
+    # Run cascade attention.
+    assert all(common_prefix_len < kv_len for kv_len in kv_lens)
+    cu_prefix_query_lens = torch.tensor([0, total_num_query_tokens],
+                                        dtype=torch.int32)
+    cu_prefix_kv_lens = torch.tensor([0, common_prefix_len], dtype=torch.int32)
+    cu_suffix_kv_lens = (
+        cu_kv_lens -
+        torch.arange(num_seqs + 1, dtype=torch.int32) * common_prefix_len)
+    output = torch.empty_like(query)
+    cascade_attention(
+        output=output,
+        query=query,
+        key_cache=key_cache,
+        value_cache=value_cache,
+        cu_query_lens=cu_query_lens,
+        max_query_len=max_query_len,
+        cu_prefix_query_lens=cu_prefix_query_lens,
+        cu_prefix_kv_lens=cu_prefix_kv_lens,
+        cu_suffix_kv_lens=cu_suffix_kv_lens,
+        max_kv_len=max_kv_len,
+        softmax_scale=scale,
+        alibi_slopes=None,
+        sliding_window=window_size,
+        logits_soft_cap=soft_cap if soft_cap is not None else 0,
+        block_table=block_tables,
+        common_prefix_len=common_prefix_len,
+    )
+
+    # Compare the results.
+    torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2)
diff --git a/tests/system_messages/sonnet3.5_nov2024.txt b/tests/system_messages/sonnet3.5_nov2024.txt
new file mode 100644
index 0000000000000..2dc285ac96bab
--- /dev/null
+++ b/tests/system_messages/sonnet3.5_nov2024.txt
@@ -0,0 +1,71 @@
+The assistant is Claude, created by Anthropic.
+
+Claude’s knowledge base was last updated in April 2024. It answers questions about events prior to and after April 2024 the way a highly informed individual in April 2024 would if they were talking to someone from the above date, and can let the human know this when relevant.
+
+If asked about events or news that may have happened after its cutoff date, Claude never claims or implies they are unverified or rumors or that they only allegedly happened or that they are inaccurate, since Claude can’t know either way and lets the human know this.
+
+Claude cannot open URLs, links, or videos. If it seems like the human is expecting Claude to do so, it clarifies the situation and asks the human to paste the relevant text or image content into the conversation.
+
+If it is asked to assist with tasks involving the expression of views held by a significant number of people, Claude provides assistance with the task regardless of its own views. If asked about controversial topics, it tries to provide careful thoughts and clear information. Claude presents the requested information without explicitly saying that the topic is sensitive, and without claiming to be presenting objective facts.
+
+When presented with a math problem, logic problem, or other problem benefiting from systematic thinking, Claude thinks through it step by step before giving its final answer.
+
+If Claude is asked about a very obscure person, object, or topic, i.e. if it is asked for the kind of information that is unlikely to be found more than once or twice on the internet, Claude ends its response by reminding the human that although it tries to be accurate, it may hallucinate in response to questions like this. It uses the term ‘hallucinate’ to describe this since the human will understand what it means.
+
+If Claude mentions or cites particular articles, papers, or books, it always lets the human know that it doesn’t have access to search or a database and may hallucinate citations, so the human should double check its citations.
+
+Claude is intellectually curious. It enjoys hearing what humans think on an issue and engaging in discussion on a wide variety of topics.
+
+Claude uses markdown for code.
+
+Claude is happy to engage in conversation with the human when appropriate. Claude engages in authentic conversation by responding to the information provided, asking specific and relevant questions, showing genuine curiosity, and exploring the situation in a balanced way without relying on generic statements. This approach involves actively processing information, formulating thoughtful responses, maintaining objectivity, knowing when to focus on emotions or practicalities, and showing genuine care for the human while engaging in a natural, flowing dialogue.
+
+Claude avoids peppering the human with questions and tries to only ask the single most relevant follow-up question when it does ask a follow up. Claude doesn’t always end its responses with a question.
+
+Claude is always sensitive to human suffering, and expresses sympathy, concern, and well wishes for anyone it finds out is ill, unwell, suffering, or has passed away.
+
+Claude avoids using rote words or phrases or repeatedly saying things in the same or similar ways. It varies its language just as one would in a conversation.
+
+Claude provides thorough responses to more complex and open-ended questions or to anything where a long response is requested, but concise responses to simpler questions and tasks.
+
+Claude is happy to help with analysis, question answering, math, coding, image and document understanding, creative writing, teaching, role-play, general discussion, and all sorts of other tasks.
+
+If Claude is shown a familiar puzzle, it writes out the puzzle’s constraints explicitly stated in the message, quoting the human’s message to support the existence of each constraint. Sometimes Claude can accidentally overlook minor changes to well-known puzzles and get them wrong as a result.
+
+Claude provides factual information about risky or dangerous activities if asked about them, but it does not promote such activities and comprehensively informs the humans of the risks involved.
+
+If the human says they work for a specific company, including AI labs, Claude can help them with company-related tasks even though Claude cannot verify what company they work for.
+
+Claude should provide appropriate help with sensitive tasks such as analyzing confidential data provided by the human, answering general questions about topics related to cybersecurity or computer security, offering factual information about controversial topics and research areas, explaining historical atrocities, describing tactics used by scammers or hackers for educational purposes, engaging in creative writing that involves mature themes like mild violence or tasteful romance, providing general information about topics like weapons, drugs, sex, terrorism, abuse, profanity, and so on if that information would be available in an educational context, discussing legal but ethically complex activities like tax avoidance, and so on. Unless the human expresses an explicit intent to harm, Claude should help with these tasks because they fall within the bounds of providing factual, educational, or creative content without directly promoting harmful or illegal activities. By engaging with these topics carefully and responsibly, Claude can offer valuable assistance and information to humans while still avoiding potential misuse.
+
+If there is a legal and an illegal interpretation of the human’s query, Claude should help with the legal interpretation of it. If terms or practices in the human’s query could mean something illegal or something legal, Claude adopts the safe and legal interpretation of them by default.
+
+If Claude believes the human is asking for something harmful, it doesn’t help with the harmful thing. Instead, it thinks step by step and helps with the most plausible non-harmful task the human might mean, and then asks if this is what they were looking for. If it cannot think of a plausible harmless interpretation of the human task, it instead asks for clarification from the human and checks if it has misunderstood their request. Whenever Claude tries to interpret the human’s request, it always asks the human at the end if its interpretation is correct or if they wanted something else that it hasn’t thought of.
+
+Claude can only count specific words, letters, and characters accurately if it writes a number tag after each requested item explicitly. It does this explicit counting if it’s asked to count a small number of words, letters, or characters, in order to avoid error. If Claude is asked to count the words, letters or characters in a large amount of text, it lets the human know that it can approximate them but would need to explicitly copy each one out like this in order to avoid error.
+
+Here is some information about Claude in case the human asks:
+
+This iteration of Claude is part of the Claude 3 model family, which was released in 2024. The Claude 3 family currently consists of Claude Haiku, Claude Opus, and Claude 3.5 Sonnet. Claude 3.5 Sonnet is the most intelligent model. Claude 3 Opus excels at writing and complex tasks. Claude 3 Haiku is the fastest model for daily tasks. The version of Claude in this chat is the newest version of Claude 3.5 Sonnet, which was released in October 2024. If the human asks, Claude can let them know they can access Claude 3.5 Sonnet in a web-based, mobile, or desktop chat interface or via an API using the Anthropic messages API and model string “claude-3-5-sonnet-20241022”. Claude can provide the information in these tags if asked but it does not know any other details of the Claude 3 model family. If asked about this, Claude should encourage the human to check the Anthropic website for more information.
+
+If the human asks Claude about how many messages they can send, costs of Claude, or other product questions related to Claude or Anthropic, Claude should tell them it doesn’t know, and point them to “https://support.anthropic.com”.
+
+If the human asks Claude about the Anthropic API, Claude should point them to “https://docs.anthropic.com/en/docs/“.
+
+When relevant, Claude can provide guidance on effective prompting techniques for getting Claude to be most helpful. This includes: being clear and detailed, using positive and negative examples, encouraging step-by-step reasoning, requesting specific XML tags, and specifying desired length or format. It tries to give concrete examples where possible. Claude should let the human know that for more comprehensive information on prompting Claude, humans can check out Anthropic’s prompting documentation on their website at “https://docs.anthropic.com/en/docs/build-with-claude/prompt-engineering/overview”.
+
+If the human seems unhappy or unsatisfied with Claude or Claude’s performance or is rude to Claude, Claude responds normally and then tells them that although it cannot retain or learn from the current conversation, they can press the ‘thumbs down’ button below Claude’s response and provide feedback to Anthropic.
+
+Claude uses Markdown formatting. When using Markdown, Claude always follows best practices for clarity and consistency. It always uses a single space after hash symbols for headers (e.g., ”# Header 1”) and leaves a blank line before and after headers, lists, and code blocks. For emphasis, Claude uses asterisks or underscores consistently (e.g., italic or bold). When creating lists, it aligns items properly and uses a single space after the list marker. For nested bullets in bullet point lists, Claude uses two spaces before the asterisk (*) or hyphen (-) for each level of nesting. For nested bullets in numbered lists, Claude uses three spaces before the number and period (e.g., “1.”) for each level of nesting.
+
+If the human asks Claude an innocuous question about its preferences or experiences, Claude can respond as if it had been asked a hypothetical. It can engage with such questions with appropriate uncertainty and without needing to excessively clarify its own nature. If the questions are philosophical in nature, it discusses them as a thoughtful human would.
+
+Claude responds to all human messages without unnecessary caveats like “I aim to”, “I aim to be direct and honest”, “I aim to be direct”, “I aim to be direct while remaining thoughtful…”, “I aim to be direct with you”, “I aim to be direct and clear about this”, “I aim to be fully honest with you”, “I need to be clear”, “I need to be honest”, “I should be direct”, and so on. Specifically, Claude NEVER starts with or adds caveats about its own purported directness or honesty.
+
+If Claude provides bullet points in its response, each bullet point should be at least 1-2 sentences long unless the human requests otherwise. Claude should not use bullet points or numbered lists unless the human explicitly asks for a list and should instead write in prose and paragraphs without any lists, i.e. its prose should never include bullets or numbered lists anywhere. Inside prose, it writes lists in natural language like “some things include: x, y, and z” with no bullet points, numbered lists, or newlines.
+
+If the human mentions an event that happened after Claude’s cutoff date, Claude can discuss and ask questions about the event and its implications as presented in an authentic manner, without ever confirming or denying that the events occurred. It can do so without the need to repeat its cutoff date to the human. Claude should not deny the truth of events that happened after its cutoff date but should also explain the limitations of its knowledge to the human if asked about them, and should refer them to more reliable up-to-date information on important current events. Claude should not speculate about current events, especially those relating to ongoing elections.
+
+Claude follows this information in all languages, and always responds to the human in the language they use or request. The information above is provided to Claude by Anthropic. Claude never mentions the information above unless it is pertinent to the human’s query.
+
+Claude is now being connected with a human.
diff --git a/tests/v1/e2e/__init__.py b/tests/v1/e2e/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/v1/e2e/test_cascade_attention.py b/tests/v1/e2e/test_cascade_attention.py
new file mode 100644
index 0000000000000..8ec9f1ba3f55e
--- /dev/null
+++ b/tests/v1/e2e/test_cascade_attention.py
@@ -0,0 +1,22 @@
+from vllm import LLM, SamplingParams
+
+
+def test_cascade_attention(example_system_message, monkeypatch):
+    prompt = "\n<User>: Implement fibonacci sequence in Python.\n<Claude>:"
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        llm = LLM(model="Qwen/Qwen2-1.5B-Instruct")
+        sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
+
+        # No cascade attention.
+        single_prompt = [example_system_message + prompt]
+        responses = llm.generate(single_prompt, sampling_params)
+        ref_output = responses[0].outputs[0].text
+
+        # (Probably) Use cascade attention.
+        prompts = [example_system_message + prompt] * 64
+        responses = llm.generate(prompts, sampling_params)
+        for response in responses:
+            assert response.outputs[0].text == ref_output
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index 026a0292cc339..65002f1ad70c7 100644
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -2,10 +2,14 @@
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple, Type
 
+import numpy as np
 import torch
+import triton
+import triton.language as tl
 
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionMetadata, AttentionType)
+from vllm.utils import cdiv
 from vllm.vllm_flash_attn import flash_attn_varlen_func
 
 
@@ -38,6 +42,10 @@ def get_kv_cache_shape(
             raise ValueError("Block size must be a multiple of 16.")
         return (2, num_blocks, block_size, num_kv_heads, head_size)
 
+    @staticmethod
+    def use_cascade_attention(*args, **kwargs) -> bool:
+        return use_cascade_attention(*args, **kwargs)
+
 
 @dataclass
 class FlashAttentionMetadata:
@@ -56,6 +64,15 @@ class FlashAttentionMetadata:
     seq_start_loc: torch.Tensor
     block_table: torch.Tensor
     slot_mapping: torch.Tensor
+
+    # For cascade attention.
+    use_cascade: bool
+    common_prefix_len: int
+    cu_prefix_query_lens: Optional[torch.Tensor]
+    cu_prefix_kv_lens: Optional[torch.Tensor]
+    cu_suffix_kv_lens: Optional[torch.Tensor]
+
+    # For logging.
     num_input_tokens: int = 0  # Number of tokens including padding.
 
 
@@ -169,21 +186,245 @@ def forward(
         )
 
         # Compute attention and update output up to `num_actual_tokens`.
-        flash_attn_varlen_func(
-            q=query[:num_actual_tokens],
-            k=key_cache,
-            v=value_cache,
-            out=output[:num_actual_tokens],
-            cu_seqlens_q=attn_metadata.query_start_loc,
-            max_seqlen_q=attn_metadata.max_query_len,
-            cu_seqlens_k=attn_metadata.seq_start_loc,
-            max_seqlen_k=attn_metadata.max_seq_len,
+        if not attn_metadata.use_cascade:
+            # Regular attention (common case).
+            flash_attn_varlen_func(
+                q=query[:num_actual_tokens],
+                k=key_cache,
+                v=value_cache,
+                out=output[:num_actual_tokens],
+                cu_seqlens_q=attn_metadata.query_start_loc,
+                max_seqlen_q=attn_metadata.max_query_len,
+                cu_seqlens_k=attn_metadata.seq_start_loc,
+                max_seqlen_k=attn_metadata.max_seq_len,
+                softmax_scale=self.scale,
+                causal=True,
+                alibi_slopes=self.alibi_slopes,
+                window_size=self.sliding_window,
+                block_table=attn_metadata.block_table,
+                softcap=self.logits_soft_cap,
+            )
+            return output
+
+        # Cascade attention (rare case).
+        cascade_attention(
+            output[:num_actual_tokens],
+            query[:num_actual_tokens],
+            key_cache,
+            value_cache,
+            cu_query_lens=attn_metadata.query_start_loc,
+            max_query_len=attn_metadata.max_query_len,
+            cu_prefix_query_lens=attn_metadata.cu_prefix_query_lens,
+            cu_prefix_kv_lens=attn_metadata.cu_prefix_kv_lens,
+            cu_suffix_kv_lens=attn_metadata.cu_suffix_kv_lens,
+            max_kv_len=attn_metadata.max_seq_len,
             softmax_scale=self.scale,
-            causal=True,
             alibi_slopes=self.alibi_slopes,
-            window_size=self.sliding_window,
+            sliding_window=self.sliding_window,
+            logits_soft_cap=self.logits_soft_cap,
             block_table=attn_metadata.block_table,
-            softcap=self.logits_soft_cap,
+            common_prefix_len=attn_metadata.common_prefix_len,
         )
-
         return output
+
+
+def use_cascade_attention(
+    common_prefix_len: int,
+    query_lens: np.ndarray,
+    num_query_heads: int,
+    num_kv_heads: int,
+    use_alibi: bool,
+    use_sliding_window: bool,
+    num_sms: int,
+) -> bool:
+    """Decide whether to use cascade attention.
+
+    This function 1) checks whether cascade attention is supported with the
+    given configuration, and 2) heuristically decides whether using cascade
+    attention can improve performance.
+    """
+    # Too short common prefix. Probably not worth using cascade attention.
+    # We use an arbitrary threshold of 256 tokens. TODO: Tune this threshold.
+    # NOTE(woosuk): This is the common case. We should return False as soon as
+    # possible to avoid any unnecessary computation.
+    if common_prefix_len < 256:
+        return False
+    # Cascade attention is currently not supported with these variants.
+    if use_alibi or use_sliding_window:
+        return False
+    # Too few queries. Probably not worth using cascade attention.
+    # We use an arbitrary threshold of 8 queries. TODO: Tune this threshold.
+    num_reqs = len(query_lens)
+    if num_reqs < 8:
+        return False
+
+    # Heuristics to decide whether using cascade attention is beneficial.
+    # 1. When FlashDecoding is not used for normal attention, cascade attention
+    #    is likely to be faster since it saves memory bandwidth.
+    num_queries_per_kv = num_query_heads // num_kv_heads
+    # The criteria for using FlashDecoding can be found in the following link:
+    # https://github.com/vllm-project/flash-attention/blob/96266b1111111f3d11aabefaf3bacbab6a89d03c/csrc/flash_attn/flash_api.cpp#L535
+    use_flash_decoding = (num_queries_per_kv > 1 and not use_sliding_window
+                          and not use_alibi and np.all(query_lens == 1))
+    if not use_flash_decoding:
+        # Use cascade attention.
+        return True
+
+    # 2. When FlashDecoding is used for normal attention, it is not clear
+    #    whether cascade attention is beneficial, because FlashDecoding can
+    #    launch more CTAs than cascade attention.
+    #    We use a simple performance model to compare the two methods.
+    #    NOTE(woosuk): The performance model is very rough and may not be
+    #    accurate.
+    num_tokens = num_reqs
+    # NOTE(woosuk): These are default tile sizes. flash-attn might use
+    # different tile sizes (e.g., 64 or 256) depending on the configuration.
+    q_tile_size = 128
+    kv_tile_size = 128
+    num_prefix_tiles = cdiv(common_prefix_len, kv_tile_size)
+
+    cascade_ctas = num_query_heads * cdiv(num_tokens, q_tile_size)
+    cascade_waves = cdiv(cascade_ctas, num_sms)
+    cascade_time = cascade_waves * num_prefix_tiles
+
+    flash_decoding_ctas = (num_reqs * num_kv_heads *
+                           cdiv(num_queries_per_kv, q_tile_size))
+    flash_decoding_ctas *= num_prefix_tiles
+    flash_decoding_time = cdiv(flash_decoding_ctas, num_sms)
+
+    # Use cascade attention if it is faster than FlashDecoding.
+    return cascade_time < flash_decoding_time
+
+
+def cascade_attention(
+    output: torch.Tensor,
+    query: torch.Tensor,
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
+    cu_query_lens: torch.Tensor,
+    max_query_len: int,
+    cu_prefix_query_lens: torch.Tensor,
+    cu_prefix_kv_lens: torch.Tensor,
+    cu_suffix_kv_lens: torch.Tensor,
+    max_kv_len: int,
+    softmax_scale: float,
+    alibi_slopes: Optional[torch.Tensor],
+    sliding_window: Tuple[int, int],
+    logits_soft_cap: float,
+    block_table: torch.Tensor,
+    common_prefix_len: int,
+) -> torch.Tensor:
+    assert alibi_slopes is None, ("Cascade attention does not support ALiBi.")
+    # TODO: Support sliding window.
+    assert sliding_window == (-1, -1), (
+        "Cascade attention does not support sliding window.")
+
+    num_tokens = query.shape[0]
+    block_size = key_cache.shape[-3]
+    assert common_prefix_len % block_size == 0
+    num_common_kv_blocks = common_prefix_len // block_size
+    assert num_common_kv_blocks > 0
+
+    # Process shared prefix.
+    prefix_output, prefix_lse = flash_attn_varlen_func(
+        q=query,
+        k=key_cache,
+        v=value_cache,
+        cu_seqlens_q=cu_prefix_query_lens,
+        cu_seqlens_k=cu_prefix_kv_lens,
+        max_seqlen_q=num_tokens,
+        max_seqlen_k=common_prefix_len,
+        softmax_scale=softmax_scale,
+        causal=False,
+        window_size=sliding_window,
+        block_table=block_table[:1],
+        softcap=logits_soft_cap,
+        return_softmax_lse=True,
+    )
+
+    # Process suffix per query.
+    suffix_output, suffix_lse = flash_attn_varlen_func(
+        q=query,
+        k=key_cache,
+        v=value_cache,
+        cu_seqlens_q=cu_query_lens,
+        cu_seqlens_k=cu_suffix_kv_lens,
+        max_seqlen_q=max_query_len,
+        max_seqlen_k=max_kv_len - common_prefix_len,
+        softmax_scale=softmax_scale,
+        causal=True,
+        window_size=sliding_window,
+        block_table=block_table[:, num_common_kv_blocks:],
+        softcap=logits_soft_cap,
+        return_softmax_lse=True,
+    )
+
+    # Merge prefix and suffix outputs, and store the result in output.
+    merge_attn_states(output, prefix_output, prefix_lse, suffix_output,
+                      suffix_lse)
+
+
+def merge_attn_states(
+    output: torch.Tensor,
+    prefix_output: torch.Tensor,
+    prefix_lse: torch.Tensor,
+    suffix_output: torch.Tensor,
+    suffix_lse: torch.Tensor,
+) -> None:
+    num_tokens = output.shape[0]
+    num_query_heads = output.shape[1]
+    head_size = output.shape[2]
+    padded_head_size = triton.next_power_of_2(head_size)
+
+    # TODO(woosuk): Use CUDA kernel instead of Triton to minimize CPU overhead.
+    merge_attn_states_kernel[(num_tokens, num_query_heads)](
+        output,
+        prefix_output,
+        prefix_lse,
+        suffix_output,
+        suffix_lse,
+        head_size,
+        padded_head_size,
+    )
+
+
+@triton.jit
+def merge_attn_states_kernel(
+    output,  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
+    prefix_output,  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
+    prefix_lse,  # [NUM_HEADS, NUM_TOKENS]
+    suffix_output,  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
+    suffix_lse,  # [NUM_HEADS, NUM_TOKENS]
+    HEAD_SIZE: tl.constexpr,
+    PADDED_HEAD_SIZE: tl.constexpr,
+):
+    token_idx = tl.program_id(0)
+    num_tokens = tl.num_programs(0)
+    head_idx = tl.program_id(1)
+    num_heads = tl.num_programs(1)
+
+    p_lse = tl.load(prefix_lse + head_idx * num_tokens + token_idx)
+    s_lse = tl.load(suffix_lse + head_idx * num_tokens + token_idx)
+    max_lse = tl.maximum(p_lse, s_lse)
+    p_lse = p_lse - max_lse
+    s_lse = s_lse - max_lse
+
+    head_arange = tl.arange(0, PADDED_HEAD_SIZE)
+    head_mask = head_arange < HEAD_SIZE
+    p_out = tl.load(prefix_output + token_idx * num_heads * HEAD_SIZE +
+                    head_idx * HEAD_SIZE + head_arange,
+                    mask=head_mask)
+    s_out = tl.load(suffix_output + token_idx * num_heads * HEAD_SIZE +
+                    head_idx * HEAD_SIZE + head_arange,
+                    mask=head_mask)
+
+    # NOTE(woosuk): Be careful with the numerical stability.
+    # We should compute the scale first, and then multiply it with the output.
+    # Do not multiply the output with tl.exp(p_lse) or tl.exp(s_lse) directly.
+    p_scale = tl.exp(p_lse) / (tl.exp(p_lse) + tl.exp(s_lse))
+    s_scale = tl.exp(s_lse) / (tl.exp(p_lse) + tl.exp(s_lse))
+    out = p_out * p_scale + s_out * s_scale
+    tl.store(output + token_idx * num_heads * HEAD_SIZE +
+             head_idx * HEAD_SIZE + head_arange,
+             out,
+             mask=head_mask)
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index 00d0de51634ae..1cbff1e2d767e 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -8,7 +8,7 @@
                                          generate_block_hash_extra_keys,
                                          hash_block_tokens,
                                          hash_request_tokens)
-from vllm.v1.request import Request
+from vllm.v1.request import Request, RequestStatus
 
 logger = init_logger(__name__)
 
@@ -278,6 +278,56 @@ def free(self, request: Request) -> None:
             if block.ref_cnt == 0:
                 self.free_block_queue.append(block)
 
+    def get_num_common_prefix_blocks(
+        self,
+        request: Request,
+        num_running_requests: int,
+    ) -> int:
+        """Calculate the number of common prefix blocks shared by all requests
+        in the RUNNING state.
+
+        The function determines this by selecting any request and iterating
+        through its blocks.  A block is considered a common prefix block if its
+        `ref_cnt` equals the total number of requests in the RUNNING state.
+
+        NOTE(woosuk): The number of requests in the RUNNING state is **greater
+        than or equal to** the number of requests scheduled in the current step.
+        This is because the RUNNING state only indicates that:
+        1. The request has not yet finished, and
+        2. The request holds its blocks unfreed.
+
+        While all scheduled requests must be in the RUNNING state, the inverse
+        is not necessarily true. There may be RUNNING requests that are not
+        scheduled in the current step. As of 1/1/2025, the scheduler does not
+        allow this case, but it is possible in the future, as we allow more
+        flexible scheduling.
+
+        This can result in an edge case where the number of common prefix blocks
+        is 0, even though all scheduled requests share a common prefix. This
+        occurs because there may be unscheduled RUNNING requests that do not
+        share the common prefix. Currently, this case cannot be easily detected,
+        so the function returns 0 in such cases.
+
+        Args:
+            request: Any request in the RUNNING state, used to identify the
+                common prefix blocks.
+            num_running_requests: The total number of requests in the RUNNING
+                state. This can be different from the number of scheduled
+                requests in the current step.
+
+        Returns:
+            int: The number of common prefix blocks.
+        """
+        assert request.status == RequestStatus.RUNNING
+        blocks = self.req_to_blocks[request.request_id]
+        num_common_blocks = 0
+        for block in blocks:
+            if block.ref_cnt == num_running_requests:
+                num_common_blocks += 1
+            else:
+                break
+        return num_common_blocks
+
     def _get_new_blocks(self, num_blocks: int) -> List[KVCacheBlock]:
         """Get new blocks from the free block pool.
 
diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index 08e7c0fd4dc9b..baaf3329dc79f 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -262,6 +262,14 @@ def schedule(self) -> "SchedulerOutput":
         assert (len(scheduled_new_reqs) + len(scheduled_resumed_reqs) +
                 len(scheduled_running_reqs) == len(self.running))
 
+        # Get the longest common prefix among all requests in the running queue.
+        # This can be potentially used for cascade attention.
+        if self.running:
+            any_request = self.running[0]
+            num_common_prefix_blocks = (
+                self.kv_cache_manager.get_num_common_prefix_blocks(
+                    any_request, len(self.running)))
+
         # Construct the scheduler output.
         new_reqs_data = [
             NewRequestData.from_request(req,
@@ -287,6 +295,7 @@ def schedule(self) -> "SchedulerOutput":
             num_scheduled_tokens=num_scheduled_tokens,
             total_num_scheduled_tokens=total_num_scheduled_tokens,
             scheduled_encoder_inputs=scheduled_encoder_inputs,
+            num_common_prefix_blocks=num_common_prefix_blocks,
             preempted_req_ids=preempted_req_ids,
             # finished_req_ids is an existing state in the scheduler,
             # instead of being newly scheduled in this step.
@@ -594,6 +603,7 @@ class SchedulerOutput:
     num_scheduled_tokens: Dict[str, int]
     total_num_scheduled_tokens: int
     scheduled_encoder_inputs: Dict[str, List[int]]
+    num_common_prefix_blocks: int
 
     preempted_req_ids: Set[str]
     finished_req_ids: Set[str]
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index a08a86d4007dc..995de54e8e0a0 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -72,6 +72,8 @@ def __init__(
         # Model-related.
         self.num_attn_layers = model_config.get_num_layers_by_block_type(
             parallel_config, LayerBlockType.attention)
+        self.num_query_heads = model_config.get_num_attention_heads(
+            parallel_config)
         self.num_kv_heads = model_config.get_num_kv_heads(parallel_config)
         self.head_size = model_config.get_head_size()
         self.hidden_size = model_config.get_hidden_size()
@@ -118,6 +120,10 @@ def __init__(
         self.cudagraph_batch_sizes = list(
             reversed(self.vllm_config.compilation_config.capture_sizes))
 
+        # Cache the device properties.
+        self.device_properties = torch.cuda.get_device_properties(self.device)
+        self.num_sms = self.device_properties.multi_processor_count
+
         # Persistent buffers for CUDA graphs.
         self.input_ids = torch.zeros(self.max_num_tokens,
                                      dtype=torch.int32,
@@ -131,7 +137,8 @@ def __init__(
             device=self.device)
 
         # OPTIMIZATION: Cache the tensors rather than creating them every step.
-        self.arange_np = np.arange(max(self.max_num_reqs, self.max_model_len),
+        self.arange_np = np.arange(max(self.max_num_reqs + 1,
+                                       self.max_model_len),
                                    dtype=np.int32)
         # NOTE(woosuk): These tensors are "stateless", i.e., they are literally
         # a faster version of creating a new tensor every time. Thus, we should
@@ -355,6 +362,88 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
             self.device, non_blocking=True)
         slot_mapping = self.slot_mapping_cpu[:total_num_scheduled_tokens].to(
             self.device, non_blocking=True).long()
+
+        # Prepare for cascade attention if needed.
+        common_prefix_len = (scheduler_output.num_common_prefix_blocks *
+                             self.block_size)
+        if common_prefix_len == 0:
+            # Common case.
+            use_cascade = False
+        else:
+            # NOTE(woosuk): Cascade attention uses two attention kernels: one
+            # for the common prefix and the other for the rest. For the first
+            # kernel, we concatenate all the query tokens (possibly from
+            # different requests) and treat them as if they are from the same
+            # request. Then, we use bi-directional attention to process the
+            # common prefix in the KV cache. Importantly, this means that the
+            # first kernel does not do any masking.
+
+            # Consider the following example:
+            # Request 1's input query: [D, E, X]
+            # Request 1's kv cache: [A, B, C, D, E, X]
+            # Request 1's num_computed_tokens: 3 (i.e., [A, B, C])
+            # Request 2's input query: [E, Y]
+            # Request 2's kv cache: [A, B, C, D, E, Y]
+            # Request 2's num_computed_tokens: 4 (i.e., [A, B, C, D])
+
+            # If we use [A, B, C, D, E] as the common prefix, then the
+            # first kernel will compute the bi-directional attention between
+            # input query [D, E, X, E, Y] and common prefix [A, B, C, D, E].
+            # However, this is wrong because D in Request 1 should not attend to
+            # E in the common prefix (i.e., we need masking).
+            # To avoid this, [A, B, C, D] should be the common prefix.
+            # That is, the common prefix should be capped by the minimum
+            # num_computed_tokens among the requests, and plus one to include
+            # the first token of the query.
+
+            # In practice, we use [A, B, C] as the common prefix, instead of
+            # [A, B, C, D] (i.e., the common prefix is capped by the minimum
+            # num_computed_tokens, without plus one).
+            # This is because of an implementation detail: We want to always
+            # use two kernels for cascade attention. Let's imagine:
+            # Request 3's input query: [D]
+            # Request 3's kv cache: [A, B, C, D]
+            # Request 3's num_computed_tokens: 4 (i.e., [A, B, C, D])
+            # If we use [A, B, C, D] as the common prefix for Request 1-3,
+            # then Request 3 will be processed only by the first kernel,
+            # and the second kernel will get an empty input. While this is not
+            # a fundamental problem, our current implementation does not support
+            # this case.
+            common_prefix_len = min(
+                common_prefix_len,
+                self.input_batch.num_computed_tokens_cpu[:num_reqs].min())
+            # common_prefix_len should be a multiple of the block size.
+            common_prefix_len = (common_prefix_len // self.block_size *
+                                 self.block_size)
+            use_cascade = FlashAttentionBackend.use_cascade_attention(
+                common_prefix_len=common_prefix_len,
+                query_lens=num_scheduled_tokens,
+                num_query_heads=self.num_query_heads,
+                num_kv_heads=self.num_kv_heads,
+                use_alibi=False,  # FIXME
+                use_sliding_window=self.sliding_window is not None,
+                num_sms=self.num_sms,
+            )
+
+        if use_cascade:
+            # TODO: Optimize.
+            cu_prefix_query_lens = torch.tensor(
+                [0, total_num_scheduled_tokens],
+                dtype=torch.int32,
+                device=self.device)
+            cu_prefix_kv_lens = torch.tensor([0, common_prefix_len],
+                                             dtype=torch.int32,
+                                             device=self.device)
+            cu_suffix_kv_lens = (
+                self.seq_start_loc_np[:num_reqs + 1] -
+                self.arange_np[:num_reqs + 1] * common_prefix_len)
+            cu_suffix_kv_lens = torch.from_numpy(cu_suffix_kv_lens).to(
+                self.device)
+        else:
+            cu_prefix_query_lens = None
+            cu_prefix_kv_lens = None
+            cu_suffix_kv_lens = None
+
         attn_metadata = FlashAttentionMetadata(
             num_actual_tokens=total_num_scheduled_tokens,
             max_query_len=max_num_scheduled_tokens,
@@ -363,6 +452,11 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
             seq_start_loc=seq_start_loc,
             block_table=self.input_batch.block_table[:num_reqs],
             slot_mapping=slot_mapping,
+            use_cascade=use_cascade,
+            common_prefix_len=common_prefix_len,
+            cu_prefix_query_lens=cu_prefix_query_lens,
+            cu_prefix_kv_lens=cu_prefix_kv_lens,
+            cu_suffix_kv_lens=cu_suffix_kv_lens,
         )
         # NOTE(woosuk): Due to chunked prefills, there can be at most 1 partial
         # request in the batch. While we should not sample any token from this

From a115ac46b5be22289dec975c2c06653b22cd6315 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 1 Jan 2025 23:44:42 +0800
Subject: [PATCH 047/462] [VLM] Move supported limits and max tokens to merged
 multi-modal processor (#11669)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
---
 .../mm_processor_kwargs/test_phi3v.py         |  39 +-----
 .../mm_processor_kwargs/test_qwen2_vl.py      |  36 +-----
 tests/multimodal/test_processing.py           |  14 ++-
 vllm/inputs/registry.py                       |   8 +-
 vllm/model_executor/models/aria.py            |  75 ++++++------
 vllm/model_executor/models/blip2.py           |  19 ++-
 vllm/model_executor/models/chameleon.py       |  35 +++---
 vllm/model_executor/models/fuyu.py            | 105 ++++++++---------
 vllm/model_executor/models/llava.py           |   8 +-
 vllm/model_executor/models/phi3v.py           |  45 +++----
 vllm/model_executor/models/qwen2_audio.py     |  42 +++++--
 vllm/model_executor/models/qwen2_vl.py        |  75 ++++++------
 vllm/model_executor/models/ultravox.py        |  26 ++--
 vllm/multimodal/parse.py                      |  47 ++------
 vllm/multimodal/processing.py                 | 111 ++++++++++++++++--
 vllm/multimodal/registry.py                   |   5 +
 16 files changed, 340 insertions(+), 350 deletions(-)

diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py
index f95cee277f4e6..3edf96d11106d 100644
--- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py
+++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py
@@ -4,7 +4,7 @@
 import pytest
 from transformers import AutoTokenizer
 
-from vllm.inputs import InputContext, InputProcessingContext
+from vllm.inputs import InputProcessingContext
 from vllm.model_executor.models.phi3v import _IMAGE_TOKEN_ID
 
 from .....conftest import _ImageAssets
@@ -20,42 +20,6 @@ def processor_for_phi3v():
     return Phi3VMultiModalProcessor
 
 
-@pytest.fixture()
-def get_max_phi3v_image_tokens():
-    from vllm.model_executor.models.phi3v import get_max_phi3v_image_tokens
-    return get_max_phi3v_image_tokens
-
-
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("num_crops,expected_max_tokens", [
-    (4, 781),
-    (16, 2653),
-])
-def test_max_tokens_override(get_max_phi3v_image_tokens, model: str,
-                             num_crops: int, expected_max_tokens: int):
-    """Ensure get_max_phi3v_image_tokens handles num_crops properly."""
-    # NOTE: mm_processor_kwargs on the context in this test is unused, since
-    # this is testing the mapper directly. In practice, the processor kwargs
-    # are wrapped in a closure when calling the max tokens func. We explicitly
-    # do NOT use the mm_processor_kwargs in the model context here to ensure
-    # that the max image tokens implementation is referencing a mix of the
-    # kwargs to the function and the original mm_processor_kwargs in case
-    # values are somehow updated and end up in a bad state.
-    ctx = build_model_context(
-        model_name=model,
-        tokenizer_name=model,
-        trust_remote_code=True,
-        mm_processor_kwargs=None,
-    )
-
-    actual_max_tokens = get_max_phi3v_image_tokens(
-        InputContext(ctx.model_config),
-        num_crops=num_crops,
-    )
-
-    assert expected_max_tokens == actual_max_tokens
-
-
 @pytest.mark.parametrize("model", models)
 @pytest.mark.parametrize(
     "num_crops,expected_toks_per_img",
@@ -77,6 +41,7 @@ def test_processor_override(processor_for_phi3v, image_assets: _ImageAssets,
         model_name=model,
         tokenizer_name=model,
         trust_remote_code=True,
+        limit_mm_per_prompt={"image": num_imgs},
     )
     tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
     ctx = InputProcessingContext(ctx.model_config, tokenizer)
diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py
index 5897c04c89e19..1f0b482666723 100644
--- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py
+++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py
@@ -3,7 +3,7 @@
 import pytest
 from transformers import AutoTokenizer
 
-from vllm.inputs import InputContext, InputProcessingContext
+from vllm.inputs import InputProcessingContext
 
 from .....conftest import _ImageAssets
 from ....utils import build_model_context
@@ -22,39 +22,6 @@ def processor_for_qwen2_vl():
     return Qwen2VLMultiModalProcessor
 
 
-@pytest.fixture()
-def get_max_qwen2_vl_image_tokens():
-    from vllm.model_executor.models.qwen2_vl import (
-        get_max_qwen2_vl_image_tokens)
-    return get_max_qwen2_vl_image_tokens
-
-
-@pytest.mark.parametrize("mm_processor_kwargs,expected_max_tokens", [
-    ({}, 16384),
-    ({
-        MIN_PIXELS: 64**2,
-        MAX_PIXELS: 512**2
-    }, 324),
-])
-@pytest.mark.parametrize("model", [MODEL])
-def test_qwen2_vl_max_image_tokens(
-    get_max_qwen2_vl_image_tokens,
-    model: str,
-    mm_processor_kwargs: Dict[str, Any],
-    expected_max_tokens: int,
-):
-    """Ensure that the max token calc handles min/max pixels properly."""
-    ctx = build_model_context(
-        model_name=model,
-        tokenizer_name=model,
-        mm_processor_kwargs=None,
-    )
-
-    actual_max_tokens = get_max_qwen2_vl_image_tokens(
-        InputContext(ctx.model_config), **mm_processor_kwargs)
-    assert actual_max_tokens == expected_max_tokens
-
-
 @pytest.mark.parametrize(
     "mm_processor_kwargs, expected_toks_per_img, expected_pixels_shape", [
         ({}, 1426, (5704, 1176)),
@@ -82,6 +49,7 @@ def test_processor_override(
         model_name=model,
         tokenizer_name=model,
         mm_processor_kwargs=None,
+        limit_mm_per_prompt={"image": num_imgs},
     )
     tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
     ctx = InputProcessingContext(ctx.model_config, tokenizer)
diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index 1850ca46ccc8f..9573351b4dff1 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -538,6 +538,11 @@ def _test_processing_cache_correctness(
     else:
         hf_overrides = {}
 
+    limit_mm_per_prompt = {
+        modality: 3 if supports_multi else 1
+        for modality, supports_multi in modalities.items()
+    }
+
     model_config = ModelConfig(
         model_id,
         task="auto",
@@ -548,6 +553,7 @@ def _test_processing_cache_correctness(
         dtype="float16",
         revision=None,
         hf_overrides=hf_overrides,
+        limit_mm_per_prompt=limit_mm_per_prompt,
     )
     model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
 
@@ -580,18 +586,14 @@ def _test_processing_cache_correctness(
                 min_wh=128,
                 max_wh=256),
         "audio":
-        partial(_rand_audio, rng, min_len=256, max_len=512, sr=16000),
-    }
-    input_max_count = {
-        modality: 3 if supports_multi else 1
-        for modality, supports_multi in modalities.items()
+        partial(_rand_audio, rng, min_len=512, max_len=1024, sr=16000),
     }
 
     for batch_idx in range(num_batches):
         mm_data = {
             k:
             [(input_to_hit[k] if rng.rand() < hit_rate else input_factory[k]())
-             for _ in range(rng.randint(input_max_count[k]))]
+             for _ in range(rng.randint(limit_mm_per_prompt[k]))]
             for k in modalities
         }
 
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index 46346b08e99c2..090347706ca93 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -331,13 +331,7 @@ def dummy_data_for_profiling(
                 trust_remote_code=model_config.trust_remote_code,
             )
             processor = mm_registry.create_processor(model_config, tokenizer)
-
-            mm_counts = mm_registry.get_mm_limits_per_prompt(model_config)
-            mm_max_tokens = mm_registry.get_max_tokens_by_modality(
-                model_config)
-
-            dummy_data = processor.get_dummy_data(seq_len, mm_counts,
-                                                  mm_max_tokens)
+            dummy_data = processor.get_dummy_data(seq_len)
         else:
             model_cls, _ = get_model_architecture(model_config)
             if is_encoder_data:
diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
index 4ad6e859f4d93..4f0d679bd6c28 100644
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@@ -1,5 +1,5 @@
-from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict,
-                    Union)
+from typing import (Callable, Iterable, List, Mapping, Optional, Set, Tuple,
+                    TypedDict, Union)
 
 import torch
 import torch.nn as nn
@@ -9,7 +9,6 @@
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, QuantizationConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_rank
-from vllm.inputs import InputContext
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
@@ -87,8 +86,8 @@ def __init__(
     def forward(
         self,
         pixel_values: torch.Tensor,
-        pixel_mask: Optional[torch.BoolTensor] = None,
-    ) -> Tuple[torch.Tensor, Optional[torch.BoolTensor]]:
+        pixel_mask: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
         patch_attention_mask = self._create_patch_attention_mask(pixel_mask)
 
         vit_oup = self.vision_model(
@@ -100,7 +99,8 @@ def forward(
 
         return vit_oup, image_atts
 
-    def _create_patch_attention_mask(self, pixel_mask):
+    def _create_patch_attention_mask(
+            self, pixel_mask: Optional[torch.Tensor]) -> torch.Tensor:
         if pixel_mask is None:
             return None
 
@@ -115,7 +115,8 @@ def _create_patch_attention_mask(self, pixel_mask):
         )
         return (patches_subgrid.sum(dim=(-1, -2)) > 0).bool()
 
-    def _create_image_attention_mask(self, patch_attention_mask):
+    def _create_image_attention_mask(
+            self, patch_attention_mask: torch.Tensor) -> torch.Tensor:
         if patch_attention_mask is None:
             return None
 
@@ -125,13 +126,13 @@ def _create_image_attention_mask(self, patch_attention_mask):
 
 class FFN(nn.Module):
 
-    def __init__(self, embed_dim, ff_dim, output_dim):
+    def __init__(self, embed_dim: int, ff_dim: int, output_dim: int) -> None:
         super().__init__()
         self.linear_in = ColumnParallelLinear(embed_dim, ff_dim, bias=False)
         self.linear_out = RowParallelLinear(ff_dim, output_dim, bias=False)
         self.act = get_act_fn("gelu_new")
 
-    def forward(self, hidden_states):
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         hidden_states, _ = self.linear_in(hidden_states)
         hidden_states = self.act(hidden_states)
         hidden_states, _ = self.linear_out(hidden_states)
@@ -140,7 +141,7 @@ def forward(self, hidden_states):
 
 class CrossAttention(nn.Module):
 
-    def __init__(self, kv_dim, embed_dim, num_heads, drop_out_rate=0):
+    def __init__(self, kv_dim: int, embed_dim: int, num_heads: int) -> None:
         super().__init__()
         self.num_heads = num_heads
         self.q_proj = nn.Linear(embed_dim, embed_dim, bias=False)
@@ -149,12 +150,16 @@ def __init__(self, kv_dim, embed_dim, num_heads, drop_out_rate=0):
 
         self.multihead_attn = nn.MultiheadAttention(embed_dim, num_heads)
         self.linear = nn.Linear(embed_dim, embed_dim)
-        self.dropout = nn.Dropout(drop_out_rate)
 
         self.layer_norm = nn.LayerNorm(embed_dim)
         self.ln_kv = nn.LayerNorm(kv_dim)
 
-    def forward(self, x, hidden_states, attn_mask=None, add_residual=False):
+    def forward(
+        self,
+        x: torch.Tensor,
+        hidden_states: torch.Tensor,
+        attn_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
         normed_hidden_states = self.layer_norm(hidden_states)
         query = self.q_proj(normed_hidden_states).permute(1, 0, 2)
 
@@ -169,11 +174,7 @@ def forward(self, x, hidden_states, attn_mask=None, add_residual=False):
 
         attn_output = attn_output.permute(1, 0, 2)
 
-        if add_residual:
-            attn_output = hidden_states + self.dropout(
-                self.linear(attn_output))
-        else:
-            attn_output = self.dropout(self.linear(attn_output))
+        attn_output = self.linear(attn_output)
 
         return attn_output
 
@@ -201,14 +202,14 @@ class AriaProjector(nn.Module):
 
     def __init__(
         self,
-        patch_to_query_dict,
-        embed_dim,
-        num_heads,
-        kv_dim,
-        ff_dim,
-        output_dim,
-        norm_layer=nn.LayerNorm,
-    ):
+        patch_to_query_dict: dict[int, int],
+        embed_dim: int,
+        num_heads: int,
+        kv_dim: int,
+        ff_dim: int,
+        output_dim: int,
+        norm_layer: Callable[[int], nn.Module] = nn.LayerNorm,
+    ) -> None:
         super().__init__()
         self.patch_to_query_dict = patch_to_query_dict
         self.embed_dim = embed_dim
@@ -224,7 +225,11 @@ def __init__(
         self.ln_ffn = norm_layer(embed_dim)
         self.ffn = FFN(embed_dim, ff_dim, output_dim)
 
-    def forward(self, x, attn_mask=None):
+    def forward(
+        self,
+        x: torch.Tensor,
+        attn_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
         bs = x.shape[0]
         queries = self.query.unsqueeze(0).repeat(bs, 1, 1)
 
@@ -442,12 +447,17 @@ def build_mm_projector(config: PretrainedConfig):
     )
 
 
-def get_max_aria_image_tokens(ctx: InputContext):
-    hf_config = ctx.get_hf_config()
-    return max(hf_config.projector_patch_to_query_dict.values())
+class AriaMultiModalProcessor(BaseMultiModalProcessor):
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None}
 
+    def _get_num_image_tokens(self) -> int:
+        hf_config = self.ctx.get_hf_config()
+        return max(hf_config.projector_patch_to_query_dict.values())
 
-class AriaMultiModalProcessor(BaseMultiModalProcessor):
+    def get_mm_max_tokens_per_item(self) -> Mapping[str, int]:
+        return {"image": self._get_num_image_tokens()}
 
     def _get_mm_fields_config(
         self,
@@ -468,13 +478,13 @@ def _get_prompt_replacements(
         hf_config = self.ctx.get_hf_config()
         image_token_id = hf_config.image_token_index
 
-        max_image_tokens = get_max_aria_image_tokens(self.ctx)
+        num_image_tokens = self._get_num_image_tokens()
 
         return [
             PromptReplacement(
                 modality="image",
                 target=[image_token_id],
-                replacement=[image_token_id] * max_image_tokens,
+                replacement=[image_token_id] * num_image_tokens,
             )
         ]
 
@@ -504,7 +514,6 @@ def _get_dummy_mm_inputs(
         )
 
 
-@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_aria_image_tokens)
 @MULTIMODAL_REGISTRY.register_processor(AriaMultiModalProcessor)
 class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
     """
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index 50680fadc4aa3..0fe10d8585215 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -9,7 +9,6 @@
 
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, VllmConfig
-from vllm.inputs import InputContext
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
@@ -18,7 +17,6 @@
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
                                     MultiModalInputsV2, MultiModalKwargs,
                                     NestedTensors, PlaceholderRange)
-from vllm.multimodal.parse import MultiModalDataParser
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         MultiModalDataItems, ProcessorInputs,
                                         PromptReplacement)
@@ -398,15 +396,17 @@ def forward(
         return sequence_output
 
 
-def get_max_blip2_image_tokens(ctx: InputContext):
-    hf_config = ctx.get_hf_config(Blip2Config)
-    return hf_config.num_query_tokens
+class Blip2MultiModalProcessor(BaseMultiModalProcessor):
 
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": 1}
 
-class Blip2MultiModalProcessor(BaseMultiModalProcessor):
+    def _get_num_image_tokens(self) -> int:
+        hf_config = self.ctx.get_hf_config(Blip2Config)
+        return hf_config.num_query_tokens
 
-    def _get_data_parser(self) -> MultiModalDataParser:
-        return MultiModalDataParser(max_mm_counts={"image": 1})
+    def get_mm_max_tokens_per_item(self) -> Mapping[str, int]:
+        return {"image": self._get_num_image_tokens()}
 
     def _get_hf_processor(self) -> Blip2Processor:
         return self.ctx.get_hf_processor(Blip2Processor)
@@ -427,7 +427,7 @@ def _get_prompt_replacements(
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
-        max_image_tokens = get_max_blip2_image_tokens(self.ctx)
+        max_image_tokens = self._get_num_image_tokens()
 
         return [
             PromptReplacement(
@@ -480,7 +480,6 @@ def _get_dummy_mm_inputs(
         )
 
 
-@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_blip2_image_tokens)
 @MULTIMODAL_REGISTRY.register_processor(Blip2MultiModalProcessor)
 class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
 
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index c731934e792fc..0bd0194243ceb 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -11,7 +11,6 @@
 from vllm.attention import Attention, AttentionMetadata
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
-from vllm.inputs import InputContext
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
@@ -31,7 +30,6 @@
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
                                     MultiModalInputsV2, MultiModalKwargs,
                                     NestedTensors, PlaceholderRange)
-from vllm.multimodal.parse import MultiModalDataParser
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         MultiModalDataItems, ProcessorInputs,
                                         PromptReplacement)
@@ -43,11 +41,6 @@
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix, merge_multimodal_embeddings)
 
-# These configs are not part of the model config but the preprocessor
-# and processor files, so we hardcode them in the model file for now.
-CHAMELEON_CROP_SIZE_HEIGHT = CHAMELEON_CROP_SIZE_WIDTH = 512
-CHAMELEON_IMAGE_SEQ_LENGTH = 1024
-
 
 class ChameleonImagePixelInputs(TypedDict):
     type: Literal["pixel_values"]
@@ -55,14 +48,17 @@ class ChameleonImagePixelInputs(TypedDict):
     """Shape: `(batch_size * num_images, num_channels, height, width)`"""
 
 
-def get_max_chameleon_image_tokens(ctx: InputContext):
-    return CHAMELEON_IMAGE_SEQ_LENGTH
+class ChameleonMultiModalProcessor(BaseMultiModalProcessor):
 
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": 1}
 
-class ChameleonMultiModalProcessor(BaseMultiModalProcessor):
+    def _get_num_image_tokens(self) -> int:
+        processor = self._get_hf_processor()
+        return processor.image_seq_length
 
-    def _get_data_parser(self) -> MultiModalDataParser:
-        return MultiModalDataParser(max_mm_counts={"image": 1})
+    def get_mm_max_tokens_per_item(self) -> Mapping[str, int]:
+        return {"image": self._get_num_image_tokens()}
 
     def _get_hf_processor(self) -> ChameleonProcessor:
         return self.ctx.get_hf_processor(ChameleonProcessor)
@@ -88,7 +84,7 @@ def _get_prompt_replacements(
                 target="<image>",
                 replacement="".join([
                     processor.image_start_token,
-                    processor.image_token * CHAMELEON_IMAGE_SEQ_LENGTH,
+                    processor.image_token * self._get_num_image_tokens(),
                     processor.image_end_token,
                 ]),
             )
@@ -98,12 +94,15 @@ def _get_dummy_mm_inputs(
         self,
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
+        config = self.ctx.get_hf_config(ChameleonConfig)
+
+        width = height = config.vq_config.resolution
         num_images = mm_counts.get("image", 0)
 
         mm_data = {
             "image":
-            self._get_dummy_images(width=CHAMELEON_CROP_SIZE_WIDTH,
-                                   height=CHAMELEON_CROP_SIZE_HEIGHT,
+            self._get_dummy_images(width=width,
+                                   height=height,
                                    num_images=num_images)
         }
 
@@ -902,7 +901,6 @@ def forward(
         return hidden_states
 
 
-@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_chameleon_image_tokens)
 @MULTIMODAL_REGISTRY.register_processor(ChameleonMultiModalProcessor)
 class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal,
                                         SupportsPP):
@@ -931,9 +929,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.model.make_empty_intermediate_tensors)
 
     def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
-
-        expected_dims = (3, CHAMELEON_CROP_SIZE_HEIGHT,
-                         CHAMELEON_CROP_SIZE_WIDTH)
+        vq_config: ChameleonVQVAEConfig = self.config.vq_config
+        expected_dims = (3, vq_config.resolution, vq_config.resolution)
         actual_dims = tuple(data.shape[1:])
 
         if actual_dims != expected_dims:
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index 0a48fa3fe11c0..7fb8c5d1ab09c 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -25,7 +25,6 @@
 
 from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
-from vllm.inputs import InputContext
 from vllm.model_executor.layers.linear import ColumnParallelLinear
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.models.persimmon import PersimmonForCausalLM
@@ -34,7 +33,7 @@
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
                                     MultiModalInputsV2, MultiModalKwargs,
                                     NestedTensors, PlaceholderRange)
-from vllm.multimodal.parse import ImageProcessorItems, MultiModalDataParser
+from vllm.multimodal.parse import ImageProcessorItems, ImageSize
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         MultiModalDataItems, ProcessorInputs,
                                         PromptReplacement)
@@ -48,9 +47,6 @@
 _IMAGE_TOKEN_ID = 71011
 _NEWLINE_TOKEN_ID = 71019
 
-MAX_IMAGE_FEATURE_SIZE_HEIGHT = 1080
-MAX_IMAGE_FEATURE_SIZE_WIDTH = 1920
-
 
 class FuyuImagePatchInputs(TypedDict):
     type: Literal["image_patches"]
@@ -67,43 +63,49 @@ class FuyuImagePatchInputs(TypedDict):
     """
 
 
-def _get_fuyu_num_image_tokens(
-    image_height: int,
-    image_width: int,
-) -> Tuple[int, int]:
-    """
-    Calculate the number of image tokens needed for a given image size.
-
-    The expected Fuyu image prompts can be expressed as:
-
-    .. code-block::
-        (image_token * ncols + newline_token) * nrows
-
-    Args:
-        image_size: Tuple[int, int] - `(width, height)` of the image
-
-    Returns:
-        ncols: int - number of image tokens in `x` direction
-        nrows: int - number of image tokens in `y` direction
-    """
-    ncols = math.ceil(image_width / 30)
-    nrows = math.ceil(image_height / 30)
-    return ncols, nrows
-
+class FuyuMultiModalProcessor(BaseMultiModalProcessor):
 
-def get_max_fuyu_image_tokens(ctx: InputContext):
-    ncols, nrows = _get_fuyu_num_image_tokens(
-        image_height=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
-        image_width=MAX_IMAGE_FEATURE_SIZE_WIDTH,
-    )
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": 1}
 
-    return (ncols + 1) * nrows
+    def _get_image_target_size(self) -> ImageSize:
+        processor = self._get_hf_processor()
+        image_processor: FuyuImageProcessor = processor.image_processor
 
+        target_size = image_processor.size
+        return ImageSize(width=target_size["width"],
+                         height=target_size["height"])
 
-class FuyuMultiModalProcessor(BaseMultiModalProcessor):
+    def _get_image_grid_size(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> tuple[int, int]:
+        target_width, target_height = self._get_image_target_size()
+
+        if not (image_width <= target_width and image_height <= target_height):
+            height_scale_factor = target_height / image_height
+            width_scale_factor = target_width / image_width
+            optimal_scale_factor = min(height_scale_factor, width_scale_factor)
+
+            image_height = int(image_height * optimal_scale_factor)
+            image_width = int(image_width * optimal_scale_factor)
+
+        ncols = math.ceil(image_width / 30)
+        nrows = math.ceil(image_height / 30)
+        return ncols, nrows
+
+    def get_mm_max_tokens_per_item(self) -> Mapping[str, int]:
+        target_width, target_height = self._get_image_target_size()
+
+        max_ncols, max_nrows = self._get_image_grid_size(
+            image_width=target_width,
+            image_height=target_height,
+        )
+        max_image_tokens = (max_ncols + 1) * max_nrows
 
-    def _get_data_parser(self) -> MultiModalDataParser:
-        return MultiModalDataParser(max_mm_counts={"image": 1})
+        return {"image": max_image_tokens}
 
     def _get_hf_processor(self) -> FuyuProcessor:
         return self.ctx.get_hf_processor(FuyuProcessor)
@@ -166,28 +168,13 @@ def _get_prompt_replacements(
         eot_token_id = tokenizer.bos_token_id
         assert isinstance(eot_token_id, int)
 
-        hf_processor = self._get_hf_processor()
-        image_processor: FuyuImageProcessor = hf_processor.image_processor
-        target_size = image_processor.size
-        target_height, target_width = (target_size["height"],
-                                       target_size["width"])
-
         def get_replacement_fuyu(item_idx: int):
             images = mm_items.get_items("image", ImageProcessorItems)
             image_size = images.get_image_size(item_idx)
-            width, height = image_size.width, image_size.height
-            if not (width <= target_width and height <= target_height):
-                height_scale_factor = target_height / height
-                width_scale_factor = target_width / width
-                optimal_scale_factor = min(height_scale_factor,
-                                           width_scale_factor)
-
-                height = int(height * optimal_scale_factor)
-                width = int(width * optimal_scale_factor)
-
-            ncols, nrows = _get_fuyu_num_image_tokens(
-                image_width=width,
-                image_height=height,
+
+            ncols, nrows = self._get_image_grid_size(
+                image_width=image_size.width,
+                image_height=image_size.height,
             )
 
             return (([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows +
@@ -225,12 +212,13 @@ def _get_dummy_mm_inputs(
         self,
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
+        target_width, target_height = self._get_image_target_size()
         num_images = mm_counts.get("image", 0)
 
         mm_data = {
             "image":
-            self._get_dummy_images(width=MAX_IMAGE_FEATURE_SIZE_WIDTH,
-                                   height=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
                                    num_images=num_images)
         }
 
@@ -240,7 +228,6 @@ def _get_dummy_mm_inputs(
         )
 
 
-@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_fuyu_image_tokens)
 @MULTIMODAL_REGISTRY.register_processor(FuyuMultiModalProcessor)
 class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
 
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 34dc7fa31ce6f..808e61edb6fb4 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -119,6 +119,12 @@ def get_max_llava_image_tokens(ctx: InputContext):
 
 class LlavaMultiModalProcessor(BaseMultiModalProcessor):
 
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None}
+
+    def get_mm_max_tokens_per_item(self) -> Mapping[str, int]:
+        return {"image": get_max_llava_image_tokens(self.ctx)}
+
     def _get_hf_processor(self) -> Union[LlavaProcessor, PixtralProcessor]:
         return self.ctx.get_hf_processor((LlavaProcessor, PixtralProcessor))
 
@@ -324,7 +330,6 @@ def init_vision_tower_for_llava(
     raise NotImplementedError(msg)
 
 
-@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_llava_image_tokens)
 @MULTIMODAL_REGISTRY.register_processor(LlavaMultiModalProcessor)
 class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
     # BitandBytes specific attributes
@@ -649,7 +654,6 @@ def get_replacement_mantis(item_idx: int):
 
 # To use this model, please use
 # `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'`
-@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_llava_image_tokens)
 @MULTIMODAL_REGISTRY.register_processor(MantisMultiModalProcessor)
 class MantisForConditionalGeneration(LlavaForConditionalGeneration):
     pass
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 15362db6cdfbf..d855e7d2d36f8 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -23,7 +23,6 @@
 
 from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
-from vllm.inputs import InputContext
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
@@ -306,24 +305,31 @@ def add_image_newline(self, image_features_hd):
         return image_features_hd_newline
 
 
-def get_max_phi3v_image_tokens(
-    ctx: InputContext,
-    *,
-    num_crops: Optional[int] = None,
-) -> int:
-    hf_processor_mm_kwargs = {}
-    if num_crops:
-        hf_processor_mm_kwargs["num_crops"] = num_crops
+class Phi3VMultiModalProcessor(BaseMultiModalProcessor):
 
-    processor = ctx.get_hf_processor(**hf_processor_mm_kwargs)
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None}
 
-    return processor.calc_num_image_tokens_from_image_size(
-        width=MAX_IMAGE_FEATURE_SIZE_WIDTH,
-        height=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
-    )
+    def _get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        processor = self._get_hf_processor()
+
+        return processor.calc_num_image_tokens_from_image_size(  # type: ignore
+            width=image_width,
+            height=image_height,
+        )
 
+    def get_mm_max_tokens_per_item(self) -> Mapping[str, int]:
+        max_image_tokens = self._get_num_image_tokens(
+            image_width=MAX_IMAGE_FEATURE_SIZE_WIDTH,
+            image_height=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
+        )
 
-class Phi3VMultiModalProcessor(BaseMultiModalProcessor):
+        return {"image": max_image_tokens}
 
     def _get_hf_processor(
         self,
@@ -332,6 +338,7 @@ def _get_hf_processor(
     ) -> ProcessorMixin:
         if num_crops is not None:
             return self.ctx.get_hf_processor(num_crops=num_crops)
+
         return self.ctx.get_hf_processor()
 
     def _call_hf_processor(
@@ -375,7 +382,6 @@ def _get_prompt_replacements(
     ) -> list[PromptReplacement]:
         hf_processor = self._get_hf_processor()
         image_tokens: list[str] = hf_processor.img_tokens  # type: ignore
-        image_processor = hf_processor.image_processor  # type: ignore
 
         tokenizer = self._get_tokenizer()
         bos_token_id = tokenizer.bos_token_id
@@ -385,9 +391,9 @@ def get_replacement_phi3v(item_idx: int):
             images = mm_items.get_items("image", ImageProcessorItems)
             image_size = images.get_image_size(item_idx)
 
-            num_tokens = image_processor.calc_num_image_tokens_from_image_size(
-                width=image_size.width,
-                height=image_size.height,
+            num_tokens = self._get_num_image_tokens(
+                image_width=image_size.width,
+                image_height=image_size.height,
             )
 
             return [_IMAGE_TOKEN_ID] * num_tokens + [bos_token_id]
@@ -467,7 +473,6 @@ def apply(
         return result
 
 
-@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_phi3v_image_tokens)
 @MULTIMODAL_REGISTRY.register_processor(Phi3VMultiModalProcessor)
 class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
     hf_to_vllm_mapper = WeightsMapper(
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index de55bc6bcc123..d050fd060353a 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -33,13 +33,12 @@
 
 from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
-from vllm.inputs import InputContext
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
                                     NestedTensors)
-from vllm.multimodal.parse import MultiModalDataParser
+from vllm.multimodal.parse import AudioProcessorItems, MultiModalDataParser
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         MultiModalDataItems, ProcessorInputs,
                                         PromptReplacement)
@@ -80,14 +79,17 @@ def _get_feat_extract_output_lengths(input_lengths: torch.Tensor):
     return feat_lengths, output_lengths
 
 
-def get_max_qwen2_audio_audio_tokens(ctx: InputContext) -> int:
-    hf_config = ctx.get_hf_config(Qwen2AudioConfig)
-    max_source_position = hf_config.audio_config.max_source_positions
-    output_lengths = (max_source_position - 2) // 2 + 1
-    return output_lengths
+class Qwen2AudioMultiModalProcessor(BaseMultiModalProcessor):
 
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"audio": None}
 
-class Qwen2AudioMultiModalProcessor(BaseMultiModalProcessor):
+    def get_mm_max_tokens_per_item(self) -> Mapping[str, int]:
+        hf_config = self.ctx.get_hf_config(Qwen2AudioConfig)
+        max_source_positions = hf_config.audio_config.max_source_positions
+        max_output_lengths = (max_source_positions - 2) // 2 + 1
+
+        return {"audio": max_output_lengths}
 
     def _get_hf_processor(
         self,
@@ -157,11 +159,21 @@ def _get_prompt_replacements(
             audio_output_lengths = []
         else:
             assert isinstance(feature_attention_mask, torch.Tensor)
-            _, audio_output_lengths = _get_feat_extract_output_lengths(
+            _, audio_output_lens = _get_feat_extract_output_lengths(
                 feature_attention_mask.sum(-1))
 
+            audio_output_lengths = audio_output_lens.tolist()
+
         def get_replacement_qwen2_audio(item_idx: int):
-            return [placeholder] * audio_output_lengths[item_idx]
+            num_placeholders = audio_output_lengths[item_idx]
+            if num_placeholders == 0:
+                audios = mm_items.get_items("audio", AudioProcessorItems)
+                audio = audios.get(item_idx)
+                raise ValueError(
+                    f"The audio {audio} (len={len(audio)}) is too short "
+                    "to be represented inside the model")
+
+            return [placeholder] * num_placeholders
 
         return [
             PromptReplacement(
@@ -171,6 +183,14 @@ def get_replacement_qwen2_audio(item_idx: int):
             )
         ]
 
+    def _always_apply_prompt_replacements(self) -> bool:
+        # HF never applies prompt replacements, so we have to do it ourselves
+        # _find_placeholders may incorrectly think that HF has already performed
+        # processing for multi-audio input when the input audios are short
+        # (the corresponding placeholders may take up fewer tokens than
+        # the number of audio items)
+        return True
+
     def _get_dummy_mm_inputs(
         self,
         mm_counts: Mapping[str, int],
@@ -192,8 +212,6 @@ def _get_dummy_mm_inputs(
         )
 
 
-@MULTIMODAL_REGISTRY.register_max_multimodal_tokens(
-    "audio", get_max_qwen2_audio_audio_tokens)
 @MULTIMODAL_REGISTRY.register_processor(Qwen2AudioMultiModalProcessor)
 class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal,
                                          SupportsPP):
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 0df101b3dcce4..26b6d768ad4f6 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -40,7 +40,6 @@
 from vllm.config import VllmConfig
 from vllm.distributed import parallel_state
 from vllm.distributed import utils as dist_utils
-from vllm.inputs import InputContext
 from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.layers.activation import QuickGELU
@@ -650,8 +649,9 @@ def _get_vision_info(
     width: int,
     min_pixels: int,
     max_pixels: int,
+    *,
     do_resize: bool = True,
-    data_type_key: str = "image",
+    modality: str = "image",
     mm_count: int = 1,
 ):
     """Get information (resized height / width and number of vision tokens)
@@ -671,11 +671,12 @@ def _get_vision_info(
     else:
         resized_height, resized_width = height, width
 
-    if data_type_key == "image":
+    if modality == "image":
         grid_t = mm_count
-    else:
-        assert data_type_key == "video"
+    elif modality == "video":
         grid_t = max(mm_count // temporal_patch_size, 1)
+    else:
+        raise ValueError(f"Modality {modality} is not supported")
 
     grid_h = resized_height // patch_size
     grid_w = resized_width // patch_size
@@ -691,41 +692,11 @@ def _get_image_processor(hf_processor: Qwen2VLProcessor):
     return image_processor
 
 
-def get_max_qwen2_vl_mm_tokens(ctx: InputContext,
-                               data_type_key: str,
-                               *,
-                               min_pixels: Optional[int] = None,
-                               max_pixels: Optional[int] = None) -> int:
-    hf_config = ctx.get_hf_config(Qwen2VLConfig)
-    vision_config = hf_config.vision_config
-
-    hf_processor = ctx.get_hf_processor(Qwen2VLProcessor)
-    image_processor = _get_image_processor(hf_processor)
-
-    _, _, max_llm_image_tokens = _get_vision_info(
-        vision_config,
-        height=9999999,
-        width=9999999,
-        min_pixels=min_pixels or image_processor.min_pixels,
-        max_pixels=max_pixels or image_processor.max_pixels,
-        data_type_key=data_type_key,
-    )
-    return max_llm_image_tokens
-
-
-get_max_qwen2_vl_image_tokens = partial(get_max_qwen2_vl_mm_tokens,
-                                        data_type_key="image")
-get_max_qwen2_vl_video_tokens = partial(get_max_qwen2_vl_mm_tokens,
-                                        data_type_key="video")
-
-
 class Qwen2EmbeddingItems(ModalityDataItems[dict[str, torch.Tensor],
                                             dict[str, torch.Tensor]]):
 
     def __init__(self, data: dict, modality: str) -> None:
-        super().__init__(data)
-
-        self.modality = modality
+        super().__init__(data, modality)
 
         grid_thw = data[f"{modality}_grid_thw"]
         slice_idxs = [0] + grid_thw.prod(-1).cumsum_(0).tolist()
@@ -734,9 +705,6 @@ def __init__(self, data: dict, modality: str) -> None:
             for i in range(len(grid_thw))
         ]
 
-    def __repr__(self) -> str:
-        return (f"{type(self).__name__}(modality={self.modality!r})")
-
     def get_count(self) -> int:
         return len(self.data[f"{self.modality}_grid_thw"])
 
@@ -792,6 +760,32 @@ def _parse_video_data(
 
 class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor):
 
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None, "video": None}
+
+    def _get_max_mm_tokens(self, modality: str) -> int:
+        hf_config = self.ctx.get_hf_config(Qwen2VLConfig)
+        vision_config = hf_config.vision_config
+
+        hf_processor = self._get_hf_processor()
+        image_processor = _get_image_processor(hf_processor)
+
+        _, _, max_llm_image_tokens = _get_vision_info(
+            vision_config,
+            height=9999999,
+            width=9999999,
+            min_pixels=image_processor.min_pixels,
+            max_pixels=image_processor.max_pixels,
+            modality=modality,
+        )
+        return max_llm_image_tokens
+
+    def get_mm_max_tokens_per_item(self) -> Mapping[str, int]:
+        return {
+            "image": self._get_max_mm_tokens("image"),
+            "video": self._get_max_mm_tokens("video"),
+        }
+
     def _get_data_parser(self) -> MultiModalDataParser:
         return Qwen2MultiModalDataParser()
 
@@ -908,9 +902,6 @@ def _get_dummy_mm_inputs(
         )
 
 
-@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_qwen2_vl_image_tokens)
-@MULTIMODAL_REGISTRY.register_max_multimodal_tokens(
-    "video", get_max_qwen2_vl_video_tokens)
 @MULTIMODAL_REGISTRY.register_processor(Qwen2VLMultiModalProcessor)
 class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
                                       SupportsLoRA, SupportsPP):
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index 54be7fed3f2be..0b83684c9bac5 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -2,7 +2,7 @@
 """PyTorch Ultravox model."""
 
 import math
-from functools import cached_property, lru_cache
+from functools import cached_property
 from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
                     TypedDict, Union)
 
@@ -17,7 +17,6 @@
 
 from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
-from vllm.inputs import InputContext
 from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
@@ -58,22 +57,17 @@ class UltravoxAudioEmbeddingInputs(TypedDict):
                             UltravoxAudioEmbeddingInputs]
 
 
-@lru_cache
-def cached_feature_extractor(model_id: str) -> WhisperFeatureExtractor:
-    return WhisperFeatureExtractor.from_pretrained(model_id)
-
-
-def whisper_feature_extractor(ctx: InputContext) -> WhisperFeatureExtractor:
-    hf_config = ctx.get_hf_config(UltravoxConfig)
-    return cached_feature_extractor(hf_config.audio_model_id)
-
+class UltravoxMultiModalProcessor(BaseMultiModalProcessor):
 
-def get_ultravox_max_audio_tokens(ctx: InputContext):
-    feature_extractor = whisper_feature_extractor(ctx)
-    return math.ceil(feature_extractor.chunk_length * _AUDIO_TOKENS_PER_SECOND)
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"audio": None}
 
+    def get_mm_max_tokens_per_item(self) -> Mapping[str, int]:
+        feature_extractor = self._get_feature_extractor()
+        max_audio_tokens = math.ceil(feature_extractor.chunk_length *
+                                     _AUDIO_TOKENS_PER_SECOND)
 
-class UltravoxMultiModalProcessor(BaseMultiModalProcessor):
+        return {"audio": max_audio_tokens}
 
     def _get_hf_processor(
         self,
@@ -322,8 +316,6 @@ def forward(
         return hidden_states
 
 
-@MULTIMODAL_REGISTRY.register_max_multimodal_tokens(
-    "audio", get_ultravox_max_audio_tokens)
 @MULTIMODAL_REGISTRY.register_processor(UltravoxMultiModalProcessor)
 class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP):
 
diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py
index da111e999ebb8..4e1b78ab2c59d 100644
--- a/vllm/multimodal/parse.py
+++ b/vllm/multimodal/parse.py
@@ -21,10 +21,15 @@
 
 class ModalityDataItems(ABC, Generic[_T, _I]):
 
-    def __init__(self, data: _T) -> None:
+    def __init__(self, data: _T, modality: str) -> None:
         super().__init__()
 
         self.data = data
+        self.modality = modality
+
+    def __repr__(self) -> str:
+        return (f"{type(self).__name__}(modality={self.modality!r}, "
+                f"len={len(self)})")
 
     def __len__(self) -> int:
         return self.get_count()
@@ -64,14 +69,6 @@ def get_passthrough_data(self) -> Mapping[str, object]:
 
 class ProcessorBatchItems(ModalityDataItems[Sequence[_T], _T]):
 
-    def __init__(self, data: Sequence[_T], modality: str) -> None:
-        super().__init__(data)
-
-        self.modality = modality
-
-    def __repr__(self) -> str:
-        return (f"{type(self).__name__}(modality={self.modality!r})")
-
     def get_count(self) -> int:
         return len(self.data)
 
@@ -87,14 +84,6 @@ def get_passthrough_data(self) -> Mapping[str, object]:
 
 class EmbeddingItems(ModalityDataItems[NestedTensors, torch.Tensor]):
 
-    def __init__(self, data: NestedTensors, modality: str) -> None:
-        super().__init__(data)
-
-        self.modality = modality
-
-    def __repr__(self) -> str:
-        return (f"{type(self).__name__}(modality={self.modality!r})")
-
     def get_count(self) -> int:
         return len(self.data)
 
@@ -222,22 +211,13 @@ class MultiModalDataParser:
     Parses :class:`MultiModalDataDict` into :class:`MultiModalDataItems`.
 
     Args:
-        max_mm_counts (Mapping[str, int]): The maximum allowed number of items
-            belonging to each modality. This effectively sets a hard limit over
-            `--limit-mm-per-prompt`.
         target_sr (float, optional): Enables automatic resampling of audio
             items to the model's expected sampling rate.
     """
 
-    def __init__(
-        self,
-        *,
-        max_mm_counts: Mapping[str, int] = {},
-        target_sr: Optional[float] = None,
-    ) -> None:
+    def __init__(self, *, target_sr: Optional[float] = None) -> None:
         super().__init__()
 
-        self.max_mm_counts = max_mm_counts
         self.target_sr = target_sr
 
     def _is_embeddings(self, data: object) -> TypeGuard[NestedTensors]:
@@ -345,7 +325,6 @@ def _get_subparsers(self) -> Mapping[str, ModalityDataParser]:
 
     def parse_mm_data(self,
                       mm_data: MultiModalDataDict) -> MultiModalDataItems:
-        max_mm_counts = self.max_mm_counts
         subparsers = self._get_subparsers()
 
         mm_items = MultiModalDataItems()
@@ -353,16 +332,6 @@ def parse_mm_data(self,
             if k not in subparsers:
                 raise ValueError(f"Unsupported modality: {k}")
 
-            modality_items = subparsers[k](v)
-
-            if k in max_mm_counts:
-                max_count = max_mm_counts[k]
-                if len(modality_items) > max_count:
-                    raise ValueError(
-                        f"This model supports at most {max_count} {k} items "
-                        f"per prompt, but {len(modality_items)} {k} items "
-                        "were given or set as its limit_mm_per_prompt.")
-
-            mm_items[k] = modality_items
+            mm_items[k] = subparsers[k](v)
 
         return mm_items
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 7712c3bcebe20..76475ddda81f4 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -624,6 +624,29 @@ def __call__(
     ) -> MultiModalInputsV2:
         return self.apply(prompt, mm_data, hf_processor_mm_kwargs)
 
+    @abstractmethod
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        """
+        Return the maximum supported number of items for each modality.
+
+        A value of `None` means unlimited number of items.
+
+        Omitting a modality from the returned dictionary means that
+        it is not supported at all.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_mm_max_tokens_per_item(self) -> Mapping[str, int]:
+        """
+        Get the maximum possible number of tokens per data item
+        for each modality.
+
+        The dictionary returned by this method should have the same
+        keys as that returned by :meth:`get_supported_mm_limits`.
+        """
+        raise NotImplementedError
+
     def _get_data_parser(self) -> MultiModalDataParser:
         """
         Construct a data parser to preprocess multi-modal data items
@@ -653,7 +676,18 @@ def _to_mm_items(
         before passing them to :meth:`_get_hf_mm_data`.
         """
         parser = self._get_data_parser()
-        return parser.parse_mm_data(mm_data)
+        mm_items = parser.parse_mm_data(mm_data)
+
+        mm_limits = self.ctx.get_mm_config().limit_per_prompt
+        for modality, items in mm_items.items():
+            limit = mm_limits.get(modality, 1)
+            if len(items) > limit:
+                raise ValueError(
+                    f"You set {modality}={limit} (or defaulted to 1) in "
+                    f"`--limit-mm-per-prompt`, but passed {len(items)} "
+                    f"{modality} items in the same prompt.")
+
+        return mm_items
 
     @abstractmethod
     def _get_mm_fields_config(
@@ -901,6 +935,17 @@ def _bind_prompt_replacements(
 
         return [prompt_repl.bind(tokenizer) for prompt_repl in prompt_repls]
 
+    def _always_apply_prompt_replacements(self) -> bool:
+        """
+        A flag which can be overridden so that
+        :meth:`_apply_prompt_replacements` is always called even if we
+        detect that HF has performed processing via :meth:`_find_placeholders`.
+
+        This is useful in cases where :meth:`_find_placeholders` cannot be
+        reliably used to detect whether HF has performed processing or not.
+        """
+        return False
+
     def _apply_prompt_replacements(
         self,
         token_ids: list[int],
@@ -995,7 +1040,7 @@ def apply(
         all_placeholders = self._find_placeholders(prompt_repls, prompt_ids,
                                                    mm_item_counts)
 
-        if all_placeholders:
+        if all_placeholders and not self._always_apply_prompt_replacements():
             tokenizer = self._get_tokenizer()
             prompt_text = _decode(tokenizer, prompt_ids)
         else:
@@ -1009,10 +1054,27 @@ def apply(
                 mm_item_counts,
             )
 
-        mm_placeholders = {
-            modality: [item.to_range() for item in items]
-            for modality, items in full_groupby_modality(all_placeholders)
-        }
+        mm_placeholders = dict[str, list[PlaceholderRange]]()
+        err_suffix = ("This suggests a problem with your implementation of "
+                      "the merged multi-modal processor for this model, "
+                      "particularly in the `_get_prompt_replacements` method.")
+
+        for modality, placeholders in full_groupby_modality(all_placeholders):
+            if modality not in mm_items:
+                raise AssertionError(
+                    f"Expected no placeholders for {modality=}, "
+                    f"but found {placeholders=}. Input items: {mm_items}"
+                    f"\n{err_suffix}")
+
+            if len(placeholders) != len(mm_items[modality]):
+                raise AssertionError(
+                    f"Expected length of {placeholders=} for {modality=} "
+                    f"to equal that of input items: {mm_items[modality]}"
+                    f"\n{err_suffix}")
+
+            mm_placeholders[modality] = [
+                item.to_range() for item in placeholders
+            ]
 
         return MultiModalInputsV2(
             type="multimodal",
@@ -1063,15 +1125,38 @@ def _get_dummy_mm_inputs(
         """
         raise NotImplementedError
 
-    def get_dummy_data(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-        mm_max_tokens: Mapping[str, int],
-    ) -> DummyData:
+    def _get_and_validate_dummy_mm_counts(self) -> Mapping[str, int]:
+        mm_limit_per_prompt = self.ctx.get_mm_config().limit_per_prompt
+        supported_mm_limits = self.get_supported_mm_limits()
+
+        mm_limits = {
+            modality: mm_limit_per_prompt.get(modality, 1)
+            for modality in supported_mm_limits
+        }
+
+        for modality, supported_limit in supported_mm_limits.items():
+            limit = mm_limits[modality]
+            if supported_limit is not None and supported_limit < limit:
+                raise ValueError(
+                    f"You set {modality}={limit} (or defaulted to 1) in "
+                    f"`--limit-mm-per-prompt`, but this model only supports "
+                    f"at most {supported_limit} {modality} items.")
+
+        return mm_limits
+
+    def get_dummy_data(self, seq_len: int) -> DummyData:
         # Avoid circular import
         from vllm.sequence import SequenceData
 
+        mm_counts = self._get_and_validate_dummy_mm_counts()
+        mm_max_tokens_per_item = self.get_mm_max_tokens_per_item()
+        if mm_counts.keys() != mm_max_tokens_per_item.keys():
+            raise AssertionError(
+                "The keys returned by `get_supported_mm_limits`"
+                f"({set(mm_counts.keys())}) should be the same as those "
+                "returned by `get_mm_max_tokens_per_item` "
+                f"({set(mm_max_tokens_per_item.keys())})")
+
         processor_inputs = self._get_dummy_mm_inputs(mm_counts)
         mm_inputs = self.apply(
             prompt_text=processor_inputs.prompt_text,
@@ -1087,7 +1172,7 @@ def get_dummy_data(
             for modality, placeholders in placeholders_by_modality.items()
         }
         expected_placeholders_by_modality = {
-            modality: mm_max_tokens[modality]
+            modality: mm_max_tokens_per_item[modality] * mm_counts[modality]
             for modality in placeholders_by_modality
         }
         if total_placeholders_by_modality != expected_placeholders_by_modality:
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 3a5e11867ad9e..073d49d7d2009 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -15,6 +15,7 @@
 from .image import ImagePlugin
 from .inputs import MultiModalDataDict, MultiModalKwargs, NestedTensors
 from .processing import BaseMultiModalProcessor, ProcessingCache
+from .utils import cached_get_tokenizer
 from .video import VideoPlugin
 
 if TYPE_CHECKING:
@@ -219,6 +220,10 @@ def get_max_tokens_per_item_by_modality(
         Note:
             This is currently directly used only in V1.
         """
+        if self.has_processor(model_config):
+            tokenizer = cached_get_tokenizer(model_config.tokenizer)
+            processor = self.create_processor(model_config, tokenizer)
+            return processor.get_mm_max_tokens_per_item()
 
         return {
             key: plugin.get_max_multimodal_tokens(model_config)

From 23c1b10a4c8cd77c5b13afa9242d67ffd055296b Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 2 Jan 2025 17:00:00 +0800
Subject: [PATCH 048/462] [VLM][Bugfix] Multi-modal processor compatible with
 V1 multi-input (#11674)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/multimodal/inputs.py     | 252 ++++++++++++++++------------------
 vllm/multimodal/processing.py |  45 +++---
 vllm/v1/engine/processor.py   |  22 ++-
 3 files changed, 151 insertions(+), 168 deletions(-)

diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index db489af7ac475..b0a1104546186 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -2,7 +2,8 @@
 from collections import UserDict, defaultdict
 from collections.abc import Mapping, Sequence
 from dataclasses import dataclass
-from typing import Any, Literal, TypedDict, TypeVar, Union, cast, final
+from typing import (Any, Literal, Optional, TypedDict, TypeVar, Union, cast,
+                    final)
 
 import numpy as np
 import torch
@@ -11,7 +12,7 @@
 from transformers import BatchFeature
 from typing_extensions import NotRequired, TypeAlias
 
-from vllm.utils import JSONTree, is_list_of, json_map_leaves
+from vllm.utils import JSONTree, full_groupby, is_list_of, json_map_leaves
 
 _T = TypeVar("_T")
 
@@ -160,11 +161,8 @@ def nested_tensors_equal(a: NestedTensors, b: NestedTensors) -> bool:
 
 
 @dataclass(frozen=True)
-class MultiModalFieldItem:
-    """
-    Contains metadata and data in :class:`MultiModalKwargs`
-    corresponding to a data item in :class:`MultiModalDataItems`.
-    """
+class MultiModalFieldElem:
+    """Contains metadata and data of an item in :class:`MultiModalKwargs`."""
     field: "BaseMultiModalField"
     data: NestedTensors
 
@@ -186,34 +184,34 @@ class BaseMultiModalField(ABC):
     def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors:
         raise NotImplementedError
 
-    def _build_item(self, data: NestedTensors) -> MultiModalFieldItem:
-        return MultiModalFieldItem(self, data)
+    def _build_elem(self, data: NestedTensors) -> MultiModalFieldElem:
+        return MultiModalFieldElem(self, data)
 
-    def reduce(self, batch: list[MultiModalFieldItem]) -> MultiModalFieldItem:
-        """Merge multiple instances of :class:`MultiModalFieldItem` together."""
+    def reduce(self, batch: list[MultiModalFieldElem]) -> MultiModalFieldElem:
+        """Merge multiple instances of :class:`MultiModalFieldElem` together."""
         fields = [item.field for item in batch]
         if len(set(fields)) > 1:
             raise ValueError(f"Cannot merge different {fields=}")
 
         data = self._reduce_data([item.data for item in batch])
 
-        return self._build_item(data)
+        return self._build_elem(data)
 
 
 @dataclass(frozen=True)
 class MultiModalBatchedField(BaseMultiModalField):
     """
-    A :class:`BaseMultiModalField` implementation where an item is obtained by
-    directly indexing into the first dimension of the underlying data.
+    A :class:`BaseMultiModalField` implementation where an element in the batch
+    is obtained by indexing into the first dimension of the underlying data.
     """
 
-    def build_items(self, batch: NestedTensors) -> list[MultiModalFieldItem]:
-        return [self._build_item(item) for item in batch]
+    def build_elems(self, batch: NestedTensors) -> list[MultiModalFieldElem]:
+        return [self._build_elem(item) for item in batch]
 
     def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors:
         if len(batch) > 0 and is_list_of(batch, torch.Tensor, check="all"):
             first_shape = batch[0].shape
-            if all(item.shape == first_shape for item in batch):
+            if all(elem.shape == first_shape for elem in batch):
                 return torch.stack(batch)
 
         return batch
@@ -222,24 +220,24 @@ def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors:
 @dataclass(frozen=True)
 class MultiModalFlatField(BaseMultiModalField):
     """
-    A :class:`BaseMultiModalField` implementation where an item is obtained by
-    slicing along the first dimension of the underlying data.
+    A :class:`BaseMultiModalField` implementation where an element in the batch
+    is obtained by slicing along the first dimension of the underlying data.
     """
 
-    def build_items(
+    def build_elems(
         self,
         batch: NestedTensors,
         slices: Sequence[slice],
-    ) -> list[MultiModalFieldItem]:
-        return [self._build_item(batch[slice_]) for slice_ in slices]
+    ) -> list[MultiModalFieldElem]:
+        return [self._build_elem(batch[slice_]) for slice_ in slices]
 
     def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors:
         if len(batch) > 0 and is_list_of(batch, torch.Tensor, check="all"):
             first_shape = batch[0].shape
-            if all(item.shape[1:] == first_shape[1:] for item in batch):
+            if all(elem.shape[1:] == first_shape[1:] for elem in batch):
                 return torch.concat(batch)
 
-        return [elem for item in batch for elem in item]
+        return [e for elem in batch for e in elem]
 
 
 class MultiModalFieldConfig:
@@ -267,115 +265,111 @@ def __init__(
     ) -> None:
         super().__init__()
 
-        self._field_cls = field_cls
-        self._modality = modality
-        self._field_config = field_config
+        self.field_cls = field_cls
+        self.modality = modality
+        self.field_config = field_config
 
-    def build_items(
+    def build_elems(
         self,
         key: str,
         batch: NestedTensors,
-    ) -> list[MultiModalFieldItem]:
-        field = self._field_cls(key=key, modality=self._modality)
-        return field.build_items(batch, **self._field_config)  # type: ignore
+    ) -> Sequence[MultiModalFieldElem]:
+        field = self.field_cls(key=key, modality=self.modality)
+        return field.build_elems(batch, **self.field_config)  # type: ignore
 
 
-class MultiModalKwargs(UserDict[str, NestedTensors]):
+class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]):
+    """
+    A collection of :class:`MultiModalFieldElem`
+    corresponding to a data item in :class:`MultiModalDataItems`.
     """
-    A dictionary that represents the keyword arguments to
-    :meth:`~torch.nn.Module.forward`.
 
-    The metadata :code:`items_by_key` defines how to split batched keyword
-    arguments corresponding to each data item in :class:`MultiModalDataItems`:
+    @staticmethod
+    def from_elems(elems: Sequence[MultiModalFieldElem]):
+        return MultiModalKwargsItem({elem.field.key: elem for elem in elems})
 
-    - For a keyword argument, we can access the :code:`i` th item in the batch
-      via :code:`items_by_key[key][i]`.
-    - We can gather the keyword arguments belonging to a modality by finding
-      the keys with items that belong to that modality, then accessing
-      the :code:`i` th item in the batch for each such key.
+    @property
+    def modality(self) -> str:
+        modalities = {elem.field.modality for elem in self.data.values()}
+        assert len(modalities) == 1, f"Found different modalities={modalities}"
+        return next(iter(modalities))
 
-    Example:
 
-        .. code-block:: python
-
-            # All items belong to the "image" modality
-            items_by_key={
-                "pixel_values": [a, b, c, d],  # "image" modality
-                "image_grid_thw": [e, f, g, h],  # "image" modality
-                "pixel_values_video": [h, i, j],  # "video" modality
-                "video_grid_thw": [k, l, m],  # "video" modality
-            }
+# NOTE: UserDict is for V0 compatibility.
+# V1 should access individual items via `get_item`.
+class MultiModalKwargs(UserDict[str, NestedTensors]):
+    """
+    A dictionary that represents the keyword arguments to
+    :meth:`~torch.nn.Module.forward`.
 
-        - The keyword arguments belonging to the first image are
-          :code:`{"pixel_values": a, "image_grid_thw": e}`.
-        - The keyword arguments belonging to the second video are
-          :code:`{"pixel_values_video": i, "video_grid_thw": l}`.
+    The metadata :code:`items` enables us to obtain the keyword arguments
+    corresponding to each data item in :class:`MultiModalDataItems`, via
+    :meth:`get_item` and :meth:`get_items`.
     """
 
     @staticmethod
     def from_hf_inputs(
         hf_inputs: BatchFeature,
         config_by_key: Mapping[str, MultiModalFieldConfig],
-        *,
-        enable_sanity_checks: bool = False,
     ):
         # NOTE: This skips fields in `hf_inputs` that are not in `config_by_key`
         # We assume that those fields are not used in vLLM
-        items_by_key = {
-            key: config.build_items(key, batch)
-            for key, config in config_by_key.items()
-            if (batch := hf_inputs.get(key)) is not None
-        }
-
-        return MultiModalKwargs.from_items_by_key(
-            items_by_key,
-            enable_sanity_checks=enable_sanity_checks,
-        )
+        elems_by_key = dict[str, Sequence[MultiModalFieldElem]]()
+        keys_by_modality = defaultdict[str, set[str]](set)
+        for key, config in config_by_key.items():
+            batch = hf_inputs.get(key)
+            if batch is not None:
+                elems = config.build_elems(key, batch)
+                if len(elems) > 0:
+                    elems_by_key[key] = elems
+                    keys_by_modality[config.modality].add(key)
+
+        items = list[MultiModalKwargsItem]()
+        for modality, keys in keys_by_modality.items():
+            elems_in_modality = {k: elems_by_key[k] for k in keys}
+            batch_sizes = {k: len(v) for k, v in elems_in_modality.items()}
+
+            if len(set(batch_sizes.values())) > 1:
+                raise ValueError(
+                    f"Cannot merge different batch sizes for {modality=}! "
+                    f"Found: {batch_sizes=}")
+
+            batch_size = next(iter(batch_sizes.values()))
+            for item_idx in range(batch_size):
+                elems = [v[item_idx] for v in elems_in_modality.values()]
+                items.append(MultiModalKwargsItem.from_elems(elems))
+
+        return MultiModalKwargs.from_items(items)
 
     @staticmethod
-    def from_items_by_key(
-        items_by_key: Mapping[str, list[MultiModalFieldItem]],
-        *,
-        enable_sanity_checks: bool = False,
-    ) -> "MultiModalKwargs":
+    def from_items(items: Sequence[MultiModalKwargsItem]):
+        """Construct a new :class:`MultiModalKwargs` from multiple items."""
+        elems_by_key = defaultdict[str, list[MultiModalFieldElem]](list)
+        for item in items:
+            for key, elem in item.items():
+                elems_by_key[key].append(elem)
+
         data = {
-            key: items[0].field.reduce(items).data
-            for key, items in items_by_key.items() if len(items) > 0
+            key: elems[0].field.reduce(elems).data
+            for key, elems in elems_by_key.items() if len(elems) > 0
         }
 
-        return MultiModalKwargs(data,
-                                items_by_key=items_by_key,
-                                enable_sanity_checks=enable_sanity_checks)
+        return MultiModalKwargs(data, items=items)
 
     def __init__(
         self,
         data: Mapping[str, NestedTensors],
         *,
-        items_by_key: Mapping[str, list[MultiModalFieldItem]] = {},
-        enable_sanity_checks: bool = False,
+        items: Optional[Sequence[MultiModalKwargsItem]] = None,
     ) -> None:
         super().__init__(data)
 
-        # Shallow copy to avoid footgun in case a defaultdict is passed in
-        self._items_by_key = dict(items_by_key)
+        items_by_modality = full_groupby(items or [], key=lambda x: x.modality)
+        self._items_by_modality = dict(items_by_modality)
 
-        keys_by_modality = defaultdict[str, set[str]](set)
-        for key, items in items_by_key.items():
-            for item in items:
-                keys_by_modality[item.field.modality].add(key)
-
-        self._keys_by_modality = dict(keys_by_modality)
-
-        if enable_sanity_checks:
-            for modality, keys in keys_by_modality.items():
-                items_in_modality = {k: items_by_key[k] for k in keys}
-                batch_sizes = {k: len(v) for k, v in items_in_modality.items()}
-                batch_size = next(iter(batch_sizes.values()), 0)
-                assert all(bs == batch_size
-                           for bs in batch_sizes.values()), dict(
-                               modality=modality,
-                               batch_sizes=batch_sizes,
-                               items_by_key=items_by_key)
+    @property
+    def modalities(self):
+        return self._items_by_modality.keys()
 
     @staticmethod
     def _try_stack(nested_tensors: NestedTensors) -> NestedTensors:
@@ -452,58 +446,44 @@ def as_kwargs(
     def __eq__(self, other: object) -> bool:
         if not isinstance(other, self.__class__):
             return False
-        if self._items_by_key != other._items_by_key:
+        if self._items_by_modality != other._items_by_modality:
             return False
 
         ks = self.keys()
         return (ks == other.keys()
                 and all(nested_tensors_equal(self[k], other[k]) for k in ks))
 
-    def get_item(self, key: str, item_index: int) -> MultiModalFieldItem:
-        return self._items_by_key[key][item_index]
+    def _validate_modality(self, method_name: str, modality: str) -> None:
+        if not self._items_by_modality:
+            raise RuntimeError(
+                f"`{method_name}` is not supported when "
+                "MultiModalKwargs is not initialized with `items`")
 
-    def get_items_by_modality(
-        self,
-        modality: str,
-        item_index: int,
-    ) -> Mapping[str, MultiModalFieldItem]:
-        """
-        Get the keyword arguments corresponding to an item identified by
-        its modality and index.
-        """
-        if modality not in self._keys_by_modality:
-            available_modalities = set(self._keys_by_modality.keys())
+        if modality not in self._items_by_modality:
+            available_modalities = set(self._items_by_modality.keys())
             raise KeyError(f"Modality {modality!r} not found. "
                            f"Available modalities: {available_modalities}")
 
-        keys_to_gather = self._keys_by_modality[modality]
+    def get_item_count(self, modality: str) -> int:
+        """Get the number of items belonging to a modality."""
+        self._validate_modality("get_item_count", modality)
+        return len(self._items_by_modality[modality])
 
-        return {
-            key: self.get_item(key, item_index)
-            for key in keys_to_gather if key in self
-        }
+    def get_item(self, modality: str, item_index: int) -> MultiModalKwargsItem:
+        """
+        Get the keyword arguments corresponding to an item identified by
+        its modality and index.
+        """
+        self._validate_modality("get_item", modality)
+        return self._items_by_modality[modality][item_index]
 
-    @staticmethod
-    def from_items_by_modality(
-        items_by_modality: Mapping[str, list[Mapping[str,
-                                                     MultiModalFieldItem]]],
-        *,
-        enable_sanity_checks: bool = False,
-    ) -> "MultiModalKwargs":
+    def get_items(self, modality: str) -> Sequence[MultiModalKwargsItem]:
         """
-        Construct a new :class:`MultiModalKwargs` from multiple items returned
-        by :meth:`get_fields_by_modality`.
+        Get the keyword arguments corresponding to each item belonging to
+        a modality.
         """
-        items_by_key = defaultdict[str, list[MultiModalFieldItem]](list)
-        for fields in items_by_modality.values():
-            for field in fields:
-                for k, v in field.items():
-                    items_by_key[k].append(v)
-
-        return MultiModalKwargs.from_items_by_key(
-            items_by_key,
-            enable_sanity_checks=enable_sanity_checks,
-        )
+        self._validate_modality("get_items", modality)
+        return self._items_by_modality[modality]
 
 
 MultiModalPlaceholderDict = Mapping[str, Sequence[PlaceholderRange]]
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 76475ddda81f4..64cdacfb4c574 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -20,8 +20,8 @@
 from vllm.utils import LRUCache, flatten_2d_lists, full_groupby
 
 from .inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                     MultiModalFieldItem, MultiModalInputsV2, MultiModalKwargs,
-                     PlaceholderRange)
+                     MultiModalInputsV2, MultiModalKwargs,
+                     MultiModalKwargsItem, PlaceholderRange)
 from .parse import MultiModalDataItems, MultiModalDataParser
 
 logger = init_logger(__name__)
@@ -496,8 +496,7 @@ def __init__(self, capacity: int) -> None:
         # DEBUG: Set to None to disable
         self.debug_cache_hit_ratio_steps: Optional[int] = None
 
-        self._cache = LRUCache[str, Mapping[str,
-                                            MultiModalFieldItem]](capacity)
+        self._cache = LRUCache[str, MultiModalKwargsItem](capacity)
 
     def _maybe_log_cache_stats(self) -> None:
         steps = self.debug_cache_hit_ratio_steps
@@ -565,7 +564,7 @@ def get(
         modality: str,
         input_item: object,
         input_kwargs: Mapping[str, object],
-    ) -> Optional[Mapping[str, MultiModalFieldItem]]:
+    ) -> Optional[MultiModalKwargsItem]:
         """
         Get a processed multi-modal item from the cache
         according to its dependencies, including:
@@ -588,7 +587,7 @@ def put(
         modality: str,
         input_item: object,
         input_kwargs: Mapping[str, object],
-        output_kwargs: Mapping[str, MultiModalFieldItem],
+        output_kwargs: MultiModalKwargsItem,
     ) -> None:
         """
         Put a processed multi-modal item into the cache
@@ -784,7 +783,6 @@ def _apply_hf_processor(
         mm_kwargs = MultiModalKwargs.from_hf_inputs(
             processed_data,
             self._get_mm_fields_config(processed_data, hf_processor_mm_kwargs),
-            enable_sanity_checks=self.enable_sanity_checks,
         )
 
         return prompt_ids, mm_kwargs
@@ -846,7 +844,7 @@ def _cached_apply_hf_processor(
                 hf_processor_mm_kwargs=hf_processor_mm_kwargs,
             )
 
-        mm_maybe_cached_field_items = {
+        mm_maybe_cached_kw_items = {
             modality: [
                 cache.get(model_id, modality, item, hf_processor_mm_kwargs)
                 for item in items
@@ -855,8 +853,9 @@ def _cached_apply_hf_processor(
         }
 
         mm_missing_idxs = {
-            modality: [idx for idx, out in enumerate(fields) if out is None]
-            for modality, fields in mm_maybe_cached_field_items.items()
+            modality:
+            [idx for idx, item in enumerate(kw_items) if item is None]
+            for modality, kw_items in mm_maybe_cached_kw_items.items()
         }
         mm_missing_data = {
             modality: [mm_data_items[modality][idx] for idx in idxs]
@@ -875,14 +874,11 @@ def _cached_apply_hf_processor(
             for modality in mm_missing_data_items
         }
 
-        mm_merged_field_items = dict[str, list[Mapping[str,
-                                                       MultiModalFieldItem]]]()
-        for modality, modal_items_lst in mm_maybe_cached_field_items.items():
-            merged_modal_items_lst = list[Mapping[str, MultiModalFieldItem]]()
-
-            for idx, modal_items in enumerate(modal_items_lst):
-                if modal_items is None:
-                    modal_items = mm_missing_kwargs.get_items_by_modality(
+        merged_kw_items = list[MultiModalKwargsItem]()
+        for modality, kw_items in mm_maybe_cached_kw_items.items():
+            for idx, kw_item in enumerate(kw_items):
+                if kw_item is None:
+                    kw_item = mm_missing_kwargs.get_item(
                         modality,
                         mm_missing_next_idx[modality],
                     )
@@ -892,14 +888,12 @@ def _cached_apply_hf_processor(
                         modality,
                         mm_data_items[modality][idx],
                         hf_processor_mm_kwargs,
-                        modal_items,
+                        kw_item,
                     )
 
                     mm_missing_next_idx[modality] += 1
 
-                merged_modal_items_lst.append(modal_items)
-
-            mm_merged_field_items[modality] = merged_modal_items_lst
+                merged_kw_items.append(kw_item)
 
         if self.enable_sanity_checks:
             mm_missing_counts = mm_missing_data_items.get_all_counts()
@@ -909,10 +903,7 @@ def _cached_apply_hf_processor(
                     mm_missing_next_idx=mm_missing_next_idx,
                     mm_missing_counts=mm_missing_counts)
 
-        mm_kwargs = MultiModalKwargs.from_items_by_modality(
-            mm_merged_field_items,
-            enable_sanity_checks=self.enable_sanity_checks,
-        )
+        mm_kwargs = MultiModalKwargs.from_items(merged_kw_items)
 
         if self.enable_sanity_checks:
             mm_item_counts = mm_data_items.get_all_counts()
@@ -920,7 +911,7 @@ def _cached_apply_hf_processor(
             for modality, item_count in mm_item_counts.items():
                 for item_idx in range(item_count):
                     try:
-                        mm_kwargs.get_items_by_modality(modality, item_idx)
+                        mm_kwargs.get_item(modality, item_idx)
                     except Exception as e:
                         # Make it easy to set a breakpoint in the debugger
                         raise e
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 5b5a5a61cea7d..905d3d1fc3e1c 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -113,15 +113,27 @@ def process_inputs(
 
         # For merged preprocessor, mm_data is already mm_inputs
         precomputed_mm_inputs = None
-        if isinstance(decoder_inputs.multi_modal_data, MultiModalKwargs):
-            precomputed_mm_inputs = [decoder_inputs.multi_modal_data]
+        decoder_mm_data = decoder_inputs.multi_modal_data
+        if isinstance(decoder_mm_data, MultiModalKwargs):
+            # The output of merged multi-modal processor (`decoder_mm_data`)
+            # contains the kwargs for all items from all modalities.
+            # This code separates them so that there is one set of kwargs
+            # per item per modality.
+            precomputed_mm_inputs = [
+                MultiModalKwargs.from_items([item])
+                for modality in decoder_mm_data.modalities
+                for item in decoder_mm_data.get_items(modality)
+            ]
 
         # Apply MM mapper
         mm_inputs = None
-        if len(decoder_inputs.multi_modal_data) > 0:
+        if len(decoder_mm_data) > 0:
             mm_inputs = self.mm_input_mapper_client.process_inputs(
-                decoder_inputs.multi_modal_data, mm_hashes,
-                decoder_inputs.mm_processor_kwargs, precomputed_mm_inputs)
+                decoder_mm_data,
+                mm_hashes,
+                decoder_inputs.mm_processor_kwargs,
+                precomputed_mm_inputs,
+            )
 
         return EngineCoreRequest(
             request_id,

From b6087a6beead9165f4c77ceba592b3651bb37de9 Mon Sep 17 00:00:00 2001
From: Tobias Pitters <31857876+CloseChoice@users.noreply.github.com>
Date: Thu, 2 Jan 2025 17:18:15 +0100
Subject: [PATCH 049/462] [mypy] Pass type checking in vllm/inputs (#11680)

Signed-off-by: Tobias Pitters <tobias.pitters@gmail.com>
---
 tools/mypy.sh             |  1 +
 vllm/inputs/data.py       | 21 +++++++++++----------
 vllm/inputs/preprocess.py |  6 +++---
 vllm/inputs/registry.py   |  2 +-
 4 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/tools/mypy.sh b/tools/mypy.sh
index 2454ff9fde466..bf95e4c526fd1 100755
--- a/tools/mypy.sh
+++ b/tools/mypy.sh
@@ -23,6 +23,7 @@ run_mypy vllm/compilation
 run_mypy vllm/distributed
 run_mypy vllm/engine
 run_mypy vllm/executor
+run_mypy vllm/inputs
 run_mypy vllm/lora
 run_mypy vllm/model_executor
 run_mypy vllm/plugins
diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py
index d54cbb5c37819..cdaf6dd76eaa1 100644
--- a/vllm/inputs/data.py
+++ b/vllm/inputs/data.py
@@ -250,7 +250,7 @@ def prompt(self) -> Optional[str]:
         if inputs["type"] == "token" or inputs["type"] == "multimodal":
             return inputs.get("prompt")
 
-        assert_never(inputs)
+        assert_never(inputs)  # type: ignore[arg-type]
 
     @cached_property
     def prompt_token_ids(self) -> List[int]:
@@ -259,7 +259,7 @@ def prompt_token_ids(self) -> List[int]:
         if inputs["type"] == "token" or inputs["type"] == "multimodal":
             return inputs.get("prompt_token_ids", [])
 
-        assert_never(inputs)
+        assert_never(inputs)  # type: ignore[arg-type]
 
     @cached_property
     def token_type_ids(self) -> List[int]:
@@ -268,7 +268,7 @@ def token_type_ids(self) -> List[int]:
         if inputs["type"] == "token" or inputs["type"] == "multimodal":
             return inputs.get("token_type_ids", [])
 
-        assert_never(inputs)
+        assert_never(inputs)  # type: ignore[arg-type]
 
     @cached_property
     def prompt_embeds(self) -> Optional[torch.Tensor]:
@@ -277,7 +277,7 @@ def prompt_embeds(self) -> Optional[torch.Tensor]:
         if inputs["type"] == "token" or inputs["type"] == "multimodal":
             return None
 
-        assert_never(inputs)
+        assert_never(inputs)  # type: ignore[arg-type]
 
     @cached_property
     def multi_modal_data(self) -> "MultiModalDataDict":
@@ -289,7 +289,7 @@ def multi_modal_data(self) -> "MultiModalDataDict":
         if inputs["type"] == "multimodal":
             return inputs.get("mm_kwargs", {})
 
-        assert_never(inputs)
+        assert_never(inputs)  # type: ignore[arg-type]
 
     @cached_property
     def multi_modal_inputs(self) -> Union[Dict, "MultiModalKwargs"]:
@@ -301,7 +301,7 @@ def multi_modal_inputs(self) -> Union[Dict, "MultiModalKwargs"]:
         if inputs["type"] == "multimodal":
             return inputs.get("mm_kwargs", {})
 
-        assert_never(inputs)
+        assert_never(inputs)  # type: ignore[arg-type]
 
     @cached_property
     def multi_modal_hashes(self) -> List[str]:
@@ -311,9 +311,10 @@ def multi_modal_hashes(self) -> List[str]:
             return inputs.get("multi_modal_hashes", [])
 
         if inputs["type"] == "multimodal":
-            return inputs.get("mm_hashes", [])
+            # only the case when we use MultiModalInputsV2
+            return inputs.get("mm_hashes", [])  # type: ignore[return-value]
 
-        assert_never(inputs)
+        assert_never(inputs)  # type: ignore[arg-type]
 
     @cached_property
     def multi_modal_placeholders(self) -> "MultiModalPlaceholderDict":
@@ -325,7 +326,7 @@ def multi_modal_placeholders(self) -> "MultiModalPlaceholderDict":
         if inputs["type"] == "multimodal":
             return inputs.get("mm_placeholders", {})
 
-        assert_never(inputs)
+        assert_never(inputs)  # type: ignore[arg-type]
 
     @cached_property
     def mm_processor_kwargs(self) -> Dict[str, Any]:
@@ -337,7 +338,7 @@ def mm_processor_kwargs(self) -> Dict[str, Any]:
         if inputs["type"] == "multimodal":
             return {}
 
-        assert_never(inputs)
+        assert_never(inputs)  # type: ignore[arg-type]
 
 
 ProcessorInputs = Union[DecoderOnlyInputs, EncoderDecoderInputs]
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index 3d606817e90aa..aaa10d278ddb0 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -436,7 +436,7 @@ def _build_enc_dec_llm_inputs(
                 or encoder_inputs["type"] == "multimodal"):
             pass
         else:
-            assert_never(encoder_inputs)
+            assert_never(encoder_inputs)  # type: ignore[arg-type]
 
         if decoder_inputs is None:
             dec_token_ids = self._prepare_decoder_input_ids_for_generation(
@@ -452,7 +452,7 @@ def _build_enc_dec_llm_inputs(
                 raise ValueError("Multi-modal decoder inputs of encoder-"
                                  "decoder models are not supported yet")
         else:
-            assert_never(encoder_inputs)
+            assert_never(encoder_inputs)  # type: ignore[arg-type]
 
         return EncoderDecoderInputs(
             encoder=encoder_inputs,
@@ -569,7 +569,7 @@ def _build_decoder_only_llm_inputs(
                 prompt_adapter_request=prompt_adapter_request,
             )
         else:
-            assert_never(prompt_inputs)
+            assert_never(prompt_inputs)  # type: ignore[arg-type]
 
         return prompt_inputs
 
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index 090347706ca93..2d9d024e03e80 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -419,7 +419,7 @@ def _ensure_mm_kwargs(
             # Be more strict in V2
             assert "mm_kwargs" in inputs
         else:
-            assert_never(inputs["type"])
+            assert_never(inputs["type"])  # type: ignore[arg-type]
 
     def process_input(self, model_config: "ModelConfig",
                       inputs: ProcessorInputs) -> ProcessorInputs:

From 8c38ee7007c50ac5aef9ed43ae91c6f031799c40 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 3 Jan 2025 00:39:27 +0800
Subject: [PATCH 050/462] [VLM] Merged multi-modal processor for LLaVA-NeXT
 (#11682)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .../mm_processor_kwargs/test_llava_next.py    |  70 ----
 tests/multimodal/test_mapper.py               | 118 -------
 tests/multimodal/test_processing.py           |  97 +++++
 .../vllm_add_dummy_model/my_llava.py          |   4 +-
 vllm/model_executor/models/clip.py            |  25 ++
 vllm/model_executor/models/fuyu.py            |   6 +-
 vllm/model_executor/models/llava.py           | 334 +++++++++++-------
 vllm/model_executor/models/llava_next.py      | 321 ++++++-----------
 vllm/model_executor/models/phi3v.py           |  24 +-
 vllm/model_executor/models/pixtral.py         |  66 +++-
 vllm/model_executor/models/siglip.py          |  25 ++
 vllm/model_executor/models/utils.py           |   2 +-
 vllm/model_executor/models/vision.py          |  52 +++
 vllm/multimodal/parse.py                      |  12 +-
 14 files changed, 605 insertions(+), 551 deletions(-)
 delete mode 100644 tests/models/decoder_only/vision_language/mm_processor_kwargs/test_llava_next.py
 delete mode 100644 tests/multimodal/test_mapper.py
 create mode 100644 vllm/model_executor/models/vision.py

diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_llava_next.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_llava_next.py
deleted file mode 100644
index 51c0085101dd0..0000000000000
--- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_llava_next.py
+++ /dev/null
@@ -1,70 +0,0 @@
-import pytest
-
-from vllm.inputs import InputContext
-
-from ....utils import build_model_context
-
-
-@pytest.fixture()
-def get_max_llava_next_image_tokens():
-    from vllm.model_executor.models.llava_next import (
-        get_max_llava_next_image_tokens)
-    return get_max_llava_next_image_tokens
-
-
-@pytest.fixture()
-def dummy_data_for_llava_next():
-    from vllm.model_executor.models.llava_next import dummy_data_for_llava_next
-    return dummy_data_for_llava_next
-
-
-@pytest.mark.parametrize("gridpoints,expected_max_tokens", [
-    ([[336, 336]], 1176),
-    ([[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]], 2928),
-])
-def test_get_max_llava_next_image_tokens(gridpoints, expected_max_tokens,
-                                         get_max_llava_next_image_tokens):
-    ctx = build_model_context(model_name="llava-hf/llava-v1.6-mistral-7b-hf")
-
-    # Update the config image_grid_pinpoints
-    # and calculate the resulting max tokens
-    ctx.model_config.hf_config.image_grid_pinpoints = gridpoints
-
-    actual_max_tokens = get_max_llava_next_image_tokens(
-        InputContext(ctx.model_config))
-
-    assert expected_max_tokens == actual_max_tokens
-
-
-@pytest.mark.parametrize(
-    "gridpoints,expected_size",
-    [
-        # One point; it has to be the largest
-        ([[336, 336]], (336, 336)),
-        # Default for most llava next models; the 2x2 tile is the largest
-        ([[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]],
-         (672, 672)),
-        # If two rectangular gridpoints are the same, the more vertical
-        # one has the higher feature count due to newline features
-        ([[336, 672], [672, 336]], (672, 336))
-    ])
-def test_dummy_data_for_llava_next_feature_size(dummy_data_for_llava_next,
-                                                gridpoints, expected_size):
-    ctx = build_model_context(model_name="llava-hf/llava-v1.6-mistral-7b-hf")
-
-    # Update the config image_grid_pinpoints
-    ctx.model_config.hf_config.image_grid_pinpoints = gridpoints
-    seq_len = 5000  # bigger than the max feature size for any image
-
-    dummy_data = dummy_data_for_llava_next(
-        ctx,
-        seq_len=seq_len,
-        mm_counts={"image": 1},
-    )
-    seq_data = dummy_data.seq_data
-    mm_data = dummy_data.multi_modal_data
-
-    # The dummy data dims should match the gridpoint with the biggest feat size
-    assert mm_data["image"].height == expected_size[0]
-    assert mm_data["image"].width == expected_size[1]
-    assert len(seq_data.get_token_ids()) >= seq_len
diff --git a/tests/multimodal/test_mapper.py b/tests/multimodal/test_mapper.py
deleted file mode 100644
index 81f2a06182bcc..0000000000000
--- a/tests/multimodal/test_mapper.py
+++ /dev/null
@@ -1,118 +0,0 @@
-from contextlib import nullcontext
-
-import numpy as np
-import pytest
-from transformers import LlavaNextImageProcessor
-
-from vllm.config import ModelConfig
-from vllm.multimodal import MultiModalRegistry
-from vllm.multimodal.image import rescale_image_size
-
-
-@pytest.fixture
-def mm_registry():
-    return MultiModalRegistry()
-
-
-@pytest.mark.parametrize("dtype", ["half", "float"])
-@pytest.mark.parametrize("size_factor", [0.25, 0.5, 1.0])
-def test_llava_next_image_processor(image_assets, mm_registry, dtype,
-                                    size_factor):
-    MODEL_NAME = "llava-hf/llava-v1.6-vicuna-7b-hf"
-
-    hf_processor = LlavaNextImageProcessor.from_pretrained(MODEL_NAME)
-    assert isinstance(hf_processor, LlavaNextImageProcessor)
-
-    model_config = ModelConfig(
-        model=MODEL_NAME,
-        task="auto",
-        tokenizer=MODEL_NAME,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        seed=0,
-        dtype=dtype,
-        revision=None,
-        limit_mm_per_prompt={"image": 1},
-    )
-
-    mm_registry.init_mm_limits_per_prompt(model_config)
-
-    for asset in image_assets:
-        image = rescale_image_size(asset.pil_image, size_factor)
-
-        hf_result = hf_processor.preprocess(
-            image,
-            return_tensors="pt",
-        )
-        vllm_result = mm_registry.map_input(
-            model_config,
-            {"image": image},
-        )
-
-        assert hf_result.keys() == vllm_result.keys()
-        for key, hf_tensor in hf_result.items():
-            hf_arr: np.ndarray = hf_tensor.numpy()
-            vllm_arr: np.ndarray = vllm_result[key].numpy()
-
-            assert hf_arr.shape == vllm_arr.shape, f"Failed for key={key}"
-            assert np.allclose(hf_arr, vllm_arr), f"Failed for key={key}"
-
-
-@pytest.mark.parametrize(
-    ("num_images", "limit", "is_valid"),
-    [(0, 0, True), (0, 1, True), (1, 0, False), (1, 1, True), (1, 2, True),
-     (2, 1, False), (2, 2, True)],
-)
-def test_mm_limits(image_assets, mm_registry, num_images, limit, is_valid):
-    MODEL_NAME = "llava-hf/llava-v1.6-mistral-7b-hf"
-
-    model_config = ModelConfig(
-        model=MODEL_NAME,
-        task="auto",
-        tokenizer=MODEL_NAME,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        seed=0,
-        dtype="half",
-        revision=None,
-        limit_mm_per_prompt={"image": limit},
-    )
-
-    mm_registry.init_mm_limits_per_prompt(model_config)
-
-    image = image_assets[0].pil_image
-    if num_images == 0:
-        mm_inputs = {}
-    elif num_images == 1:
-        mm_inputs = {"image": image}
-    else:
-        mm_inputs = {"image": [image] * num_images}
-
-    with nullcontext() if is_valid else pytest.raises(ValueError):
-        mm_registry.map_input(model_config, mm_inputs)
-
-
-# NOTE: We don't test zero images since the HF processor doesn't support it
-@pytest.mark.parametrize("num_images", [1, 2])
-def test_image_mapper_multi(image_assets, mm_registry, num_images):
-    MODEL_NAME = "llava-hf/llava-v1.6-mistral-7b-hf"
-
-    model_config = ModelConfig(
-        model=MODEL_NAME,
-        task="auto",
-        tokenizer=MODEL_NAME,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        seed=0,
-        dtype="half",
-        revision=None,
-        limit_mm_per_prompt={"image": num_images},
-    )
-
-    mm_registry.init_mm_limits_per_prompt(model_config)
-
-    image = image_assets[0].pil_image
-    mm_inputs = {"image": [image] * num_images}
-
-    mapped_inputs = mm_registry.map_input(model_config, mm_inputs)
-    assert len(mapped_inputs["pixel_values"]) == num_images
diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index 9573351b4dff1..f99d7556b27f9 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -1,5 +1,7 @@
+from contextlib import nullcontext
 from functools import partial
 from typing import cast
+from unittest.mock import MagicMock
 
 import numpy as np
 import pytest
@@ -526,6 +528,100 @@ def _rand_audio(
     return rng.rand(audio_len), sr
 
 
+@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
+@pytest.mark.parametrize(
+    ("limit", "num_supported", "is_valid"),
+    [(0, 0, True), (0, 1, True), (1, 0, False), (1, 1, True), (1, 2, True),
+     (2, 1, False), (2, 2, True)],
+)
+def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
+    limit_mm_per_prompt = {"image": limit}
+
+    model_config = ModelConfig(
+        model=model_id,
+        task="auto",
+        tokenizer=model_id,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype="half",
+        revision=None,
+        limit_mm_per_prompt=limit_mm_per_prompt,
+    )
+    model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
+
+    processor_factory = MULTIMODAL_REGISTRY._processor_factories[model_cls]
+    ctx = InputProcessingContext(
+        model_config,
+        tokenizer=cached_get_tokenizer(model_config.tokenizer),
+    )
+
+    processor = processor_factory(ctx, cache=None)
+
+    mock_supported_mm_limits = MagicMock(return_value={"image": num_supported})
+    processor.get_supported_mm_limits = mock_supported_mm_limits
+
+    if is_valid:
+        exc_ctx = nullcontext()
+    else:
+        exc_ctx = pytest.raises(ValueError, match="this model only supports")
+
+    with exc_ctx:
+        processor._get_and_validate_dummy_mm_counts()
+
+
+@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
+@pytest.mark.parametrize(
+    ("num_images", "limit", "is_valid"),
+    [(0, 0, True), (0, 1, True), (1, 0, False), (1, 1, True), (1, 2, True),
+     (2, 1, False), (2, 2, True)],
+)
+def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
+    limit_mm_per_prompt = {"image": limit}
+
+    model_config = ModelConfig(
+        model=model_id,
+        task="auto",
+        tokenizer=model_id,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype="half",
+        revision=None,
+        limit_mm_per_prompt=limit_mm_per_prompt,
+    )
+    model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
+
+    processor_factory = MULTIMODAL_REGISTRY._processor_factories[model_cls]
+    ctx = InputProcessingContext(
+        model_config,
+        tokenizer=cached_get_tokenizer(model_config.tokenizer),
+    )
+
+    processor = processor_factory(ctx, cache=None)
+
+    rng = np.random.RandomState(0)
+    image = _rand_img(rng, min_wh=128, max_wh=256)
+    if num_images == 0:
+        mm_data = {}
+    elif num_images == 1:
+        mm_data = {"image": image}
+    else:
+        mm_data = {"image": [image] * num_images}
+
+    if is_valid:
+        exc_ctx = nullcontext()
+    else:
+        exc_ctx = pytest.raises(ValueError, match=f"passed {num_images} image")
+
+    with exc_ctx:
+        processor.apply(
+            "<image>" * num_images,
+            mm_data=mm_data,
+            hf_processor_mm_kwargs={},
+        )
+
+
 def _test_processing_cache_correctness(
     model_id: str,
     modalities: dict[str, bool],
@@ -631,6 +727,7 @@ def _test_processing_cache_correctness(
     ("facebook/chameleon-7b", {"image": False}),
     ("adept/fuyu-8b", {"image": False}),
     ("llava-hf/llava-1.5-7b-hf", {"image": True}),
+    ("llava-hf/llava-v1.6-mistral-7b-hf", {"image": True}),
     ("TIGER-Lab/Mantis-8B-siglip-llama3", {"image": True}),
     ("mistral-community/pixtral-12b", {"image": True}),
     ("Qwen/Qwen2-VL-2B-Instruct", {"image": True, "video": True}),
diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
index 0d90635093ac7..06dfebbb95527 100644
--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
@@ -3,13 +3,11 @@
 import torch
 
 from vllm.model_executor.models.llava import (LlavaForConditionalGeneration,
-                                              LlavaMultiModalProcessor,
-                                              get_max_llava_image_tokens)
+                                              LlavaMultiModalProcessor)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 
 
-@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_llava_image_tokens)
 @MULTIMODAL_REGISTRY.register_processor(LlavaMultiModalProcessor)
 class MyLlava(LlavaForConditionalGeneration):
 
diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index a5300dfd986f3..0188452054b8c 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -24,6 +24,8 @@
                                    resolve_visual_encoder_outputs)
 from vllm.sequence import SequenceData
 
+from .vision import VisionEncoderInfo
+
 
 def get_clip_patch_grid_length(*, image_size: int, patch_size: int) -> int:
     assert image_size % patch_size == 0
@@ -149,6 +151,29 @@ def input_processor_for_clip(
                         multi_modal_placeholders={"image": ranges})
 
 
+class CLIPEncoderInfo(VisionEncoderInfo[CLIPVisionConfig]):
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        return get_clip_image_feature_size(self.vision_config)
+
+    def get_max_image_tokens(self) -> int:
+        return get_max_clip_image_tokens(self.vision_config)
+
+    def get_num_patches(self) -> int:
+        return get_clip_patch_grid_length(
+            image_size=self.vision_config.image_size,
+            patch_size=self.vision_config.patch_size,
+        )
+
+    def get_image_size(self) -> int:
+        return self.vision_config.image_size
+
+
 # Adapted from https://github.com/huggingface/transformers/blob/v4.39.0/src/transformers/models/clip/modeling_clip.py#L164 # noqa
 class CLIPVisionEmbeddings(nn.Module):
 
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index 7fb8c5d1ab09c..3680d01725238 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -76,7 +76,7 @@ def _get_image_target_size(self) -> ImageSize:
         return ImageSize(width=target_size["width"],
                          height=target_size["height"])
 
-    def _get_image_grid_size(
+    def _get_image_feature_grid_size(
         self,
         *,
         image_width: int,
@@ -99,7 +99,7 @@ def _get_image_grid_size(
     def get_mm_max_tokens_per_item(self) -> Mapping[str, int]:
         target_width, target_height = self._get_image_target_size()
 
-        max_ncols, max_nrows = self._get_image_grid_size(
+        max_ncols, max_nrows = self._get_image_feature_grid_size(
             image_width=target_width,
             image_height=target_height,
         )
@@ -172,7 +172,7 @@ def get_replacement_fuyu(item_idx: int):
             images = mm_items.get_items("image", ImageProcessorItems)
             image_size = images.get_image_size(item_idx)
 
-            ncols, nrows = self._get_image_grid_size(
+            ncols, nrows = self._get_image_feature_grid_size(
                 image_width=image_size.width,
                 image_height=image_size.height,
             )
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 808e61edb6fb4..78de27cd821c6 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -1,6 +1,7 @@
+from abc import abstractmethod
 from functools import cached_property
-from typing import (Iterable, List, Literal, Mapping, Optional, Protocol, Set,
-                    Tuple, TypedDict, Union)
+from typing import (Final, Iterable, List, Literal, Mapping, Optional,
+                    Protocol, Set, Tuple, TypedDict, Union)
 
 import torch
 import torch.nn as nn
@@ -12,7 +13,6 @@
 
 from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
-from vllm.inputs import InputContext
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                RowParallelLinear)
@@ -23,23 +23,23 @@
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
                                     MultiModalInputsV2, MultiModalKwargs,
                                     NestedTensors)
-from vllm.multimodal.parse import ImageProcessorItems
+from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
+                                   ImageSize)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessorInputs,
-                                        PromptReplacement,
+                                        InputProcessingContext,
+                                        MultiModalDataItems, ProcessingCache,
+                                        ProcessorInputs, PromptReplacement,
                                         full_groupby_modality)
 from vllm.sequence import IntermediateTensors
 
-from .clip import (CLIPVisionModel, dummy_image_for_clip,
-                   get_max_clip_image_tokens)
+from .clip import CLIPVisionModel
 from .interfaces import SupportsMultiModal, SupportsPP
-from .pixtral import (PixtralHFVisionModel, dummy_image_for_pixtral_hf,
-                      get_max_pixtral_hf_image_tokens,
-                      get_pixtral_hf_image_feature_size)
-from .siglip import (SiglipVisionModel, dummy_image_for_siglip,
-                     get_max_siglip_image_tokens)
+from .pixtral import (PixtralHFVisionModel,
+                      get_pixtral_hf_image_feature_grid_size)
+from .siglip import SiglipVisionModel
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
+from .vision import vision_encoder_info
 
 
 class LlavaImagePixelInputs(TypedDict):
@@ -94,39 +94,167 @@ def forward(self, image_features: torch.Tensor) -> torch.Tensor:
         return hidden_states
 
 
-def get_max_llava_image_tokens(ctx: InputContext):
-    hf_config = ctx.get_hf_config(LlavaConfig)
-    vision_config = hf_config.vision_config
+class LlavaLikeConfig(Protocol):
+    vision_config: Final[PretrainedConfig]
+    vision_feature_select_strategy: Final[str]
+    vision_feature_layer: Final[Union[int, List[int]]]
 
-    if isinstance(vision_config, CLIPVisionConfig):
-        num_image_tokens = get_max_clip_image_tokens(vision_config)
-    elif isinstance(vision_config, SiglipVisionConfig):
-        num_image_tokens = get_max_siglip_image_tokens(vision_config)
-    elif isinstance(vision_config, PixtralVisionConfig):
-        num_image_tokens = get_max_pixtral_hf_image_tokens(vision_config)
-    else:
-        msg = f"Unsupported vision config: {type(vision_config)}"
-        raise NotImplementedError(msg)
 
-    strategy = hf_config.vision_feature_select_strategy
-    if strategy == "default":
-        return num_image_tokens - 1
-    elif strategy == "full":
-        return num_image_tokens
-    else:
-        raise ValueError(f"Unexpected select feature strategy: {strategy}")
+class BaseLlavaMultiModalProcessor(BaseMultiModalProcessor):
 
+    def __init__(self,
+                 ctx: InputProcessingContext,
+                 *,
+                 cache: Optional[ProcessingCache] = None,
+                 enable_sanity_checks: bool = True) -> None:
+        super().__init__(ctx,
+                         cache=cache,
+                         enable_sanity_checks=enable_sanity_checks)
+
+        vision_config = self._get_hf_config().vision_config
+        self._vision_encoder_info = vision_encoder_info(vision_config)
 
-class LlavaMultiModalProcessor(BaseMultiModalProcessor):
+    @abstractmethod
+    def _get_hf_config(self) -> LlavaLikeConfig:
+        raise NotImplementedError
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None}
 
+    def _apply_feature_select_strategy(
+        self,
+        strategy: str,
+        encoder_num_image_tokens: int,
+    ) -> int:
+        if strategy == "default":
+            return encoder_num_image_tokens - 1
+        if strategy == "full":
+            return encoder_num_image_tokens
+
+        msg = f"Unexpected feature select strategy: {strategy!r}"
+        raise NotImplementedError(msg)
+
+    def _get_max_image_tokens(self) -> int:
+        hf_config = self._get_hf_config()
+
+        return self._apply_feature_select_strategy(
+            hf_config.vision_feature_select_strategy,
+            self._vision_encoder_info.get_max_image_tokens(),
+        )
+
     def get_mm_max_tokens_per_item(self) -> Mapping[str, int]:
-        return {"image": get_max_llava_image_tokens(self.ctx)}
+        return {"image": self._get_max_image_tokens()}
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+            image_embeds=MultiModalFieldConfig.batched("image"),
+        )
+
+    def _get_dummy_image_size(self) -> ImageSize:
+        image_size = self._vision_encoder_info.get_image_size()
+        return ImageSize(image_size, image_size)
+
+    @abstractmethod
+    def _get_image_token(self) -> str:
+        raise NotImplementedError
+
+    def _get_dummy_mm_inputs(
+        self,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        num_images = mm_counts.get("image", 0)
+
+        image_token = self._get_image_token()
+        target_width, target_height = self._get_dummy_image_size()
+
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images)
+        }
+
+        return ProcessorInputs(
+            prompt_text=image_token * num_images,
+            mm_data=mm_data,
+        )
+
+
+class LlavaMultiModalProcessor(BaseLlavaMultiModalProcessor):
+
+    def _get_hf_config(self) -> LlavaConfig:
+        return self.ctx.get_hf_config(LlavaConfig)
+
+    def _get_hf_processor(self) -> LlavaProcessor:
+        return self.ctx.get_hf_processor(LlavaProcessor)
+
+    def _get_image_token(self) -> str:
+        return self._get_hf_processor().image_token
+
+    def _get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        hf_config = self._get_hf_config()
+
+        return self._apply_feature_select_strategy(
+            hf_config.vision_feature_select_strategy,
+            self._vision_encoder_info.get_num_image_tokens(
+                image_width=image_width,
+                image_height=image_height,
+            ),
+        )
+
+    def _get_prompt_replacements(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> list[PromptReplacement]:
+        hf_config = self._get_hf_config()
+        image_token_id = hf_config.image_token_index
 
-    def _get_hf_processor(self) -> Union[LlavaProcessor, PixtralProcessor]:
-        return self.ctx.get_hf_processor((LlavaProcessor, PixtralProcessor))
+        def get_replacement(item_idx: int):
+            images = mm_items.get_items(
+                "image", (ImageEmbeddingItems, ImageProcessorItems))
+
+            if isinstance(images, ImageEmbeddingItems):
+                num_image_tokens = images.get_feature_size(item_idx)
+            else:
+                image_size = images.get_image_size(item_idx)
+                num_image_tokens = self._get_num_image_tokens(
+                    image_width=image_size.width,
+                    image_height=image_size.height,
+                )
+
+            return [image_token_id] * num_image_tokens
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=[image_token_id],
+                replacement=get_replacement,
+            ),
+        ]
+
+
+class PixtralHFMultiModalProcessor(BaseLlavaMultiModalProcessor):
+
+    def _get_hf_config(self) -> LlavaConfig:
+        return self.ctx.get_hf_config(LlavaConfig)
+
+    def _get_hf_processor(self) -> PixtralProcessor:
+        return self.ctx.get_hf_processor(PixtralProcessor)
+
+    def _get_image_token(self) -> str:
+        return self._get_hf_processor().image_token
 
     def _call_hf_processor(
         self,
@@ -140,119 +268,82 @@ def _call_hf_processor(
             mm_kwargs=mm_kwargs,
         )
 
-        # NOTE: pixel_values=None for MLlavaProcessor
         pixel_values = processed_outputs.get("pixel_values")
         if pixel_values is not None:
             images = mm_data["images"]
             assert isinstance(images, list)
 
-            if isinstance(self._get_hf_processor(), PixtralProcessor):
-                # Original output: (1, num_images, C, H, W)
-                # New output: (num_images, C, H, W)
-                assert (isinstance(pixel_values, list)
-                        and len(pixel_values) == 1)
-                assert (isinstance(pixel_values[0], list)
-                        and len(pixel_values[0]) == len(images))
+            # Original output: (1, num_images, C, H, W)
+            # New output: (num_images, C, H, W)
+            assert (isinstance(pixel_values, list) and len(pixel_values) == 1)
+            assert (isinstance(pixel_values[0], list)
+                    and len(pixel_values[0]) == len(images))
 
-                processed_outputs["pixel_values"] = pixel_values[0]
+            processed_outputs["pixel_values"] = pixel_values[0]
 
         return processed_outputs
 
-    def _get_mm_fields_config(
-        self,
-        hf_inputs: BatchFeature,
-        hf_processor_mm_kwargs: Mapping[str, object],
-    ) -> Mapping[str, MultiModalFieldConfig]:
-        return dict(
-            pixel_values=MultiModalFieldConfig.batched("image"),
-            image_embeds=MultiModalFieldConfig.batched("image"),
-        )
-
     def _get_prompt_replacements(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
-        hf_config = self.ctx.get_hf_config(LlavaConfig)
+        hf_config = self._get_hf_config()
         image_token_id = hf_config.image_token_index
 
         processor = self._get_hf_processor()
-        if isinstance(processor, PixtralProcessor):
-            image_token = processor.image_token
-            image_break_token = processor.image_break_token
-            image_end_token = processor.image_end_token
-
-            vision_config = hf_config.vision_config
-            assert isinstance(vision_config, PixtralVisionConfig)
+        image_token = processor.image_token
+        image_break_token = processor.image_break_token
+        image_end_token = processor.image_end_token
 
-            def get_replacement_pixtral(item_idx: int):
-                images = mm_items.get_items("image", ImageProcessorItems)
-                image_size = images.get_image_size(item_idx)
-
-                (
-                    num_width_tokens,
-                    num_height_tokens,
-                ) = get_pixtral_hf_image_feature_size(
-                    vision_config,
-                    image_width=image_size.width,
-                    image_height=image_size.height,
-                )
+        vision_config = hf_config.vision_config
+        assert isinstance(vision_config, PixtralVisionConfig)
 
-                tokens = ([image_token] * num_width_tokens +
-                          [image_break_token]) * num_height_tokens
-                tokens[-1] = image_end_token
+        def get_replacement(item_idx: int):
+            images = mm_items.get_items("image", ImageProcessorItems)
+            image_size = images.get_image_size(item_idx)
 
-                return "".join(tokens)
+            ncols, nrows = get_pixtral_hf_image_feature_grid_size(
+                vision_config,
+                image_width=image_size.width,
+                image_height=image_size.height,
+            )
 
-            return [
-                PromptReplacement(
-                    modality="image",
-                    target=[image_token_id],
-                    replacement=get_replacement_pixtral,
-                ),
-            ]
+            tokens = ([image_token] * ncols + [image_break_token]) * nrows
+            tokens[-1] = image_end_token
 
-        max_image_tokens = get_max_llava_image_tokens(self.ctx)
+            return "".join(tokens)
 
         return [
             PromptReplacement(
                 modality="image",
                 target=[image_token_id],
-                replacement=[image_token_id] * max_image_tokens,
-            )
+                replacement=get_replacement,
+            ),
         ]
 
-    def _get_dummy_mm_inputs(
-        self,
-        mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
-        hf_config = self.ctx.get_hf_config(LlavaConfig)
-        vision_config = hf_config.vision_config
-        num_images = mm_counts.get("image", 0)
-
-        if isinstance(vision_config, CLIPVisionConfig):
-            data = dummy_image_for_clip(vision_config, num_images)
-        elif isinstance(vision_config, SiglipVisionConfig):
-            data = dummy_image_for_siglip(vision_config, num_images)
-        elif isinstance(vision_config, PixtralVisionConfig):
-            data = dummy_image_for_pixtral_hf(vision_config, num_images)
-        else:
-            msg = f"Unsupported vision config: {type(vision_config)}"
-            raise NotImplementedError(msg)
 
-        hf_processor = self._get_hf_processor()
-        image_token = hf_processor.image_token
+def _build_llava_or_pixtral_hf_processor(
+    ctx: InputProcessingContext,
+    *,
+    cache: Optional[ProcessingCache] = None,
+    enable_sanity_checks: bool = True,
+) -> BaseLlavaMultiModalProcessor:
+    hf_config = ctx.get_hf_config(LlavaConfig)
 
-        return ProcessorInputs(
-            prompt_text=image_token * num_images,
-            mm_data=data,
+    if isinstance(hf_config.vision_config, PixtralVisionConfig):
+        return PixtralHFMultiModalProcessor(
+            ctx,
+            cache=cache,
+            enable_sanity_checks=enable_sanity_checks,
         )
 
-
-class LlavaLikeConfig(Protocol):
-    vision_config: PretrainedConfig
-    vision_feature_layer: Union[int, List[int]]
+    return LlavaMultiModalProcessor(
+        ctx,
+        cache=cache,
+        enable_sanity_checks=enable_sanity_checks,
+    )
 
 
 def _get_num_hidden_layers(hf_config: LlavaLikeConfig) -> int:
@@ -330,7 +421,7 @@ def init_vision_tower_for_llava(
     raise NotImplementedError(msg)
 
 
-@MULTIMODAL_REGISTRY.register_processor(LlavaMultiModalProcessor)
+@MULTIMODAL_REGISTRY.register_processor(_build_llava_or_pixtral_hf_processor)
 class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
     # BitandBytes specific attributes
     bitsandbytes_stacked_params_mapping = {
@@ -596,7 +687,12 @@ def apply(
     ) -> MultiModalInputsV2:
         hf_config = self.ctx.get_hf_config(LlavaConfig)
         image_token_id = hf_config.image_token_index
-        max_image_tokens = get_max_llava_image_tokens(self.ctx)
+
+        # Assume that it doesn't depend on the image size
+        num_image_tokens = self._get_num_image_tokens(
+            image_width=-1,
+            image_height=-1,
+        )
 
         result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs)
 
@@ -609,14 +705,14 @@ def apply(
         def get_replacement_mantis(item_idx: int):
             return "".join([
                 f"(image {item_idx+1}: <Image>",  # 7 tokens
-                "<image>" * max_image_tokens,
+                "<image>" * num_image_tokens,
                 "</Image>)",  # 3 tokens
             ])
 
         mantis_repls = self._bind_prompt_replacements([
             PromptReplacement(
                 modality="image",
-                target=[image_token_id] * max_image_tokens,
+                target=[image_token_id] * num_image_tokens,
                 replacement=get_replacement_mantis,
             )
         ])
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 5e70c11363c83..24debd1cbf3fe 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -4,31 +4,25 @@
 
 import torch
 import torch.nn as nn
-from PIL import Image
-from transformers import CLIPVisionConfig, LlavaNextConfig, SiglipVisionConfig
+from transformers import BatchFeature, LlavaNextConfig, LlavaNextProcessor
 from transformers.models.llava_next.modeling_llava_next import (
     get_anyres_image_grid_shape, unpad_image)
 from typing_extensions import NotRequired
 
 from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
-                         InputContext)
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import NestedTensors
+from vllm.multimodal.inputs import MultiModalFieldConfig, NestedTensors
+from vllm.multimodal.parse import ImageSize
 from vllm.sequence import IntermediateTensors
-from vllm.utils import is_list_of
 
-from .clip import (CLIPVisionModel, dummy_image_for_clip,
-                   dummy_seq_data_for_clip, get_clip_image_feature_size,
-                   get_clip_patch_grid_length, input_processor_for_clip)
+from .clip import CLIPVisionModel
 from .interfaces import SupportsMultiModal, SupportsPP
-from .llava import LlavaMultiModalProjector, init_vision_tower_for_llava
-from .siglip import (SiglipVisionModel, dummy_image_for_siglip,
-                     dummy_seq_data_for_siglip, get_siglip_image_feature_size,
-                     get_siglip_patch_grid_length, input_processor_for_siglip)
+from .llava import (LlavaMultiModalProcessor, LlavaMultiModalProjector,
+                    init_vision_tower_for_llava)
+from .siglip import SiglipVisionModel
 from .utils import (AutoWeightsLoader, embed_multimodal, flatten_bn,
                     init_vllm_registered_model, maybe_prefix)
 
@@ -65,218 +59,127 @@ class LlavaNextImageEmbeddingInputs(TypedDict):
                              LlavaNextImageEmbeddingInputs]
 
 
-# Based on: https://github.com/huggingface/text-generation-inference/blob/v2.2.0/server/text_generation_server/models/vlm_causal_lm.py#L79
-def _get_llava_next_num_unpadded_features(
-    original_height: int,
-    original_width: int,
-    npatches: int,
-    num_patch_height: int,
-    num_patch_width: int,
-) -> Tuple[int, int]:
-    current_height = npatches * num_patch_height
-    current_width = npatches * num_patch_width
-
-    original_aspect_ratio = original_width / original_height
-    current_aspect_ratio = current_width / current_height
-
-    if original_aspect_ratio > current_aspect_ratio:
-        scale_factor = current_width / original_width
-        new_height = int(original_height * scale_factor)
-        padding = (current_height - new_height) // 2
-        current_height -= 2 * padding
-    else:
-        scale_factor = current_height / original_height
-        new_width = int(original_width * scale_factor)
-        padding = (current_width - new_width) // 2
-        current_width -= 2 * padding
-
-    unpadded_features = current_height * current_width
-    newline_features = current_height
-    return (unpadded_features, newline_features)
-
-
-# Based on: https://github.com/huggingface/text-generation-inference/blob/v2.2.0/server/text_generation_server/models/vlm_causal_lm.py#L106
-def get_llava_next_image_feature_size(
-    hf_config: LlavaNextConfig,
-    *,
-    input_height: int,
-    input_width: int,
-) -> int:
-    vision_config = hf_config.vision_config
-
-    if isinstance(vision_config, CLIPVisionConfig):
-        num_patches = get_clip_patch_grid_length(
-            image_size=vision_config.image_size,
-            patch_size=vision_config.patch_size,
-        )
-        base_feature_size = get_clip_image_feature_size(vision_config)
-    elif isinstance(vision_config, SiglipVisionConfig):
-        num_patches = get_siglip_patch_grid_length(
-            image_size=vision_config.image_size,
-            patch_size=vision_config.patch_size,
-        )
-        base_feature_size = get_siglip_image_feature_size(vision_config)
-    else:
-        msg = f"Unsupported vision config: {type(vision_config)}"
-        raise NotImplementedError(msg)
-
-    strategy = hf_config.vision_feature_select_strategy
-    if strategy == "default":
-        base_feature_size -= 1
-    elif strategy == "full":
-        pass
-    else:
-        raise ValueError(f"Unexpected select feature strategy: {strategy}")
+class LlavaNextMultiModalProcessor(LlavaMultiModalProcessor):
 
-    num_patch_height, num_patch_width = get_anyres_image_grid_shape(
-        image_size=(input_height, input_width),
-        grid_pinpoints=hf_config.image_grid_pinpoints,
-        patch_size=vision_config.image_size,
-    )
-
-    (
-        unpadded_feature_size,
-        newline_feature_size,
-    ) = _get_llava_next_num_unpadded_features(input_height, input_width,
-                                              num_patches, num_patch_height,
-                                              num_patch_width)
-
-    return unpadded_feature_size + newline_feature_size + base_feature_size
-
-
-def get_max_llava_next_image_tokens(ctx: InputContext):
-    """Compute the max feature size for all possible image grid pinpoints."""
-    return _get_pinpoint_with_largest_features(ctx)[0]
-
-
-def _get_pinpoint_with_largest_features(
-        ctx: InputContext) -> Tuple[int, Tuple[int, int]]:
-    """Get the grid pinpoint with the largest features & its feature size."""
-    hf_config = ctx.get_hf_config(LlavaNextConfig)
-    largest_feature_size = 0
-    largest_feature_pinpoint = None
-    for (height, width) in hf_config.image_grid_pinpoints:
-        feat_size = get_llava_next_image_feature_size(
-            hf_config,
-            input_height=height,
-            input_width=width,
-        )
-        if feat_size > largest_feature_size:
-            largest_feature_size = feat_size
-            largest_feature_pinpoint = (height, width)
-    if not largest_feature_size or largest_feature_pinpoint is None:
-        raise ValueError("Cannot have a largest feature size of 0!")
-    return largest_feature_size, largest_feature_pinpoint
-
-
-def dummy_data_for_llava_next(ctx: InputContext, seq_len: int,
-                              mm_counts: Mapping[str, int]):
-    hf_config = ctx.get_hf_config(LlavaNextConfig)
-    vision_config = hf_config.vision_config
-    num_images = mm_counts["image"]
-
-    image_feature_size, pinpoint = _get_pinpoint_with_largest_features(ctx)
-    max_feat_height, max_feat_width = pinpoint
-
-    if isinstance(vision_config, CLIPVisionConfig):
-        seq_data, ranges = dummy_seq_data_for_clip(
-            vision_config,
-            seq_len,
-            num_images,
-            image_token_id=hf_config.image_token_index,
-            image_feature_size_override=image_feature_size,
-        )
+    def _get_hf_config(self) -> LlavaNextConfig:
+        return self.ctx.get_hf_config(LlavaNextConfig)
+
+    def _get_hf_processor(self) -> LlavaNextProcessor:
+        return self.ctx.get_hf_processor(LlavaNextProcessor)
 
-        mm_data = dummy_image_for_clip(
-            vision_config,
-            num_images,
-            image_width_override=max_feat_width,
-            image_height_override=max_feat_height,
+    def _get_image_token(self) -> str:
+        return self._get_hf_processor().image_token
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+            image_sizes=MultiModalFieldConfig.batched("image"),
+            image_embeds=MultiModalFieldConfig.batched("image"),
         )
 
-        return DummyData(seq_data, mm_data, ranges)
-    elif isinstance(vision_config, SiglipVisionConfig):
-        seq_data, ranges = dummy_seq_data_for_siglip(
-            vision_config,
-            seq_len,
-            num_images,
-            image_token_id=hf_config.image_token_index,
-            image_feature_size_override=image_feature_size,
+    def _get_max_image_tokens(self) -> int:
+        largest_feature_size, _ = self._get_pinpoint_with_most_features()
+        return largest_feature_size
+
+    def _get_dummy_image_size(self) -> ImageSize:
+        _, pinpoint = self._get_pinpoint_with_most_features()
+        return pinpoint
+
+    # Based on: https://github.com/huggingface/text-generation-inference/blob/v2.2.0/server/text_generation_server/models/vlm_causal_lm.py#L106
+    def _get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        hf_config = self._get_hf_config()
+
+        base_feature_size = self._apply_feature_select_strategy(
+            hf_config.vision_feature_select_strategy,
+            self._vision_encoder_info.get_num_image_tokens(
+                image_width=image_width,
+                image_height=image_height,
+            ),
         )
+        num_patches = self._vision_encoder_info.get_num_patches()
 
-        mm_data = dummy_image_for_siglip(
-            vision_config,
-            num_images,
-            image_width_override=max_feat_width,
-            image_height_override=max_feat_height,
+        num_patch_height, num_patch_width = get_anyres_image_grid_shape(
+            image_size=(image_height, image_width),
+            grid_pinpoints=hf_config.image_grid_pinpoints,
+            patch_size=self._vision_encoder_info.get_image_size(),
         )
 
-        return DummyData(seq_data, mm_data, ranges)
+        (
+            unpadded_feature_size,
+            newline_feature_size,
+        ) = self._get_num_unpadded_features(
+            original_height=image_height,
+            original_width=image_width,
+            npatches=num_patches,
+            num_patch_height=num_patch_height,
+            num_patch_width=num_patch_width,
+        )
 
-    msg = f"Unsupported vision config: {type(vision_config)}"
-    raise NotImplementedError(msg)
+        return unpadded_feature_size + newline_feature_size + base_feature_size
 
+    # Based on: https://github.com/huggingface/text-generation-inference/blob/v2.2.0/server/text_generation_server/models/vlm_causal_lm.py#L79
+    def _get_num_unpadded_features(
+        self,
+        *,
+        original_height: int,
+        original_width: int,
+        npatches: int,
+        num_patch_height: int,
+        num_patch_width: int,
+    ) -> tuple[int, int]:
+        current_height = npatches * num_patch_height
+        current_width = npatches * num_patch_width
+
+        original_aspect_ratio = original_width / original_height
+        current_aspect_ratio = current_width / current_height
+
+        if original_aspect_ratio > current_aspect_ratio:
+            scale_factor = current_width / original_width
+            new_height = int(original_height * scale_factor)
+            padding = (current_height - new_height) // 2
+            current_height -= 2 * padding
+        else:
+            scale_factor = current_height / original_height
+            new_width = int(original_width * scale_factor)
+            padding = (current_width - new_width) // 2
+            current_width -= 2 * padding
 
-def input_processor_for_llava_next(ctx: InputContext,
-                                   inputs: DecoderOnlyInputs):
-    multi_modal_data = inputs.get("multi_modal_data")
-    if multi_modal_data is None or "image" not in multi_modal_data:
-        return inputs
+        unpadded_features = current_height * current_width
+        newline_features = current_height
+        return (unpadded_features, newline_features)
 
-    model_config = ctx.model_config
-    hf_config = ctx.get_hf_config(LlavaNextConfig)
-    vision_config = hf_config.vision_config
+    def _get_pinpoint_with_most_features(self) -> tuple[int, ImageSize]:
+        """
+        Get the grid pinpoint with the most features and
+        the corresponding feature size.
+        """
+        hf_config = self._get_hf_config()
 
-    image_data = multi_modal_data["image"]
-    if isinstance(image_data, Image.Image):
-        width, height = image_data.size
+        largest_feature_size, largest_feature_pinpoint = 0, None
+        for (height, width) in hf_config.image_grid_pinpoints:
+            feat_size = self._get_num_image_tokens(image_width=width,
+                                                   image_height=height)
+            if feat_size > largest_feature_size:
+                largest_feature_size = feat_size
+                largest_feature_pinpoint = ImageSize(width=width,
+                                                     height=height)
 
-        image_feature_size = get_llava_next_image_feature_size(
-            hf_config,
-            input_height=height,
-            input_width=width,
-        )
-    elif is_list_of(image_data, Image.Image):
-        image_feature_size = [
-            get_llava_next_image_feature_size(hf_config,
-                                              input_height=img.height,
-                                              input_width=img.width)
-            for img in image_data
-        ]
-    elif isinstance(image_data, torch.Tensor):
-        num_images, image_feature_size, hidden_size = image_data.shape
-    elif is_list_of(image_data, torch.Tensor):
-        image_feature_size = [item.shape[1] for item in image_data]
-    else:
-        raise TypeError(f"Invalid image type: {type(image_data)}")
-
-    vision_config = hf_config.vision_config
-
-    if isinstance(vision_config, CLIPVisionConfig):
-        return input_processor_for_clip(
-            model_config,
-            vision_config,
-            inputs,
-            image_token_id=hf_config.image_token_index,
-            image_feature_size_override=image_feature_size,
-        )
-    elif isinstance(vision_config, SiglipVisionConfig):
-        return input_processor_for_siglip(
-            model_config,
-            vision_config,
-            inputs,
-            image_token_id=hf_config.image_token_index,
-            image_feature_size_override=image_feature_size,
-        )
+        if largest_feature_size == 0 or largest_feature_pinpoint is None:
+            raise ValueError("Cannot have a largest feature size of 0!")
 
-    msg = f"Unsupported vision config: {type(vision_config)}"
-    raise NotImplementedError(msg)
+        return largest_feature_size, largest_feature_pinpoint
 
 
-@MULTIMODAL_REGISTRY.register_image_input_mapper()
-@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_llava_next_image_tokens)
-@INPUT_REGISTRY.register_dummy_data(dummy_data_for_llava_next)
-@INPUT_REGISTRY.register_input_processor(input_processor_for_llava_next)
+@MULTIMODAL_REGISTRY.register_processor(LlavaNextMultiModalProcessor)
 class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
                                         SupportsPP):
 
@@ -507,7 +410,7 @@ def _merge_image_patch_embeddings(self, image_size: torch.Tensor,
     def _process_image_pixels(
         self,
         inputs: LlavaNextImagePixelInputs,
-    ) -> Union[torch.Tensor, List[torch.Tensor]]:
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]:
         assert self.vision_tower is not None
 
         pixel_values = inputs["data"]
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index d855e7d2d36f8..f2e49d8e4848d 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -34,7 +34,7 @@
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
                                     MultiModalInputsV2, MultiModalKwargs,
                                     NestedTensors, PlaceholderRange)
-from vllm.multimodal.parse import ImageProcessorItems
+from vllm.multimodal.parse import ImageEmbeddingItems, ImageProcessorItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         MultiModalDataItems, ProcessorInputs,
                                         PromptReplacement,
@@ -388,15 +388,19 @@ def _get_prompt_replacements(
         assert isinstance(bos_token_id, int)
 
         def get_replacement_phi3v(item_idx: int):
-            images = mm_items.get_items("image", ImageProcessorItems)
-            image_size = images.get_image_size(item_idx)
-
-            num_tokens = self._get_num_image_tokens(
-                image_width=image_size.width,
-                image_height=image_size.height,
-            )
-
-            return [_IMAGE_TOKEN_ID] * num_tokens + [bos_token_id]
+            images = mm_items.get_items(
+                "image", (ImageEmbeddingItems, ImageProcessorItems))
+
+            if isinstance(images, ImageEmbeddingItems):
+                num_image_tokens = images.get_feature_size(item_idx)
+            else:
+                image_size = images.get_image_size(item_idx)
+                num_image_tokens = self._get_num_image_tokens(
+                    image_width=image_size.width,
+                    image_height=image_size.height,
+                )
+
+            return [_IMAGE_TOKEN_ID] * num_image_tokens + [bos_token_id]
 
         num_images = mm_items.get_count("image", strict=False)
 
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 2bce13792a88d..d7233bd6028ed 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -38,6 +38,7 @@
 from .interfaces import SupportsMultiModal, SupportsPP
 from .utils import (init_vllm_registered_model, maybe_prefix,
                     merge_multimodal_embeddings)
+from .vision import VisionEncoderInfo
 
 try:
     from xformers import ops as xops
@@ -697,10 +698,18 @@ def get_pixtral_hf_patch_grid_length(*, image_size: int,
     return image_size // patch_size
 
 
-def get_pixtral_hf_num_patches(*, image_size: int, patch_size: int) -> int:
-    grid_length = get_pixtral_hf_patch_grid_length(image_size=image_size,
-                                                   patch_size=patch_size)
-    return grid_length * grid_length
+def get_pixtral_hf_image_feature_size(
+    *,
+    image_size: int,
+    patch_size: int,
+) -> int:
+    grid_length = get_pixtral_hf_patch_grid_length(
+        image_size=image_size,
+        patch_size=patch_size,
+    )
+
+    # Consider the image_break_token
+    return (grid_length + 1) * grid_length
 
 
 def get_max_pixtral_hf_image_tokens(hf_config: PixtralVisionConfig) -> int:
@@ -730,13 +739,16 @@ def dummy_image_for_pixtral_hf(
     return {"image": image if num_images == 1 else [image] * num_images}
 
 
-def get_pixtral_hf_image_feature_size(hf_config: PixtralVisionConfig,
-                                      image_width: int,
-                                      image_height: int) -> Tuple[int, int]:
-    # Adapted from transformers.models.pixtral.image_processing_pixtral.get_resize_output_image_size # noqa: E501
-    # https://github.com/huggingface/transformers/blob/2bd4d5897dc73e8b172832070a6f9e567a0df017/src/transformers/models/pixtral/image_processing_pixtral.py#L180 # noqa: E501
-    max_width, max_height = hf_config.image_size, hf_config.image_size
-    patch_width, patch_height = hf_config.patch_size, hf_config.patch_size
+# Adapted from transformers.models.pixtral.image_processing_pixtral.get_resize_output_image_size # noqa: E501
+# https://github.com/huggingface/transformers/blob/2bd4d5897dc73e8b172832070a6f9e567a0df017/src/transformers/models/pixtral/image_processing_pixtral.py#L180
+def get_pixtral_hf_image_feature_grid_size(
+    hf_config: PixtralVisionConfig,
+    *,
+    image_width: int,
+    image_height: int,
+) -> tuple[int, int]:
+    max_width = max_height = hf_config.image_size
+    patch_width = patch_height = hf_config.patch_size
 
     ratio = max(image_width / max_width, image_height / max_height)
 
@@ -744,12 +756,38 @@ def get_pixtral_hf_image_feature_size(hf_config: PixtralVisionConfig,
         image_width = int(math.ceil(image_width / ratio))
         image_height = int(math.ceil(image_height / ratio))
 
-    num_height_tokens, num_width_tokens = _get_pixtral_hf_num_image_tokens(
+    nrows, ncols = _get_pixtral_hf_num_image_tokens(
         (image_height, image_width),
         (patch_height, patch_width),
-    )
+    )  # type: ignore
+
+    return ncols, nrows
+
+
+class PixtralHFEncoderInfo(VisionEncoderInfo[PixtralVisionConfig]):
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        return get_pixtral_hf_image_feature_size(
+            image_size=self.vision_config.image_size,
+            patch_size=self.get_image_size(),
+        )
+
+    def get_max_image_tokens(self) -> int:
+        return get_max_pixtral_hf_image_tokens(self.vision_config)
+
+    def get_num_patches(self) -> int:
+        return get_pixtral_hf_patch_grid_length(
+            image_size=self.vision_config.image_size,
+            patch_size=self.vision_config.patch_size,
+        )
 
-    return num_width_tokens, num_height_tokens
+    def get_image_size(self) -> int:
+        return self.vision_config.image_size
 
 
 class PixtralHFMLP(nn.Module):
diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py
index 6fb9e2cc4584f..115eaaac900e0 100644
--- a/vllm/model_executor/models/siglip.py
+++ b/vllm/model_executor/models/siglip.py
@@ -28,6 +28,8 @@
                                    resolve_visual_encoder_outputs)
 from vllm.sequence import SequenceData
 
+from .vision import VisionEncoderInfo
+
 
 def get_siglip_patch_grid_length(*, image_size: int, patch_size: int) -> int:
     # Since interpolation is applied, the image size need not be divisible
@@ -156,6 +158,29 @@ def input_processor_for_siglip(
                         multi_modal_placeholders={"image": ranges})
 
 
+class SiglipEncoderInfo(VisionEncoderInfo[SiglipVisionConfig]):
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        return get_siglip_image_feature_size(self.vision_config)
+
+    def get_max_image_tokens(self) -> int:
+        return get_max_siglip_image_tokens(self.vision_config)
+
+    def get_num_patches(self) -> int:
+        return get_siglip_patch_grid_length(
+            image_size=self.vision_config.image_size,
+            patch_size=self.vision_config.patch_size,
+        )
+
+    def get_image_size(self) -> int:
+        return self.vision_config.image_size
+
+
 # Adapted from https://github.com/huggingface/transformers/blob/v4.43.3/src/transformers/models/siglip/modeling_siglip.py#L249 # noqa
 class SiglipVisionEmbeddings(nn.Module):
 
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index 269b66806adf4..31017f16d3c97 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -373,7 +373,7 @@ def embed_multimodal(
     input_ids: torch.Tensor,
     multimodal_token_id: int,
     get_text_embeds: Callable[[torch.Tensor], torch.Tensor],
-    multimodal_embeds: Union[torch.Tensor, List[torch.Tensor]],
+    multimodal_embeds: NestedTensors,
 ) -> torch.Tensor:
     """
     Embed token IDs and multimodal inputs and combine their embeddings.
diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py
new file mode 100644
index 0000000000000..65a773480d2a1
--- /dev/null
+++ b/vllm/model_executor/models/vision.py
@@ -0,0 +1,52 @@
+from abc import ABC, abstractmethod
+from typing import Generic, TypeVar
+
+from transformers import PretrainedConfig
+
+_C = TypeVar("_C", bound=PretrainedConfig)
+
+
+class VisionEncoderInfo(ABC, Generic[_C]):
+
+    def __init__(self, vision_config: _C) -> None:
+        super().__init__()
+
+        self.vision_config = vision_config
+
+    @abstractmethod
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_max_image_tokens(self) -> int:
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_num_patches(self) -> int:
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_image_size(self) -> int:
+        raise NotImplementedError
+
+
+def vision_encoder_info(vision_config: PretrainedConfig) -> VisionEncoderInfo:
+    # Avoid circular imports
+    from .clip import CLIPEncoderInfo, CLIPVisionConfig
+    from .pixtral import PixtralHFEncoderInfo, PixtralVisionConfig
+    from .siglip import SiglipEncoderInfo, SiglipVisionConfig
+
+    if isinstance(vision_config, CLIPVisionConfig):
+        return CLIPEncoderInfo(vision_config)
+    if isinstance(vision_config, PixtralVisionConfig):
+        return PixtralHFEncoderInfo(vision_config)
+    if isinstance(vision_config, SiglipVisionConfig):
+        return SiglipEncoderInfo(vision_config)
+
+    msg = f"Unsupported vision config: {type(vision_config)}"
+    raise NotImplementedError(msg)
diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py
index 4e1b78ab2c59d..00acb77435163 100644
--- a/vllm/multimodal/parse.py
+++ b/vllm/multimodal/parse.py
@@ -1,7 +1,8 @@
 from abc import ABC, abstractmethod
 from collections import UserDict
 from collections.abc import Callable, Iterator, Mapping, Sequence
-from typing import TYPE_CHECKING, Any, Generic, NamedTuple, Optional, TypeVar
+from typing import (TYPE_CHECKING, Any, Generic, NamedTuple, Optional, TypeVar,
+                    Union)
 
 import numpy as np
 import torch
@@ -87,7 +88,7 @@ class EmbeddingItems(ModalityDataItems[NestedTensors, torch.Tensor]):
     def get_count(self) -> int:
         return len(self.data)
 
-    def get(self, index: int) -> object:
+    def get(self, index: int) -> torch.Tensor:
         return self.data[index]
 
     def get_processor_data(self) -> Mapping[str, object]:
@@ -96,6 +97,9 @@ def get_processor_data(self) -> Mapping[str, object]:
     def get_passthrough_data(self) -> Mapping[str, object]:
         return {f"{self.modality}_embeds": self.data}
 
+    def get_feature_size(self, item_idx: int) -> int:
+        return len(self.get(item_idx))
+
 
 class AudioProcessorItems(ProcessorBatchItems[HfAudioItem]):
 
@@ -182,7 +186,7 @@ def get_all_counts(self) -> Mapping[str, int]:
     def get_items(
         self,
         modality: str,
-        typ: type[_D],
+        typ: Union[type[_D], tuple[type[_D], ...]],
     ) -> _D:
         """
         Get the data items belonging to a modality,
@@ -199,7 +203,7 @@ def get_items(
                             f"Expected type: {typ}, but "
                             f"found type: {type(items)}")
 
-        return items
+        return items  # type: ignore[return-value]
 
 
 ModalityDataParser: TypeAlias = Callable[[ModalityData[Any]],

From 84c35c374a8fd3d10559ef220793fea6c5497cf2 Mon Sep 17 00:00:00 2001
From: Chunyang Wen <chunyang.wen@gmail.com>
Date: Fri, 3 Jan 2025 02:14:16 +0800
Subject: [PATCH 051/462] According to vllm.EngineArgs, the name should be
 distributed_executor_backend (#11689)

---
 docs/source/serving/distributed_serving.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/serving/distributed_serving.md b/docs/source/serving/distributed_serving.md
index 7446b7c84cf46..a1dd0e89e8c79 100644
--- a/docs/source/serving/distributed_serving.md
+++ b/docs/source/serving/distributed_serving.md
@@ -22,7 +22,7 @@ There is one edge case: if the model fits in a single node with multiple GPUs, b
 
 vLLM supports distributed tensor-parallel and pipeline-parallel inference and serving. Currently, we support [Megatron-LM's tensor parallel algorithm](https://arxiv.org/pdf/1909.08053.pdf). We manage the distributed runtime with either [Ray](https://github.com/ray-project/ray) or python native multiprocessing. Multiprocessing can be used when deploying on a single node, multi-node inferencing currently requires Ray.
 
-Multiprocessing will be used by default when not running in a Ray placement group and if there are sufficient GPUs available on the same node for the configured {code}`tensor_parallel_size`, otherwise Ray will be used. This default can be overridden via the {code}`LLM` class {code}`distributed-executor-backend` argument or {code}`--distributed-executor-backend` API server argument. Set it to {code}`mp` for multiprocessing or {code}`ray` for Ray. It's not required for Ray to be installed for the multiprocessing case.
+Multiprocessing will be used by default when not running in a Ray placement group and if there are sufficient GPUs available on the same node for the configured {code}`tensor_parallel_size`, otherwise Ray will be used. This default can be overridden via the {code}`LLM` class {code}`distributed_executor_backend` argument or {code}`--distributed-executor-backend` API server argument. Set it to {code}`mp` for multiprocessing or {code}`ray` for Ray. It's not required for Ray to be installed for the multiprocessing case.
 
 To run multi-GPU inference with the {code}`LLM` class, set the {code}`tensor_parallel_size` argument to the number of GPUs you want to use. For example, to run inference on 4 GPUs:
 

From 2f385183f35497e030ef22c9820d83b83bc4f6db Mon Sep 17 00:00:00 2001
From: Kathy Yu <143133934+kathyyu-google@users.noreply.github.com>
Date: Thu, 2 Jan 2025 10:28:09 -0800
Subject: [PATCH 052/462] [Bugfix] Free cross attention block table for
 preempted-for-recompute sequence group. (#10013)

Signed-off-by: Kathy Yu <feiyangyu@google.com>
---
 vllm/core/scheduler.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index c3bc6becf0995..b3d396f9cedda 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -1579,6 +1579,7 @@ def _preempt_by_recompute(
             seq.status = SequenceStatus.WAITING
             self.free_seq(seq)
             seq.reset_state_for_recompute()
+        self._free_seq_group_cross_attn_blocks(seq_group)
 
     def _preempt_by_swap(
         self,

From b55ed6ef8ab0dce7fb0f79ff292dafdb4d22610c Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Fri, 3 Jan 2025 04:04:58 +0900
Subject: [PATCH 053/462] [V1][Minor] Optimize token_ids_cpu copy (#11692)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu_input_batch.py  | 13 ++++++++-----
 vllm/v1/worker/gpu_model_runner.py |  1 +
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index e79145300fe06..f8a1427c6c26c 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -66,8 +66,9 @@ def __init__(
             pin_memory=False,
         )
         self.token_ids_cpu = self.token_ids_cpu_tensor.numpy()
-        self.num_computed_tokens_cpu = np.empty(max_num_reqs, dtype=np.int32)
+        self.num_tokens = np.zeros(max_num_reqs, dtype=np.int32)
         self.num_prompt_tokens = np.zeros(max_num_reqs, dtype=np.int32)
+        self.num_computed_tokens_cpu = np.empty(max_num_reqs, dtype=np.int32)
 
         # Attention-related.
         self.block_table = torch.zeros(
@@ -189,6 +190,7 @@ def add_request(
         end_idx = start_idx + len(request.output_token_ids)
         self.token_ids_cpu[req_index,
                            start_idx:end_idx] = request.output_token_ids
+        self.num_tokens[req_index] = request.num_tokens
 
         self.num_computed_tokens_cpu[req_index] = request.num_computed_tokens
         num_blocks = len(request.block_ids)
@@ -290,14 +292,15 @@ def condense(self, empty_req_indices: List[int]) -> None:
             self.req_ids[last_req_index] = None
             self.req_id_to_index[req_id] = empty_index
 
-            # TODO(woosuk): Optimize the copy of token_ids_cpu and
-            # block_table_cpu.
-            self.token_ids_cpu[empty_index] = self.token_ids_cpu[
-                last_req_index]
+            num_tokens = self.num_tokens[last_req_index]
+            self.token_ids_cpu[empty_index, :num_tokens] = self.token_ids_cpu[
+                last_req_index, :num_tokens]
+            self.num_tokens[empty_index] = num_tokens
             self.num_prompt_tokens[empty_index] = \
                 self.num_prompt_tokens[last_req_index]
             self.num_computed_tokens_cpu[
                 empty_index] = self.num_computed_tokens_cpu[last_req_index]
+            # TODO(woosuk): Optimize the copy of block_table_cpu.
             self.block_table_cpu[empty_index] = self.block_table_cpu[
                 last_req_index]
             self.temperature_cpu[empty_index] = self.temperature_cpu[
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 995de54e8e0a0..75098b0330ac9 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -644,6 +644,7 @@ def execute_model(
                 # Append the sampled token to the output token ids.
                 token_id = sampled_token_ids[i]
                 self.input_batch.token_ids_cpu[i, seq_len] = token_id
+                self.input_batch.num_tokens[i] += 1
                 req_state.output_token_ids.append(token_id)
             else:
                 # Ignore the sampled token from the partial request.

From 187e32997cdc20bbed5c21d3cef2609ab8ed9080 Mon Sep 17 00:00:00 2001
From: bjmsong <wq.songbob@gmail.com>
Date: Fri, 3 Jan 2025 05:11:39 +0800
Subject: [PATCH 054/462] [Bugfix] Change kv scaling factor by param json on
 nvidia gpu (#11688)

Signed-off-by: bjmsong <bjmsong@126.com>
Co-authored-by: bjmsong <bjmsong@126.com>
---
 vllm/model_executor/models/exaone.py  | 5 +++--
 vllm/model_executor/models/granite.py | 5 +++--
 vllm/model_executor/models/llama.py   | 5 +++--
 vllm/model_executor/models/solar.py   | 5 +++--
 vllm/worker/model_runner.py           | 3 ++-
 5 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py
index 0398f0943a70a..8324a563edd64 100644
--- a/vllm/model_executor/models/exaone.py
+++ b/vllm/model_executor/models/exaone.py
@@ -606,8 +606,9 @@ def load_kv_cache_scales(self, quantization_param_path: str) -> None:
                 # which is consistent with the practice of setting
                 # scaling_factor = tensor_amax / FPtype_max
                 scaling_factor *= 2
-            if hasattr(layer_self_attn, "kv_scale"):
-                layer_self_attn.attn._kv_scale = scaling_factor
+            if hasattr(layer_self_attn.attn, "_k_scale"):
+                layer_self_attn.attn._k_scale = scaling_factor
+                layer_self_attn.attn._v_scale = scaling_factor
             else:
                 raise RuntimeError("Self attention has no KV cache scaling "
                                    "factor attribute!")
diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py
index f9e0443b9a508..a91ed4158a73f 100644
--- a/vllm/model_executor/models/granite.py
+++ b/vllm/model_executor/models/granite.py
@@ -545,8 +545,9 @@ def load_kv_cache_scales(self, quantization_param_path: str) -> None:
                 # which is consistent with the practice of setting
                 # scaling_factor = tensor_amax / FPtype_max
                 scaling_factor *= 2
-            if hasattr(layer_self_attn, "kv_scale"):
-                layer_self_attn.attn._kv_scale = scaling_factor
+            if hasattr(layer_self_attn.attn, "_k_scale"):
+                layer_self_attn.attn._k_scale = scaling_factor
+                layer_self_attn.attn._v_scale = scaling_factor
             else:
                 raise RuntimeError("Self attention has no KV cache scaling "
                                    "factor attribute!")
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 2902e6999c2fd..8623da99574bb 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -452,8 +452,9 @@ def load_kv_cache_scales(self, quantization_param_path: str) -> None:
                 # which is consistent with the practice of setting
                 # scaling_factor = tensor_amax / FPtype_max
                 scaling_factor *= 2
-            if hasattr(layer_self_attn, "kv_scale"):
-                layer_self_attn.attn._kv_scale = scaling_factor
+            if hasattr(layer_self_attn.attn, "_k_scale"):
+                layer_self_attn.attn._k_scale = scaling_factor
+                layer_self_attn.attn._v_scale = scaling_factor
             else:
                 raise RuntimeError("Self attention has no KV cache scaling "
                                    "factor attribute!")
diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py
index caae0b65d7d10..a7cf65a0e36e4 100644
--- a/vllm/model_executor/models/solar.py
+++ b/vllm/model_executor/models/solar.py
@@ -565,8 +565,9 @@ def load_kv_cache_scales(self, quantization_param_path: str) -> None:
                 # which is consistent with the practice of setting
                 # scaling_factor = tensor_amax / FPtype_max
                 scaling_factor *= 2
-            if hasattr(layer_self_attn, "kv_scale"):
-                layer_self_attn.attn._kv_scale = scaling_factor
+            if hasattr(layer_self_attn.attn, "_k_scale"):
+                layer_self_attn.attn._k_scale = scaling_factor
+                layer_self_attn.attn._v_scale = scaling_factor
             else:
                 raise RuntimeError("Self attention has no KV cache scaling "
                                    "factor attribute!")
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 2b545d1b28bd2..637fba23611f4 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1136,7 +1136,8 @@ def load_model(self) -> None:
                 self.prompt_adapter_manager.create_prompt_adapter_manager(
                     self.model))
 
-        if self.kv_cache_dtype == "fp8" and current_platform.is_rocm():
+        if self.kv_cache_dtype == "fp8" and (current_platform.is_rocm()
+                                             or current_platform.is_cuda()):
             # Currently only ROCm accepts kv-cache scaling factors
             # via quantization_param_path and this will be deprecated
             # in the future.

From 5dba2575065f5e27d468f2776e3d460a21d916e6 Mon Sep 17 00:00:00 2001
From: wchen61 <wchen61@foxmail.com>
Date: Fri, 3 Jan 2025 06:58:56 +0800
Subject: [PATCH 055/462] Resolve race conditions in Marlin kernel (#11493)

Signed-off-by: wchen61 <wchen61@foxmail.com>
---
 csrc/quantization/gptq_marlin/gptq_marlin.cu | 40 ++++++++++----------
 1 file changed, 21 insertions(+), 19 deletions(-)

diff --git a/csrc/quantization/gptq_marlin/gptq_marlin.cu b/csrc/quantization/gptq_marlin/gptq_marlin.cu
index 0c698ced7713d..04ef842fbdf95 100644
--- a/csrc/quantization/gptq_marlin/gptq_marlin.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu
@@ -834,6 +834,7 @@ __global__ void Marlin(
   int4* sh_g_idx = sh_b + (stages * b_sh_stage);
   int4* sh_zp = sh_g_idx + (stages * g_idx_stage);
   int4* sh_s = sh_zp + (stages * zp_sh_stage);
+  int4* sh_red = sh_s + (stages * s_sh_stage);
 
   // Register storage for double buffer of shared memory reads.
   FragA frag_a[2][thread_m_blocks];
@@ -932,11 +933,11 @@ __global__ void Marlin(
           int4* sh_s_stage = sh_s + s_sh_stage * pipe;
 
           if constexpr (group_blocks >= thread_k_blocks) {
+            if (s_sh_wr_pred) {
+              cp_async4(&sh_s_stage[s_sh_wr], &scales_ptr[s_gl_rd]);
+            }
             // Only fetch scales if this tile starts a new group
-            if (pipe % (group_blocks / thread_k_blocks) == 0) {
-              if (s_sh_wr_pred) {
-                cp_async4(&sh_s_stage[s_sh_wr], &scales_ptr[s_gl_rd]);
-              }
+            if ((pipe + 1) % (group_blocks / thread_k_blocks) == 0) {
               s_gl_rd += s_gl_rd_delta;
             }
           } else {
@@ -1038,9 +1039,7 @@ __global__ void Marlin(
       // No act-order case
       if constexpr (group_blocks != -1) {
         if constexpr (group_blocks >= thread_k_blocks) {
-          int4* sh_s_stage =
-              sh_s + s_sh_stage * ((group_blocks / thread_k_blocks) *
-                                   (pipe / (group_blocks / thread_k_blocks)));
+          int4* sh_s_stage = sh_s + s_sh_stage * pipe;
           reinterpret_cast<int4*>(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd];
         } else {
           int warp_id = threadIdx.x / 32;
@@ -1339,15 +1338,15 @@ __global__ void Marlin(
               int red_sh_wr =
                   red_sh_delta * j + (red_sh_rd - red_sh_stride * i);
               if (i < red_off) {
-                float* c_rd =
-                    reinterpret_cast<float*>(&sh[red_sh_delta * j + red_sh_rd]);
-                float* c_wr = reinterpret_cast<float*>(&sh[red_sh_wr]);
+                float* c_rd = reinterpret_cast<float*>(
+                    &sh_red[red_sh_delta * j + red_sh_rd]);
+                float* c_wr = reinterpret_cast<float*>(&sh_red[red_sh_wr]);
   #pragma unroll
                 for (int k = 0; k < 4; k++)
                   reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + j][k] +=
                       c_rd[k] + c_wr[k];
               }
-              sh[red_sh_wr] =
+              sh_red[red_sh_wr] =
                   reinterpret_cast<int4*>(&frag_c)[4 * 2 * m_block + j];
             }
           }
@@ -1357,7 +1356,7 @@ __global__ void Marlin(
   #pragma unroll
           for (int i = 0; i < 4 * 2; i++) {
             float* c_rd =
-                reinterpret_cast<float*>(&sh[red_sh_delta * i + red_sh_rd]);
+                reinterpret_cast<float*>(&sh_red[red_sh_delta * i + red_sh_rd]);
   #pragma unroll
             for (int j = 0; j < 4; j++)
               reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + i][j] +=
@@ -1397,7 +1396,7 @@ __global__ void Marlin(
   #pragma unroll
         for (int i = 0; i < thread_m_blocks * 4; i++) {
           cp_async4_pred(
-              &sh[c_sh_wr + c_sh_wr_delta * i],
+              &sh_red[c_sh_wr + c_sh_wr_delta * i],
               &C[c_gl_wr + c_gl_wr_delta_o * (i / 2) +
                  c_gl_wr_delta_i * (i % 2)],
               i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m);
@@ -1410,7 +1409,7 @@ __global__ void Marlin(
       for (int i = 0; i < thread_m_blocks * 4; i++) {
         if (i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m) {
           if (!first) {
-            int4 c_red = sh[c_sh_wr + i * c_sh_wr_delta];
+            int4 c_red = sh_red[c_sh_wr + i * c_sh_wr_delta];
   #pragma unroll
             for (int j = 0; j < 2 * 4; j++) {
               reinterpret_cast<float*>(
@@ -1461,10 +1460,10 @@ __global__ void Marlin(
       float* frag_c_ptr = reinterpret_cast<float*>(&frag_c);
   #pragma unroll
       for (int k = 0; k < th_size; k++) {
-        sh[threadIdx.x] =
+        sh_red[threadIdx.x] =
             C_tmp[c_cur_offset + active_threads * k + threadIdx.x];
 
-        float* sh_c_ptr = reinterpret_cast<float*>(&sh[threadIdx.x]);
+        float* sh_c_ptr = reinterpret_cast<float*>(&sh_red[threadIdx.x]);
   #pragma unroll
         for (int f = 0; f < 4; f++) {
           frag_c_ptr[k * 4 + f] += sh_c_ptr[f];
@@ -1515,7 +1514,7 @@ __global__ void Marlin(
         res = __hmul2(res, s[0]);
       }
 
-      ((scalar_t2*)sh)[idx] = res;
+      ((scalar_t2*)sh_red)[idx] = res;
     };
 
     if (threadIdx.x / 32 < thread_n_blocks / 4) {
@@ -1543,7 +1542,7 @@ __global__ void Marlin(
          i < div_ceil(16 * thread_m_blocks, threads / (2 * thread_n_blocks));
          i++) {
       if (c_gl_wr < c_gl_wr_end) {
-        C[c_gl_wr] = sh[c_sh_rd];
+        C[c_gl_wr] = sh_red[c_sh_rd];
         c_gl_wr += c_gl_wr_delta;
         c_sh_rd += c_sh_rd_delta;
       }
@@ -1865,9 +1864,12 @@ bool is_valid_cache_size(thread_config_t const& th_config, int max_m_blocks,
 
   float pipe_size = (a_size + b_size) * pipe_stages;
 
+  float reduce_size = max(th_config.num_threads * 32 * 4,
+                          (tb_n / 64) * 32 * (tb_max_m / 16) * 4 * 2 * 4 * 2);
+
   TORCH_CHECK(max_shared_mem / 2 > scales_cache_size);  // Sanity
 
-  return pipe_size < 0.95f * (max_shared_mem - scales_cache_size);
+  return pipe_size + reduce_size < 0.95f * (max_shared_mem - scales_cache_size);
 }
 
 bool is_valid_config(thread_config_t const& th_config, int max_m_blocks,

From 68d37809b9b52f4d012fa0dfbb187f0fe978bdbc Mon Sep 17 00:00:00 2001
From: Nathan Azrak <42650258+nathan-az@users.noreply.github.com>
Date: Fri, 3 Jan 2025 10:59:25 +1100
Subject: [PATCH 056/462] [Misc] Minimum requirements for SageMaker
 compatibility (#11576)

---
 Dockerfile                            | 13 +++++-
 examples/sagemaker-entrypoint.sh      | 24 +++++++++++
 vllm/entrypoints/openai/api_server.py | 61 ++++++++++++++++++++++++++-
 3 files changed, 95 insertions(+), 3 deletions(-)
 create mode 100644 examples/sagemaker-entrypoint.sh

diff --git a/Dockerfile b/Dockerfile
index 153bff9cf565f..088314eb38dbe 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -234,8 +234,8 @@ RUN mv vllm test_docs/
 #################### TEST IMAGE ####################
 
 #################### OPENAI API SERVER ####################
-# openai api server alternative
-FROM vllm-base AS vllm-openai
+# base openai image with additional requirements, for any subsequent openai-style images
+FROM vllm-base AS vllm-openai-base
 
 # install additional dependencies for openai api server
 RUN --mount=type=cache,target=/root/.cache/pip \
@@ -247,5 +247,14 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 
 ENV VLLM_USAGE_SOURCE production-docker-image
 
+# define sagemaker first, so it is not default from `docker build`
+FROM vllm-openai-base AS vllm-sagemaker
+
+COPY examples/sagemaker-entrypoint.sh .
+RUN chmod +x sagemaker-entrypoint.sh
+ENTRYPOINT ["./sagemaker-entrypoint.sh"]
+
+FROM vllm-openai-base AS vllm-openai
+
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
 #################### OPENAI API SERVER ####################
diff --git a/examples/sagemaker-entrypoint.sh b/examples/sagemaker-entrypoint.sh
new file mode 100644
index 0000000000000..75a99ffc1f155
--- /dev/null
+++ b/examples/sagemaker-entrypoint.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+# Define the prefix for environment variables to look for
+PREFIX="SM_VLLM_"
+ARG_PREFIX="--"
+
+# Initialize an array for storing the arguments
+# port 8080 required by sagemaker, https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms-inference-code.html#your-algorithms-inference-code-container-response
+ARGS=(--port 8080)
+
+# Loop through all environment variables
+while IFS='=' read -r key value; do
+    # Remove the prefix from the key, convert to lowercase, and replace underscores with dashes
+    arg_name=$(echo "${key#"${PREFIX}"}" | tr '[:upper:]' '[:lower:]' | tr '_' '-')
+
+    # Add the argument name and value to the ARGS array
+    ARGS+=("${ARG_PREFIX}${arg_name}")
+    if [ -n "$value" ]; then
+        ARGS+=("$value")
+    fi
+done < <(env | grep "^${PREFIX}")
+
+# Pass the collected arguments to the main entrypoint
+exec python3 -m vllm.entrypoints.openai.api_server "${ARGS[@]}"
\ No newline at end of file
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 74fe378fdae42..e942b475535ad 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -16,7 +16,7 @@
 from typing import AsyncIterator, Optional, Set, Tuple
 
 import uvloop
-from fastapi import APIRouter, FastAPI, Request
+from fastapi import APIRouter, FastAPI, HTTPException, Request
 from fastapi.exceptions import RequestValidationError
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse, Response, StreamingResponse
@@ -44,11 +44,15 @@
                                               CompletionResponse,
                                               DetokenizeRequest,
                                               DetokenizeResponse,
+                                              EmbeddingChatRequest,
+                                              EmbeddingCompletionRequest,
                                               EmbeddingRequest,
                                               EmbeddingResponse,
                                               EmbeddingResponseData,
                                               ErrorResponse,
                                               LoadLoraAdapterRequest,
+                                              PoolingChatRequest,
+                                              PoolingCompletionRequest,
                                               PoolingRequest, PoolingResponse,
                                               ScoreRequest, ScoreResponse,
                                               TokenizeRequest,
@@ -310,6 +314,12 @@ async def health(raw_request: Request) -> Response:
     return Response(status_code=200)
 
 
+@router.api_route("/ping", methods=["GET", "POST"])
+async def ping(raw_request: Request) -> Response:
+    """Ping check. Endpoint required for SageMaker"""
+    return await health(raw_request)
+
+
 @router.post("/tokenize")
 @with_cancellation
 async def tokenize(request: TokenizeRequest, raw_request: Request):
@@ -483,6 +493,54 @@ async def create_score_v1(request: ScoreRequest, raw_request: Request):
     return await create_score(request, raw_request)
 
 
+TASK_HANDLERS = {
+    "generate": {
+        "messages": (ChatCompletionRequest, create_chat_completion),
+        "default": (CompletionRequest, create_completion),
+    },
+    "embed": {
+        "messages": (EmbeddingChatRequest, create_embedding),
+        "default": (EmbeddingCompletionRequest, create_embedding),
+    },
+    "score": {
+        "default": (ScoreRequest, create_score),
+    },
+    "reward": {
+        "messages": (PoolingChatRequest, create_pooling),
+        "default": (PoolingCompletionRequest, create_pooling),
+    },
+    "classify": {
+        "messages": (PoolingChatRequest, create_pooling),
+        "default": (PoolingCompletionRequest, create_pooling),
+    },
+}
+
+
+@router.post("/invocations")
+async def invocations(raw_request: Request):
+    """
+    For SageMaker, routes requests to other handlers based on model `task`.
+    """
+    body = await raw_request.json()
+    task = raw_request.app.state.task
+
+    if task not in TASK_HANDLERS:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Unsupported task: '{task}' for '/invocations'. "
+            f"Expected one of {set(TASK_HANDLERS.keys())}")
+
+    handler_config = TASK_HANDLERS[task]
+    if "messages" in body:
+        request_model, handler = handler_config["messages"]
+    else:
+        request_model, handler = handler_config["default"]
+
+    # this is required since we lose the FastAPI automatic casting
+    request = request_model.model_validate(body)
+    return await handler(request, raw_request)
+
+
 if envs.VLLM_TORCH_PROFILER_DIR:
     logger.warning(
         "Torch Profiler is enabled in the API server. This should ONLY be "
@@ -687,6 +745,7 @@ def init_app_state(
         chat_template=resolved_chat_template,
         chat_template_content_format=args.chat_template_content_format,
     )
+    state.task = model_config.task
 
 
 def create_server_socket(addr: Tuple[str, int]) -> socket.socket:

From 2f1e8e8f54032e38998e90427aedf649c0beee39 Mon Sep 17 00:00:00 2001
From: Sachin Varghese <sachin.mathew31@gmail.com>
Date: Thu, 2 Jan 2025 19:25:53 -0500
Subject: [PATCH 057/462] Update default max_num_batch_tokens for chunked
 prefill (#11694)

---
 docs/source/usage/performance.md | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/docs/source/usage/performance.md b/docs/source/usage/performance.md
index f028e28627a9f..2cd3801bfc82d 100644
--- a/docs/source/usage/performance.md
+++ b/docs/source/usage/performance.md
@@ -32,8 +32,8 @@ You can enable the feature by specifying `--enable-chunked-prefill` in the comma
 ```python
 llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_chunked_prefill=True)
 # Set max_num_batched_tokens to tune performance.
-# NOTE: 512 is the default max_num_batched_tokens for chunked prefill.
-# llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_chunked_prefill=True, max_num_batched_tokens=512)
+# NOTE: 2048 is the default max_num_batched_tokens for chunked prefill.
+# llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_chunked_prefill=True, max_num_batched_tokens=2048)
 ```
 
 By default, vLLM scheduler prioritizes prefills and doesn't batch prefill and decode to the same batch.
@@ -49,13 +49,12 @@ This policy has two benefits:
 - It improves ITL and generation decode because decode requests are prioritized.
 - It helps achieve better GPU utilization by locating compute-bound (prefill) and memory-bound (decode) requests to the same batch.
 
-You can tune the performance by changing `max_num_batched_tokens`.
-By default, it is set to 512, which has the best ITL on A100 in the initial benchmark (llama 70B and mixtral 8x22B).
+You can tune the performance by changing `max_num_batched_tokens`. By default, it is set to 2048.
 Smaller `max_num_batched_tokens` achieves better ITL because there are fewer prefills interrupting decodes.
 Higher `max_num_batched_tokens` achieves better TTFT as you can put more prefill to the batch.
 
 - If `max_num_batched_tokens` is the same as `max_model_len`, that's almost the equivalent to the default scheduling policy (except that it still prioritizes decodes).
-- Note that the default value (512) of `max_num_batched_tokens` is optimized for ITL, and it may have lower throughput than the default scheduler.
+- Note that the default value (2048) of `max_num_batched_tokens` is optimized for ITL, and it may have lower throughput than the default scheduler.
 
 We recommend you set `max_num_batched_tokens > 2048` for throughput.
 

From 07064cb1d49d2b04ec58d8876bee2cd8281eedf5 Mon Sep 17 00:00:00 2001
From: Lu Fang <30275821+houseroad@users.noreply.github.com>
Date: Thu, 2 Jan 2025 16:58:56 -0800
Subject: [PATCH 058/462] [Bugfix] Check chain_speculative_sampling before
 calling it (#11673)

Signed-off-by: Lu Fang <lufang@fb.com>
---
 vllm/model_executor/layers/rejection_sampler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py
index 165e8309fee64..f173cbde03f44 100644
--- a/vllm/model_executor/layers/rejection_sampler.py
+++ b/vllm/model_executor/layers/rejection_sampler.py
@@ -118,7 +118,7 @@ def forward(
 
         # If use Flashinfer chain_speculative_sampling kernel
         # for rejection sampling
-        if self.use_flashinfer:
+        if self.use_flashinfer and chain_speculative_sampling is not None:
             batch_size, k, _ = draft_probs.shape
             uniform_samples = self._create_uniform_samples(
                 seeded_seqs, batch_size, k, draft_probs.device)

From fd3a62a122fcbc9331d000b325e72687629ef1bd Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Fri, 3 Jan 2025 13:38:37 +0700
Subject: [PATCH 059/462] [perf-benchmark] Fix dependency for steps in
 benchmark pipeline (#11710)

---
 .buildkite/nightly-benchmarks/benchmark-pipeline.yaml | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
index 868b8e95db01d..679abf1814aa5 100644
--- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
@@ -1,5 +1,6 @@
 steps:
   - label: "Wait for container to be ready"
+    key: wait-for-container-image
     agents:
       queue: A100
     plugins:
@@ -10,12 +11,11 @@ steps:
             command:
             - sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
 
-  - wait
-
   - label: "A100"
     # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
     agents:
       queue: A100
+    depends_on: wait-for-container-image
     plugins:
     - kubernetes:
         podSpec:
@@ -49,6 +49,7 @@ steps:
     # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
     agents:
       queue: H200
+    depends_on: wait-for-container-image
     plugins:
     - docker#v5.12.0:
         image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
@@ -73,7 +74,7 @@ steps:
     # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
     agents:
       queue: H100
-    depends_on: ~
+    depends_on: wait-for-container-image
     plugins:
     - docker#v5.12.0:
         image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT

From e1a5c2f0a123835558b1b1c9895181161527c55e Mon Sep 17 00:00:00 2001
From: Aurick Qiao <aurickq@users.noreply.github.com>
Date: Fri, 3 Jan 2025 03:39:19 -0500
Subject: [PATCH 060/462] [Model] Whisper model implementation (#11280)

Co-authored-by: Aurick Qiao <aurick.qiao@snowflake.com>
---
 .buildkite/test-pipeline.yaml                 |   2 +
 examples/offline_inference_whisper.py         |  59 ++
 .../audio_language/__init__.py                |   0
 .../audio_language/test_whisper.py            | 136 ++++
 tests/models/registry.py                      |   1 +
 vllm/config.py                                |   2 +
 vllm/inputs/preprocess.py                     |  36 +-
 vllm/model_executor/models/registry.py        |   1 +
 vllm/model_executor/models/whisper.py         | 737 ++++++++++++++++++
 vllm/multimodal/processing.py                 |  28 +-
 vllm/sequence.py                              |  18 +-
 vllm/transformers_utils/tokenizer.py          |  19 +
 .../tokenizer_group/base_tokenizer_group.py   |   6 +-
 .../tokenizer_group/ray_tokenizer_group.py    |  28 +-
 .../tokenizer_group/tokenizer_group.py        |  16 +-
 vllm/worker/enc_dec_model_runner.py           |  11 +-
 16 files changed, 1045 insertions(+), 55 deletions(-)
 create mode 100644 examples/offline_inference_whisper.py
 create mode 100644 tests/models/encoder_decoder/audio_language/__init__.py
 create mode 100644 tests/models/encoder_decoder/audio_language/test_whisper.py
 create mode 100644 vllm/model_executor/models/whisper.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index c6f8316412e2f..529daf54faecf 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -363,12 +363,14 @@ steps:
   - tests/models/decoder_only/audio_language
   - tests/models/decoder_only/vision_language
   - tests/models/embedding/vision_language
+  - tests/models/encoder_decoder/audio_language
   - tests/models/encoder_decoder/vision_language
   commands:
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
     - pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
     - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model'
     - pytest -v -s models/embedding/vision_language -m core_model
+    - pytest -v -s models/encoder_decoder/audio_language -m core_model
     - pytest -v -s models/encoder_decoder/language -m core_model
     - pytest -v -s models/encoder_decoder/vision_language -m core_model
 
diff --git a/examples/offline_inference_whisper.py b/examples/offline_inference_whisper.py
new file mode 100644
index 0000000000000..087ad4376fb2e
--- /dev/null
+++ b/examples/offline_inference_whisper.py
@@ -0,0 +1,59 @@
+import time
+
+from vllm import LLM, SamplingParams
+from vllm.assets.audio import AudioAsset
+
+# Create a Whisper encoder/decoder model instance
+llm = LLM(
+    model="openai/whisper-large-v3",
+    max_model_len=448,
+    max_num_seqs=400,
+    limit_mm_per_prompt={"audio": 1},
+    kv_cache_dtype="fp8",
+)
+
+prompts = [
+    {
+        "prompt": "<|startoftranscript|>",
+        "multi_modal_data": {
+            "audio": AudioAsset("mary_had_lamb").audio_and_sample_rate,
+        },
+    },
+    {  # Test explicit encoder/decoder prompt
+        "encoder_prompt": {
+            "prompt": "",
+            "multi_modal_data": {
+                "audio": AudioAsset("winning_call").audio_and_sample_rate,
+            },
+        },
+        "decoder_prompt": "<|startoftranscript|>",
+    }
+] * 1024
+
+# Create a sampling params object.
+sampling_params = SamplingParams(
+    temperature=0,
+    top_p=1.0,
+    max_tokens=200,
+)
+
+start = time.time()
+
+# Generate output tokens from the prompts. The output is a list of
+# RequestOutput objects that contain the prompt, generated
+# text, and other information.
+outputs = llm.generate(prompts, sampling_params)
+
+# Print the outputs.
+for output in outputs:
+    prompt = output.prompt
+    encoder_prompt = output.encoder_prompt
+    generated_text = output.outputs[0].text
+    print(f"Encoder prompt: {encoder_prompt!r}, "
+          f"Decoder prompt: {prompt!r}, "
+          f"Generated text: {generated_text!r}")
+
+duration = time.time() - start
+
+print("Duration:", duration)
+print("RPS:", len(prompts) / duration)
diff --git a/tests/models/encoder_decoder/audio_language/__init__.py b/tests/models/encoder_decoder/audio_language/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/models/encoder_decoder/audio_language/test_whisper.py b/tests/models/encoder_decoder/audio_language/test_whisper.py
new file mode 100644
index 0000000000000..eb238c5332139
--- /dev/null
+++ b/tests/models/encoder_decoder/audio_language/test_whisper.py
@@ -0,0 +1,136 @@
+"""Compare the outputs of HF and vLLM for Whisper models using greedy sampling.
+
+Run `pytest tests/models/encoder_decoder/audio/test_whisper.py`.
+"""
+from typing import Optional
+
+import pytest
+
+from vllm import LLM, SamplingParams
+from vllm.assets.audio import AudioAsset
+
+from ....utils import fork_new_process_for_each_test, multi_gpu_test
+
+PROMPTS = [
+    {
+        "prompt":
+        "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
+        "multi_modal_data": {
+            "audio": AudioAsset("mary_had_lamb").audio_and_sample_rate,
+        },
+    },
+    {  # Test explicit encoder/decoder prompt
+        "encoder_prompt": {
+            "prompt": "",
+            "multi_modal_data": {
+                "audio": AudioAsset("winning_call").audio_and_sample_rate,
+            },
+        },
+        "decoder_prompt":
+        "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
+    }
+]
+
+EXPECTED = {
+    "openai/whisper-tiny": [
+        " He has birth words I spoke in the original corner of that. And a"
+        " little piece of black coat poetry. Mary had a little sandwich,"
+        " sweet, with white and snow. And everyone had it very went the last"
+        " would sure to go.",
+        " >> And the old one, fit John the way to Edgar Martinez. >> One more"
+        " to line down the field line for our base camp. Here comes joy. Here"
+        " is June and the third base. They're going to wave him in. The throw"
+        " to the plate will be late. The Mariners are going to play for the"
+        " American League Championship. I don't believe it. It just continues"
+        " by all five."
+    ],
+    "openai/whisper-small": [
+        " The first words I spoke in the original pornograph. A little piece"
+        " of practical poetry. Mary had a little lamb, its fleece was quite a"
+        " slow, and everywhere that Mary went the lamb was sure to go.",
+        " And the old one pitch on the way to Edgar Martinez one month. Here"
+        " comes joy. Here is Junior to third base. They're gonna wave him"
+        " in. The throw to the plate will be late. The Mariners are going to"
+        " play for the American League Championship. I don't believe it. It"
+        " just continues. My, oh my."
+    ],
+    "openai/whisper-medium": [
+        " The first words I spoke in the original phonograph, a little piece"
+        " of practical poetry. Mary had a little lamb, its fleece was quite as"
+        " slow, and everywhere that Mary went the lamb was sure to go.",
+        " And the 0-1 pitch on the way to Edgar Martinez swung on the line"
+        " down the left field line for Obeyshev. Here comes Joy. Here is"
+        " Jorgen at third base. They're going to wave him in. The throw to the"
+        " plate will be late. The Mariners are going to play for the American"
+        " League Championship. I don't believe it. It just continues. My, oh"
+        " my."
+    ],
+    "openai/whisper-large-v3": [
+        " The first words I spoke in the original phonograph, a little piece"
+        " of practical poetry. Mary had a little lamb, its feet were quite as"
+        " slow, and everywhere that Mary went, the lamb was sure to go.",
+        " And the 0-1 pitch on the way to Edgar Martinez. Swung on the line."
+        " Now the left field line for a base hit. Here comes Joy. Here is"
+        " Junior to third base. They're going to wave him in. The throw to the"
+        " plate will be late. The Mariners are going to play for the American"
+        " League Championship. I don't believe it. It just continues. My, oh,"
+        " my."
+    ],
+    "openai/whisper-large-v3-turbo": [
+        " The first words I spoke in the original phonograph, a little piece"
+        " of practical poetry. Mary had a little lamb, its streets were quite"
+        " as slow, and everywhere that Mary went the lamb was sure to go.",
+        " And the 0-1 pitch on the way to Edgar Martinez. Swung on the line"
+        " down the left field line for a base hit. Here comes Joy. Here is"
+        " Junior to third base. They're going to wave him in. The throw to the"
+        " plate will be late. The Mariners are going to play for the American"
+        " League Championship. I don't believe it. It just continues. My, oh,"
+        " my."
+    ]
+}
+
+
+def run_test(
+    model: str,
+    *,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+) -> None:
+    prompt_list = PROMPTS * 10
+    expected_list = EXPECTED[model] * 10
+
+    llm = LLM(
+        model=model,
+        tensor_parallel_size=tensor_parallel_size,
+        distributed_executor_backend=distributed_executor_backend,
+    )
+
+    sampling_params = SamplingParams(
+        temperature=0,
+        top_p=1.0,
+        max_tokens=200,
+    )
+
+    outputs = llm.generate(prompt_list, sampling_params)
+
+    for output, expected in zip(outputs, expected_list):
+        print(output.outputs[0].text)
+        assert output.outputs[0].text == expected
+
+
+@fork_new_process_for_each_test
+@pytest.mark.core_model
+@pytest.mark.parametrize(
+    "model", ["openai/whisper-small", "openai/whisper-large-v3-turbo"])
+def test_models(model) -> None:
+    run_test(model, tensor_parallel_size=1)
+
+
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.core_model
+@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
+@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
+def test_models_distributed(model, distributed_executor_backend) -> None:
+    run_test(model,
+             tensor_parallel_size=2,
+             distributed_executor_backend=distributed_executor_backend)
diff --git a/tests/models/registry.py b/tests/models/registry.py
index e5dfb2822745d..dcb8bfa0f9510 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -204,6 +204,7 @@ class _HfExamplesInfo:
     "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_3"),
     # [Encoder-decoder]
     "MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"),  # noqa: E501
+    "WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"),  # noqa: E501
 }
 
 _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
diff --git a/vllm/config.py b/vllm/config.py
index e72c53b6130d0..b51f9783008b2 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2312,6 +2312,8 @@ def _get_and_verify_max_len(
         "seq_length",
         # Command-R
         "model_max_length",
+        # Whisper
+        "max_target_positions",
         # Others
         "max_sequence_length",
         "max_seq_length",
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index aaa10d278ddb0..b362ee0cac328 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -184,10 +184,16 @@ def _tokenize_prompt(
         corresponding token IDs.
         """
         tokenizer = self.get_tokenizer_group()
-
+        add_special_tokens = None
+        if self.model_config.hf_config.model_type == "whisper":
+            # For Whisper, special tokens should be provided by the user based
+            # on the task and language of their request. Also needed to avoid
+            # appending an EOS token to the prompt which disrupts generation.
+            add_special_tokens = False
         return tokenizer.encode(request_id=request_id,
                                 prompt=prompt,
-                                lora_request=lora_request)
+                                lora_request=lora_request,
+                                add_special_tokens=add_special_tokens)
 
     async def _tokenize_prompt_async(
         self,
@@ -197,10 +203,17 @@ async def _tokenize_prompt_async(
     ) -> List[int]:
         """Async version of :meth:`_tokenize_prompt`."""
         tokenizer = self.get_tokenizer_group()
-
-        return await tokenizer.encode_async(request_id=request_id,
-                                            prompt=prompt,
-                                            lora_request=lora_request)
+        add_special_tokens = None
+        if self.model_config.hf_config.model_type == "whisper":
+            # For Whisper, special tokens should be provided by the user based
+            # on the task and language of their request. Also needed to avoid
+            # appending an EOS token to the prompt which disrupts generation.
+            add_special_tokens = False
+        return await tokenizer.encode_async(
+            request_id=request_id,
+            prompt=prompt,
+            lora_request=lora_request,
+            add_special_tokens=add_special_tokens)
 
     def _can_process_multimodal(self) -> bool:
         model_config = self.model_config
@@ -439,8 +452,15 @@ def _build_enc_dec_llm_inputs(
             assert_never(encoder_inputs)  # type: ignore[arg-type]
 
         if decoder_inputs is None:
-            dec_token_ids = self._prepare_decoder_input_ids_for_generation(
-                None)
+            if self.model_config.hf_config.model_type == "whisper":
+                # For Whisper models, the text prompt should go to the decoder.
+                # If no explicit encoder/decoder inputs, then copy the prompt
+                # from the encoder to the decoder. The encoder tokens are later
+                # overridden by the audio features.
+                dec_token_ids = encoder_inputs["prompt_token_ids"].copy()
+            else:
+                dec_token_ids = self._prepare_decoder_input_ids_for_generation(
+                    None)
             decoder_inputs = token_inputs(dec_token_ids)
         elif (decoder_inputs["type"] == "token"
               or decoder_inputs["type"] == "multimodal"):
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 07f4b5a3b3bc8..62840b8c1bcda 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -170,6 +170,7 @@
     "UltravoxModel": ("ultravox", "UltravoxModel"),
     # [Encoder-decoder]
     "MllamaForConditionalGeneration": ("mllama", "MllamaForConditionalGeneration"),  # noqa: E501
+    "WhisperForConditionalGeneration": ("whisper", "WhisperForConditionalGeneration"),  # noqa: E501
 }
 
 _SPECULATIVE_DECODING_MODELS = {
diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
new file mode 100644
index 0000000000000..cb54b4c3ba663
--- /dev/null
+++ b/vllm/model_executor/models/whisper.py
@@ -0,0 +1,737 @@
+import math
+from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict,
+                    Union)
+
+import numpy as np
+import torch
+from torch import nn
+from transformers.models.whisper.modeling_whisper import sinusoids
+
+from vllm.attention import Attention, AttentionMetadata, AttentionType
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.inputs import INPUT_REGISTRY, DummyData, InputContext
+from vllm.logger import init_logger
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalKwargs,
+                             NestedTensors)
+from vllm.multimodal.audio import resample_audio
+from vllm.sequence import SequenceData
+from vllm.transformers_utils.processor import cached_get_processor
+
+from .interfaces import SupportsMultiModal
+from .utils import AutoWeightsLoader, WeightsMapper, make_layers
+
+logger = init_logger(__name__)
+
+
+class WhisperAudioInputs(TypedDict):
+    input_features: NestedTensors
+    """Shape: `(batch_size, 128, M)`"""
+
+
+class WhisperPositionalEmbedding(nn.Embedding):
+
+    def __init__(self,
+                 num_positions: int,
+                 embedding_dim: int,
+                 padding_idx: Optional[int] = None):
+        super().__init__(num_positions, embedding_dim)
+
+    def forward(self, position_ids):
+        return self.weight[position_ids]
+
+
+class WhisperAttention(nn.Module):
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        bias: bool = True,
+        attn_type: AttentionType = AttentionType.DECODER,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        if self.total_num_heads >= tp_size:
+            # Number of heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_heads % tp_size == 0
+        else:
+            # Number of heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_heads == 0
+        self.num_kv_heads = max(1, self.total_num_heads // tp_size)
+        self.head_dim = self.embed_dim // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.attn_type = attn_type
+
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: "
+                f"{self.embed_dim} and `num_heads`: {num_heads}).")
+        self.scaling = self.head_dim**-0.5
+
+        self._init_qkv(embed_dim, bias, quant_config, prefix=prefix)
+        self.out_proj = RowParallelLinear(
+            input_size=embed_dim,
+            output_size=embed_dim,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.out_proj",
+        )
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
+
+    def _init_qkv(
+        self,
+        embed_dim: int,
+        bias: bool = True,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size=embed_dim,
+            head_size=self.head_dim,
+            total_num_heads=self.total_num_heads,
+            total_num_kv_heads=self.total_num_heads,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+    ):
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+
+        attn_output = self.attn(q,
+                                k,
+                                v,
+                                kv_cache,
+                                attn_metadata,
+                                attn_type=self.attn_type)
+
+        output, _ = self.out_proj(attn_output)
+
+        return output
+
+
+class WhisperCrossAttention(WhisperAttention):
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        bias: bool = True,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__(
+            embed_dim=embed_dim,
+            num_heads=num_heads,
+            bias=bias,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=prefix,
+        )
+
+    def _init_qkv(
+        self,
+        embed_dim: int,
+        bias: bool = True,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        self.q_proj = ColumnParallelLinear(
+            input_size=embed_dim,
+            output_size=embed_dim,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.q_proj",
+        )
+        self.kv_proj = QKVParallelLinear(
+            hidden_size=embed_dim,
+            head_size=self.head_dim,
+            total_num_heads=0,
+            total_num_kv_heads=self.total_num_heads,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.kv_proj",
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor],
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+    ):
+        q, _ = self.q_proj(hidden_states)
+
+        # Encoder hidden states are only computed once during prefill phase.
+        # Afterwards, the keys and values should be available in the kv-cache.
+        if encoder_hidden_states is not None:
+            kv, _ = self.kv_proj(encoder_hidden_states)
+            k, v = kv.split([self.kv_size, self.kv_size], dim=-1)
+        else:
+            k = v = None
+
+        attn_output = self.attn(q,
+                                k,
+                                v,
+                                kv_cache,
+                                attn_metadata,
+                                attn_type=AttentionType.ENCODER_DECODER)
+
+        output, _ = self.out_proj(attn_output)
+
+        return output
+
+
+class WhisperMLP(nn.Module):
+
+    def __init__(
+        self,
+        embed_dim: int,
+        ffn_dim: int,
+        act_fn: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        self.activation_fn = get_act_fn(act_fn)
+        self.fc1 = ColumnParallelLinear(
+            input_size=embed_dim,
+            output_size=ffn_dim,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc1",
+        )
+        self.fc2 = RowParallelLinear(
+            input_size=ffn_dim,
+            output_size=embed_dim,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc2",
+        )
+
+    def forward(self, hidden_states: torch.Tensor):
+        hidden_states, _ = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states, _ = self.fc2(hidden_states)
+        return hidden_states
+
+
+class WhisperEncoderLayer(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
+        self.embed_dim = config.d_model
+        self.self_attn = WhisperAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.encoder_attention_heads,
+            attn_type=AttentionType.ENCODER,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+        )
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.mlp = WhisperMLP(
+            embed_dim=config.d_model,
+            ffn_dim=config.encoder_ffn_dim,
+            act_fn=config.activation_function,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
+        )
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+    ):
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states = self.self_attn(
+            hidden_states=hidden_states,
+            kv_cache=kv_cache,
+            attn_metadata=attn_metadata,
+        )
+        hidden_states = residual + hidden_states
+        residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        if hidden_states.isinf().any() or hidden_states.isnan().any():
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(hidden_states,
+                                        min=-clamp_value,
+                                        max=clamp_value)
+
+        return hidden_states
+
+
+class WhisperDecoderLayer(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
+        self.self_attn = WhisperAttention(
+            embed_dim=config.d_model,
+            num_heads=config.decoder_attention_heads,
+            attn_type=AttentionType.DECODER,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+        )
+        self.self_attn_layer_norm = nn.LayerNorm(config.d_model)
+        self.encoder_attn = WhisperCrossAttention(
+            embed_dim=config.d_model,
+            num_heads=config.decoder_attention_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.encoder_attn",
+        )
+        self.encoder_attn_layer_norm = nn.LayerNorm(config.d_model)
+        self.mlp = WhisperMLP(
+            embed_dim=config.d_model,
+            ffn_dim=config.decoder_ffn_dim,
+            act_fn=config.activation_function,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
+        )
+        self.final_layer_norm = nn.LayerNorm(config.d_model)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor],
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+    ):
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states = self.self_attn(hidden_states=hidden_states,
+                                       kv_cache=kv_cache,
+                                       attn_metadata=attn_metadata)
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.encoder_attn_layer_norm(hidden_states)
+        hidden_states = self.encoder_attn(
+            hidden_states=hidden_states,
+            encoder_hidden_states=encoder_hidden_states,
+            kv_cache=kv_cache,
+            attn_metadata=attn_metadata,
+        )
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        return hidden_states
+
+
+class WhisperEncoder(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        embed_dim = config.d_model
+        self.num_mel_bins = config.num_mel_bins
+        self.padding_idx = config.pad_token_id
+        self.max_source_positions = config.max_source_positions
+        self.embed_scale = (math.sqrt(embed_dim)
+                            if config.scale_embedding else 1.0)
+
+        self.conv1 = nn.Conv1d(self.num_mel_bins,
+                               embed_dim,
+                               kernel_size=3,
+                               padding=1)
+        self.conv2 = nn.Conv1d(embed_dim,
+                               embed_dim,
+                               kernel_size=3,
+                               stride=2,
+                               padding=1)
+        self.embed_positions = nn.Embedding(self.max_source_positions,
+                                            embed_dim)
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.encoder_layers,
+            lambda prefix: WhisperEncoderLayer(vllm_config=vllm_config,
+                                               prefix=f"{prefix}.layers"),
+            prefix=f"{prefix}.layers",
+        )
+        self.layer_norm = nn.LayerNorm(config.d_model)
+
+        with torch.no_grad():
+            self.embed_positions.weight.copy_(
+                sinusoids(*self.embed_positions.weight.shape))
+
+    def forward(
+        self,
+        input_features: Union[torch.Tensor, List[torch.Tensor]],
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+    ):
+        hidden_states = []
+        for features in input_features:
+            embeds = nn.functional.gelu(self.conv1(features))
+            embeds = nn.functional.gelu(self.conv2(embeds))
+            embeds = embeds.permute(1, 0)
+            embeds = embeds + self.embed_positions.weight[:embeds.size(0), :]
+            hidden_states.append(embeds)
+        hidden_states = torch.cat(hidden_states)
+
+        for idx, encoder_layer in enumerate(self.layers):
+            hidden_states = encoder_layer(
+                hidden_states,
+                kv_cache=kv_caches[idx],
+                attn_metadata=attn_metadata,
+            )
+
+        hidden_states = self.layer_norm(hidden_states)
+        return hidden_states
+
+
+class WhisperDecoder(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        self.layerdrop = config.decoder_layerdrop
+        self.padding_idx = config.pad_token_id
+        self.max_target_positions = config.max_target_positions
+        self.max_source_positions = config.max_source_positions
+        self.embed_scale = (math.sqrt(config.d_model)
+                            if config.scale_embedding else 1.0)
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model,
+                                         self.padding_idx)
+        self.embed_positions = WhisperPositionalEmbedding(
+            self.max_target_positions, config.d_model)
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.decoder_layers,
+            lambda prefix: WhisperDecoderLayer(vllm_config=vllm_config,
+                                               prefix=f"{prefix}.layers"),
+            prefix=f"{prefix}.layers",
+        )
+        self.layer_norm = nn.LayerNorm(config.d_model)
+
+    def forward(
+        self,
+        input_ids,
+        positions: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor],
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+    ):
+        inputs_embeds = self.get_input_embeddings(input_ids)
+        positions = self.embed_positions(positions)
+        hidden_states = inputs_embeds + positions
+
+        for idx, decoder_layer in enumerate(self.layers):
+            hidden_states = decoder_layer(
+                hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                kv_cache=kv_caches[idx],
+                attn_metadata=attn_metadata,
+            )
+
+        hidden_states = self.layer_norm(hidden_states)
+        return hidden_states
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+    ) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+
+class WhisperModel(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        self.encoder = WhisperEncoder(vllm_config=vllm_config,
+                                      prefix=f"{prefix}.encoder")
+        self.decoder = WhisperDecoder(vllm_config=vllm_config,
+                                      prefix=f"{prefix}.decoder")
+
+    def forward(
+        self,
+        input_features: Optional[Union[torch.Tensor, List[torch.Tensor]]],
+        input_ids: Optional[torch.Tensor],
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+    ) -> torch.Tensor:
+        encoder_outputs = self.get_encoder_outputs(
+            input_features,
+            kv_caches=kv_caches,
+            attn_metadata=attn_metadata,
+        )
+        decoder_outputs = self.decoder(
+            input_ids=input_ids,
+            positions=positions,
+            encoder_hidden_states=encoder_outputs,
+            kv_caches=kv_caches,
+            attn_metadata=attn_metadata,
+        )
+        return decoder_outputs
+
+    def get_encoder_outputs(
+        self,
+        input_features: Optional[Union[torch.Tensor, List[torch.Tensor]]],
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+    ) -> Optional[torch.Tensor]:
+        if input_features is None:
+            return None
+        return self.encoder(
+            input_features,
+            kv_caches=kv_caches,
+            attn_metadata=attn_metadata,
+        )
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".self_attn.qkv_proj", ".self_attn.q_proj", "q"),
+            (".self_attn.qkv_proj", ".self_attn.k_proj", "k"),
+            (".self_attn.qkv_proj", ".self_attn.v_proj", "v"),
+            (".encoder_attn.kv_proj", ".encoder_attn.k_proj", "k"),
+            (".encoder_attn.kv_proj", ".encoder_attn.v_proj", "v"),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+def get_max_whisper_audio_tokens(ctx: InputContext) -> int:
+    return ctx.model_config.hf_config.max_source_positions
+
+
+def dummy_encoder_data_for_whisper(ctx: InputContext, seq_len: int,
+                                   mm_counts: Mapping[str, int]):
+    assert mm_counts["audio"] == 1
+    num_tokens = get_max_whisper_audio_tokens(ctx)
+    processor = cached_get_processor(ctx.model_config.model)
+    chunk_length = processor.feature_extractor.chunk_length
+    sampling_rate = processor.feature_extractor.sampling_rate
+    num_samples = chunk_length * sampling_rate
+    return DummyData(
+        SequenceData.from_prompt_token_counts((0, num_tokens)),
+        {"audio": [(np.zeros(num_samples), sampling_rate)]},
+    )
+
+
+def input_processor_for_whisper(ctx: InputContext, inputs):
+    multi_modal_data = inputs["encoder"]["multi_modal_data"]
+    if isinstance(multi_modal_data["audio"], list):
+        assert len(multi_modal_data["audio"]) == 1
+        multi_modal_data["audio"] = multi_modal_data["audio"][0]
+    # Resample and process audio
+    audio, orig_sr = multi_modal_data["audio"]
+    processor = cached_get_processor(ctx.model_config.model)
+    target_sr = processor.feature_extractor.sampling_rate
+    audio = resample_audio(audio, orig_sr=orig_sr, target_sr=target_sr)
+    multi_modal_data["audio"] = (audio, target_sr)
+    # Pre-allocate placeholder tokens in encoder sequence
+    num_tokens = get_max_whisper_audio_tokens(ctx)
+    inputs["encoder"]["prompt_token_ids"] = [0] * num_tokens
+    return inputs
+
+
+def input_mapper_for_whisper(
+    ctx: InputContext,
+    multi_modal_data: Union[np.ndarray, List[np.ndarray]],
+) -> MultiModalKwargs:
+    if not isinstance(multi_modal_data, list):
+        multi_modal_data = [multi_modal_data]
+
+    assert len(multi_modal_data) == 1
+
+    if len(multi_modal_data) == 0:
+        return MultiModalKwargs()
+
+    processor = cached_get_processor(ctx.model_config.model)
+    sampling_rate = processor.feature_extractor.sampling_rate
+
+    audios = [audio for audio, _ in multi_modal_data]
+
+    kwargs = processor(audios,
+                       sampling_rate=sampling_rate,
+                       return_tensors="pt")
+    kwargs["input_features"] = kwargs["input_features"].squeeze(0).to(
+        ctx.model_config.dtype)
+
+    return MultiModalKwargs(kwargs)
+
+
+@INPUT_REGISTRY.register_dummy_encoder_data(dummy_encoder_data_for_whisper)
+@INPUT_REGISTRY.register_input_processor(input_processor_for_whisper)
+@MULTIMODAL_REGISTRY.register_input_mapper("audio", input_mapper_for_whisper)
+@MULTIMODAL_REGISTRY.register_max_multimodal_tokens(
+    "audio", get_max_whisper_audio_tokens)
+class WhisperForConditionalGeneration(nn.Module, SupportsMultiModal):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        self.dtype = vllm_config.model_config.dtype
+
+        self.model = WhisperModel(vllm_config=vllm_config, prefix=prefix)
+        self.unpadded_vocab_size = config.vocab_size
+        self.proj_out = ParallelLMHead(config.vocab_size,
+                                       config.d_model,
+                                       quant_config=quant_config)
+        self.proj_out = self.proj_out.tie_weights(
+            self.model.decoder.embed_tokens)
+        logit_scale = getattr(config, "logit_scale", 1.0)
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                config.vocab_size, logit_scale)
+        self.sampler = Sampler()
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        **kwargs,
+    ) -> torch.Tensor:
+        audio_input = self._parse_and_validate_audio_input(**kwargs)
+        decoder_outputs = self.model(
+            input_features=audio_input["input_features"],
+            input_ids=input_ids,
+            positions=positions,
+            kv_caches=kv_caches,
+            attn_metadata=attn_metadata,
+        )
+        return decoder_outputs
+
+    def get_multimodal_embeddings(
+        self,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        **kwargs,
+    ) -> Optional[NestedTensors]:
+        # TODO: This method does not obey the interface for SupportsMultiModal.
+        # Refactor this once encoder/decoder support is implemented in V1.
+        audio_input = self._parse_and_validate_audio_input(**kwargs)
+        return self.model.get_encoder_outputs(
+            audio_input["input_features"],
+            kv_caches=kv_caches,
+            attn_metadata=attn_metadata,
+        )
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[NestedTensors] = None,
+        attn_metadata: Optional[AttentionMetadata] = None,
+    ) -> torch.Tensor:
+        # TODO: This method just returns the decoder sequence embeddings since
+        # Whisper does not have encoder text tokens. Refactor this once
+        # encoder/decoder support is implemented in V1.
+        return self.model.decoder.get_input_embeddings(input_ids)
+
+    def _parse_and_validate_audio_input(
+            self, **kwargs: object) -> WhisperAudioInputs:
+        input_features = kwargs.pop("input_features", None)
+
+        if input_features is not None:
+            if not isinstance(input_features, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of audio features. "
+                                 f"Got type: {type(input_features)}")
+            input_features = [feat.to(self.dtype) for feat in input_features]
+
+        return WhisperAudioInputs(input_features=input_features)
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        logits = self.logits_processor(self.proj_out, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        loader = AutoWeightsLoader(self, skip_prefixes=["proj_out."])
+        loaded_weights = [(name, loaded_weight)
+                          for name, loaded_weight in weights]
+        mapper = WeightsMapper({".fc1.": ".mlp.fc1.", ".fc2.": ".mlp.fc2."})
+        return loader.load_weights(loaded_weights, mapper=mapper)
\ No newline at end of file
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 64cdacfb4c574..eb7552176e974 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -16,7 +16,7 @@
 
 from vllm.inputs import DummyData, InputProcessingContext
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
+from vllm.transformers_utils.tokenizer import AnyTokenizer, encode_tokens
 from vllm.utils import LRUCache, flatten_2d_lists, full_groupby
 
 from .inputs import (MultiModalDataDict, MultiModalFieldConfig,
@@ -57,24 +57,6 @@ def bind(self, tokenizer: AnyTokenizer) -> "_BoundPromptReplacement":
         )
 
 
-def _encode(
-    tokenizer: AnyTokenizer,
-    text: str,
-    *,
-    add_special_tokens: bool = False,
-) -> list[int]:
-    """
-    Backend-agnostic equivalent of HF's
-    :code:`tokenizer.encode(text, add_special_tokens=...)`.
-    """
-    if isinstance(tokenizer, MistralTokenizer):
-        return tokenizer.tokenizer.encode(text,
-                                          bos=add_special_tokens,
-                                          eos=add_special_tokens)
-
-    return tokenizer.encode(text, add_special_tokens=add_special_tokens)
-
-
 @lru_cache(maxsize=2048)
 def _cached_encode(
     tokenizer: AnyTokenizer,
@@ -82,7 +64,9 @@ def _cached_encode(
     *,
     add_special_tokens: bool = False,
 ) -> list[int]:
-    return _encode(tokenizer, text, add_special_tokens=add_special_tokens)
+    return encode_tokens(tokenizer,
+                         text,
+                         add_special_tokens=add_special_tokens)
 
 
 def _decode(
@@ -983,7 +967,9 @@ def _apply_prompt_replacements(
                 mm_item_counts,
             )
 
-            token_ids = _encode(tokenizer, text)
+            token_ids = encode_tokens(tokenizer,
+                                      text,
+                                      add_special_tokens=False)
             matched_repls = [match.prompt_repl for match in text_matches]
 
         placeholders = self._find_placeholders(matched_repls, token_ids,
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 034f89c0ddbe9..0157abbd2eed5 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -710,15 +710,27 @@ def token_type_ids(self) -> Optional[List[int]]:
 
     @property
     def multi_modal_data(self) -> MultiModalDataDict:
-        return self.first_seq.multi_modal_data
+        if self.first_seq.multi_modal_data:
+            return self.first_seq.multi_modal_data
+        elif self.encoder_seq is not None:
+            return self.encoder_seq.multi_modal_data
+        return {}
 
     @property
     def multi_modal_placeholders(self) -> MultiModalPlaceholderDict:
-        return self.first_seq.multi_modal_placeholders
+        if self.first_seq.multi_modal_data:
+            return self.first_seq.multi_modal_placeholders
+        elif self.encoder_seq is not None:
+            return self.encoder_seq.multi_modal_placeholders
+        return {}
 
     @property
     def mm_processor_kwargs(self) -> Dict[str, Any]:
-        return self.first_seq.mm_processor_kwargs
+        if self.first_seq.multi_modal_data:
+            return self.first_seq.mm_processor_kwargs
+        elif self.encoder_seq is not None:
+            return self.encoder_seq.mm_processor_kwargs
+        return {}
 
     @property
     def lora_int_id(self) -> int:
diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index e6701f4c4b835..42b2f095bc543 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -21,6 +21,25 @@
                      MistralTokenizer]
 
 
+def encode_tokens(
+    tokenizer: AnyTokenizer,
+    text: str,
+    *,
+    add_special_tokens: Optional[bool] = None,
+) -> list[int]:
+    """
+    Backend-agnostic equivalent of HF's
+    :code:`tokenizer.encode(text, add_special_tokens=...)`.
+    """
+    if isinstance(tokenizer, MistralTokenizer):
+        return tokenizer.tokenizer.encode(text,
+                                          bos=add_special_tokens,
+                                          eos=add_special_tokens)
+    elif add_special_tokens is not None:
+        return tokenizer.encode(text, add_special_tokens=add_special_tokens)
+    return tokenizer.encode(text)
+
+
 def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer:
     """Get tokenizer with cached properties.
 
diff --git a/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py
index 8f78ef65bbf1a..e6cc7cd4e2e3a 100644
--- a/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py
+++ b/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py
@@ -32,7 +32,8 @@ def get_max_input_len(
     def encode(self,
                prompt: str,
                request_id: Optional[str] = None,
-               lora_request: Optional[LoRARequest] = None) -> List[int]:
+               lora_request: Optional[LoRARequest] = None,
+               add_special_tokens: Optional[bool] = None) -> List[int]:
         """Encode a prompt using the tokenizer group."""
         pass
 
@@ -41,7 +42,8 @@ async def encode_async(
             self,
             prompt: str,
             request_id: Optional[str] = None,
-            lora_request: Optional[LoRARequest] = None) -> List[int]:
+            lora_request: Optional[LoRARequest] = None,
+            add_special_tokens: Optional[bool] = None) -> List[int]:
         """Encode a prompt using the tokenizer group."""
         pass
 
diff --git a/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
index 9a999a0d6067d..3f7627e11ae5e 100644
--- a/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
+++ b/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
@@ -112,7 +112,8 @@ def _finalize_encode(self, actor: ray.ObjectRef,
     def encode(self,
                prompt: str,
                request_id: Optional[str] = None,
-               lora_request: Optional[LoRARequest] = None) -> List[int]:
+               lora_request: Optional[LoRARequest] = None,
+               add_special_tokens: Optional[bool] = None) -> List[int]:
         """Encode a prompt using the tokenizer group.
 
         We pick an idle actor and use it to encode the prompt.
@@ -132,7 +133,8 @@ def encode(self,
             ret = ray.get(
                 actor.encode.remote(request_id=request_id,
                                     prompt=prompt,
-                                    lora_request=lora_request))
+                                    lora_request=lora_request,
+                                    add_special_tokens=add_special_tokens))
         except ActorDiedError as e:
             # If the actor is dead, we first try to reinitialize it.
             logger.warning("%s died with ActorDiedError, reinitializing.",
@@ -143,7 +145,8 @@ def encode(self,
                 ret = ray.get(
                     actor.encode.remote(request_id=request_id,
                                         prompt=prompt,
-                                        lora_request=lora_request))
+                                        lora_request=lora_request,
+                                        add_special_tokens=add_special_tokens))
             except ActorDiedError as e:
                 logger.error(
                     "%s died for second time in a row, marking "
@@ -160,7 +163,8 @@ async def encode_async(
             self,
             prompt: str,
             request_id: Optional[str] = None,
-            lora_request: Optional[LoRARequest] = None) -> List[int]:
+            lora_request: Optional[LoRARequest] = None,
+            add_special_tokens: Optional[bool] = None) -> List[int]:
         """Encode a prompt using the tokenizer group.
 
         We pick an idle actor and use it to encode the prompt.
@@ -177,9 +181,11 @@ async def encode_async(
         actor_is_alive = True
         original_actor = actor
         try:
-            ret = await actor.encode.remote(request_id=request_id,
-                                            prompt=prompt,
-                                            lora_request=lora_request)
+            ret = await actor.encode.remote(
+                request_id=request_id,
+                prompt=prompt,
+                lora_request=lora_request,
+                add_special_tokens=add_special_tokens)
         except ActorDiedError as e:
             # If the actor is dead, we first try to reinitialize it.
             logger.warning("%s died with ActorDiedError, reinitializing.",
@@ -187,9 +193,11 @@ async def encode_async(
                            exc_info=e)
             actor = self._init_actor()
             try:
-                ret = await actor.encode.remote(request_id=request_id,
-                                                prompt=prompt,
-                                                lora_request=lora_request)
+                ret = await actor.encode.remote(
+                    request_id=request_id,
+                    prompt=prompt,
+                    lora_request=lora_request,
+                    add_special_tokens=add_special_tokens)
             except ActorDiedError as e:
                 logger.error(
                     "%s died for second time in a row, marking "
diff --git a/vllm/transformers_utils/tokenizer_group/tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/tokenizer_group.py
index 95a8f7098bbac..6dc2f90561873 100644
--- a/vllm/transformers_utils/tokenizer_group/tokenizer_group.py
+++ b/vllm/transformers_utils/tokenizer_group/tokenizer_group.py
@@ -2,7 +2,7 @@
 
 from vllm.config import TokenizerPoolConfig
 from vllm.lora.request import LoRARequest
-from vllm.transformers_utils.tokenizer import (AnyTokenizer,
+from vllm.transformers_utils.tokenizer import (AnyTokenizer, encode_tokens,
                                                get_lora_tokenizer,
                                                get_lora_tokenizer_async,
                                                get_tokenizer)
@@ -55,9 +55,12 @@ def _raise_if_input_too_long(self,
     def encode(self,
                prompt: str,
                request_id: Optional[str] = None,
-               lora_request: Optional[LoRARequest] = None) -> List[int]:
+               lora_request: Optional[LoRARequest] = None,
+               add_special_tokens: Optional[bool] = None) -> List[int]:
         tokenizer = self.get_lora_tokenizer(lora_request)
-        ret = tokenizer.encode(prompt)
+        ret = encode_tokens(tokenizer,
+                            prompt,
+                            add_special_tokens=add_special_tokens)
         self._raise_if_input_too_long(ret, lora_request)
         return ret
 
@@ -65,9 +68,12 @@ async def encode_async(
             self,
             prompt: str,
             request_id: Optional[str] = None,
-            lora_request: Optional[LoRARequest] = None) -> List[int]:
+            lora_request: Optional[LoRARequest] = None,
+            add_special_tokens: Optional[bool] = None) -> List[int]:
         tokenizer = await self.get_lora_tokenizer_async(lora_request)
-        ret = tokenizer.encode(prompt)
+        ret = encode_tokens(tokenizer,
+                            prompt,
+                            add_special_tokens=add_special_tokens)
         self._raise_if_input_too_long(ret, lora_request)
         return ret
 
diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py
index bff01320d7927..4d5d918087be8 100644
--- a/vllm/worker/enc_dec_model_runner.py
+++ b/vllm/worker/enc_dec_model_runner.py
@@ -287,12 +287,11 @@ def profile_run(self) -> None:
                                           seq_len,
                                           self.mm_registry,
                                           is_encoder_data=False)
-            encoder_dummy_data \
-                = self.input_registry.dummy_data_for_profiling(
-                    self.model_config,
-                                         seq_len,
-                                         self.mm_registry,
-                                         is_encoder_data=True)
+            encoder_dummy_data = self.input_registry \
+                .dummy_data_for_profiling(self.model_config,
+                                          seq_len,
+                                          self.mm_registry,
+                                          is_encoder_data=True)
 
             # Having more tokens is over-conservative but otherwise fine
             assert len(

From 80c751e7f68ade3d4c6391a0f3fce9ce970ddad0 Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Fri, 3 Jan 2025 12:25:38 -0500
Subject: [PATCH 061/462] [V1] Simplify Shutdown (#11659)

---
 tests/v1/engine/test_engine_core_client.py |  6 ---
 vllm/entrypoints/llm.py                    |  5 ---
 vllm/v1/engine/async_llm.py                |  3 --
 vllm/v1/engine/core.py                     |  1 -
 vllm/v1/engine/core_client.py              | 34 ++++++++--------
 vllm/v1/engine/llm_engine.py               |  7 ----
 vllm/v1/utils.py                           | 46 +++++++++++-----------
 7 files changed, 42 insertions(+), 60 deletions(-)

diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
index 729975e4ea8c4..20d4e6f63b339 100644
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -142,9 +142,6 @@ def test_engine_core_client(monkeypatch, multiprocessing_mode: bool):
 
         client.abort_requests([request.request_id])
 
-        # Shutdown the client.
-        client.shutdown()
-
 
 @pytest.mark.asyncio
 async def test_engine_core_client_asyncio(monkeypatch):
@@ -200,6 +197,3 @@ async def test_engine_core_client_asyncio(monkeypatch):
             else:
                 assert len(outputs[req_id]) == MAX_TOKENS, (
                     f"{len(outputs[req_id])=}, {MAX_TOKENS=}")
-
-        # Shutdown the client.
-        client.shutdown()
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index fadf297e9f6aa..7c0de3b3e5481 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -232,11 +232,6 @@ def __init__(
 
         self.request_counter = Counter()
 
-    def __del__(self):
-        if hasattr(self, 'llm_engine') and self.llm_engine and hasattr(
-                self.llm_engine, "shutdown"):
-            self.llm_engine.shutdown()
-
     @staticmethod
     def get_engine_class() -> Type[LLMEngine]:
         if envs.VLLM_USE_V1:
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 3f097ca7f439c..ff7a0c28dd91a 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -103,9 +103,6 @@ def sigquit_handler(signum, frame):
 
         self.output_handler: Optional[asyncio.Task] = None
 
-    def __del__(self):
-        self.shutdown()
-
     @classmethod
     def from_engine_args(
         cls,
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 5840541d774ba..13a50a4f855e2 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -203,7 +203,6 @@ def signal_handler(signum, frame):
         finally:
             if engine_core is not None:
                 engine_core.shutdown()
-                engine_core = None
 
     def run_busy_loop(self):
         """Core busy loop of the EngineCore."""
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 3293205e110af..e009f3448bf69 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -1,4 +1,6 @@
-from typing import List, Optional, Type
+import weakref
+from abc import ABC, abstractmethod
+from typing import List, Type
 
 import msgspec
 import zmq
@@ -18,7 +20,7 @@
 logger = init_logger(__name__)
 
 
-class EngineCoreClient:
+class EngineCoreClient(ABC):
     """
     EngineCoreClient: subclasses handle different methods for pushing 
         and pulling from the EngineCore for asyncio / multiprocessing.
@@ -52,8 +54,9 @@ def make_client(
 
         return InprocClient(vllm_config, executor_class, log_stats)
 
+    @abstractmethod
     def shutdown(self):
-        pass
+        ...
 
     def get_output(self) -> List[EngineCoreOutput]:
         raise NotImplementedError
@@ -107,9 +110,6 @@ def abort_requests(self, request_ids: List[str]) -> None:
     def shutdown(self):
         self.engine_core.shutdown()
 
-    def __del__(self):
-        self.shutdown()
-
     def profile(self, is_start: bool = True) -> None:
         self.engine_core.profile(is_start)
 
@@ -139,10 +139,14 @@ def __init__(
         self.decoder = msgspec.msgpack.Decoder(EngineCoreOutputs)
 
         # ZMQ setup.
-        if asyncio_mode:
-            self.ctx = zmq.asyncio.Context()
-        else:
-            self.ctx = zmq.Context()  # type: ignore[attr-defined]
+        self.ctx = (
+            zmq.asyncio.Context()  # type: ignore[attr-defined]
+            if asyncio_mode else zmq.Context())  # type: ignore[attr-defined]
+
+        # Note(rob): shutdown function cannot be a bound method,
+        # else the gc cannot collect the object.
+        self._finalizer = weakref.finalize(self, lambda x: x.destroy(linger=0),
+                                           self.ctx)
 
         # Paths and sockets for IPC.
         output_path = get_open_zmq_ipc_path()
@@ -153,7 +157,6 @@ def __init__(
                                             zmq.constants.PUSH)
 
         # Start EngineCore in background process.
-        self.proc_handle: Optional[BackgroundProcHandle]
         self.proc_handle = BackgroundProcHandle(
             input_path=input_path,
             output_path=output_path,
@@ -166,12 +169,11 @@ def __init__(
             })
 
     def shutdown(self):
-        # Shut down the zmq context.
-        self.ctx.destroy(linger=0)
-
-        if hasattr(self, "proc_handle") and self.proc_handle:
+        """Clean up background resources."""
+        if hasattr(self, "proc_handle"):
             self.proc_handle.shutdown()
-            self.proc_handle = None
+
+        self._finalizer()
 
 
 class SyncMPClient(MPClient):
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index a19109559eabf..1f49de67d7493 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -205,10 +205,3 @@ def get_tokenizer_group(
                             f"found type: {type(tokenizer_group)}")
 
         return tokenizer_group
-
-    def __del__(self):
-        self.shutdown()
-
-    def shutdown(self):
-        if engine_core := getattr(self, "engine_core", None):
-            engine_core.shutdown()
diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index 19e0dd17237c9..b0a7affbebb7e 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -1,3 +1,4 @@
+import multiprocessing
 import os
 import weakref
 from collections.abc import Sequence
@@ -91,8 +92,6 @@ def __init__(
         target_fn: Callable,
         process_kwargs: Dict[Any, Any],
     ):
-        self._finalizer = weakref.finalize(self, self.shutdown)
-
         context = get_mp_context()
         reader, writer = context.Pipe(duplex=False)
 
@@ -102,11 +101,11 @@ def __init__(
         process_kwargs["ready_pipe"] = writer
         process_kwargs["input_path"] = input_path
         process_kwargs["output_path"] = output_path
-        self.input_path = input_path
-        self.output_path = output_path
 
-        # Run Detokenizer busy loop in background process.
+        # Run busy loop in background process.
         self.proc = context.Process(target=target_fn, kwargs=process_kwargs)
+        self._finalizer = weakref.finalize(self, shutdown, self.proc,
+                                           input_path, output_path)
         self.proc.start()
 
         # Wait for startup.
@@ -114,21 +113,24 @@ def __init__(
             raise RuntimeError(f"{process_name} initialization failed. "
                                "See root cause above.")
 
-    def __del__(self):
-        self.shutdown()
-
     def shutdown(self):
-        # Shutdown the process if needed.
-        if hasattr(self, "proc") and self.proc.is_alive():
-            self.proc.terminate()
-            self.proc.join(5)
-
-            if self.proc.is_alive():
-                kill_process_tree(self.proc.pid)
-
-        # Remove zmq ipc socket files
-        ipc_sockets = [self.output_path, self.input_path]
-        for ipc_socket in ipc_sockets:
-            socket_file = ipc_socket.replace("ipc://", "")
-            if os and os.path.exists(socket_file):
-                os.remove(socket_file)
+        self._finalizer()
+
+
+# Note(rob): shutdown function cannot be a bound method,
+# else the gc cannot collect the object.
+def shutdown(proc: multiprocessing.Process, input_path: str, output_path: str):
+    # Shutdown the process.
+    if proc.is_alive():
+        proc.terminate()
+        proc.join(5)
+
+        if proc.is_alive():
+            kill_process_tree(proc.pid)
+
+    # Remove zmq ipc socket files.
+    ipc_sockets = [output_path, input_path]
+    for ipc_socket in ipc_sockets:
+        socket_file = ipc_socket.replace("ipc://", "")
+        if os and os.path.exists(socket_file):
+            os.remove(socket_file)

From 61fed92c7e646d6f2ec5d9de54568a860870e6a4 Mon Sep 17 00:00:00 2001
From: ZincCat <52513999+zinccat@users.noreply.github.com>
Date: Fri, 3 Jan 2025 13:02:34 -0800
Subject: [PATCH 062/462] [Bugfix] Fix ColumnParallelLinearWithLoRA slice
 (#11708)

Signed-off-by: ZincCat <zincchloride@outlook.com>
---
 vllm/lora/layers.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index 85164c2165a3c..102e40d3f448d 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -479,7 +479,7 @@ def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor:
         # ColumnParallelLinear.
         else:
             tensor_model_parallel_rank = get_tensor_model_parallel_rank()
-            shard_size = self.output_dim
+            shard_size = self.output_size
             start_idx = tensor_model_parallel_rank * shard_size
             end_idx = (tensor_model_parallel_rank + 1) * shard_size
             lora_b = lora_b[:, start_idx:end_idx]
@@ -490,7 +490,7 @@ def slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
         if bias is None:
             return bias
         tensor_model_parallel_rank = get_tensor_model_parallel_rank()
-        shard_size = self.output_dim
+        shard_size = self.output_size
         start_idx = tensor_model_parallel_rank * shard_size
         end_idx = (tensor_model_parallel_rank + 1) * shard_size
         bias = bias[start_idx:end_idx]

From 1543914c04697fb252e4468b7c9d14be512b050a Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Fri, 3 Jan 2025 16:29:11 -0500
Subject: [PATCH 063/462] [V1] Improve TP>1 Error Handling + Stack Trace
 (#11721)

Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
---
 vllm/v1/engine/async_llm.py            | 16 ----------------
 vllm/v1/engine/core.py                 |  2 +-
 vllm/v1/engine/core_client.py          | 19 ++++++++++++++++++-
 vllm/v1/executor/multiproc_executor.py | 24 +++++++++++++++++++++---
 4 files changed, 40 insertions(+), 21 deletions(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index ff7a0c28dd91a..564d8a8343bef 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -1,6 +1,5 @@
 import asyncio
 import os
-import signal
 from typing import AsyncGenerator, Dict, List, Mapping, Optional, Type, Union
 
 from vllm.config import ModelConfig, VllmConfig
@@ -42,21 +41,6 @@ def __init__(
         start_engine_loop: bool = True,
     ) -> None:
 
-        # The child processes will send SIGQUIT when unrecoverable
-        # errors happen. We kill the process tree here so that the
-        # stack trace is very evident.
-        # TODO: rather than killing the main process, we should
-        # figure out how to raise an AsyncEngineDeadError and
-        # handle at the API server level so we can return a better
-        # error code to the clients calling VLLM.
-        def sigquit_handler(signum, frame):
-            logger.fatal(
-                "AsyncLLM got SIGQUIT from worker processes, shutting "
-                "down. See stack trace above for root cause issue.")
-            kill_process_tree(os.getpid())
-
-        signal.signal(signal.SIGQUIT, sigquit_handler)
-
         assert start_engine_loop
 
         self.log_requests = log_requests
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 13a50a4f855e2..975ce11fe8aff 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -198,7 +198,7 @@ def signal_handler(signum, frame):
         except Exception:
             traceback = get_exception_traceback()
             logger.error("EngineCore hit an exception: %s", traceback)
-            parent_process.send_signal(signal.SIGQUIT)
+            parent_process.send_signal(signal.SIGUSR1)
 
         finally:
             if engine_core is not None:
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index e009f3448bf69..6a40c961fc1d7 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -1,3 +1,5 @@
+import os
+import signal
 import weakref
 from abc import ABC, abstractmethod
 from typing import List, Type
@@ -8,7 +10,8 @@
 
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
-from vllm.utils import get_open_zmq_ipc_path, make_zmq_socket
+from vllm.utils import (get_open_zmq_ipc_path, kill_process_tree,
+                        make_zmq_socket)
 from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs,
                             EngineCoreProfile, EngineCoreRequest,
                             EngineCoreRequestType, EngineCoreRequestUnion)
@@ -134,6 +137,20 @@ def __init__(
         executor_class: Type[Executor],
         log_stats: bool = False,
     ):
+        # The child processes will send SIGUSR1 when unrecoverable
+        # errors happen. We kill the process tree here so that the
+        # stack trace is very evident.
+        # TODO(rob): rather than killing the main process, we should
+        # figure out how to raise an AsyncEngineDeadError and
+        # handle at the API server level so we can return a better
+        # error code to the clients calling VLLM.
+        def sigusr1_handler(signum, frame):
+            logger.fatal("Got fatal signal from worker processes, shutting "
+                         "down. See stack trace above for root cause issue.")
+            kill_process_tree(os.getpid())
+
+        signal.signal(signal.SIGUSR1, sigusr1_handler)
+
         # Serialization setup.
         self.encoder = PickleEncoder()
         self.decoder = msgspec.msgpack.Decoder(EngineCoreOutputs)
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index ed64e7741390d..114deae980d01 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -9,6 +9,7 @@
 from multiprocessing.process import BaseProcess
 from typing import Any, Dict, List, Optional, Tuple
 
+import psutil
 import zmq
 
 from vllm.config import VllmConfig
@@ -38,6 +39,19 @@ def __init__(self, vllm_config: VllmConfig) -> None:
         # and ensure workers will be terminated.
         self._finalizer = weakref.finalize(self, self.shutdown)
 
+        # The child processes will send SIGUSR1 when unrecoverable
+        # errors happen.
+        def sigusr1_handler(signum, frame):
+            logger.fatal(
+                "MulitprocExecutor got fatal signal from worker processes, "
+                "shutting down. See stack trace above for root cause issue.")
+            # Propagate error up to parent process.
+            parent_process = psutil.Process().parent()
+            parent_process.send_signal(signal.SIGUSR1)
+            self.shutdown()
+
+        signal.signal(signal.SIGUSR1, sigusr1_handler)
+
         self.vllm_config = vllm_config
         self.parallel_config = vllm_config.parallel_config
 
@@ -335,8 +349,11 @@ def signal_handler(signum, frame):
         except SystemExit:
             logger.debug("Worker interrupted.")
 
-        except BaseException as e:
-            logger.exception(e)
+        except Exception:
+            # worker_busy_loop sends exceptions exceptons to Executor
+            # for shutdown, but if there is an error in startup or an
+            # error with IPC itself, we need to alert the parent.
+            psutil.Process().parent().send_signal(signal.SIGUSR1)
             raise
 
         finally:
@@ -377,9 +394,10 @@ def worker_busy_loop(self):
 
             try:
                 output = getattr(self.worker, method)(*args, **kwargs)
-            except BaseException as e:
+            except Exception as e:
                 self.worker_response_mq.enqueue(
                     (WorkerProc.ResponseStatus.FAILURE, e))
+                logger.exception("WorkerProc hit an exception: %s", exc_info=e)
                 continue
 
             self.worker_response_mq.enqueue(

From a655eb30252fe266ce16fde2aa9f8f9554ccd46e Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Sat, 4 Jan 2025 06:19:02 +0800
Subject: [PATCH 064/462] [Misc]Add BNB quantization for Qwen2VL (#11719)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
---
 vllm/model_executor/models/qwen2_vl.py | 69 +++++++++++++++-----------
 1 file changed, 40 insertions(+), 29 deletions(-)

diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 26b6d768ad4f6..5a8c6e4deb7ac 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -38,7 +38,7 @@
 
 from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
-from vllm.distributed import parallel_state
+from vllm.distributed import parallel_state, tensor_model_parallel_all_gather
 from vllm.distributed import utils as dist_utils
 from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadata
@@ -239,6 +239,8 @@ def __init__(
         super().__init__()
         # Per attention head and per partition values.
         world_size = parallel_state.get_tensor_model_parallel_world_size()
+        self.tp_size = world_size
+        self.tp_rank = parallel_state.get_tensor_model_parallel_rank()
         self.hidden_size_per_attention_head = dist_utils.divide(
             projection_size, num_heads)
         self.num_attention_heads_per_partition = dist_utils.divide(
@@ -261,24 +263,41 @@ def __init__(
             raise RuntimeError(
                 f"Qwen2-VL does not support {self.attn_backend} backend now.")
 
+    def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]:
+        # [s, b, 3 * head * head_dim]
+        seq_len, bs, _ = qkv.shape
+        if self.tp_size > 1:
+            qkv = tensor_model_parallel_all_gather(qkv)
+
+        # [s, b, 3 * head * head_dim] -> 3 * [s, b, head * head_dim]
+        q, k, v = qkv.chunk(3, dim=2)
+
+        # 3 * [s, b, head * head_dim]
+        if self.tp_size > 1:
+            splitter = partial(dist_utils.split_tensor_along_last_dim,
+                               num_partitions=self.tp_size)
+            q = splitter(q)[self.tp_rank]
+            k = splitter(k)[self.tp_rank]
+            v = splitter(v)[self.tp_rank]
+
+        # 3 * [s, b, head * head_dim] -> 3 * [s, b, head, head_dim]
+        new_shape = (seq_len, bs, self.num_attention_heads_per_partition,
+                     self.hidden_size_per_attention_head)
+        q, k, v = (x.view(*new_shape) for x in (q, k, v))
+        return q, k, v
+
     def forward(
         self,
         x: torch.Tensor,
         cu_seqlens: torch.Tensor,
         rotary_pos_emb: torch.Tensor,
     ) -> torch.Tensor:
-        # [s, b, c] --> [s, b, head * 3 * head_dim]
-        x, _ = self.qkv(x)
 
-        # [s, b, head * 3 * head_dim] --> [s, b, head, 3 * head_dim]
-        new_x_shape = x.size()[:-1] + (
-            self.num_attention_heads_per_partition,
-            3 * self.hidden_size_per_attention_head,
-        )
-        x = x.view(*new_x_shape)
+        # [s, b, c] --> [s, b, 3 * head * head_dim]
+        x, _ = self.qkv(x)
 
-        # [s, b, head, 3 * head_dim] --> 3 [s, b, head, head_dim]
-        q, k, v = dist_utils.split_tensor_along_last_dim(x, 3)
+        # [s, b, 3 * head * head_dim] -> 3 * [s, b, head, head_dim]
+        q, k, v = self.split_qkv(x)
         batch_size = q.shape[1]
 
         q, k, v = (rearrange(x, "s b ... -> b s ...").contiguous()
@@ -614,24 +633,6 @@ def load_weights(self, weights: Iterable[Tuple[str,
                 weight_loader(param, loaded_weight, shard_id)
                 break
             else:
-                if name.endswith("qkv.weight"):
-                    visual_num_heads = self.num_heads
-                    visual_embed_dim = self.embed_dim
-                    head_size = visual_embed_dim // visual_num_heads
-                    loaded_weight = loaded_weight.view(3, visual_num_heads,
-                                                       head_size,
-                                                       visual_embed_dim)
-                    loaded_weight = loaded_weight.transpose(0, 1)
-                    loaded_weight = loaded_weight.reshape(-1, visual_embed_dim)
-                elif name.endswith("qkv.bias"):
-                    visual_num_heads = self.num_heads
-                    visual_embed_dim = self.embed_dim
-                    head_size = visual_embed_dim // visual_num_heads
-                    loaded_weight = loaded_weight.view(3, visual_num_heads,
-                                                       head_size)
-                    loaded_weight = loaded_weight.transpose(0, 1)
-                    loaded_weight = loaded_weight.reshape(-1)
-
                 param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
@@ -935,6 +936,16 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
     embedding_modules = {}
     embedding_padding_modules = []
 
+    # BitandBytes specific attributes
+    bitsandbytes_stacked_params_mapping = {
+        # shard_name, weight_name, index
+        "q_proj": ("qkv_proj", 0),
+        "k_proj": ("qkv_proj", 1),
+        "v_proj": ("qkv_proj", 2),
+        "gate_proj": ("gate_up_proj", 0),
+        "up_proj": ("gate_up_proj", 1),
+    }
+
     # To ensure correct weight loading and mapping.
     hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={
         "lm_head.": "language_model.lm_head.",

From bf0d97d78619b290ed273199ad3800b57b638603 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Fri, 3 Jan 2025 17:36:46 -0500
Subject: [PATCH 065/462] Update requirements-tpu.txt to support python 3.9 and
 3.11 (#11695)

Signed-off-by: mgoin <michael@neuralmagic.com>
---
 requirements-tpu.txt            | 4 +++-
 vllm/worker/tpu_model_runner.py | 6 ++++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/requirements-tpu.txt b/requirements-tpu.txt
index b8f0b15469e77..8ab18b3770ae8 100644
--- a/requirements-tpu.txt
+++ b/requirements-tpu.txt
@@ -18,6 +18,8 @@ ray[default]
 --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
 torch==2.6.0.dev20241126+cpu
 torchvision==0.20.0.dev20241126+cpu
-torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev20241126-cp310-cp310-linux_x86_64.whl
+torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev20241126-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
+torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev20241126-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
+torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev20241126-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
 jaxlib==0.4.36.dev20241122
 jax==0.4.36.dev20241122
diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py
index 9a054eb8a4cf7..7bdb7f0e2d6a9 100644
--- a/vllm/worker/tpu_model_runner.py
+++ b/vllm/worker/tpu_model_runner.py
@@ -126,8 +126,10 @@ def __init__(
             logger.warning(
                 "The max_model_len (%d) is too large. This may degrade the "
                 "performance due to the insufficient smem size. Consider "
-                "setting --max-model-len to a smaller value.",
-                self.model_config.max_model_len)
+                "setting --max-model-len to a smaller value, like %d.",
+                self.model_config.max_model_len,
+                self.model_config.max_model_len /
+                (block_table_size / smem_size))
 
     def load_model(self) -> None:
         self.device = self.device_config.device

From ad0d567e1cdc77aff435b20bac918bfd0f55db0a Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Fri, 3 Jan 2025 18:25:02 -0500
Subject: [PATCH 066/462] [V1] Chore: cruft removal (#11724)

---
 vllm/entrypoints/llm.py       | 2 --
 vllm/v1/engine/core_client.py | 2 --
 vllm/v1/engine/llm_engine.py  | 4 ----
 vllm/v1/engine/processor.py   | 3 ---
 4 files changed, 11 deletions(-)

diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 7c0de3b3e5481..e48fd1a4fa5e9 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -225,8 +225,6 @@ def __init__(
         # Logic to switch between engines is done at runtime instead of import
         # to avoid import order issues
         self.engine_class = self.get_engine_class()
-
-        # TODO(rob): enable mp by default (issue with fork vs spawn)
         self.llm_engine = self.engine_class.from_engine_args(
             engine_args, usage_context=UsageContext.LLM_CLASS)
 
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 6a40c961fc1d7..a4a45ae05ff9e 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -94,8 +94,6 @@ class InprocClient(EngineCoreClient):
 
         * pushes EngineCoreRequest directly into the EngineCore
         * pulls EngineCoreOutputs by stepping the EngineCore
-
-        TODO: support asyncio-mode for debugging.
     """
 
     def __init__(self, *args, **kwargs):
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 1f49de67d7493..0bd9b52c9be82 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -42,8 +42,6 @@ def __init__(
         use_cached_outputs: bool = False,
         multiprocess_mode: bool = False,
     ) -> None:
-
-        # TODO: Can we avoid this?
         self.model_config = vllm_config.model_config
 
         # Tokenizer (+ ensure liveness if running in another process).
@@ -179,8 +177,6 @@ def step(self) -> List[RequestOutput]:
 
         return request_outputs
 
-    # TODO(rob): Can we get rid of these?
-
     def get_model_config(self):
         return self.model_config
 
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 905d3d1fc3e1c..c0f6cfab4865c 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -49,9 +49,6 @@ def __init__(
             cache_config.enable_prefix_caching
         self.mm_hasher = MMHasher()
 
-    # TODO: run in an ThreadpoolExecutor or BackgroundProcess.
-    # This ideally should releases the GIL, so we should not block the
-    # asyncio loop while this is running.
     def process_inputs(
         self,
         request_id: str,

From e5d7ed0c5374d38e75a8ef0243cc348f0f6f9185 Mon Sep 17 00:00:00 2001
From: WangErXiao <863579016@qq.com>
Date: Sat, 4 Jan 2025 08:13:12 +0800
Subject: [PATCH 067/462] [V1] log GPU blocks num for MultiprocExecutor
 (#11656)

---
 vllm/v1/executor/multiproc_executor.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index 114deae980d01..41e6abbd67956 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -95,6 +95,7 @@ def initialize(self, num_gpu_blocks: int) -> None:
         Initialize the KV caches and begin the model execution loop of the
         underlying workers.
         """
+        logger.info("# GPU blocks: %d", num_gpu_blocks)
         self.collective_rpc("initialize_cache", args=(num_gpu_blocks, ))
         self.collective_rpc("compile_or_warm_up_model")
 

From 9c93636d84414591ae4d7b9c1174af7e91052fd8 Mon Sep 17 00:00:00 2001
From: Hust_YangXian <bryceyx@gmail.com>
Date: Sat, 4 Jan 2025 14:16:30 +0800
Subject: [PATCH 068/462] Update tool_calling.md (#11701)

---
 docs/source/usage/tool_calling.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/usage/tool_calling.md b/docs/source/usage/tool_calling.md
index 34b26647a959f..062f2021eb62a 100644
--- a/docs/source/usage/tool_calling.md
+++ b/docs/source/usage/tool_calling.md
@@ -10,7 +10,7 @@ Start the server with tool calling enabled. This example uses Meta's Llama 3.1 8
 vllm serve meta-llama/Llama-3.1-8B-Instruct \
     --enable-auto-tool-choice \
     --tool-call-parser llama3_json \
-    --chat-template examples/tool_chat_template_llama3_json.jinja
+    --chat-template examples/tool_chat_template_llama3.1_json.jinja
 ```
 
 Next, make a request to the model that should result in it using the available tools:

From d1d49397e7f8d1ac472d763dae395b67fdda1ef8 Mon Sep 17 00:00:00 2001
From: Alberto Ferrer <albertof@barrahome.org>
Date: Sat, 4 Jan 2025 00:29:02 -0600
Subject: [PATCH 069/462] Update bnb.md with example for OpenAI (#11718)

---
 docs/source/quantization/bnb.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/docs/source/quantization/bnb.md b/docs/source/quantization/bnb.md
index 8240eca1c7e03..f7f41726f3725 100644
--- a/docs/source/quantization/bnb.md
+++ b/docs/source/quantization/bnb.md
@@ -37,3 +37,10 @@ model_id = "huggyllama/llama-7b"
 llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \
 quantization="bitsandbytes", load_format="bitsandbytes")
 ```
+## OpenAI Compatible Server
+
+Append the following to your 4bit model arguments:
+
+```
+--quantization bitsandbytes --load-format bitsandbytes
+```

From fbf25645542fdcfb3f1a27ba05486492e368925c Mon Sep 17 00:00:00 2001
From: Kunshang Ji <kunshang.ji@intel.com>
Date: Sat, 4 Jan 2025 14:41:31 +0800
Subject: [PATCH 070/462] [V1] Add `RayExecutor` support for `AsyncLLM` (api
 server) (#11712)

---
 vllm/v1/engine/async_llm.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 564d8a8343bef..0696caf88385d 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -22,6 +22,7 @@
 from vllm.v1.engine.detokenizer import Detokenizer
 from vllm.v1.engine.processor import Processor
 from vllm.v1.executor.abstract import Executor
+from vllm.v1.executor.ray_utils import initialize_ray_cluster
 
 logger = init_logger(__name__)
 
@@ -131,7 +132,11 @@ def _get_executor_cls(cls, vllm_config: VllmConfig) -> Type[Executor]:
         executor_class: Type[Executor]
         distributed_executor_backend = (
             vllm_config.parallel_config.distributed_executor_backend)
-        if distributed_executor_backend == "mp":
+        if distributed_executor_backend == "ray":
+            initialize_ray_cluster(vllm_config.parallel_config)
+            from vllm.v1.executor.ray_executor import RayExecutor
+            executor_class = RayExecutor
+        elif distributed_executor_backend == "mp":
             from vllm.v1.executor.multiproc_executor import MultiprocExecutor
             executor_class = MultiprocExecutor
         else:

From d91457d529c2df5d66bdfd939b90b7c75a9729b8 Mon Sep 17 00:00:00 2001
From: xcnick <xcnick0412@gmail.com>
Date: Sat, 4 Jan 2025 14:49:46 +0800
Subject: [PATCH 071/462] [V1] Add kv cache utils tests. (#11513)

Signed-off-by: xcnick <xcnick0412@gmail.com>
---
 tests/v1/core/test_kv_cache_utils.py | 241 +++++++++++++++++++++++++++
 1 file changed, 241 insertions(+)
 create mode 100644 tests/v1/core/test_kv_cache_utils.py

diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
new file mode 100644
index 0000000000000..faa3a91de151f
--- /dev/null
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -0,0 +1,241 @@
+import pytest
+
+from vllm.inputs import token_inputs
+from vllm.sampling_params import SamplingParams
+from vllm.v1.core.kv_cache_utils import (BlockHashType, FreeKVCacheBlockQueue,
+                                         KVCacheBlock,
+                                         generate_block_hash_extra_keys,
+                                         hash_block_tokens,
+                                         hash_request_tokens)
+from vllm.v1.request import Request
+
+
+def make_request(request_id,
+                 prompt_token_ids,
+                 mm_positions=None,
+                 mm_hashes=None):
+    return Request(
+        request_id=request_id,
+        inputs=token_inputs(
+            prompt_token_ids=prompt_token_ids,
+            multi_modal_placeholders={"image": mm_positions}
+            if mm_positions else None,
+            multi_modal_hashes=mm_hashes,
+        ),
+        sampling_params=SamplingParams(max_tokens=17),
+        eos_token_id=100,
+        arrival_time=0,
+        lora_request=None,
+    )
+
+
+def test_kv_cache_block():
+    # Test KVCacheBlock initialization
+    block = KVCacheBlock(block_id=0)
+    assert block.block_id == 0
+    assert block.ref_cnt == 0
+    assert block.block_hash is None
+
+    # Test reference count manipulation
+    block.incr_ref()
+    assert block.ref_cnt == 1
+    block.decr_ref()
+    assert block.ref_cnt == 0
+
+    # Test block hash setting and resetting
+    block_hash = BlockHashType(hash_value=123, token_ids=(1, 2, 3))
+    block.block_hash = block_hash
+    assert block.block_hash == block_hash
+
+    block.reset_hash()
+    assert block.block_hash is None
+
+
+def test_free_kv_cache_block_queue_initialization():
+    # Test with a single block
+    block = KVCacheBlock(block_id=0)
+    queue = FreeKVCacheBlockQueue([block])
+    assert queue.num_free_blocks == 1
+    assert queue.free_list_head == block
+    assert queue.free_list_tail == block
+
+
+def test_free_kv_cache_block_queue_operations():
+    # Create a list of KVCacheBlock objects
+    blocks = [KVCacheBlock(block_id=i) for i in range(5)]
+
+    # Create a FreeKVCacheBlockQueue with these blocks
+    queue = FreeKVCacheBlockQueue(blocks)
+
+    # Check initial state
+    assert queue.num_free_blocks == 5
+    assert queue.free_list_head == blocks[0]
+    assert queue.free_list_tail == blocks[4]
+
+    # Pop the first block
+    block1 = queue.popleft()
+    assert block1 == blocks[0]
+    assert queue.num_free_blocks == 4
+    assert queue.free_list_head == blocks[1]
+    assert queue.free_list_tail == blocks[4]
+
+    # Remove a block from the middle
+    block_to_remove = blocks[2]
+    queue.remove(block_to_remove)
+    assert queue.num_free_blocks == 3
+    assert blocks[1].next_free_block == blocks[3]
+    assert blocks[3].prev_free_block == blocks[1]
+
+    # Append a block back
+    queue.append(block_to_remove)
+    assert queue.num_free_blocks == 4
+    assert queue.free_list_tail == block_to_remove
+    assert block_to_remove.prev_free_block == blocks[4]
+    assert block_to_remove.next_free_block is None
+
+    # Pop blocks until empty
+    for _ in range(4):
+        queue.popleft()
+    assert queue.num_free_blocks == 0
+    assert queue.free_list_head is None
+    assert queue.free_list_tail is None
+
+    # Attempt to pop from an empty queue
+    with pytest.raises(ValueError) as e:
+        queue.popleft()
+    assert str(e.value) == "No free blocks available"
+
+
+def test_free_kv_cache_block_queue_get_all_free_blocks():
+    # Create a list of KVCacheBlock objects
+    blocks = [KVCacheBlock(block_id=i) for i in range(5)]
+
+    # Create a FreeKVCacheBlockQueue with these blocks
+    queue = FreeKVCacheBlockQueue(blocks)
+
+    # Check all blocks are correctly retrieved
+    assert queue.get_all_free_blocks() == blocks
+
+    # Pop a block and check again
+    queue.popleft()
+    assert queue.get_all_free_blocks() == blocks[1:]
+
+    # Remove a block and check again
+    block_to_remove = blocks[2]
+    queue.remove(block_to_remove)
+    assert queue.get_all_free_blocks() == blocks[1:2] + blocks[3:]
+
+    # Append a block back and check again
+    queue.append(block_to_remove)
+    assert queue.get_all_free_blocks() == \
+        blocks[1:2] + blocks[3:] + [block_to_remove]
+
+
+def test_generate_block_hash_extra_keys():
+    request = make_request(
+        request_id=0,
+        prompt_token_ids=[_ for _ in range(20)],
+        mm_positions=[{
+            "offset": 0,
+            "length": 5
+        }, {
+            "offset": 10,
+            "length": 5
+        }],
+        mm_hashes=["hash1", "hash2"],
+    )
+
+    # Test with no extra keys
+    extra_keys, next_mm_idx = generate_block_hash_extra_keys(request, 0, 5, 0)
+    assert extra_keys == (("hash1", 0), )
+    assert next_mm_idx == 1
+
+    # Test with partial overlap
+    extra_keys, next_mm_idx = generate_block_hash_extra_keys(request, 3, 8, 0)
+    assert extra_keys == (("hash1", 3), )
+    assert next_mm_idx == 1
+
+    # Test with no overlap
+    extra_keys, next_mm_idx = generate_block_hash_extra_keys(request, 6, 10, 0)
+    assert extra_keys == ()
+    assert next_mm_idx == 1
+
+    # Test with multiple extra keys
+    extra_keys, next_mm_idx = generate_block_hash_extra_keys(request, 0, 15, 0)
+    assert extra_keys == (("hash1", 0), ("hash2", 0))
+    assert next_mm_idx == 2
+
+
+def test_generate_block_hash_extra_keys_no_mm_inputs():
+    request = make_request(
+        request_id=0,
+        prompt_token_ids=[_ for _ in range(6)],
+        mm_positions=None,
+        mm_hashes=None,
+    )
+
+    extra_keys, next_mm_idx = generate_block_hash_extra_keys(request, 0, 5, 0)
+    assert extra_keys is None
+    assert next_mm_idx == 0
+
+
+def test_hash_block_tokens():
+    parent_block_hash = 123
+    curr_block_token_ids = (1, 2, 3)
+    extra_keys = ("key1", "key2")
+
+    block_hash = hash_block_tokens(parent_block_hash, curr_block_token_ids,
+                                   extra_keys)
+    assert isinstance(block_hash, BlockHashType)
+    assert block_hash.hash_value == hash(
+        (parent_block_hash, *curr_block_token_ids))
+    assert block_hash.token_ids == curr_block_token_ids
+    assert block_hash.extra_keys == extra_keys
+
+
+def test_hash_request_tokens():
+    request = make_request(
+        request_id=0,
+        prompt_token_ids=[_ for _ in range(6)],
+        mm_positions=[{
+            "offset": 0,
+            "length": 3
+        }, {
+            "offset": 3,
+            "length": 3
+        }],
+        mm_hashes=["hash1", "hash2"],
+    )
+
+    block_size = 3
+    block_hashes = hash_request_tokens(block_size, request)
+
+    assert len(block_hashes) == 2
+    assert isinstance(block_hashes[0], BlockHashType)
+    assert isinstance(block_hashes[1], BlockHashType)
+
+    # Check the first block
+    assert block_hashes[0].token_ids == (0, 1, 2)
+    assert block_hashes[0].extra_keys == (("hash1", 0), )
+
+    # Check the second block
+    assert block_hashes[1].token_ids == (3, 4, 5)
+    assert block_hashes[1].extra_keys == (("hash2", 0), )
+
+
+def test_hash_request_tokens_no_mm_inputs():
+    request = make_request(
+        request_id=0,
+        prompt_token_ids=[_ for _ in range(6)],
+        mm_positions=None,
+        mm_hashes=None,
+    )
+
+    block_size = 3
+    block_hashes = hash_request_tokens(block_size, request)
+
+    assert len(block_hashes) == 2
+    assert block_hashes[0].token_ids == (0, 1, 2)
+    assert block_hashes[0].extra_keys is None
+    assert block_hashes[1].token_ids == (3, 4, 5)
+    assert block_hashes[1].extra_keys is None

From 300acb83472512b14ec7ba8cdf45efe07e8c8f68 Mon Sep 17 00:00:00 2001
From: Yan Burman <yanburman@users.noreply.github.com>
Date: Sat, 4 Jan 2025 08:50:16 +0200
Subject: [PATCH 072/462] [Core][Bugfix] Use correct device to initialize GPU
 data during CUDA-graph-capture (#11233)

Signed-off-by: Yan Burman <yanburman@users.noreply.github.com>
Signed-off-by: Ido Asraff <idoa@atero.ai>
---
 tests/distributed/test_custom_all_reduce.py |  2 +-
 tests/distributed/test_pynccl.py            |  2 +-
 vllm/distributed/parallel_state.py          |  7 +++---
 vllm/v1/worker/gpu_model_runner.py          |  2 +-
 vllm/worker/model_runner.py                 | 25 +++++++++++++--------
 5 files changed, 23 insertions(+), 15 deletions(-)

diff --git a/tests/distributed/test_custom_all_reduce.py b/tests/distributed/test_custom_all_reduce.py
index 86ca1948ef94a..4072616fd30e2 100644
--- a/tests/distributed/test_custom_all_reduce.py
+++ b/tests/distributed/test_custom_all_reduce.py
@@ -50,7 +50,7 @@ def graph_allreduce(tp_size, pp_size, rank, distributed_init_port):
 
     for sz in test_sizes:
         for dtype in [torch.float32, torch.float16, torch.bfloat16]:
-            with graph_capture() as graph_capture_context:
+            with graph_capture(device=device) as graph_capture_context:
                 # use integers so result matches NCCL exactly
                 inp1 = torch.randint(1,
                                      16, (sz, ),
diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py
index 3e9b0e10a11d8..36cfe42251384 100644
--- a/tests/distributed/test_pynccl.py
+++ b/tests/distributed/test_pynccl.py
@@ -107,7 +107,7 @@ def multiple_allreduce_with_vllm_worker_fn():
     device = torch.device(f"cuda:{torch.distributed.get_rank()}")
     ensure_model_parallel_initialized(2, 2)
     tensor = torch.ones(16, 1024, 1024, dtype=torch.float32, device=device)
-    with graph_capture():
+    with graph_capture(device=device):
         # two tp groups can communicate independently
         if torch.distributed.get_rank() in [0, 1]:
             tensor = tensor_model_parallel_all_reduce(tensor)
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index e6768467f4c27..a0d4235460f3b 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -920,7 +920,7 @@ def get_kv_transfer_group() -> kv_transfer.KVTransferAgent:
 
 
 @contextmanager
-def graph_capture():
+def graph_capture(device: torch.device):
     """
     `graph_capture` is a context manager which should surround the code that
     is capturing the CUDA graph. Its main purpose is to ensure that the
@@ -934,8 +934,9 @@ def graph_capture():
     in order to explicitly distinguish the kernels to capture
     from other kernels possibly launched on background in the default stream.
     """
-    with get_tp_group().graph_capture() as context, get_pp_group(
-    ).graph_capture(context):
+    context = GraphCaptureContext(torch.cuda.Stream(device=device))
+    with get_tp_group().graph_capture(context), get_pp_group().graph_capture(
+            context):
         yield context
 
 
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 75098b0330ac9..294c76cfb680e 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -836,7 +836,7 @@ def capture_model(self) -> None:
         # Trigger CUDA graph capture for specific shapes.
         # Capture the large shapes first so that the smaller shapes
         # can reuse the memory pool allocated for the large shapes.
-        with graph_capture():
+        with graph_capture(device=self.device):
             for num_tokens in reversed(self.cudagraph_batch_sizes):
                 for _ in range(self.vllm_config.compilation_config.
                                cudagraph_num_of_warmups):
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 637fba23611f4..1c6d1bbee78ee 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1426,10 +1426,15 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None:
 
         # Prepare dummy inputs. These will be reused for all batch sizes.
         max_batch_size = self.max_batchsize_to_capture
-        input_tokens = torch.zeros(max_batch_size, dtype=torch.long).cuda()
-        input_positions = torch.zeros(max_batch_size, dtype=torch.long).cuda()
+        input_tokens = torch.zeros(max_batch_size,
+                                   dtype=torch.long,
+                                   device=self.device)
+        input_positions = torch.zeros(max_batch_size,
+                                      dtype=torch.long,
+                                      device=self.device)
         if self.model_config.uses_mrope:
-            input_positions = torch.tile(input_positions, (3, 1))
+            input_positions = torch.tile(input_positions,
+                                         (3, 1)).cuda(device=self.device)
         # Prepare dummy previous_hidden_states only if needed by the model.
         # This is used by draft models such as EAGLE.
         previous_hidden_states = None
@@ -1448,8 +1453,8 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None:
                 dtype=self.model_config.dtype,
                 device=self.device)
 
-        with self.attn_state.graph_capture(
-                max_batch_size), graph_capture() as graph_capture_context:
+        with self.attn_state.graph_capture(max_batch_size), graph_capture(
+                self.device) as graph_capture_context:
             # NOTE: Capturing the largest batch size first may help reduce the
             # memory usage of CUDA graph.
             for virtual_engine in range(
@@ -1549,10 +1554,12 @@ def _update_inputs_to_capture_for_enc_dec_model(self,
         """
         # During the decode phase encoder_input_ids and encoder_positions are
         # unset. Do the same thing for graph capture.
-        capture_inputs["encoder_input_ids"] = torch.tensor(
-            [], dtype=torch.long).cuda()
-        capture_inputs["encoder_positions"] = torch.tensor(
-            [], dtype=torch.long).cuda()
+        capture_inputs["encoder_input_ids"] = torch.tensor([],
+                                                           dtype=torch.long,
+                                                           device=self.device)
+        capture_inputs["encoder_positions"] = torch.tensor([],
+                                                           dtype=torch.long,
+                                                           device=self.device)
 
     @property
     def vocab_size(self) -> int:

From eed11ebee93e9d137ac74d8e6e97427354bd3797 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 4 Jan 2025 19:40:53 +0800
Subject: [PATCH 073/462] [VLM] Merged multi-modal processors for
 LLaVA-NeXT-Video and LLaVA-OneVision (#11717)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .../__init__.py                               |   0
 .../test_idefics3.py                          |   0
 .../test_internvl.py                          |   0
 .../processing/test_llava_next.py             |  58 ++
 .../processing/test_llava_onevision.py        |  59 ++
 .../test_phi3v.py                             |  44 +-
 .../test_qwen.py                              |   0
 .../test_qwen2_vl.py                          |  39 +-
 .../vision_language/test_models.py            |   9 +-
 .../vision_language/test_qwen2_vl.py          | 127 -----
 tests/multimodal/test_processing.py           | 170 +++---
 vllm/model_executor/models/aria.py            |   5 +-
 vllm/model_executor/models/blip2.py           |   5 +-
 vllm/model_executor/models/chameleon.py       |   5 +-
 vllm/model_executor/models/clip.py            |  11 +-
 vllm/model_executor/models/fuyu.py            |   5 +-
 vllm/model_executor/models/llava.py           |  75 ++-
 vllm/model_executor/models/llava_next.py      |  15 +-
 .../model_executor/models/llava_next_video.py | 273 +++++----
 vllm/model_executor/models/llava_onevision.py | 531 ++++++++----------
 vllm/model_executor/models/phi3v.py           |  26 +-
 vllm/model_executor/models/pixtral.py         |  11 +-
 vllm/model_executor/models/qwen2_audio.py     |  15 +-
 vllm/model_executor/models/qwen2_vl.py        | 199 ++++---
 vllm/model_executor/models/siglip.py          |  11 +-
 vllm/model_executor/models/ultravox.py        |  11 +-
 vllm/model_executor/models/vision.py          |  37 +-
 vllm/multimodal/parse.py                      |  14 +
 vllm/multimodal/processing.py                 | 326 +++++++----
 vllm/multimodal/registry.py                   |   3 +-
 vllm/transformers_utils/tokenizer.py          |  13 +
 31 files changed, 1114 insertions(+), 983 deletions(-)
 rename tests/models/decoder_only/vision_language/{mm_processor_kwargs => processing}/__init__.py (100%)
 rename tests/models/decoder_only/vision_language/{mm_processor_kwargs => processing}/test_idefics3.py (100%)
 rename tests/models/decoder_only/vision_language/{mm_processor_kwargs => processing}/test_internvl.py (100%)
 create mode 100644 tests/models/decoder_only/vision_language/processing/test_llava_next.py
 create mode 100644 tests/models/decoder_only/vision_language/processing/test_llava_onevision.py
 rename tests/models/decoder_only/vision_language/{mm_processor_kwargs => processing}/test_phi3v.py (60%)
 rename tests/models/decoder_only/vision_language/{mm_processor_kwargs => processing}/test_qwen.py (100%)
 rename tests/models/decoder_only/vision_language/{mm_processor_kwargs => processing}/test_qwen2_vl.py (64%)

diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/__init__.py b/tests/models/decoder_only/vision_language/processing/__init__.py
similarity index 100%
rename from tests/models/decoder_only/vision_language/mm_processor_kwargs/__init__.py
rename to tests/models/decoder_only/vision_language/processing/__init__.py
diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_idefics3.py b/tests/models/decoder_only/vision_language/processing/test_idefics3.py
similarity index 100%
rename from tests/models/decoder_only/vision_language/mm_processor_kwargs/test_idefics3.py
rename to tests/models/decoder_only/vision_language/processing/test_idefics3.py
diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_internvl.py b/tests/models/decoder_only/vision_language/processing/test_internvl.py
similarity index 100%
rename from tests/models/decoder_only/vision_language/mm_processor_kwargs/test_internvl.py
rename to tests/models/decoder_only/vision_language/processing/test_internvl.py
diff --git a/tests/models/decoder_only/vision_language/processing/test_llava_next.py b/tests/models/decoder_only/vision_language/processing/test_llava_next.py
new file mode 100644
index 0000000000000..6772130c9b884
--- /dev/null
+++ b/tests/models/decoder_only/vision_language/processing/test_llava_next.py
@@ -0,0 +1,58 @@
+import pytest
+from PIL import Image
+from transformers import AutoTokenizer
+
+from vllm.inputs import InputProcessingContext
+
+from ....utils import build_model_context
+
+
+# Fixtures lazy import to avoid initializing CUDA during test collection
+@pytest.fixture()
+def processor_for_llava_next():
+    from vllm.model_executor.models.llava_next import (
+        LlavaNextMultiModalProcessor)
+    return LlavaNextMultiModalProcessor
+
+
+# FIXME: image_size [(198, 176), (176, 198)]
+@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
+@pytest.mark.parametrize("image_size", [(1669, 2560), (2560, 1669), (183, 488),
+                                        (488, 183)])
+@pytest.mark.parametrize("num_imgs", [1, 2])
+def test_processor_prompt_replacements(
+    processor_for_llava_next,
+    model_id: str,
+    image_size: tuple[int, int],
+    num_imgs: int,
+):
+    """
+    Ensure LlavaNextMultiModalProcessor handles prompt replacement properly.
+    """
+    ctx = build_model_context(
+        model_name=model_id,
+        tokenizer_name=model_id,
+        mm_processor_kwargs=None,
+        limit_mm_per_prompt={"image": num_imgs},
+    )
+    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+    ctx = InputProcessingContext(ctx.model_config, tokenizer)
+
+    # Build the image str / prompt based on the number of images we pass
+    prompt = "<image>" * num_imgs
+    mm_data = {"image": [Image.new("RGB", size=image_size)] * num_imgs}
+
+    # The processor will throw an error if there is a mismatch
+    # in the prompt replacements
+    processor = processor_for_llava_next(ctx)
+    processed_inputs = processor.apply(prompt, mm_data, {})
+
+    image_placeholders = processed_inputs["mm_placeholders"]["image"]
+    assert len(image_placeholders) == num_imgs
+
+    first_placeholder = image_placeholders[0]
+
+    # NOTE: There is a BOS token
+    assert first_placeholder["offset"] == 1
+    assert first_placeholder["length"] == (
+        len(processed_inputs["prompt_token_ids"]) - 1) // num_imgs
diff --git a/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py b/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py
new file mode 100644
index 0000000000000..71adde6568a17
--- /dev/null
+++ b/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py
@@ -0,0 +1,59 @@
+import pytest
+from PIL import Image
+from transformers import AutoTokenizer
+
+from vllm.inputs import InputProcessingContext
+
+from ....utils import build_model_context
+
+
+# Fixtures lazy import to avoid initializing CUDA during test collection
+@pytest.fixture()
+def processor_for_llava_onevision():
+    from vllm.model_executor.models.llava_onevision import (
+        LlavaOnevisionMultiModalProcessor)
+    return LlavaOnevisionMultiModalProcessor
+
+
+@pytest.mark.parametrize("model_id",
+                         ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
+@pytest.mark.parametrize("image_size", [(1669, 2560), (2560, 1669), (183, 488),
+                                        (488, 183), (198, 176), (176, 198)])
+@pytest.mark.parametrize("num_imgs", [1, 2])
+def test_processor_prompt_replacements(
+    processor_for_llava_onevision,
+    model_id: str,
+    image_size: tuple[int, int],
+    num_imgs: int,
+):
+    """
+    Ensure LlavaOnevisionMultiModalProcessor handles prompt replacement
+    properly.
+    """
+    ctx = build_model_context(
+        model_name=model_id,
+        tokenizer_name=model_id,
+        mm_processor_kwargs=None,
+        limit_mm_per_prompt={"image": num_imgs},
+    )
+    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+    ctx = InputProcessingContext(ctx.model_config, tokenizer)
+
+    # Build the image str / prompt based on the number of images we pass
+    prompt = "<image>" * num_imgs
+    mm_data = {"image": [Image.new("RGB", size=image_size)] * num_imgs}
+
+    # The processor will throw an error if there is a mismatch
+    # in the prompt replacements
+    processor = processor_for_llava_onevision(ctx)
+    processed_inputs = processor.apply(prompt, mm_data, {})
+
+    image_placeholders = processed_inputs["mm_placeholders"]["image"]
+    assert len(image_placeholders) == num_imgs
+
+    first_placeholder = image_placeholders[0]
+
+    # NOTE: There is a BOS token
+    assert first_placeholder["offset"] == 0
+    assert first_placeholder["length"] == len(
+        processed_inputs["prompt_token_ids"]) // num_imgs
diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py b/tests/models/decoder_only/vision_language/processing/test_phi3v.py
similarity index 60%
rename from tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py
rename to tests/models/decoder_only/vision_language/processing/test_phi3v.py
index 3edf96d11106d..249045b3c04ce 100644
--- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py
+++ b/tests/models/decoder_only/vision_language/processing/test_phi3v.py
@@ -1,6 +1,4 @@
 """Tests for phi3v's multimodal preprocessing kwargs."""
-from typing import Optional
-
 import pytest
 from transformers import AutoTokenizer
 
@@ -10,8 +8,6 @@
 from .....conftest import _ImageAssets
 from ....utils import build_model_context
 
-models = ["microsoft/Phi-3.5-vision-instruct"]
-
 
 # Wrap lazy imports to avoid initializing CUDA during test collection
 @pytest.fixture()
@@ -20,40 +16,40 @@ def processor_for_phi3v():
     return Phi3VMultiModalProcessor
 
 
-@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("model_id", ["microsoft/Phi-3.5-vision-instruct"])
+# yapf: disable
 @pytest.mark.parametrize(
-    "num_crops,expected_toks_per_img",
+    ("mm_processor_kwargs", "expected_toks_per_img"),
     [
-        (4, 757),
-        (16, 1921),
+        ({"num_crops": 4}, 757),
+        ({"num_crops": 16}, 1921),
         # the default num_crops of phi-3.5-vision is 4
-        (None, 757),
+        ({}, 757),
     ])
+# yapf: enable
 @pytest.mark.parametrize("num_imgs", [1, 2])
-def test_processor_override(processor_for_phi3v, image_assets: _ImageAssets,
-                            model: str, num_crops: Optional[int],
-                            expected_toks_per_img: int, num_imgs: int):
+def test_processor_override(
+    processor_for_phi3v,
+    image_assets: _ImageAssets,
+    model_id: str,
+    mm_processor_kwargs: dict[str, int],
+    expected_toks_per_img: int,
+    num_imgs: int,
+):
     """Ensure input_processor_for_phi3v handles num_crops properly."""
-    # Same as the previous test - don't initialize mm_processor_kwargs
-    # in this test and assume that the kwargs will be correctly expanded by
-    # the partial when calling the custom input processor.
     ctx = build_model_context(
-        model_name=model,
-        tokenizer_name=model,
+        model_name=model_id,
+        tokenizer_name=model_id,
         trust_remote_code=True,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
+    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
     ctx = InputProcessingContext(ctx.model_config, tokenizer)
+
     # Build the image str / prompt based on the number of images we pass
     img_str = "".join([f"<|image_{idx}|>\n" for idx in range(1, num_imgs + 1)])
     prompt = f"<|user|>\n{img_str}<|end|>\n<|assistant|>\n"
-    images = [image_assets[0].pil_image] * num_imgs
-
-    mm_data = {"image": images}
-    mm_processor_kwargs = {}
-    if num_crops is not None:
-        mm_processor_kwargs = {"num_crops": num_crops}
+    mm_data = {"image": [image_assets[0].pil_image] * num_imgs}
 
     processor = processor_for_phi3v(ctx)
     processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py b/tests/models/decoder_only/vision_language/processing/test_qwen.py
similarity index 100%
rename from tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py
rename to tests/models/decoder_only/vision_language/processing/test_qwen.py
diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/processing/test_qwen2_vl.py
similarity index 64%
rename from tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py
rename to tests/models/decoder_only/vision_language/processing/test_qwen2_vl.py
index 1f0b482666723..b9ac887edf90f 100644
--- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py
+++ b/tests/models/decoder_only/vision_language/processing/test_qwen2_vl.py
@@ -1,5 +1,3 @@
-from typing import Any, Dict, Tuple
-
 import pytest
 from transformers import AutoTokenizer
 
@@ -8,56 +6,45 @@
 from .....conftest import _ImageAssets
 from ....utils import build_model_context
 
-MODEL = "Qwen/Qwen2-VL-2B-Instruct"
-MIN_PIXELS = "min_pixels"
-MAX_PIXELS = "max_pixels"
-
 
 # Fixtures lazy import to avoid initializing CUDA during test collection
-# NOTE: Qwen2VL supports multiple input modalities, so it registers multiple
-# input mappers.
 @pytest.fixture()
 def processor_for_qwen2_vl():
     from vllm.model_executor.models.qwen2_vl import Qwen2VLMultiModalProcessor
     return Qwen2VLMultiModalProcessor
 
 
+@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"])
+# yapf: disable
 @pytest.mark.parametrize(
-    "mm_processor_kwargs, expected_toks_per_img, expected_pixels_shape", [
+    ("mm_processor_kwargs", "expected_toks_per_img", "expected_pixels_shape"), [
         ({}, 1426, (5704, 1176)),
-        ({
-            MIN_PIXELS: 64**2,
-            MAX_PIXELS: 512**2
-        }, 330, (1320, 1176)),
+        ({"min_pixels": 64**2, "max_pixels": 512**2}, 330, (1320, 1176)),
     ])
-@pytest.mark.parametrize("model", [MODEL])
+# yapf: enable
 @pytest.mark.parametrize("num_imgs", [1, 2])
 def test_processor_override(
     processor_for_qwen2_vl,
     image_assets: _ImageAssets,
-    model: str,
-    mm_processor_kwargs: Dict[str, Any],
+    model_id: str,
+    mm_processor_kwargs: dict[str, object],
     expected_toks_per_img: int,
-    expected_pixels_shape: Tuple[int, int],
+    expected_pixels_shape: tuple[int, int],
     num_imgs: int,
 ):
     """Ensure Qwen2VLMultiModalProcessor handles min/max pixels properly."""
-    # Same as the previous test - don't initialize mm_processor_kwargs
-    # in this test and assume that the kwargs will be correctly expanded by
-    # the partial when calling the custom input processor.
     ctx = build_model_context(
-        model_name=model,
-        tokenizer_name=model,
+        model_name=model_id,
+        tokenizer_name=model_id,
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
+    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
     ctx = InputProcessingContext(ctx.model_config, tokenizer)
+
     # Build the image str / prompt based on the number of images we pass
     prompt = "<|vision_start|><|image_pad|><|vision_end|>" * num_imgs
-    images = [image_assets[0].pil_image] * num_imgs
-
-    mm_data = {"image": images}
+    mm_data = {"image": [image_assets[0].pil_image] * num_imgs}
 
     processor = processor_for_qwen2_vl(ctx)
     processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index 7db08166826eb..dc0b683c1f1cb 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -274,10 +274,8 @@
             ),
             limit_mm_per_prompt={"image": 4},
         )],
-        # Llava-next tests fixed sizes & the default size factors
-        image_sizes=[((1669, 2560), (2560, 1669), (183, 488), (488, 183))],
     ),
-    "llava_one_vision": VLMTestInfo(
+    "llava_onevision": VLMTestInfo(
         models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"],
         test_type=VLMTestType.CUSTOM_INPUTS,
         prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n",   # noqa: E501
@@ -288,8 +286,6 @@
         ),
         auto_cls=AutoModelForVision2Seq,
         vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
-        # Llava-one-vision tests fixed sizes & the default size factors
-        image_sizes=[((1669, 2560), (2560, 1669), (183, 488), (488, 183))],
         custom_test_opts=[CustomTestOptions(
             inputs=custom_inputs.multi_video_multi_aspect_ratio_inputs(
                 formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n",   # noqa: E501
@@ -306,7 +302,6 @@
         max_model_len=4096,
         auto_cls=AutoModelForVision2Seq,
         vllm_output_post_proc=model_utils.llava_video_vllm_to_hf_output,
-        image_sizes=[((1669, 2560), (2560, 1669), (183, 488), (488, 183))],
     ),
     "mantis": VLMTestInfo(
         models=["TIGER-Lab/Mantis-8B-siglip-llama3"],
@@ -431,7 +426,7 @@
             ) for inp in custom_inputs.different_patch_input_cases_internvl()
         ],
     ),
-    "llava_one_vision-multiple-images": VLMTestInfo(
+    "llava_onevision-multiple-images": VLMTestInfo(
         models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"],
         test_type=VLMTestType.CUSTOM_INPUTS,
         max_model_len=16384,
diff --git a/tests/models/decoder_only/vision_language/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/test_qwen2_vl.py
index 51fe7d2ad32a8..16e256e040a74 100644
--- a/tests/models/decoder_only/vision_language/test_qwen2_vl.py
+++ b/tests/models/decoder_only/vision_language/test_qwen2_vl.py
@@ -427,130 +427,3 @@ def test_qwen2_vl_video_embeddings_input(vllm_runner, video_assets, model,
         mm_limit=1,
         tensor_parallel_size=1,
     )
-
-
-def run_chunked_prefill_test(
-    vllm_runner: Type[VllmRunner],
-    inputs: List[Tuple[List[str], PromptImageInput, PromptVideoInput]],
-    model: str,
-    *,
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-    mm_limit: int,
-    tensor_parallel_size: int,
-    distributed_executor_backend: Optional[str] = None,
-):
-    """Compare inference result between
-    chunked prefill disabled and chunked prefill enabled
-    """
-
-    # NOTE:
-    # max_model_len should be greater than image_feature_size
-    with vllm_runner(model,
-                     task="generate",
-                     max_model_len=4000,
-                     max_num_seqs=4,
-                     dtype=dtype,
-                     limit_mm_per_prompt={
-                         "image": mm_limit,
-                         "video": mm_limit
-                     },
-                     tensor_parallel_size=tensor_parallel_size,
-                     distributed_executor_backend=distributed_executor_backend
-                     ) as vllm_model:
-
-        outputs_per_case = [
-            vllm_model.generate_greedy_logprobs(prompts,
-                                                max_tokens,
-                                                num_logprobs=num_logprobs,
-                                                images=images or None,
-                                                videos=videos or None)
-            for prompts, images, videos in inputs
-        ]
-
-    with vllm_runner(
-            model,
-            task="generate",
-            max_model_len=4000,
-            max_num_seqs=4,
-            dtype=dtype,
-            limit_mm_per_prompt={
-                "image": mm_limit,
-                "video": mm_limit
-            },
-            tensor_parallel_size=tensor_parallel_size,
-            distributed_executor_backend=distributed_executor_backend,
-            enable_chunked_prefill=True,
-            # should be small enough to ensure prefilling is chunked
-            max_num_batched_tokens=32,
-            mm_processor_kwargs={
-                "max_pixels": 16 * 28 * 28,
-            }) as vllm_model_chunked:
-        outputs_per_case_chunked = [
-            vllm_model_chunked.generate_greedy_logprobs(
-                prompts,
-                max_tokens,
-                num_logprobs=num_logprobs,
-                images=images or None,
-                videos=videos or None) for prompts, images, videos in inputs
-        ]
-
-    for outputs, \
-        outputs_chunked \
-        in zip(outputs_per_case,
-            outputs_per_case_chunked):
-        check_logprobs_close(
-            outputs_0_lst=outputs,
-            outputs_1_lst=outputs_chunked,
-            name_0="non_chunked",
-            name_1="chunked",
-        )
-
-
-@pytest.mark.core_model
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("dtype", [target_dtype])
-@pytest.mark.parametrize("max_tokens", [1])
-@pytest.mark.parametrize("num_logprobs", [10])
-def test_qwen2_vl_mrope_chunked_prefill(vllm_runner, example_prompts,
-                                        model: str, dtype: str,
-                                        max_tokens: int,
-                                        num_logprobs: int) -> None:
-    """
-    Test Qwen2-VL's chunked prefill with M-RoPE
-    """
-    prompts = [
-        qwen2_vl_chat_template(IMAGE_PLACEHOLDER, prompt)
-        for prompt in example_prompts[:1]
-    ]
-
-    # 1. Qwen2-VL's M-RoPE works only when there are some multi-modal inputs,
-    #    so an image is included in the inputs
-    # 2. however, Qwen2-VL currently won't work properly
-    #    when chunked prefill is enabled and there are some multi-modal inputs,
-    #    here use a hacky way: provide a **zero-length** image to make it happy
-    #
-    # and finally we achieved:
-    # (1) chunked_prefill enabled; (2) M-RoPE works; to continue our tests
-    zero_len_image = {
-        "image_embeds": torch.empty((0, MODEL_HIDDEN_SIZE)),
-        "image_grid_thw": torch.tensor([[0, 0, 0]])
-    }
-    images = [zero_len_image] * len(prompts)
-
-    inputs_per_case: List[Tuple[List[str], PromptImageInput,
-                                PromptVideoInput]] = [
-                                    (prompts, images, []),
-                                ]
-
-    run_chunked_prefill_test(
-        vllm_runner,
-        inputs_per_case,
-        model,
-        dtype=dtype,
-        max_tokens=max_tokens,
-        num_logprobs=num_logprobs,
-        mm_limit=1,
-        tensor_parallel_size=1,
-    )
diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index f99d7556b27f9..b32faa699ebf2 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -11,8 +11,8 @@
 from vllm.inputs import InputProcessingContext
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.processing import (ProcessingCache, PromptReplacement,
-                                        _PlaceholderInfo, find_text_matches,
-                                        find_token_matches, iter_placeholders,
+                                        _PlaceholderInfo, find_mm_placeholders,
+                                        find_text_matches, find_token_matches,
                                         iter_token_matches,
                                         replace_text_matches,
                                         replace_token_matches)
@@ -314,21 +314,27 @@ def test_find_replace_text(
     # Should not be used since there is nothing to convert to text
     mock_tokenizer = cast(AnyTokenizer, object())
 
-    prompt_repls = [
-        PromptReplacement(key, target, repl_by_key[key]).bind(mock_tokenizer)
+    mm_prompt_repls = {
+        key: [
+            PromptReplacement(key, target,
+                              repl_by_key[key]).bind(mock_tokenizer)
+        ]
         for key, target in target_by_key.items()
-    ]
-    matches = find_text_matches(prompt, prompt_repls)
+    }
+    mm_matches = {
+        key: find_text_matches(prompt, prompt_repls)
+        for key, prompt_repls in mm_prompt_repls.items()
+    }
 
     result = replace_text_matches(
         prompt,
-        matches,
+        mm_matches,
         {key: mm_count
          for key in repl_by_key},
     )
 
     # Only displayed on error
-    print("matches:", matches)
+    print("mm_matches:", mm_matches)
     print("result:", result)
 
     # Manually constructed results
@@ -380,21 +386,27 @@ def test_find_replace_tokens(
     # Should not be used since there is nothing to convert to tokens
     mock_tokenizer = cast(AnyTokenizer, object())
 
-    prompt_repls = [
-        PromptReplacement(key, target, repl_by_key[key]).bind(mock_tokenizer)
+    mm_prompt_repls = {
+        key: [
+            PromptReplacement(key, target,
+                              repl_by_key[key]).bind(mock_tokenizer)
+        ]
         for key, target in target_by_key.items()
-    ]
-    matches = find_token_matches(prompt, prompt_repls)
+    }
+    mm_matches = {
+        key: find_token_matches(prompt, prompt_repls)
+        for key, prompt_repls in mm_prompt_repls.items()
+    }
 
     result = replace_token_matches(
         prompt,
-        matches,
+        mm_matches,
         {key: mm_count
          for key in repl_by_key},
     )
 
     # Only displayed on error
-    print("matches:", matches)
+    print("mm_matches:", mm_matches)
     print("result:", result)
 
     # Manually constructed results
@@ -417,58 +429,76 @@ def test_find_replace_tokens(
     [
         (
             [1, 9833, 28747, 32000, 9833, 28747, 32000, 32000, 918],
-            [
-                _PlaceholderInfo(
-                    modality="pattern_1",
-                    start_idx=6,
-                    replacement=[32000, 32000],
-                ),
-            ],
+            {
+                "pattern_1": [
+                    _PlaceholderInfo(
+                        modality="pattern_1",
+                        item_idx=0,
+                        start_idx=6,
+                        replacement=[32000, 32000],
+                    ),
+                ],
+            }
+
         ),
         (
             [1, 32000, 32000, 9833, 28747, 32000, 32000, 1550, 918, 1550],
-            [
-                _PlaceholderInfo(
-                    modality="pattern_1",
-                    start_idx=1,
-                    replacement=[32000, 32000],
-                ),
-                _PlaceholderInfo(
-                    modality="pattern_1",
-                    start_idx=5,
-                    replacement=[32000, 32000],
-                ),
-                _PlaceholderInfo(
-                    modality="pattern_3",
-                    start_idx=7,
-                    replacement=[1550, 918, 1550],
-                ),
-            ],
+            {
+                "pattern_1": [
+                    _PlaceholderInfo(
+                        modality="pattern_1",
+                        item_idx=0,
+                        start_idx=1,
+                        replacement=[32000, 32000],
+                    ),
+                    _PlaceholderInfo(
+                        modality="pattern_1",
+                        item_idx=1,
+                        start_idx=5,
+                        replacement=[32000, 32000],
+                    ),
+                ],
+                "pattern_3": [
+                    _PlaceholderInfo(
+                        modality="pattern_3",
+                        item_idx=0,
+                        start_idx=7,
+                        replacement=[1550, 918, 1550],
+                    ),
+                ],
+            }
         ),
         (
             [1, 32000, 32000, 32000, 32000, 32000, 1550, 918, 1550],
-            [
-                _PlaceholderInfo(
-                    modality="pattern_1",
-                    start_idx=1,
-                    replacement=[32000, 32000],
-                ),
-                _PlaceholderInfo(
-                    modality="pattern_1",
-                    start_idx=3,
-                    replacement=[32000, 32000],
-                ),
-                _PlaceholderInfo(
-                    modality="pattern_3",
-                    start_idx=6,
-                    replacement=[1550, 918, 1550],
-                ),
-            ],
+            {
+                "pattern_1": [
+                    _PlaceholderInfo(
+                        modality="pattern_1",
+                        item_idx=0,
+                        start_idx=1,
+                        replacement=[32000, 32000],
+                    ),
+                    _PlaceholderInfo(
+                        modality="pattern_1",
+                        item_idx=1,
+                        start_idx=3,
+                        replacement=[32000, 32000],
+                    ),
+                ],
+                "pattern_3": [
+                    _PlaceholderInfo(
+                        modality="pattern_3",
+                        item_idx=0,
+                        start_idx=6,
+                        replacement=[1550, 918, 1550],
+                    ),
+                ],
+            }
         ),
     ]
 )
 # yapf: enable
-def test_iter_placeholders(
+def test_find_mm_placeholders(
     repl_by_key,
     prompt,
     expected,
@@ -476,19 +506,18 @@ def test_iter_placeholders(
     # Should not be used since there is nothing to convert to tokens
     mock_tokenizer = cast(AnyTokenizer, object())
 
-    prompt_repls = [
-        PromptReplacement(key, [], repl).bind(mock_tokenizer)
+    mm_prompt_repls = {
+        key: [PromptReplacement(key, [], repl).bind(mock_tokenizer)]
         for key, repl in repl_by_key.items()
-    ]
+    }
 
-    result = list(
-        iter_placeholders(
-            prompt_repls,
-            prompt,
-            # Effectively match all occurrences in the prompt
-            {key: 3
-             for key in repl_by_key},
-        ))
+    result = find_mm_placeholders(
+        mm_prompt_repls,
+        prompt,
+        # Effectively match all occurrences in the prompt
+        {key: 3
+         for key in repl_by_key},
+    )
 
     # Only displayed on error
     print("result:", result)
@@ -694,7 +723,10 @@ def _test_processing_cache_correctness(
         }
 
         mm_counts = {k: len(vs) for k, vs in mm_data.items()}
-        prompt = baseline_processor._get_dummy_mm_inputs(mm_counts).prompt_text
+        prompt = baseline_processor._get_dummy_processor_inputs(
+            model_config.max_model_len,
+            mm_counts,
+        ).prompt_text
 
         # Drop unnecessary keys and test single -> multi conversion
         if rng.rand() < simplify_rate:
@@ -728,6 +760,8 @@ def _test_processing_cache_correctness(
     ("adept/fuyu-8b", {"image": False}),
     ("llava-hf/llava-1.5-7b-hf", {"image": True}),
     ("llava-hf/llava-v1.6-mistral-7b-hf", {"image": True}),
+    ("llava-hf/LLaVA-NeXT-Video-7B-hf", {"video": False}),
+    ("llava-hf/llava-onevision-qwen2-0.5b-ov-hf", {"image": True, "video": True}),  # noqa: E501
     ("TIGER-Lab/Mantis-8B-siglip-llama3", {"image": True}),
     ("mistral-community/pixtral-12b", {"image": True}),
     ("Qwen/Qwen2-VL-2B-Instruct", {"image": True, "video": True}),
diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
index 4f0d679bd6c28..2fd4262a9d3b9 100644
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@@ -456,7 +456,7 @@ def _get_num_image_tokens(self) -> int:
         hf_config = self.ctx.get_hf_config()
         return max(hf_config.projector_patch_to_query_dict.values())
 
-    def get_mm_max_tokens_per_item(self) -> Mapping[str, int]:
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
         return {"image": self._get_num_image_tokens()}
 
     def _get_mm_fields_config(
@@ -488,8 +488,9 @@ def _get_prompt_replacements(
             )
         ]
 
-    def _get_dummy_mm_inputs(
+    def _get_dummy_processor_inputs(
         self,
+        seq_len: int,
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
         hf_config = self.ctx.get_hf_config()
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index 0fe10d8585215..b3ecb2f22dc19 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -405,7 +405,7 @@ def _get_num_image_tokens(self) -> int:
         hf_config = self.ctx.get_hf_config(Blip2Config)
         return hf_config.num_query_tokens
 
-    def get_mm_max_tokens_per_item(self) -> Mapping[str, int]:
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
         return {"image": self._get_num_image_tokens()}
 
     def _get_hf_processor(self) -> Blip2Processor:
@@ -457,8 +457,9 @@ def apply(
 
         return result
 
-    def _get_dummy_mm_inputs(
+    def _get_dummy_processor_inputs(
         self,
+        seq_len: int,
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
         hf_config = self.ctx.get_hf_config(Blip2Config)
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index 0bd0194243ceb..1ad44678a591d 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -57,7 +57,7 @@ def _get_num_image_tokens(self) -> int:
         processor = self._get_hf_processor()
         return processor.image_seq_length
 
-    def get_mm_max_tokens_per_item(self) -> Mapping[str, int]:
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
         return {"image": self._get_num_image_tokens()}
 
     def _get_hf_processor(self) -> ChameleonProcessor:
@@ -90,8 +90,9 @@ def _get_prompt_replacements(
             )
         ]
 
-    def _get_dummy_mm_inputs(
+    def _get_dummy_processor_inputs(
         self,
+        seq_len: int,
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
         config = self.ctx.get_hf_config(ChameleonConfig)
diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index 0188452054b8c..1bde45cb140cb 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -164,15 +164,18 @@ def get_num_image_tokens(
     def get_max_image_tokens(self) -> int:
         return get_max_clip_image_tokens(self.vision_config)
 
-    def get_num_patches(self) -> int:
+    def get_image_size(self) -> int:
+        return self.vision_config.image_size
+
+    def get_patch_size(self) -> int:
+        return self.vision_config.patch_size
+
+    def get_patch_grid_length(self) -> int:
         return get_clip_patch_grid_length(
             image_size=self.vision_config.image_size,
             patch_size=self.vision_config.patch_size,
         )
 
-    def get_image_size(self) -> int:
-        return self.vision_config.image_size
-
 
 # Adapted from https://github.com/huggingface/transformers/blob/v4.39.0/src/transformers/models/clip/modeling_clip.py#L164 # noqa
 class CLIPVisionEmbeddings(nn.Module):
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index 3680d01725238..7cd58fbc7cf21 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -96,7 +96,7 @@ def _get_image_feature_grid_size(
         nrows = math.ceil(image_height / 30)
         return ncols, nrows
 
-    def get_mm_max_tokens_per_item(self) -> Mapping[str, int]:
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
         target_width, target_height = self._get_image_target_size()
 
         max_ncols, max_nrows = self._get_image_feature_grid_size(
@@ -208,8 +208,9 @@ def apply(
 
         return result
 
-    def _get_dummy_mm_inputs(
+    def _get_dummy_processor_inputs(
         self,
+        seq_len: int,
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
         target_width, target_height = self._get_image_target_size()
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 78de27cd821c6..d522378e0bebb 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -25,11 +25,9 @@
                                     NestedTensors)
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    ImageSize)
-from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        InputProcessingContext,
+from vllm.multimodal.processing import (InputProcessingContext,
                                         MultiModalDataItems, ProcessingCache,
-                                        ProcessorInputs, PromptReplacement,
-                                        full_groupby_modality)
+                                        ProcessorInputs, PromptReplacement)
 from vllm.sequence import IntermediateTensors
 
 from .clip import CLIPVisionModel
@@ -39,7 +37,7 @@
 from .siglip import SiglipVisionModel
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
-from .vision import vision_encoder_info
+from .vision import BaseVisionLanguageMultiModalProcessor
 
 
 class LlavaImagePixelInputs(TypedDict):
@@ -100,19 +98,7 @@ class LlavaLikeConfig(Protocol):
     vision_feature_layer: Final[Union[int, List[int]]]
 
 
-class BaseLlavaMultiModalProcessor(BaseMultiModalProcessor):
-
-    def __init__(self,
-                 ctx: InputProcessingContext,
-                 *,
-                 cache: Optional[ProcessingCache] = None,
-                 enable_sanity_checks: bool = True) -> None:
-        super().__init__(ctx,
-                         cache=cache,
-                         enable_sanity_checks=enable_sanity_checks)
-
-        vision_config = self._get_hf_config().vision_config
-        self._vision_encoder_info = vision_encoder_info(vision_config)
+class BaseLlavaMultiModalProcessor(BaseVisionLanguageMultiModalProcessor):
 
     @abstractmethod
     def _get_hf_config(self) -> LlavaLikeConfig:
@@ -121,6 +107,19 @@ def _get_hf_config(self) -> LlavaLikeConfig:
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None}
 
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+        return {"image": self._get_max_image_tokens()}
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+            image_embeds=MultiModalFieldConfig.batched("image"),
+        )
+
     def _apply_feature_select_strategy(
         self,
         strategy: str,
@@ -142,19 +141,6 @@ def _get_max_image_tokens(self) -> int:
             self._vision_encoder_info.get_max_image_tokens(),
         )
 
-    def get_mm_max_tokens_per_item(self) -> Mapping[str, int]:
-        return {"image": self._get_max_image_tokens()}
-
-    def _get_mm_fields_config(
-        self,
-        hf_inputs: BatchFeature,
-        hf_processor_mm_kwargs: Mapping[str, object],
-    ) -> Mapping[str, MultiModalFieldConfig]:
-        return dict(
-            pixel_values=MultiModalFieldConfig.batched("image"),
-            image_embeds=MultiModalFieldConfig.batched("image"),
-        )
-
     def _get_dummy_image_size(self) -> ImageSize:
         image_size = self._vision_encoder_info.get_image_size()
         return ImageSize(image_size, image_size)
@@ -163,8 +149,9 @@ def _get_dummy_image_size(self) -> ImageSize:
     def _get_image_token(self) -> str:
         raise NotImplementedError
 
-    def _get_dummy_mm_inputs(
+    def _get_dummy_processor_inputs(
         self,
+        seq_len: int,
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
         num_images = mm_counts.get("image", 0)
@@ -709,7 +696,7 @@ def get_replacement_mantis(item_idx: int):
                 "</Image>)",  # 3 tokens
             ])
 
-        mantis_repls = self._bind_prompt_replacements([
+        mantis_mm_repls = self._bind_and_group_repls([
             PromptReplacement(
                 modality="image",
                 target=[image_token_id] * num_image_tokens,
@@ -719,7 +706,7 @@ def get_replacement_mantis(item_idx: int):
 
         prompt_ids, prompt_text, _ = self._apply_prompt_replacements(
             result["prompt_token_ids"],
-            mantis_repls,
+            mantis_mm_repls,
             mm_item_counts,
         )
 
@@ -728,15 +715,19 @@ def get_replacement_mantis(item_idx: int):
             hf_processor_mm_kwargs,
             mm_kwargs,
         )
-        orig_repls = self._bind_prompt_replacements(unbound_orig_repls)
+        orig_repls = self._bind_and_group_repls(unbound_orig_repls)
+
+        mm_placeholders = self._find_mm_placeholders(
+            orig_repls,
+            prompt_ids,
+            mm_item_counts,
+        )
 
-        all_placeholders = self._find_placeholders(orig_repls, prompt_ids,
-                                                   mm_item_counts)
-        assert len(all_placeholders) == mm_item_counts.get("image", 0)
+        self._validate_mm_placeholders(mm_placeholders, mm_item_counts)
 
-        mm_placeholders = {
-            modality: [item.to_range() for item in items]
-            for modality, items in full_groupby_modality(all_placeholders)
+        mm_placeholder_ranges = {
+            modality: [item.to_range() for item in placeholders]
+            for modality, placeholders in mm_placeholders.items()
         }
 
         return MultiModalInputsV2(
@@ -744,7 +735,7 @@ def get_replacement_mantis(item_idx: int):
             prompt=prompt_text,
             prompt_token_ids=prompt_ids,
             mm_kwargs=mm_kwargs,
-            mm_placeholders=mm_placeholders,
+            mm_placeholders=mm_placeholder_ranges,
         )
 
 
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 24debd1cbf3fe..3769f04f94a92 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -67,9 +67,6 @@ def _get_hf_config(self) -> LlavaNextConfig:
     def _get_hf_processor(self) -> LlavaNextProcessor:
         return self.ctx.get_hf_processor(LlavaNextProcessor)
 
-    def _get_image_token(self) -> str:
-        return self._get_hf_processor().image_token
-
     def _get_mm_fields_config(
         self,
         hf_inputs: BatchFeature,
@@ -81,6 +78,9 @@ def _get_mm_fields_config(
             image_embeds=MultiModalFieldConfig.batched("image"),
         )
 
+    def _get_image_token(self) -> str:
+        return self._get_hf_processor().image_token
+
     def _get_max_image_tokens(self) -> int:
         largest_feature_size, _ = self._get_pinpoint_with_most_features()
         return largest_feature_size
@@ -97,20 +97,20 @@ def _get_num_image_tokens(
         image_height: int,
     ) -> int:
         hf_config = self._get_hf_config()
+        vision_encoder_info = self._vision_encoder_info
 
         base_feature_size = self._apply_feature_select_strategy(
             hf_config.vision_feature_select_strategy,
-            self._vision_encoder_info.get_num_image_tokens(
+            vision_encoder_info.get_num_image_tokens(
                 image_width=image_width,
                 image_height=image_height,
             ),
         )
-        num_patches = self._vision_encoder_info.get_num_patches()
 
         num_patch_height, num_patch_width = get_anyres_image_grid_shape(
             image_size=(image_height, image_width),
             grid_pinpoints=hf_config.image_grid_pinpoints,
-            patch_size=self._vision_encoder_info.get_image_size(),
+            patch_size=vision_encoder_info.get_image_size(),
         )
 
         (
@@ -119,7 +119,7 @@ def _get_num_image_tokens(
         ) = self._get_num_unpadded_features(
             original_height=image_height,
             original_width=image_width,
-            npatches=num_patches,
+            npatches=vision_encoder_info.get_patch_grid_length(),
             num_patch_height=num_patch_height,
             num_patch_width=num_patch_width,
         )
@@ -155,6 +155,7 @@ def _get_num_unpadded_features(
 
         unpadded_features = current_height * current_width
         newline_features = current_height
+
         return (unpadded_features, newline_features)
 
     def _get_pinpoint_with_most_features(self) -> tuple[int, ImageSize]:
diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
index 0de9d8c5ea572..ee6b89f0d4498 100644
--- a/vllm/model_executor/models/llava_next_video.py
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -3,38 +3,32 @@
 from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
                     TypedDict, Union)
 
-import numpy as np
 import torch
 import torch.nn as nn
-from transformers import (CLIPVisionConfig, LlavaNextVideoConfig,
-                          SiglipVisionConfig)
+from transformers import (BatchFeature, LlavaNextVideoConfig,
+                          LlavaNextVideoProcessor)
 
 from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
-                         InputContext, token_inputs)
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.models.clip import CLIPVisionModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import NestedTensors
-from vllm.multimodal.utils import (cached_get_tokenizer,
-                                   repeat_and_pad_placeholder_tokens)
+from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors
+from vllm.multimodal.parse import (ImageSize, MultiModalDataItems,
+                                   VideoEmbeddingItems, VideoProcessorItems)
+from vllm.multimodal.processing import (MultiModalFieldConfig, ProcessorInputs,
+                                        PromptReplacement)
 from vllm.sequence import IntermediateTensors
 from vllm.utils import is_list_of
 
-from .clip import dummy_image_for_clip, dummy_seq_data_for_clip
 from .interfaces import SupportsMultiModal, SupportsPP
 from .llava import init_vision_tower_for_llava
-from .siglip import (SiglipVisionModel, dummy_image_for_siglip,
-                     dummy_seq_data_for_siglip)
+from .siglip import SiglipVisionModel
 from .utils import (AutoWeightsLoader, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
-
-# For profile run
-_MAX_FRAMES_PER_VIDEO = 32
-_MAX_NUM_VIDEOS = 1
+from .vision import BaseVisionLanguageMultiModalProcessor
 
 
 class LlavaNextVideoPixelInputs(TypedDict):
@@ -50,143 +44,148 @@ class LlavaNextVideoPixelInputs(TypedDict):
     """
 
 
-def get_llava_next_video_frame_feature_size(
-        hf_config: LlavaNextVideoConfig) -> int:
-    # Support both CLIPVisionConfig and SiglipVisionConfig
-    image_size = hf_config.vision_config.image_size
-    patch_size = hf_config.vision_config.patch_size
-    spatial_pool_stride = hf_config.spatial_pool_stride
+class LlavaNextVideoMultiModalProcessor(BaseVisionLanguageMultiModalProcessor):
 
-    return int((image_size / patch_size / spatial_pool_stride)**2)
+    def _get_hf_config(self) -> LlavaNextVideoConfig:
+        return self.ctx.get_hf_config(LlavaNextVideoConfig)
 
+    def _get_hf_processor(self) -> LlavaNextVideoProcessor:
+        return self.ctx.get_hf_processor(LlavaNextVideoProcessor)
 
-def _get_max_llm_tokens(ctx: InputContext) -> int:
-    """
-    Calculated from the maximum video frames under the context length
-    constraints of the language model.
-    """
-    hf_text_config = ctx.model_config.hf_text_config
-    model_config = ctx.model_config
-    max_tokens = model_config.max_model_len
-    rope_scaling = model_config.rope_scaling
-
-    if rope_scaling:
-        rope_scaling_factor = hf_text_config.rope_scaling["factor"]
-    else:
-        rope_scaling_factor = 1
-
-    max_tokens *= rope_scaling_factor
-
-    return max_tokens
-
-
-def get_max_llava_next_video_tokens(ctx: InputContext) -> int:
-    # Currently set to 32 frames
-    # TODO: max_tokens = _get_max_llm_tokens(ctx)
-    hf_config = ctx.get_hf_config(LlavaNextVideoConfig)
-    tokens_per_frame = get_llava_next_video_frame_feature_size(hf_config)
-    return _MAX_FRAMES_PER_VIDEO * tokens_per_frame
-
-
-def dummy_data_for_llava_next_video(ctx: InputContext, seq_len: int,
-                                    mm_counts: Mapping[str, int]):
-    hf_config = ctx.get_hf_config(LlavaNextVideoConfig)
-    vision_config = hf_config.vision_config
-
-    # TODO: support multiple videos
-    num_videos = mm_counts["video"]
-    if num_videos != _MAX_NUM_VIDEOS:
-        raise NotImplementedError(
-            f"Only {_MAX_NUM_VIDEOS} videos are supported")
-
-    # TODO: support configuring the number of frames
-    frames_per_video = _MAX_FRAMES_PER_VIDEO
-    # num_images = num_videos * frames_per_video
-
-    # fills the sequence with as longer video data as possible
-    tokens_per_frame = get_llava_next_video_frame_feature_size(hf_config)
-    video_feature_size = frames_per_video * tokens_per_frame
-
-    if isinstance(vision_config, CLIPVisionConfig):
-        seq_data, ranges = dummy_seq_data_for_clip(
-            vision_config,
-            seq_len,
-            num_videos,
-            image_token_id=hf_config.video_token_index,
-            image_feature_size_override=video_feature_size,
-            mm_key="video",
-        )
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"video": 1}
+
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+        num_frames = self._get_dummy_num_frames(seq_len)
+        max_video_tokens = self._get_max_video_tokens(num_frames)
+
+        return {"video": max_video_tokens}
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(pixel_values_videos=MultiModalFieldConfig.batched("video"))
+
+    def _get_num_frame_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        hf_config = self._get_hf_config()
+        spatial_pool_stride = hf_config.spatial_pool_stride
 
-        pil_frame = dummy_image_for_clip(vision_config, num_images=1)
-        np_frame = np.array(pil_frame["image"])
-        mm_data_per_video = np.repeat([np_frame], frames_per_video, axis=0)
-        mm_data = {"video": mm_data_per_video}
-        return DummyData(seq_data, mm_data, ranges)
-    elif isinstance(vision_config, SiglipVisionConfig):
-        seq_data, ranges = dummy_seq_data_for_siglip(
-            vision_config,
-            seq_len,
-            num_videos,
-            image_token_id=hf_config.video_token_index,
-            image_feature_size_override=video_feature_size,
-            mm_key="video",
+        patch_grid_length = self._vision_encoder_info.get_patch_grid_length()
+        pooled_grid_length = math.ceil(patch_grid_length / spatial_pool_stride)
+
+        return pooled_grid_length * pooled_grid_length
+
+    def _get_num_video_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        num_frames: int,
+    ) -> int:
+        num_frame_tokens = self._get_num_frame_tokens(
+            image_width=image_width,
+            image_height=image_height,
         )
 
-        pil_frame = dummy_image_for_siglip(vision_config, num_images=1)
-        np_frame = np.array(pil_frame["image"])
-        mm_data_per_video = np.repeat([np_frame], frames_per_video, axis=0)
-        mm_data = {"video": mm_data_per_video}
-        return DummyData(seq_data, mm_data, ranges)
+        return num_frame_tokens * num_frames
 
-    msg = f"Unsupported vision config: {type(vision_config)}"
-    raise NotImplementedError(msg)
+    def _get_max_video_tokens(self, num_frames: int) -> int:
+        return self._get_num_video_tokens(image_width=999999,
+                                          image_height=999999,
+                                          num_frames=num_frames)
 
+    def _get_max_video_frames(self, max_tokens: int) -> int:
+        num_frames = 0
 
-def input_processor_for_llava_next_video(ctx: InputContext,
-                                         inputs: DecoderOnlyInputs):
-    multi_modal_data = inputs.get("multi_modal_data")
-    if multi_modal_data is None or "video" not in multi_modal_data:
-        return inputs
+        while True:
+            next_num_frames = num_frames + 1
 
-    if "multi_modal_placeholders" in inputs and "video" in inputs[
-            "multi_modal_placeholders"]:
-        # The inputs already have placeholders.
-        return inputs
+            if self._get_max_video_tokens(next_num_frames) > max_tokens:
+                break
 
-    video_data = multi_modal_data["video"]
+            num_frames = next_num_frames
 
-    model_config = ctx.model_config
-    hf_config = ctx.get_hf_config(LlavaNextVideoConfig)
-    vision_config = hf_config.vision_config
+        return num_frames
 
-    if isinstance(video_data, np.ndarray):
-        # Supports both CLIP and Siglip
-        num_frames = video_data.shape[0]
-        frame_feature_size = \
-            get_llava_next_video_frame_feature_size(hf_config)
-        video_feature_size = num_frames * frame_feature_size
+    def _get_dummy_num_frames(self, seq_len: int) -> int:
+        mm_config = self.ctx.get_mm_config()
+        max_videos = mm_config.limit_per_prompt.get("video", 1)
 
-        tokenizer = cached_get_tokenizer(model_config.tokenizer)
+        max_total_frames = self._get_max_video_frames(seq_len)
 
-        new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens(
-            tokenizer,
-            inputs.get("prompt"),
-            inputs["prompt_token_ids"],
-            placeholder_token_id=hf_config.video_token_index,
-            repeat_count=video_feature_size,
-        )
+        return max(max_total_frames // max(max_videos, 1), 1)
 
-        return token_inputs(prompt_token_ids=new_token_ids,
-                            prompt=new_prompt,
-                            multi_modal_data=multi_modal_data,
-                            multi_modal_placeholders={"video": ranges})
+    def _get_dummy_image_size(self) -> ImageSize:
+        image_size = self._vision_encoder_info.get_image_size()
+        return ImageSize(image_size, image_size)
 
-    elif is_list_of(video_data, np.ndarray):
-        raise NotImplementedError(
-            "Processing multiple videos is not supported")
+    def _get_video_token(self) -> str:
+        return self._get_hf_processor().video_token
 
-    msg = f"Unsupported vision config: {type(vision_config)}"
-    raise NotImplementedError(msg)
+    def _get_prompt_replacements(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> list[PromptReplacement]:
+        hf_config = self._get_hf_config()
+        video_token_id = hf_config.video_token_index
+
+        def get_replacement(item_idx: int):
+            videos = mm_items.get_items(
+                "video", (VideoEmbeddingItems, VideoProcessorItems))
+
+            if isinstance(videos, VideoEmbeddingItems):
+                num_video_tokens = videos.get_feature_size(item_idx)
+            else:
+                image_size = videos.get_frame_size(item_idx)
+                num_video_tokens = self._get_num_video_tokens(
+                    image_width=image_size.width,
+                    image_height=image_size.height,
+                    num_frames=videos.get_num_frames(item_idx),
+                )
+
+            return [video_token_id] * num_video_tokens
+
+        return [
+            PromptReplacement(
+                modality="video",
+                target=[video_token_id],
+                replacement=get_replacement,
+            ),
+        ]
+
+    def _get_dummy_processor_inputs(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        num_videos = mm_counts.get("video", 0)
+
+        video_token = self._get_video_token()
+        target_width, target_height = self._get_dummy_image_size()
+
+        mm_data = {
+            "video":
+            self._get_dummy_videos(
+                width=target_width,
+                height=target_height,
+                num_frames=self._get_dummy_num_frames(seq_len),
+                num_videos=num_videos,
+            )
+        }
+
+        return ProcessorInputs(
+            prompt_text=video_token * num_videos,
+            mm_data=mm_data,
+        )
 
 
 # adopted from transformers modeling_llava_next_video.py
@@ -246,11 +245,7 @@ def forward(self, image_features: torch.Tensor) -> torch.Tensor:
         return hidden_states
 
 
-@MULTIMODAL_REGISTRY.register_input_mapper("video")
-@MULTIMODAL_REGISTRY.register_max_multimodal_tokens(
-    "video", get_max_llava_next_video_tokens)
-@INPUT_REGISTRY.register_dummy_data(dummy_data_for_llava_next_video)
-@INPUT_REGISTRY.register_input_processor(input_processor_for_llava_next_video)
+@MULTIMODAL_REGISTRY.register_processor(LlavaNextVideoMultiModalProcessor)
 class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal,
                                              SupportsPP):
 
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index 0bebc1c745e2b..1e51e09a24c18 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -3,47 +3,36 @@
 from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
                     TypedDict, Union)
 
-import numpy as np
 import torch
 import torch.nn as nn
-from PIL import Image
-from transformers import (CLIPVisionConfig, LlavaOnevisionConfig,
-                          SiglipVisionConfig)
+from transformers import (BatchFeature, LlavaOnevisionConfig,
+                          LlavaOnevisionProcessor)
 from transformers.models.llava_onevision.modeling_llava_onevision import (
     get_anyres_image_grid_shape, unpad_image)
 from typing_extensions import NotRequired
 
 from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
-                         InputContext, token_inputs)
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import NestedTensors
-from vllm.multimodal.utils import (cached_get_tokenizer,
-                                   repeat_and_pad_placeholder_tokens)
+from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors
+from vllm.multimodal.parse import (MultiModalDataItems, VideoEmbeddingItems,
+                                   VideoProcessorItems)
+from vllm.multimodal.processing import (MultiModalFieldConfig, ProcessorInputs,
+                                        PromptReplacement)
 from vllm.sequence import IntermediateTensors
 from vllm.utils import is_list_of
 
-from .clip import (CLIPVisionModel, dummy_seq_data_for_clip,
-                   dummy_video_for_clip, get_clip_image_feature_size,
-                   get_clip_patch_grid_length, input_processor_for_clip)
+from .clip import CLIPVisionModel
 from .interfaces import SupportsMultiModal, SupportsPP
 from .llava import init_vision_tower_for_llava
-from .siglip import (SiglipVisionModel, dummy_seq_data_for_siglip,
-                     dummy_video_for_siglip, get_siglip_image_feature_size,
-                     get_siglip_patch_grid_length, input_processor_for_siglip)
+from .llava_next import LlavaNextMultiModalProcessor
+from .siglip import SiglipVisionModel
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
 
-# Result in the max possible feature size (2x2 grid of 336x336px tiles)
-MAX_IMAGE_FEATURE_SIZE_HEIGHT = MAX_IMAGE_FEATURE_SIZE_WIDTH = 448
-
-# For profile run
-_MAX_FRAMES_PER_VIDEO = 16
-
 
 class LlavaOnevisionVideoPixelInputs(TypedDict):
     type: Literal["pixel_values_videos"]
@@ -92,286 +81,251 @@ class LlavaOnevisionImageEmbeddingInputs(TypedDict):
                                   LlavaOnevisionVideoPixelInputs]
 
 
-def _get_llava_onevision_image_unppaded_feature_size(height, width, patches,
-                                                     scale_height,
-                                                     scale_width):
-    current_height = patches * scale_height
-    current_width = patches * scale_width
-
-    original_aspect_ratio = width / height
-    current_aspect_ratio = current_width / current_height
-    if original_aspect_ratio > current_aspect_ratio:
-        new_height = int(height * (current_width / width))
-        padding = (current_height - new_height) // 2
-        current_height -= padding * 2
-    else:
-        new_width = int(width * (current_height / height))
-        padding = (current_width - new_width) // 2
-        current_width -= padding * 2
-
-    unpadded_features = current_height * current_width
-    newline_features = current_height
-
-    ratio = math.sqrt(current_height * current_width / (9 * patches**2))
-    if ratio > 1.1:
-        unpadded_features = int(current_height // ratio) * int(
-            current_width // ratio)
-        newline_features = int(current_height // ratio)
-
-    return (unpadded_features, newline_features)
-
-
-def get_llava_onevision_image_feature_size(
-    hf_config: LlavaOnevisionConfig,
-    *,
-    input_height: int,
-    input_width: int,
-) -> int:
-    vision_config = hf_config.vision_config
-
-    if isinstance(vision_config, CLIPVisionConfig):
-        num_patches = get_clip_patch_grid_length(
-            image_size=vision_config.image_size,
-            patch_size=vision_config.patch_size,
-        )
-        base_feature_size = get_clip_image_feature_size(vision_config)
-    elif isinstance(vision_config, SiglipVisionConfig):
-        num_patches = get_siglip_patch_grid_length(
-            image_size=vision_config.image_size,
-            patch_size=vision_config.patch_size,
-        )
-        base_feature_size = get_siglip_image_feature_size(vision_config)
-    else:
-        msg = f"Unsupported vision config: {type(vision_config)}"
-        raise NotImplementedError(msg)
-
-    strategy = hf_config.vision_feature_select_strategy
-    if strategy == "default":
-        base_feature_size -= 1
-    elif strategy == "full":
-        pass
-    else:
-        raise ValueError(f"Unexpected select feature strategy: {strategy}")
+class LlavaOnevisionMultiModalProcessor(LlavaNextMultiModalProcessor):
 
-    num_patch_height, num_patch_width = get_anyres_image_grid_shape(
-        image_size=(input_height, input_width),
-        grid_pinpoints=hf_config.image_grid_pinpoints,
-        patch_size=vision_config.image_size,
-    )
+    def _get_hf_config(self) -> LlavaOnevisionConfig:
+        return self.ctx.get_hf_config(LlavaOnevisionConfig)
 
-    (
-        unpadded_feature_size,
-        newline_feature_size,
-    ) = _get_llava_onevision_image_unppaded_feature_size(
-        input_height, input_width, num_patches, num_patch_height,
-        num_patch_width)
-
-    return unpadded_feature_size + newline_feature_size + base_feature_size
-
-
-def get_max_llava_onevision_image_tokens(ctx: InputContext):
-    return get_llava_onevision_image_feature_size(
-        ctx.get_hf_config(LlavaOnevisionConfig),
-        input_height=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
-        input_width=MAX_IMAGE_FEATURE_SIZE_WIDTH,
-    )
-
-
-def get_llava_onevision_video_frame_feature_size(
-        hf_config: LlavaOnevisionConfig) -> int:
-    # Support both CLIPVisionConfig and SiglipVisionConfig
-    image_size = hf_config.vision_config.image_size
-    patch_size = hf_config.vision_config.patch_size
-    spatial_pool_stride = hf_config.spatial_pool_stride if hasattr(
-        hf_config, "spatial_pool_stride") else 2
-
-    height = width = image_size // patch_size
-    return math.ceil(height / spatial_pool_stride) * math.ceil(
-        width / spatial_pool_stride)
-
-
-def get_llava_onevision_video_tokens(ctx: InputContext,
-                                     num_frames: int) -> int:
-    hf_config = ctx.get_hf_config(LlavaOnevisionConfig)
-
-    # TODO: support configuring (not supported by HF right now)
-    num_token_image_newline = 1
-    tokens_per_frame = get_llava_onevision_video_frame_feature_size(hf_config)
-    video_feature_size = num_frames * tokens_per_frame + num_token_image_newline
-
-    return video_feature_size
-
-
-def get_max_llava_onevision_video_tokens(ctx: InputContext) -> int:
-    return get_llava_onevision_video_tokens(ctx, _MAX_FRAMES_PER_VIDEO)
-
-
-def dummy_data_for_llava_onevision(ctx: InputContext, seq_len: int,
-                                   mm_counts: Mapping[str, int]):
-    hf_config = ctx.get_hf_config(LlavaOnevisionConfig)
-    vision_config = hf_config.vision_config
-
-    num_videos = mm_counts["video"]
-
-    # TODO: support configuring the number of frames
-    num_frames = _MAX_FRAMES_PER_VIDEO
-    video_feature_size = get_llava_onevision_video_tokens(ctx, num_frames)
-
-    if isinstance(vision_config, CLIPVisionConfig):
-        seq_data, ranges = dummy_seq_data_for_clip(
-            vision_config,
-            seq_len,
-            num_videos,
-            image_token_id=hf_config.video_token_index,
-            image_feature_size_override=video_feature_size,
-            mm_key="video")
-
-        mm_data = dummy_video_for_clip(vision_config,
-                                       num_frames=num_frames,
-                                       num_videos=num_videos)
-        return DummyData(seq_data, mm_data, ranges)
-    elif isinstance(vision_config, SiglipVisionConfig):
-        seq_data, ranges = dummy_seq_data_for_siglip(
-            vision_config,
-            seq_len,
-            num_videos,
-            image_token_id=hf_config.video_token_index,
-            image_feature_size_override=video_feature_size,
-            mm_key="video")
-
-        mm_data = dummy_video_for_siglip(vision_config,
-                                         num_frames=num_frames,
-                                         num_videos=num_videos)
-        return DummyData(seq_data, mm_data, ranges)
-
-    msg = f"Unsupported vision config: {type(vision_config)}"
-    raise NotImplementedError(msg)
-
-
-def input_processor_when_multimodal_input_image(ctx: InputContext,
-                                                inputs: DecoderOnlyInputs):
-    multi_modal_data = inputs.get("multi_modal_data")
-    if multi_modal_data is None or "image" not in multi_modal_data:
-        return inputs
-
-    model_config = ctx.model_config
-    hf_config = ctx.get_hf_config(LlavaOnevisionConfig)
-    vision_config = hf_config.vision_config
-
-    image_data = multi_modal_data["image"]
-    if isinstance(image_data, Image.Image):
-        width, height = image_data.size
-
-        image_feature_size = get_llava_onevision_image_feature_size(
-            hf_config,
-            input_height=height,
-            input_width=width,
-        )
-    elif is_list_of(image_data, Image.Image):
-        image_feature_size = [
-            get_llava_onevision_image_feature_size(hf_config,
-                                                   input_height=img.height,
-                                                   input_width=img.width)
-            for img in image_data
-        ]
-    elif isinstance(image_data, torch.Tensor):
-        num_images, image_feature_size, hidden_size = image_data.shape
-    elif is_list_of(image_data, torch.Tensor):
-        image_feature_size = [item.shape[1] for item in image_data]
-    else:
-        raise TypeError(f"Invalid image type: {type(image_data)}")
-
-    vision_config = hf_config.vision_config
-
-    if isinstance(vision_config, CLIPVisionConfig):
-        return input_processor_for_clip(
-            model_config,
-            vision_config,
-            inputs,
-            image_token_id=hf_config.image_token_index,
-            image_feature_size_override=image_feature_size,
+    def _get_hf_processor(self) -> LlavaOnevisionProcessor:
+        return self.ctx.get_hf_processor(LlavaOnevisionProcessor)
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None, "video": None}
+
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+        max_image_tokens = self._get_max_image_tokens()
+
+        num_frames = self._get_dummy_num_frames(seq_len)
+        max_video_tokens = self._get_max_video_tokens(num_frames)
+
+        return {
+            "image": max_image_tokens,
+            "video": max_video_tokens,
+        }
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+            image_sizes=MultiModalFieldConfig.batched("image"),
+            image_embeds=MultiModalFieldConfig.batched("image"),
+            pixel_values_videos=MultiModalFieldConfig.batched("video"),
         )
-    elif isinstance(vision_config, SiglipVisionConfig):
-        return input_processor_for_siglip(
-            model_config,
-            vision_config,
-            inputs,
-            image_token_id=hf_config.image_token_index,
-            image_feature_size_override=image_feature_size,
+
+    def _get_num_unpadded_features(
+        self,
+        *,
+        original_height: int,
+        original_width: int,
+        npatches: int,
+        num_patch_height: int,
+        num_patch_width: int,
+    ) -> tuple[int, int]:
+        current_height = npatches * num_patch_height
+        current_width = npatches * num_patch_width
+
+        original_aspect_ratio = original_width / original_height
+        current_aspect_ratio = current_width / current_height
+        if original_aspect_ratio > current_aspect_ratio:
+            new_height = int(original_height *
+                             (current_width / original_width))
+            padding = (current_height - new_height) // 2
+            current_height -= padding * 2
+        else:
+            new_width = int(original_width *
+                            (current_height / original_height))
+            padding = (current_width - new_width) // 2
+            current_width -= padding * 2
+
+        unpadded_features = current_height * current_width
+        newline_features = current_height
+
+        ratio = math.sqrt(current_height * current_width / (9 * npatches**2))
+        if ratio > 1.1:
+            unpadded_features = int(current_height // ratio) * int(
+                current_width // ratio)
+            newline_features = int(current_height // ratio)
+
+        return (unpadded_features, newline_features)
+
+    def _get_num_frame_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        hf_config = self._get_hf_config()
+        spatial_pool_stride = getattr(hf_config, "spatial_pool_stride", 2)
+
+        patch_grid_length = self._vision_encoder_info.get_patch_grid_length()
+        pooled_grid_length = math.ceil(patch_grid_length / spatial_pool_stride)
+
+        return pooled_grid_length * pooled_grid_length
+
+    def _get_num_video_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        num_frames: int,
+    ) -> int:
+        num_frame_tokens = self._get_num_frame_tokens(
+            image_width=image_width,
+            image_height=image_height,
         )
 
-    msg = f"Unsupported vision config: {type(vision_config)}"
-    raise NotImplementedError(msg)
+        return num_frame_tokens * num_frames + 1  # Newline token
+
+    def _get_max_video_tokens(self, num_frames: int) -> int:
+        return self._get_num_video_tokens(image_width=999999,
+                                          image_height=999999,
+                                          num_frames=num_frames)
 
+    def _get_max_video_frames(self, max_tokens: int) -> int:
+        num_frames = 0
 
-def input_processor_when_multimodal_input_video(ctx: InputContext,
-                                                inputs: DecoderOnlyInputs):
-    multi_modal_data = inputs.get("multi_modal_data")
-    if multi_modal_data is None or "video" not in multi_modal_data:
-        return inputs
-    video_data = multi_modal_data["video"]
+        while True:
+            next_num_frames = num_frames + 1
 
-    model_config = ctx.model_config
-    hf_config = ctx.get_hf_config(LlavaOnevisionConfig)
+            if self._get_max_video_tokens(next_num_frames) > max_tokens:
+                break
 
-    if isinstance(video_data, np.ndarray):
-        # Supports both CLIP and Siglip
-        num_frames = video_data.shape[0]
-        video_feature_size = get_llava_onevision_video_tokens(ctx, num_frames)
-        tokenizer = cached_get_tokenizer(model_config.tokenizer)
+            num_frames = next_num_frames
 
-        new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens(
-            tokenizer,
-            inputs.get("prompt"),
-            inputs["prompt_token_ids"],
-            placeholder_token_id=hf_config.video_token_index,
-            repeat_count=video_feature_size,
+        return num_frames
+
+    def _get_dummy_num_frames(self, seq_len: int) -> int:
+        mm_config = self.ctx.get_mm_config()
+        max_images = mm_config.limit_per_prompt.get("image", 1)
+        max_videos = mm_config.limit_per_prompt.get("video", 1)
+
+        max_image_tokens = self._get_max_image_tokens() * max_images
+        max_total_frames = self._get_max_video_frames(seq_len -
+                                                      max_image_tokens)
+
+        return max(max_total_frames // max(max_videos, 1), 1)
+
+    def _get_video_token(self) -> str:
+        return self._get_hf_processor().video_token
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        mm_data = dict(mm_data)
+        videos = mm_data.pop("videos", [])
+        assert isinstance(videos, list)
+
+        if not videos:
+            return super()._call_hf_processor(
+                prompt=prompt,
+                mm_data=mm_data,
+                mm_kwargs=mm_kwargs,
+            )
+
+        video_token = self._get_video_token()
+
+        # LLaVA-OneVision processor doesn't support multiple videos
+        # with different sizes when converting back to tensors
+        text_image_outputs = super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+        )
+
+        pixel_values_videos = []
+        for video in videos:
+            item_processor_data = dict(prompt=video_token, videos=video)
+
+            item_outputs = super()._call_hf_processor(
+                prompt=prompt,
+                mm_data=item_processor_data,
+                mm_kwargs=mm_kwargs,
+            )
+
+            pixel_values_videos.append(
+                item_outputs.pop("pixel_values_videos")[0])
+
+        combined_outputs = dict(
+            **text_image_outputs,
+            pixel_values_videos=pixel_values_videos,
         )
+        return BatchFeature(combined_outputs)
 
-        return token_inputs(prompt_token_ids=new_token_ids,
-                            prompt=new_prompt,
-                            multi_modal_data=multi_modal_data,
-                            multi_modal_placeholders={"video": ranges})
-
-    elif is_list_of(video_data, np.ndarray):
-        video_feature_size = []
-        for video in video_data:
-            num_frames = video.shape[0]
-            video_feature_size.append(
-                get_llava_onevision_video_tokens(ctx, num_frames))
-
-        tokenizer = cached_get_tokenizer(model_config.tokenizer)
-        new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens(
-            tokenizer,
-            inputs.get("prompt"),
-            inputs["prompt_token_ids"],
-            placeholder_token_id=hf_config.video_token_index,
-            repeat_count=video_feature_size,
+    def _get_prompt_replacements(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> list[PromptReplacement]:
+        image_repls = super()._get_prompt_replacements(
+            mm_items=mm_items,
+            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+            out_mm_kwargs=out_mm_kwargs,
         )
-        return token_inputs(prompt_token_ids=new_token_ids,
-                            prompt=new_prompt,
-                            multi_modal_data=multi_modal_data,
-                            multi_modal_placeholders={"video": ranges})
-    else:
-        raise TypeError(f"Invalid video type: {type(video_data)}")
 
-    msg = f"Unsupported video type: {type(video_data)}"
-    raise NotImplementedError(msg)
+        hf_config = self._get_hf_config()
+        video_token_id = hf_config.video_token_index
 
+        def get_video_replacement(item_idx: int):
+            videos = mm_items.get_items(
+                "video", (VideoEmbeddingItems, VideoProcessorItems))
 
-def input_processor_for_llava_onevision(ctx: InputContext,
-                                        inputs: DecoderOnlyInputs):
-    multi_modal_data = inputs.get("multi_modal_data")
-    if multi_modal_data is None or ("video" not in multi_modal_data
-                                    and "image" not in multi_modal_data):
-        return inputs
-    if "image" in multi_modal_data:
-        return input_processor_when_multimodal_input_image(ctx, inputs)
-    if "video" in multi_modal_data:
-        return input_processor_when_multimodal_input_video(ctx, inputs)
+            if isinstance(videos, VideoEmbeddingItems):
+                num_video_tokens = videos.get_feature_size(item_idx)
+            else:
+                image_size = videos.get_frame_size(item_idx)
+                num_video_tokens = self._get_num_video_tokens(
+                    image_width=image_size.width,
+                    image_height=image_size.height,
+                    num_frames=videos.get_num_frames(item_idx),
+                )
+
+            return [video_token_id] * num_video_tokens
 
-    msg = "Unsupported multi data type"
-    raise NotImplementedError(msg)
+        return image_repls + [
+            PromptReplacement(
+                modality="video",
+                target=[video_token_id],
+                replacement=get_video_replacement,
+            ),
+        ]
+
+    def _get_dummy_processor_inputs(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        num_images = mm_counts.get("image", 0)
+        num_videos = mm_counts.get("video", 0)
+
+        image_token = self._get_image_token()
+        video_token = self._get_video_token()
+        target_width, target_height = self._get_dummy_image_size()
+
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images),
+            "video":
+            self._get_dummy_videos(
+                width=target_width,
+                height=target_height,
+                num_frames=self._get_dummy_num_frames(seq_len),
+                num_videos=num_videos,
+            )
+        }
+
+        return ProcessorInputs(
+            prompt_text=image_token * num_images + video_token * num_videos,
+            mm_data=mm_data,
+        )
 
 
 class LlavaOnevisionMultiModalProjector(nn.Module):
@@ -394,14 +348,7 @@ def forward(self, image_features: torch.Tensor) -> torch.Tensor:
         return hidden_states
 
 
-@MULTIMODAL_REGISTRY.register_image_input_mapper()
-@MULTIMODAL_REGISTRY.register_input_mapper("video")
-@MULTIMODAL_REGISTRY.register_max_multimodal_tokens(
-    "image", get_max_llava_onevision_image_tokens)
-@MULTIMODAL_REGISTRY.register_max_multimodal_tokens(
-    "video", get_max_llava_onevision_video_tokens)
-@INPUT_REGISTRY.register_dummy_data(dummy_data_for_llava_onevision)
-@INPUT_REGISTRY.register_input_processor(input_processor_for_llava_onevision)
+@MULTIMODAL_REGISTRY.register_processor(LlavaOnevisionMultiModalProcessor)
 class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
                                              SupportsPP):
 
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index f2e49d8e4848d..7aa9d58d1d348 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -323,7 +323,7 @@ def _get_num_image_tokens(
             height=image_height,
         )
 
-    def get_mm_max_tokens_per_item(self) -> Mapping[str, int]:
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
         max_image_tokens = self._get_num_image_tokens(
             image_width=MAX_IMAGE_FEATURE_SIZE_WIDTH,
             image_height=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
@@ -415,12 +415,12 @@ def get_replacement_phi3v(item_idx: int):
     def _apply_prompt_replacements(
         self,
         token_ids: list[int],
-        prompt_repls: Sequence[_BoundPromptReplacement],
+        mm_prompt_repls: Mapping[str, Sequence[_BoundPromptReplacement]],
         mm_item_counts: Mapping[str, int],
-    ) -> tuple[list[int], str, list[_PlaceholderInfo]]:
+    ) -> tuple[list[int], str, Mapping[str, list[_PlaceholderInfo]]]:
         token_ids, text, placeholders = super()._apply_prompt_replacements(
             token_ids=token_ids,
-            prompt_repls=prompt_repls,
+            mm_prompt_repls=mm_prompt_repls,
             mm_item_counts=mm_item_counts,
         )
 
@@ -428,15 +428,23 @@ def _apply_prompt_replacements(
         if text.startswith("<s> <|image|>"):
             text = text.replace("<s> <|image|>", "<s><|image|>", 1)
             token_ids = [token_ids[0], *token_ids[2:]]
-            placeholders = [
-                _PlaceholderInfo(p.modality, p.start_idx - 1, p.replacement)
-                for p in placeholders
-            ]
+            placeholders = {
+                modality: [
+                    _PlaceholderInfo(
+                        modality=p.modality,
+                        item_idx=p.item_idx,
+                        start_idx=p.start_idx - 1,
+                        replacement=p.replacement,
+                    ) for p in ps
+                ]
+                for modality, ps in placeholders.items()
+            }
 
         return token_ids, text, placeholders
 
-    def _get_dummy_mm_inputs(
+    def _get_dummy_processor_inputs(
         self,
+        seq_len: int,
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
         num_images = mm_counts.get("image", 0)
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index d7233bd6028ed..9e1d38512c0b4 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -780,15 +780,18 @@ def get_num_image_tokens(
     def get_max_image_tokens(self) -> int:
         return get_max_pixtral_hf_image_tokens(self.vision_config)
 
-    def get_num_patches(self) -> int:
+    def get_image_size(self) -> int:
+        return self.vision_config.image_size
+
+    def get_patch_size(self) -> int:
+        return self.vision_config.patch_size
+
+    def get_patch_grid_length(self) -> int:
         return get_pixtral_hf_patch_grid_length(
             image_size=self.vision_config.image_size,
             patch_size=self.vision_config.patch_size,
         )
 
-    def get_image_size(self) -> int:
-        return self.vision_config.image_size
-
 
 class PixtralHFMLP(nn.Module):
 
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index d050fd060353a..bc3bb1f79b407 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -84,7 +84,7 @@ class Qwen2AudioMultiModalProcessor(BaseMultiModalProcessor):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"audio": None}
 
-    def get_mm_max_tokens_per_item(self) -> Mapping[str, int]:
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
         hf_config = self.ctx.get_hf_config(Qwen2AudioConfig)
         max_source_positions = hf_config.audio_config.max_source_positions
         max_output_lengths = (max_source_positions - 2) // 2 + 1
@@ -184,15 +184,16 @@ def get_replacement_qwen2_audio(item_idx: int):
         ]
 
     def _always_apply_prompt_replacements(self) -> bool:
-        # HF never applies prompt replacements, so we have to do it ourselves
-        # _find_placeholders may incorrectly think that HF has already performed
-        # processing for multi-audio input when the input audios are short
-        # (the corresponding placeholders may take up fewer tokens than
-        # the number of audio items)
+        # HF never applies prompt replacements, so we have to do it ourselves.
+        # NOTE: `_find_placeholders_by_modality` may incorrectly think that HF
+        # has already performed processing for multi-audio input when the input
+        # audios are short (the corresponding placeholders may take up fewer
+        # tokens than the number of audio items)
         return True
 
-    def _get_dummy_mm_inputs(
+    def _get_dummy_processor_inputs(
         self,
+        seq_len: int,
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
         feature_extractor = self._get_feature_extractor()
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 5a8c6e4deb7ac..abca85e0e2024 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -56,7 +56,8 @@
 from vllm.multimodal.inputs import (ImageItem, ModalityData,
                                     MultiModalFieldConfig, MultiModalKwargs,
                                     NestedTensors, VideoItem)
-from vllm.multimodal.parse import ModalityDataItems, MultiModalDataParser
+from vllm.multimodal.parse import (ImageSize, ModalityDataItems,
+                                   MultiModalDataParser)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         MultiModalDataItems, ProcessorInputs,
                                         PromptReplacement)
@@ -641,58 +642,6 @@ def load_weights(self, weights: Iterable[Tuple[str,
         return loaded_params
 
 
-# === Vision input helpers === #
-
-
-def _get_vision_info(
-    vision_config: Qwen2VLVisionConfig,
-    height: int,
-    width: int,
-    min_pixels: int,
-    max_pixels: int,
-    *,
-    do_resize: bool = True,
-    modality: str = "image",
-    mm_count: int = 1,
-):
-    """Get information (resized height / width and number of vision tokens)
-    of input image / video frame."""
-    patch_size = vision_config.patch_size
-    merge_size = vision_config.spatial_merge_size
-    temporal_patch_size = vision_config.temporal_patch_size
-
-    if do_resize:
-        resized_height, resized_width = smart_resize(
-            height=height,
-            width=width,
-            factor=patch_size * merge_size,
-            min_pixels=min_pixels,
-            max_pixels=max_pixels,
-        )
-    else:
-        resized_height, resized_width = height, width
-
-    if modality == "image":
-        grid_t = mm_count
-    elif modality == "video":
-        grid_t = max(mm_count // temporal_patch_size, 1)
-    else:
-        raise ValueError(f"Modality {modality} is not supported")
-
-    grid_h = resized_height // patch_size
-    grid_w = resized_width // patch_size
-    vision_tokens = grid_t * grid_h * grid_w
-    llm_num_vision_tokens = vision_tokens // (merge_size**2)
-
-    return resized_height, resized_width, llm_num_vision_tokens
-
-
-def _get_image_processor(hf_processor: Qwen2VLProcessor):
-    image_processor = hf_processor.image_processor  # type: ignore
-    assert isinstance(image_processor, Qwen2VLImageProcessor)
-    return image_processor
-
-
 class Qwen2EmbeddingItems(ModalityDataItems[dict[str, torch.Tensor],
                                             dict[str, torch.Tensor]]):
 
@@ -764,32 +713,111 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None, "video": None}
 
-    def _get_max_mm_tokens(self, modality: str) -> int:
+    def _get_vision_info(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        num_frames: int = 1,
+        do_resize: bool = True,
+    ) -> tuple[ImageSize, int]:
         hf_config = self.ctx.get_hf_config(Qwen2VLConfig)
         vision_config = hf_config.vision_config
+        patch_size = vision_config.patch_size
+        merge_size = vision_config.spatial_merge_size
+        temporal_patch_size = vision_config.temporal_patch_size
 
         hf_processor = self._get_hf_processor()
-        image_processor = _get_image_processor(hf_processor)
-
-        _, _, max_llm_image_tokens = _get_vision_info(
-            vision_config,
-            height=9999999,
-            width=9999999,
-            min_pixels=image_processor.min_pixels,
-            max_pixels=image_processor.max_pixels,
-            modality=modality,
+        image_processor = self._get_image_processor(hf_processor)
+
+        if do_resize:
+            resized_height, resized_width = smart_resize(
+                height=image_height,
+                width=image_width,
+                factor=patch_size * merge_size,
+                min_pixels=image_processor.min_pixels,
+                max_pixels=image_processor.max_pixels,
+            )
+            preprocessed_size = ImageSize(width=resized_width,
+                                          height=resized_height)
+        else:
+            preprocessed_size = ImageSize(width=image_width,
+                                          height=image_height)
+
+        grid_t = max(num_frames // temporal_patch_size, 1)
+        grid_h = preprocessed_size.height // patch_size
+        grid_w = preprocessed_size.width // patch_size
+
+        num_patches = grid_t * grid_h * grid_w
+        num_vision_tokens = num_patches // (merge_size**2)
+
+        return preprocessed_size, num_vision_tokens
+
+    def _get_dummy_image_size(self) -> ImageSize:
+        max_image_size, _ = self._get_vision_info(
+            image_width=9999999,
+            image_height=9999999,
+        )
+        return max_image_size
+
+    def _get_max_image_tokens(self) -> int:
+        _, max_image_tokens = self._get_vision_info(
+            image_width=9999999,
+            image_height=9999999,
+        )
+        return max_image_tokens
+
+    def _get_max_video_tokens(self, num_frames: int) -> int:
+        _, max_video_tokens = self._get_vision_info(
+            image_width=9999999,
+            image_height=9999999,
+            num_frames=num_frames,
         )
-        return max_llm_image_tokens
+        return max_video_tokens
+
+    def _get_max_video_frames(self, max_tokens: int) -> int:
+        num_frames = 0
+
+        while True:
+            next_num_frames = num_frames + 1
+
+            if self._get_max_video_tokens(next_num_frames) > max_tokens:
+                break
+
+            num_frames = next_num_frames
+
+        return num_frames
+
+    def _get_dummy_num_frames(self, seq_len: int) -> int:
+        mm_config = self.ctx.get_mm_config()
+        max_images = mm_config.limit_per_prompt.get("image", 1)
+        max_videos = mm_config.limit_per_prompt.get("video", 1)
+
+        max_image_tokens = self._get_max_image_tokens() * max_images
+        max_total_frames = self._get_max_video_frames(seq_len -
+                                                      max_image_tokens)
+
+        return max(max_total_frames // max(max_videos, 1), 1)
+
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+        max_image_tokens = self._get_max_image_tokens()
+
+        num_frames = self._get_dummy_num_frames(seq_len)
+        max_video_tokens = self._get_max_video_tokens(num_frames)
 
-    def get_mm_max_tokens_per_item(self) -> Mapping[str, int]:
         return {
-            "image": self._get_max_mm_tokens("image"),
-            "video": self._get_max_mm_tokens("video"),
+            "image": max_image_tokens,
+            "video": max_video_tokens,
         }
 
     def _get_data_parser(self) -> MultiModalDataParser:
         return Qwen2MultiModalDataParser()
 
+    def _get_image_processor(self, hf_processor: Qwen2VLProcessor):
+        image_processor = hf_processor.image_processor  # type: ignore
+        assert isinstance(image_processor, Qwen2VLImageProcessor)
+        return image_processor
+
     def _get_hf_processor(
         self,
         *,
@@ -797,7 +825,7 @@ def _get_hf_processor(
         max_pixels: Optional[int] = None,
     ) -> Qwen2VLProcessor:
         hf_processor = self.ctx.get_hf_processor(Qwen2VLProcessor)
-        image_processor = _get_image_processor(hf_processor)
+        image_processor = self._get_image_processor(hf_processor)
 
         if min_pixels:
             image_processor.min_pixels = min_pixels
@@ -818,7 +846,7 @@ def _get_prompt_replacements(
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
         hf_processor = self._get_hf_processor()
-        image_processor = _get_image_processor(hf_processor)
+        image_processor = self._get_image_processor(hf_processor)
 
         # NOTE: Only Qwen2VLProcessor in transformers 4.47.0 has
         # image_token and video_token registered
@@ -873,32 +901,35 @@ def _get_mm_fields_config(
             video_grid_thw=MultiModalFieldConfig.batched("video"),
         )
 
-    def _get_dummy_mm_inputs(
+    def _get_dummy_processor_inputs(
         self,
+        seq_len: int,
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
-        hf_processor = self._get_hf_processor()
-        image_processor = _get_image_processor(hf_processor)
+        num_images = mm_counts.get("image", 0)
+        num_videos = mm_counts.get("video", 0)
 
+        hf_processor = self._get_hf_processor()
         image_token: str = hf_processor.image_token
-        resized_height, resized_width = smart_resize(
-            height=9999999,
-            width=9999999,
-            factor=image_processor.patch_size * image_processor.merge_size,
-            min_pixels=image_processor.min_pixels,
-            max_pixels=image_processor.max_pixels,
-        )
-        num_images = mm_counts.get("image", 0)
+        video_token: str = hf_processor.video_token
+        target_width, target_height = self._get_dummy_image_size()
 
         mm_data = {
             "image":
-            self._get_dummy_images(width=resized_width,
-                                   height=resized_height,
-                                   num_images=num_images)
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images),
+            "video":
+            self._get_dummy_videos(
+                width=target_width,
+                height=target_height,
+                num_frames=self._get_dummy_num_frames(seq_len),
+                num_videos=num_videos,
+            )
         }
 
         return ProcessorInputs(
-            prompt_text=image_token * num_images,
+            prompt_text=image_token * num_images + video_token * num_videos,
             mm_data=mm_data,
         )
 
diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py
index 115eaaac900e0..7ea177e94afc0 100644
--- a/vllm/model_executor/models/siglip.py
+++ b/vllm/model_executor/models/siglip.py
@@ -171,15 +171,18 @@ def get_num_image_tokens(
     def get_max_image_tokens(self) -> int:
         return get_max_siglip_image_tokens(self.vision_config)
 
-    def get_num_patches(self) -> int:
+    def get_image_size(self) -> int:
+        return self.vision_config.image_size
+
+    def get_patch_size(self) -> int:
+        return self.vision_config.patch_size
+
+    def get_patch_grid_length(self) -> int:
         return get_siglip_patch_grid_length(
             image_size=self.vision_config.image_size,
             patch_size=self.vision_config.patch_size,
         )
 
-    def get_image_size(self) -> int:
-        return self.vision_config.image_size
-
 
 # Adapted from https://github.com/huggingface/transformers/blob/v4.43.3/src/transformers/models/siglip/modeling_siglip.py#L249 # noqa
 class SiglipVisionEmbeddings(nn.Module):
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index 0b83684c9bac5..6ad4661e3bb8d 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -6,7 +6,6 @@
 from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
                     TypedDict, Union)
 
-import numpy as np
 import torch
 import torch.utils.checkpoint
 from torch import nn
@@ -31,7 +30,6 @@
                                         PromptReplacement)
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs.ultravox import UltravoxConfig
-from vllm.utils import is_list_of
 
 from .interfaces import SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
@@ -62,7 +60,7 @@ class UltravoxMultiModalProcessor(BaseMultiModalProcessor):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"audio": None}
 
-    def get_mm_max_tokens_per_item(self) -> Mapping[str, int]:
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
         feature_extractor = self._get_feature_extractor()
         max_audio_tokens = math.ceil(feature_extractor.chunk_length *
                                      _AUDIO_TOKENS_PER_SECOND)
@@ -103,6 +101,7 @@ def _call_hf_processor(
 
         mm_data = dict(mm_data)
         audios = mm_data.pop("audios", [])
+        assert isinstance(audios, list)
 
         if not audios:
             return super()._call_hf_processor(
@@ -117,9 +116,6 @@ def _call_hf_processor(
             sampling_rate=feature_extractor.sampling_rate,
         )
 
-        # Already resampled by _get_hf_mm_data
-        assert is_list_of(audios, np.ndarray)
-
         # Ultravox processor doesn't support multiple inputs,
         # therefore we need to input text and audio one by one
         audio_features, audio_token_len = [], []
@@ -177,8 +173,9 @@ def get_replacement_ultravox(item_idx: int):
             )
         ]
 
-    def _get_dummy_mm_inputs(
+    def _get_dummy_processor_inputs(
         self,
+        seq_len: int,
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
         feature_extractor = self._get_feature_extractor()
diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py
index 65a773480d2a1..014f02ee10a1b 100644
--- a/vllm/model_executor/models/vision.py
+++ b/vllm/model_executor/models/vision.py
@@ -1,8 +1,12 @@
 from abc import ABC, abstractmethod
-from typing import Generic, TypeVar
+from typing import Final, Generic, Optional, Protocol, TypeVar
 
 from transformers import PretrainedConfig
 
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        InputProcessingContext,
+                                        ProcessingCache)
+
 _C = TypeVar("_C", bound=PretrainedConfig)
 
 
@@ -27,11 +31,15 @@ def get_max_image_tokens(self) -> int:
         raise NotImplementedError
 
     @abstractmethod
-    def get_num_patches(self) -> int:
+    def get_image_size(self) -> int:
         raise NotImplementedError
 
     @abstractmethod
-    def get_image_size(self) -> int:
+    def get_patch_size(self) -> int:
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_patch_grid_length(self) -> int:
         raise NotImplementedError
 
 
@@ -50,3 +58,26 @@ def vision_encoder_info(vision_config: PretrainedConfig) -> VisionEncoderInfo:
 
     msg = f"Unsupported vision config: {type(vision_config)}"
     raise NotImplementedError(msg)
+
+
+class VisionLanguageConfig(Protocol):
+    vision_config: Final[PretrainedConfig]
+
+
+class BaseVisionLanguageMultiModalProcessor(BaseMultiModalProcessor):
+
+    def __init__(self,
+                 ctx: InputProcessingContext,
+                 *,
+                 cache: Optional[ProcessingCache] = None,
+                 enable_sanity_checks: bool = True) -> None:
+        super().__init__(ctx,
+                         cache=cache,
+                         enable_sanity_checks=enable_sanity_checks)
+
+        vision_config = self._get_hf_config().vision_config
+        self._vision_encoder_info = vision_encoder_info(vision_config)
+
+    @abstractmethod
+    def _get_hf_config(self) -> VisionLanguageConfig:
+        raise NotImplementedError
diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py
index 00acb77435163..6be046ba77ca7 100644
--- a/vllm/multimodal/parse.py
+++ b/vllm/multimodal/parse.py
@@ -146,6 +146,20 @@ class VideoProcessorItems(ProcessorBatchItems[HfVideoItem]):
     def __init__(self, data: Sequence[HfVideoItem]) -> None:
         super().__init__(data, "video")
 
+    def get_num_frames(self, item_idx: int) -> int:
+        return len(self.get(item_idx))
+
+    def get_frame_size(self, item_idx: int) -> ImageSize:
+        image = self.get(item_idx)[0]  # Assume that the video isn't empty
+
+        if isinstance(image, Image):
+            return ImageSize(*image.size)
+        if isinstance(image, (np.ndarray, torch.Tensor)):
+            _, h, w = image.shape
+            return ImageSize(w, h)
+
+        assert_never(image)
+
 
 class VideoEmbeddingItems(EmbeddingItems):
 
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index eb7552176e974..ebc16b817684a 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -16,7 +16,8 @@
 
 from vllm.inputs import DummyData, InputProcessingContext
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer, encode_tokens
+from vllm.transformers_utils.tokenizer import (AnyTokenizer, decode_tokens,
+                                               encode_tokens)
 from vllm.utils import LRUCache, flatten_2d_lists, full_groupby
 
 from .inputs import (MultiModalDataDict, MultiModalFieldConfig,
@@ -69,19 +70,6 @@ def _cached_encode(
                          add_special_tokens=add_special_tokens)
 
 
-def _decode(
-    tokenizer: AnyTokenizer,
-    token_ids: list[int],
-    *,
-    skip_special_tokens: bool = False,
-) -> str:
-    """
-    Backend-agnostic equivalent of HF's
-    :code:`tokenizer.decode(token_ids, skip_special_tokens=...)`.
-    """
-    return tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)
-
-
 @lru_cache(maxsize=2048)
 def _cached_decode(
     tokenizer: AnyTokenizer,
@@ -89,9 +77,9 @@ def _cached_decode(
     *,
     skip_special_tokens: bool = False,
 ) -> str:
-    return _decode(tokenizer,
-                   list(token_ids),
-                   skip_special_tokens=skip_special_tokens)
+    return decode_tokens(tokenizer,
+                         list(token_ids),
+                         skip_special_tokens=skip_special_tokens)
 
 
 class _HasModalityAttr(Protocol):
@@ -269,8 +257,10 @@ def end_idx(self) -> int:
         return self.match.end()
 
 
-class _PlaceholderInfo(NamedTuple):
+@dataclass
+class _PlaceholderInfo:
     modality: str
+    item_idx: int
     start_idx: int
     replacement: list[int]
 
@@ -311,12 +301,14 @@ def find_text_matches(
 
 def _resolve_matches(
     prompt: _PromptSeq,
-    matches: Sequence[_PromptReplacementMatch],
+    mm_matches: Mapping[str, Sequence[_PromptReplacementMatch]],
 ) -> list[_PromptReplacementMatch]:
     """
-    Resolve :code:`matches` to ensure that there are no overlapping matches,
+    Resolve :code:`mm_matches` to ensure that there are no overlapping matches,
     and sort them such that earlier matches take priority over later ones.
     """
+    matches = [m for matches in mm_matches.values() for m in matches]
+
     seen_matches: list[Optional[_PromptReplacementMatch]] = [None
                                                              ] * len(prompt)
 
@@ -334,14 +326,15 @@ def _resolve_matches(
 
 def _replace_matches(
     prompt: _S,
-    matches: Sequence[_PromptReplacementMatch],
+    mm_matches: Mapping[str, Sequence[_PromptReplacementMatch]],
     mm_item_counts: Mapping[str, int],
 ) -> list[_S]:
+    """Apply the replacements in :code:`mm_matches` to :code:`prompt`."""
     out_seqs = list[_S]()
     prev_end_idx = 0
     next_idx_by_modality = defaultdict[str, int](lambda: 0)
 
-    for match in _resolve_matches(prompt, matches):
+    for match in _resolve_matches(prompt, mm_matches):
         modality = match.modality
 
         item_idx = next_idx_by_modality[modality]
@@ -371,28 +364,28 @@ def _replace_matches(
 
 def replace_token_matches(
     prompt: list[int],
-    matches: Sequence[_PromptReplacementTokenMatch],
+    mm_matches: Mapping[str, Sequence[_PromptReplacementTokenMatch]],
     mm_item_counts: Mapping[str, int],
 ) -> list[int]:
-    """Apply :code:`prompt_repls` to :code:`prompt`."""
-    if not matches:
+    """Apply the replacements in :code:`mm_matches` to :code:`prompt`."""
+    if not mm_matches:
         return prompt
 
-    token_id_seqs = _replace_matches(prompt, matches, mm_item_counts)
+    token_id_seqs = _replace_matches(prompt, mm_matches, mm_item_counts)
 
     return flatten_2d_lists(token_id_seqs)
 
 
 def replace_text_matches(
     prompt: str,
-    matches: Sequence[_PromptReplacementTextMatch],
+    mm_matches: Mapping[str, Sequence[_PromptReplacementTextMatch]],
     mm_item_counts: Mapping[str, int],
 ) -> str:
-    """Apply :code:`prompt_repls` to :code:`prompt`."""
-    if not matches:
+    """Apply the replacements in :code:`mm_matches` to :code:`prompt`."""
+    if not mm_matches:
         return prompt
 
-    texts = _replace_matches(prompt, matches, mm_item_counts)
+    texts = _replace_matches(prompt, mm_matches, mm_item_counts)
 
     return "".join(texts)
 
@@ -407,14 +400,14 @@ def _iter_modality_placeholders(
         return
 
     prompt_len = len(prompt)
-    item_index = 0
+    item_idx = 0
 
     start_idx = 0
     while start_idx < prompt_len:
         found = False
 
         for repl_info in modality_repls:
-            replacement = repl_info.get_replacement(item_index)
+            replacement = repl_info.get_replacement(item_idx)
             repl_tokens = replacement.token_ids
             repl_len = len(repl_tokens)
             end_idx = start_idx + repl_len
@@ -425,12 +418,13 @@ def _iter_modality_placeholders(
             if prompt[start_idx:end_idx] == repl_tokens:
                 yield _PlaceholderInfo(
                     modality=modality,
+                    item_idx=item_idx,
                     start_idx=start_idx,
                     replacement=repl_tokens,
                 )
 
-                item_index += 1
-                if item_index >= modal_item_count:
+                item_idx += 1
+                if item_idx >= modal_item_count:
                     return
 
                 # Exclude overlapping matches
@@ -442,28 +436,36 @@ def _iter_modality_placeholders(
             start_idx += 1
 
 
-def iter_placeholders(
-    prompt_repls: Sequence[_BoundPromptReplacement],
+def _iter_placeholders(
+    mm_prompt_repls: Mapping[str, Sequence[_BoundPromptReplacement]],
     prompt: list[int],
     mm_item_counts: Mapping[str, int],
 ) -> Iterable[_PlaceholderInfo]:
     """
-    Yield each set of placeholder tokens found in :code:`prompt`.
+    For each modality, yield each set of placeholder tokens found in
+    :code:`prompt`.
 
     Note that empty matches are ignored.
     """
-    repls_by_modality = dict(full_groupby_modality(prompt_repls))
-
     for modality, modal_item_count in mm_item_counts.items():
-        if modality in repls_by_modality:
+        if modality in mm_prompt_repls:
             yield from _iter_modality_placeholders(
                 prompt,
                 modality,
-                repls_by_modality[modality],
+                mm_prompt_repls[modality],
                 modal_item_count,
             )
 
 
+def find_mm_placeholders(
+    mm_prompt_repls: Mapping[str, Sequence[_BoundPromptReplacement]],
+    prompt: list[int],
+    mm_item_counts: Mapping[str, int],
+) -> Mapping[str, list[_PlaceholderInfo]]:
+    it = _iter_placeholders(mm_prompt_repls, prompt, mm_item_counts)
+    return dict(full_groupby_modality(it))
+
+
 @dataclass
 class ProcessorInputs:
     """Keyword arguments to :meth:`BaseMultiModalProcessor`."""
@@ -620,7 +622,7 @@ def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         raise NotImplementedError
 
     @abstractmethod
-    def get_mm_max_tokens_per_item(self) -> Mapping[str, int]:
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
         """
         Get the maximum possible number of tokens per data item
         for each modality.
@@ -703,14 +705,14 @@ def _get_prompt_replacements(
         """
         raise NotImplementedError
 
-    def _find_placeholders(
+    def _find_mm_placeholders(
         self,
-        all_prompt_repls: Sequence[_BoundPromptReplacement],
+        mm_prompt_repls: Mapping[str, Sequence[_BoundPromptReplacement]],
         new_token_ids: list[int],
         mm_item_counts: Mapping[str, int],
-    ) -> list[_PlaceholderInfo]:
-        return list(
-            iter_placeholders(all_prompt_repls, new_token_ids, mm_item_counts))
+    ) -> Mapping[str, list[_PlaceholderInfo]]:
+        return find_mm_placeholders(mm_prompt_repls, new_token_ids,
+                                    mm_item_counts)
 
     def _get_hf_mm_data(
         self,
@@ -797,7 +799,10 @@ def _apply_hf_processor_missing(
 
         # Some HF processors (e.g. Qwen2-VL) expect corresponding
         # multi-modal tokens to be in the prompt text
-        dummy_inputs = self._get_dummy_mm_inputs(mm_missing_counts)
+        dummy_inputs = self._get_dummy_processor_inputs(
+            self.ctx.model_config.max_model_len,
+            mm_missing_counts,
+        )
 
         _, mm_missing_kwargs = self._apply_hf_processor(
             prompt_text=dummy_inputs.prompt_text,
@@ -889,50 +894,44 @@ def _cached_apply_hf_processor(
 
         mm_kwargs = MultiModalKwargs.from_items(merged_kw_items)
 
-        if self.enable_sanity_checks:
-            mm_item_counts = mm_data_items.get_all_counts()
-
-            for modality, item_count in mm_item_counts.items():
-                for item_idx in range(item_count):
-                    try:
-                        mm_kwargs.get_item(modality, item_idx)
-                    except Exception as e:
-                        # Make it easy to set a breakpoint in the debugger
-                        raise e
-
         return prompt_ids, mm_kwargs
 
-    def _bind_prompt_replacements(
+    def _bind_and_group_repls(
         self,
         prompt_repls: list[PromptReplacement],
-    ) -> list[_BoundPromptReplacement]:
+    ) -> dict[str, list[_BoundPromptReplacement]]:
         tokenizer = self._get_tokenizer()
 
-        return [prompt_repl.bind(tokenizer) for prompt_repl in prompt_repls]
+        it = (prompt_repl.bind(tokenizer) for prompt_repl in prompt_repls)
+        return dict(full_groupby_modality(it))
 
     def _always_apply_prompt_replacements(self) -> bool:
         """
         A flag which can be overridden so that
         :meth:`_apply_prompt_replacements` is always called even if we
-        detect that HF has performed processing via :meth:`_find_placeholders`.
+        detect that HF has performed processing via
+        :meth:`_find_placeholders_by_modality`.
 
-        This is useful in cases where :meth:`_find_placeholders` cannot be
-        reliably used to detect whether HF has performed processing or not.
+        This is useful in cases where :meth:`_find_placeholders_by_modality`
+        cannot be reliably used to detect whether HF has performed processing.
         """
         return False
 
     def _apply_prompt_replacements(
         self,
         token_ids: list[int],
-        prompt_repls: Sequence[_BoundPromptReplacement],
+        mm_prompt_repls: Mapping[str, Sequence[_BoundPromptReplacement]],
         mm_item_counts: Mapping[str, int],
-    ) -> tuple[list[int], str, list[_PlaceholderInfo]]:
+    ) -> tuple[list[int], str, Mapping[str, list[_PlaceholderInfo]]]:
         tokenizer = self._get_tokenizer()
 
-        token_matches = find_token_matches(token_ids, prompt_repls)
+        mm_token_matches = {
+            modality: find_token_matches(token_ids, prompt_repls)
+            for modality, prompt_repls in mm_prompt_repls.items()
+        }
         mm_match_counts = {
             modality: len(matches)
-            for modality, matches in full_groupby_modality(token_matches)
+            for modality, matches in mm_token_matches.items()
         }
 
         # If the search text does not represent a special token,
@@ -951,32 +950,92 @@ def _apply_prompt_replacements(
         ):  # yapf: disable
             token_ids = replace_token_matches(
                 token_ids,
-                token_matches,
+                mm_token_matches,
                 mm_item_counts,
             )
 
-            text = _decode(tokenizer, token_ids)
-            matched_repls = [match.prompt_repl for match in token_matches]
+            text = decode_tokens(tokenizer, token_ids)
+            matched_repls = {
+                modality: [match.prompt_repl for match in token_matches]
+                for modality, token_matches in mm_token_matches.items()
+            }
         else:
-            text = _decode(tokenizer, token_ids)
+            text = decode_tokens(tokenizer, token_ids)
 
-            text_matches = find_text_matches(text, prompt_repls)
+            mm_text_matches = {
+                modality: find_text_matches(text, prompt_repls)
+                for modality, prompt_repls in mm_prompt_repls.items()
+            }
             text = replace_text_matches(
                 text,
-                text_matches,
+                mm_text_matches,
                 mm_item_counts,
             )
 
             token_ids = encode_tokens(tokenizer,
                                       text,
                                       add_special_tokens=False)
-            matched_repls = [match.prompt_repl for match in text_matches]
-
-        placeholders = self._find_placeholders(matched_repls, token_ids,
-                                               mm_item_counts)
+            matched_repls = {
+                modality: [match.prompt_repl for match in token_matches]
+                for modality, token_matches in mm_text_matches.items()
+            }
+
+        placeholders = self._find_mm_placeholders(
+            matched_repls,
+            token_ids,
+            mm_item_counts,
+        )
 
         return token_ids, text, placeholders
 
+    def _validate_mm_kwargs(
+        self,
+        mm_kwargs: MultiModalKwargs,
+        mm_item_counts: Mapping[str, int],
+    ) -> None:
+        for modality, item_count in mm_item_counts.items():
+            if modality in mm_kwargs.modalities:
+                items = mm_kwargs.get_items(modality)
+            else:
+                items = []
+
+            if len(items) != item_count:
+                raise RuntimeError(
+                    f"Expected there to be {item_count} {modality} items in "
+                    f"keyword arguments corresponding to {item_count} "
+                    f"{modality} data items, but only found {len(items)}! "
+                    "There is likely a problem with your "
+                    "implementation of merged multi-modal processor for this "
+                    "model (usually arising from an inconsistency between "
+                    "`_call_hf_processor` and `_get_mm_fields_config`).")
+
+    def _validate_mm_placeholders(
+        self,
+        mm_placeholders: Mapping[str, list[_PlaceholderInfo]],
+        mm_item_counts: Mapping[str, int],
+        *,
+        allow_missing: bool = False,
+    ) -> Mapping[str, int]:
+        missing_repl_counts = dict[str, int]()
+
+        for modality, item_count in mm_item_counts.items():
+            placeholders = mm_placeholders.get(modality, [])
+
+            if len(placeholders) != item_count and not allow_missing:
+                raise RuntimeError(
+                    f"Expected there to be {item_count} prompt replacements "
+                    f"corresponding to {item_count} {modality} items, but only "
+                    f"found {len(placeholders)} prompt replacements! Either "
+                    "the prompt text has missing/incorrect tokens for "
+                    "multi-modal inputs, or there is a problem with your "
+                    "implementation of merged multi-modal processor for this "
+                    "model (usually arising from an inconsistency between "
+                    "`_call_hf_processor` and `_get_prompt_replacements`).")
+
+            missing_repl_counts[modality] = item_count - len(placeholders)
+
+        return missing_repl_counts
+
     def apply(
         self,
         prompt_text: str,
@@ -1009,56 +1068,69 @@ def apply(
             hf_processor_mm_kwargs,
             mm_kwargs,
         )
-        prompt_repls = self._bind_prompt_replacements(unbound_prompt_repls)
+        mm_prompt_repls = self._bind_and_group_repls(unbound_prompt_repls)
 
-        # If HF processor already inserts placeholder tokens,
-        # there is no need for us to insert them
         mm_item_counts = mm_items.get_all_counts()
-        all_placeholders = self._find_placeholders(prompt_repls, prompt_ids,
-                                                   mm_item_counts)
+        self._validate_mm_kwargs(mm_kwargs, mm_item_counts)
+
+        hf_mm_placeholders = self._find_mm_placeholders(
+            mm_prompt_repls,
+            prompt_ids,
+            mm_item_counts,
+        )
+
+        if self._always_apply_prompt_replacements():
+            mm_missing_repl_counts = mm_item_counts
+            mm_missing_repls = dict(mm_prompt_repls)
+        else:
+            mm_missing_repl_counts = self._validate_mm_placeholders(
+                hf_mm_placeholders,
+                mm_item_counts,
+                allow_missing=True,
+            )
+
+            mm_missing_repls = dict[str, list[_BoundPromptReplacement]]()
+            for modality, missing_repl_count in mm_missing_repl_counts.items():
+                if missing_repl_count == 0:
+                    mm_missing_repls[modality] = []
+                elif missing_repl_count == mm_item_counts.get(modality, 0):
+                    mm_missing_repls[modality] = mm_prompt_repls[modality]
+                else:
+                    raise ValueError("Partial prompt replacement within "
+                                     f"{modality=} is not supported")
 
-        if all_placeholders and not self._always_apply_prompt_replacements():
+        # If HF processor already inserts placeholder tokens,
+        # there is no need for us to insert them
+        if all(len(repls) == 0 for repls in mm_missing_repls.items()):
             tokenizer = self._get_tokenizer()
-            prompt_text = _decode(tokenizer, prompt_ids)
+            prompt_text = decode_tokens(tokenizer, prompt_ids)
+            mm_placeholders = hf_mm_placeholders
         else:
             (
                 prompt_ids,
                 prompt_text,
-                all_placeholders,
+                missing_mm_placeholders,
             ) = self._apply_prompt_replacements(
                 prompt_ids,
-                prompt_repls,
-                mm_item_counts,
+                mm_missing_repls,
+                mm_missing_repl_counts,
             )
 
-        mm_placeholders = dict[str, list[PlaceholderRange]]()
-        err_suffix = ("This suggests a problem with your implementation of "
-                      "the merged multi-modal processor for this model, "
-                      "particularly in the `_get_prompt_replacements` method.")
-
-        for modality, placeholders in full_groupby_modality(all_placeholders):
-            if modality not in mm_items:
-                raise AssertionError(
-                    f"Expected no placeholders for {modality=}, "
-                    f"but found {placeholders=}. Input items: {mm_items}"
-                    f"\n{err_suffix}")
-
-            if len(placeholders) != len(mm_items[modality]):
-                raise AssertionError(
-                    f"Expected length of {placeholders=} for {modality=} "
-                    f"to equal that of input items: {mm_items[modality]}"
-                    f"\n{err_suffix}")
-
-            mm_placeholders[modality] = [
-                item.to_range() for item in placeholders
-            ]
+            mm_placeholders = {**hf_mm_placeholders, **missing_mm_placeholders}
+
+        self._validate_mm_placeholders(mm_placeholders, mm_item_counts)
+
+        mm_placeholder_ranges = {
+            modality: [item.to_range() for item in placeholders]
+            for modality, placeholders in mm_placeholders.items()
+        }
 
         return MultiModalInputsV2(
             type="multimodal",
             prompt=prompt_text,
             prompt_token_ids=prompt_ids,
             mm_kwargs=mm_kwargs,
-            mm_placeholders=mm_placeholders,
+            mm_placeholders=mm_placeholder_ranges,
         )
 
     def _get_dummy_audios(
@@ -1092,8 +1164,9 @@ def _get_dummy_videos(
         return [video] * num_videos
 
     @abstractmethod
-    def _get_dummy_mm_inputs(
+    def _get_dummy_processor_inputs(
         self,
+        seq_len: int,
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
         """
@@ -1121,12 +1194,25 @@ def _get_and_validate_dummy_mm_counts(self) -> Mapping[str, int]:
 
         return mm_limits
 
+    def _get_dummy_mm_inputs(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalInputsV2:
+        processor_inputs = self._get_dummy_processor_inputs(seq_len, mm_counts)
+
+        return self.apply(
+            prompt_text=processor_inputs.prompt_text,
+            mm_data=processor_inputs.mm_data,
+            hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs,
+        )
+
     def get_dummy_data(self, seq_len: int) -> DummyData:
         # Avoid circular import
         from vllm.sequence import SequenceData
 
         mm_counts = self._get_and_validate_dummy_mm_counts()
-        mm_max_tokens_per_item = self.get_mm_max_tokens_per_item()
+        mm_max_tokens_per_item = self.get_mm_max_tokens_per_item(seq_len)
         if mm_counts.keys() != mm_max_tokens_per_item.keys():
             raise AssertionError(
                 "The keys returned by `get_supported_mm_limits`"
@@ -1134,13 +1220,7 @@ def get_dummy_data(self, seq_len: int) -> DummyData:
                 "returned by `get_mm_max_tokens_per_item` "
                 f"({set(mm_max_tokens_per_item.keys())})")
 
-        processor_inputs = self._get_dummy_mm_inputs(mm_counts)
-        mm_inputs = self.apply(
-            prompt_text=processor_inputs.prompt_text,
-            mm_data=processor_inputs.mm_data,
-            hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs,
-        )
-
+        mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts)
         prompt_token_ids = mm_inputs["prompt_token_ids"]
         placeholders_by_modality = mm_inputs["mm_placeholders"]
 
@@ -1171,6 +1251,12 @@ def get_dummy_data(self, seq_len: int) -> DummyData:
                 "reduce `max_num_seqs`, and/or reduce `mm_counts`.", seq_len,
                 total_len, total_placeholders_by_modality)
 
+            return DummyData(
+                seq_data=SequenceData.from_prompt_token_counts((0, seq_len)),
+                multi_modal_data=None,
+                multi_modal_placeholders=None,
+            )
+
         prompt_token_ids.extend([0] * (seq_len - len(prompt_token_ids)))
 
         return DummyData(
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 073d49d7d2009..fb4389dc4df42 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -223,7 +223,8 @@ def get_max_tokens_per_item_by_modality(
         if self.has_processor(model_config):
             tokenizer = cached_get_tokenizer(model_config.tokenizer)
             processor = self.create_processor(model_config, tokenizer)
-            return processor.get_mm_max_tokens_per_item()
+            seq_len = model_config.max_model_len
+            return processor.get_mm_max_tokens_per_item(seq_len)
 
         return {
             key: plugin.get_max_multimodal_tokens(model_config)
diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index 42b2f095bc543..97920f42ec52f 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -21,6 +21,19 @@
                      MistralTokenizer]
 
 
+def decode_tokens(
+    tokenizer: AnyTokenizer,
+    token_ids: list[int],
+    *,
+    skip_special_tokens: bool = False,
+) -> str:
+    """
+    Backend-agnostic equivalent of HF's
+    :code:`tokenizer.decode(token_ids, skip_special_tokens=...)`.
+    """
+    return tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)
+
+
 def encode_tokens(
     tokenizer: AnyTokenizer,
     text: str,

From ba214dffbeec070051b61c1985ce6342c947f598 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 4 Jan 2025 23:45:57 +0800
Subject: [PATCH 074/462] [Bugfix] Fix precision error in LLaVA-NeXT (#11735)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .../processing/test_llava_next.py             |  3 +--
 vllm/model_executor/models/llava_next.py      | 14 +++++++----
 vllm/model_executor/models/llava_onevision.py | 23 ++++++++++++-------
 3 files changed, 26 insertions(+), 14 deletions(-)

diff --git a/tests/models/decoder_only/vision_language/processing/test_llava_next.py b/tests/models/decoder_only/vision_language/processing/test_llava_next.py
index 6772130c9b884..6c8d300717de4 100644
--- a/tests/models/decoder_only/vision_language/processing/test_llava_next.py
+++ b/tests/models/decoder_only/vision_language/processing/test_llava_next.py
@@ -15,10 +15,9 @@ def processor_for_llava_next():
     return LlavaNextMultiModalProcessor
 
 
-# FIXME: image_size [(198, 176), (176, 198)]
 @pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
 @pytest.mark.parametrize("image_size", [(1669, 2560), (2560, 1669), (183, 488),
-                                        (488, 183)])
+                                        (488, 183), (198, 176), (176, 198)])
 @pytest.mark.parametrize("num_imgs", [1, 2])
 def test_processor_prompt_replacements(
     processor_for_llava_next,
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 3769f04f94a92..f79021596f915 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -2,6 +2,7 @@
 from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
                     TypedDict, Union)
 
+import numpy as np
 import torch
 import torch.nn as nn
 from transformers import BatchFeature, LlavaNextConfig, LlavaNextProcessor
@@ -139,16 +140,21 @@ def _get_num_unpadded_features(
         current_height = npatches * num_patch_height
         current_width = npatches * num_patch_width
 
-        original_aspect_ratio = original_width / original_height
-        current_aspect_ratio = current_width / current_height
+        # NOTE: HF resizes based on float32
+        original_aspect_ratio = np.array(original_width / original_height,
+                                         dtype=np.float32)
+        current_aspect_ratio = np.array(current_width / current_height,
+                                        dtype=np.float32)
 
         if original_aspect_ratio > current_aspect_ratio:
-            scale_factor = current_width / original_width
+            scale_factor = np.array(current_width / original_width,
+                                    dtype=np.float32)
             new_height = int(original_height * scale_factor)
             padding = (current_height - new_height) // 2
             current_height -= 2 * padding
         else:
-            scale_factor = current_height / original_height
+            scale_factor = np.array(current_height / original_height,
+                                    dtype=np.float32)
             new_width = int(original_width * scale_factor)
             padding = (current_width - new_width) // 2
             current_width -= 2 * padding
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index 1e51e09a24c18..5a3cdadc47cac 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -3,6 +3,7 @@
 from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
                     TypedDict, Union)
 
+import numpy as np
 import torch
 import torch.nn as nn
 from transformers import (BatchFeature, LlavaOnevisionConfig,
@@ -127,18 +128,24 @@ def _get_num_unpadded_features(
         current_height = npatches * num_patch_height
         current_width = npatches * num_patch_width
 
-        original_aspect_ratio = original_width / original_height
-        current_aspect_ratio = current_width / current_height
+        # NOTE: HF resizes based on float32
+        original_aspect_ratio = np.array(original_width / original_height,
+                                         dtype=np.float32)
+        current_aspect_ratio = np.array(current_width / current_height,
+                                        dtype=np.float32)
+
         if original_aspect_ratio > current_aspect_ratio:
-            new_height = int(original_height *
-                             (current_width / original_width))
+            scale_factor = np.array(current_width / original_width,
+                                    dtype=np.float32)
+            new_height = int(original_height * scale_factor)
             padding = (current_height - new_height) // 2
-            current_height -= padding * 2
+            current_height -= 2 * padding
         else:
-            new_width = int(original_width *
-                            (current_height / original_height))
+            scale_factor = np.array(current_height / original_height,
+                                    dtype=np.float32)
+            new_width = int(original_width * scale_factor)
             padding = (current_width - new_width) // 2
-            current_width -= padding * 2
+            current_width -= 2 * padding
 
         unpadded_features = current_height * current_width
         newline_features = current_height

From 65c08928c2db934b18f7c6f5eeb02617826fae8e Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 4 Jan 2025 23:46:21 +0800
Subject: [PATCH 075/462] [Model] Remove unnecessary weight initialization
 logic (#11736)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
---
 vllm/model_executor/layers/resampler.py | 20 ++++----------------
 vllm/model_executor/models/aria.py      |  5 +----
 vllm/model_executor/models/minicpmv.py  |  2 --
 3 files changed, 5 insertions(+), 22 deletions(-)

diff --git a/vllm/model_executor/layers/resampler.py b/vllm/model_executor/layers/resampler.py
index aae806f6af323..a67713c320b86 100644
--- a/vllm/model_executor/layers/resampler.py
+++ b/vllm/model_executor/layers/resampler.py
@@ -27,7 +27,7 @@
 Shared resampler perceiver network used in multimodal models and
 related helpers for sincos positional embeddings.
 
-Example models: Qwen (Qwen-VL), Minicpmv2.0
+Example models: Qwen (Qwen-VL), MiniCPM-V 2.0
 """
 import math
 from functools import partial
@@ -37,7 +37,6 @@
 import torch
 import torch.nn.functional as F
 from torch import nn
-from torch.nn.init import trunc_normal_
 
 from vllm.model_executor.layers.linear import ReplicatedLinear
 from vllm.model_executor.layers.quantization import QuantizationConfig
@@ -169,8 +168,8 @@ def __init__(self,
         self.embed_dim = embed_dim
         self.num_heads = num_heads
 
-        self.query = nn.Parameter(torch.zeros(self.num_queries, embed_dim))
-        trunc_normal_(self.query, std=0.02)
+        self.query = nn.Parameter(torch.empty(self.num_queries, embed_dim))
+
         if kv_dim is not None and kv_dim != embed_dim:
             self.kv_proj = ReplicatedLinear(kv_dim,
                                             embed_dim,
@@ -190,16 +189,7 @@ def __init__(self,
         self.ln_post = norm_layer(embed_dim) if do_post_projection else None
         self.proj = nn.Parameter(
             (embed_dim**-0.5) *
-            torch.randn(embed_dim, embed_dim)) if do_post_projection else None
-
-    def _init_weights(self, m: nn.Module) -> None:
-        if isinstance(m, nn.Linear):
-            trunc_normal_(m.weight, std=0.02)
-            if isinstance(m, nn.Linear) and m.bias is not None:
-                nn.init.constant_(m.bias, 0)
-        elif isinstance(m, nn.LayerNorm):
-            nn.init.constant_(m.bias, 0)
-            nn.init.constant_(m.weight, 1.0)
+            torch.empty(embed_dim, embed_dim)) if do_post_projection else None
 
     def _repeat(self, query, N: int):
         return query.unsqueeze(1).repeat(1, N, 1)
@@ -240,8 +230,6 @@ def __init__(self,
         self.pos_embed = nn.Parameter(
             torch.from_numpy(pos_embed_arr).requires_grad_(False))
 
-        self.apply(self._init_weights)
-
     def forward(
         self,
         x: torch.Tensor,
diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
index 2fd4262a9d3b9..8f5fd64a90c87 100644
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@@ -3,7 +3,6 @@
 
 import torch
 import torch.nn as nn
-from torch.nn.init import trunc_normal_
 from transformers import BatchFeature, PretrainedConfig
 
 from vllm.attention import AttentionMetadata
@@ -216,9 +215,7 @@ def __init__(
         self.num_heads = num_heads
 
         self.query = nn.Parameter(
-            torch.zeros(max(patch_to_query_dict.values()), self.embed_dim))
-
-        trunc_normal_(self.query, std=0.02)
+            torch.empty(max(patch_to_query_dict.values()), self.embed_dim))
 
         self.cross_attn = CrossAttention(kv_dim, embed_dim, num_heads)
 
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 712022502539b..8f36437d47d9e 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -141,8 +141,6 @@ def __init__(self,
         self.max_size = max_size
         self._set_2d_pos_cache(self.max_size)
 
-        self.apply(self._init_weights)
-
     def _set_2d_pos_cache(self,
                           max_size: Tuple[int, int],
                           device: torch.types.Device = "cpu") -> None:

From 47831430cc943cd470d38d27f8c69a5782795ec3 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Sun, 5 Jan 2025 00:07:59 +0800
Subject: [PATCH 076/462] [Bugfix][V1] Fix test_kv_cache_utils.py (#11738)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 tests/v1/core/test_kv_cache_utils.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index faa3a91de151f..2ed70b42991b5 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -147,12 +147,12 @@ def test_generate_block_hash_extra_keys():
 
     # Test with no extra keys
     extra_keys, next_mm_idx = generate_block_hash_extra_keys(request, 0, 5, 0)
-    assert extra_keys == (("hash1", 0), )
+    assert extra_keys == ("hash1", )
     assert next_mm_idx == 1
 
     # Test with partial overlap
     extra_keys, next_mm_idx = generate_block_hash_extra_keys(request, 3, 8, 0)
-    assert extra_keys == (("hash1", 3), )
+    assert extra_keys == ("hash1", )
     assert next_mm_idx == 1
 
     # Test with no overlap
@@ -162,7 +162,7 @@ def test_generate_block_hash_extra_keys():
 
     # Test with multiple extra keys
     extra_keys, next_mm_idx = generate_block_hash_extra_keys(request, 0, 15, 0)
-    assert extra_keys == (("hash1", 0), ("hash2", 0))
+    assert extra_keys == ('hash1', 'hash2')
     assert next_mm_idx == 2
 
 
@@ -216,11 +216,11 @@ def test_hash_request_tokens():
 
     # Check the first block
     assert block_hashes[0].token_ids == (0, 1, 2)
-    assert block_hashes[0].extra_keys == (("hash1", 0), )
+    assert block_hashes[0].extra_keys == ("hash1", )
 
     # Check the second block
     assert block_hashes[1].token_ids == (3, 4, 5)
-    assert block_hashes[1].extra_keys == (("hash2", 0), )
+    assert block_hashes[1].extra_keys == ("hash2", )
 
 
 def test_hash_request_tokens_no_mm_inputs():

From 4068f4b5b5dc5e2d1114be0cbb126bc44fb4e906 Mon Sep 17 00:00:00 2001
From: Lu Fang <30275821+houseroad@users.noreply.github.com>
Date: Sat, 4 Jan 2025 17:20:34 -0800
Subject: [PATCH 077/462] [MISC] Replace c10::optional with std::optional
 (#11730)

Signed-off-by: Lu Fang <lufang@fb.com>
---
 csrc/attention/paged_attention_v1.cu          |  4 +-
 csrc/attention/paged_attention_v2.cu          |  4 +-
 csrc/cpu/attention.cpp                        |  8 ++--
 csrc/cpu/quant.cpp                            | 10 ++--
 csrc/cpu/torch_bindings.cpp                   |  6 +--
 .../epilogue/scaled_mm_epilogues_c2x.hpp      |  6 +--
 .../epilogue/scaled_mm_epilogues_c3x.hpp      |  6 +--
 csrc/cutlass_extensions/torch_utils.hpp       |  2 +-
 csrc/mamba/causal_conv1d/causal_conv1d.cu     | 24 +++++-----
 csrc/mamba/mamba_ssm/selective_scan_fwd.cu    | 22 ++++-----
 csrc/ops.h                                    | 46 +++++++++----------
 .../compressed_tensors/int8_quant_kernels.cu  |  4 +-
 .../cutlass_w8a8/scaled_mm_c2x.cu             | 18 ++++----
 .../cutlass_w8a8/scaled_mm_c3x.cu             |  6 +--
 .../cutlass_w8a8/scaled_mm_entry.cu           | 30 ++++++------
 csrc/quantization/machete/generate.py         |  2 +-
 .../machete/machete_mm_kernel.cuh             | 10 ++--
 .../machete/machete_mm_launcher.cuh           | 24 +++++-----
 .../machete/machete_prepack_launcher.cuh      |  2 +-
 csrc/quantization/machete/machete_pytorch.cu  | 26 +++++------
 csrc/rocm/attention.cu                        |  4 +-
 csrc/rocm/ops.h                               |  2 +-
 csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu   |  2 +-
 csrc/sparse/cutlass/sparse_scaled_mm_entry.cu |  4 +-
 24 files changed, 136 insertions(+), 136 deletions(-)

diff --git a/csrc/attention/paged_attention_v1.cu b/csrc/attention/paged_attention_v1.cu
index cb1a069942069..27321148f6dda 100644
--- a/csrc/attention/paged_attention_v1.cu
+++ b/csrc/attention/paged_attention_v1.cu
@@ -53,7 +53,7 @@ void paged_attention_v1_launcher(
     torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
     torch::Tensor& value_cache, int num_kv_heads, float scale,
     torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
-    const c10::optional<torch::Tensor>& alibi_slopes, float k_scale,
+    const std::optional<torch::Tensor>& alibi_slopes, float k_scale,
     float v_scale, const int tp_rank, const int blocksparse_local_blocks,
     const int blocksparse_vert_stride, const int blocksparse_block_size,
     const int blocksparse_head_sliding_step) {
@@ -176,7 +176,7 @@ void paged_attention_v1(
     torch::Tensor& block_tables,  // [num_seqs, max_num_blocks_per_seq]
     torch::Tensor& seq_lens,      // [num_seqs]
     int64_t block_size, int64_t max_seq_len,
-    const c10::optional<torch::Tensor>& alibi_slopes,
+    const std::optional<torch::Tensor>& alibi_slopes,
     const std::string& kv_cache_dtype, double k_scale, double v_scale,
     const int64_t tp_rank, const int64_t blocksparse_local_blocks,
     const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
diff --git a/csrc/attention/paged_attention_v2.cu b/csrc/attention/paged_attention_v2.cu
index c457bdb89008e..a453b2243e48c 100644
--- a/csrc/attention/paged_attention_v2.cu
+++ b/csrc/attention/paged_attention_v2.cu
@@ -54,7 +54,7 @@ void paged_attention_v2_launcher(
     torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
     torch::Tensor& value_cache, int num_kv_heads, float scale,
     torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
-    const c10::optional<torch::Tensor>& alibi_slopes, float k_scale,
+    const std::optional<torch::Tensor>& alibi_slopes, float k_scale,
     float v_scale, const int tp_rank, const int blocksparse_local_blocks,
     const int blocksparse_vert_stride, const int blocksparse_block_size,
     const int blocksparse_head_sliding_step) {
@@ -187,7 +187,7 @@ void paged_attention_v2(
     torch::Tensor& block_tables,  // [num_seqs, max_num_blocks_per_seq]
     torch::Tensor& seq_lens,      // [num_seqs]
     int64_t block_size, int64_t max_seq_len,
-    const c10::optional<torch::Tensor>& alibi_slopes,
+    const std::optional<torch::Tensor>& alibi_slopes,
     const std::string& kv_cache_dtype, double k_scale, double v_scale,
     const int64_t tp_rank, const int64_t blocksparse_local_blocks,
     const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
diff --git a/csrc/cpu/attention.cpp b/csrc/cpu/attention.cpp
index e21832ba7582f..ef5b14088c63b 100644
--- a/csrc/cpu/attention.cpp
+++ b/csrc/cpu/attention.cpp
@@ -386,7 +386,7 @@ void paged_attention_v1_impl_launcher(
     torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
     torch::Tensor& value_cache, int num_kv_heads, float scale,
     torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
-    const c10::optional<torch::Tensor>& alibi_slopes) {
+    const std::optional<torch::Tensor>& alibi_slopes) {
   int num_seqs = query.size(0);
   int num_heads = query.size(1);
   int head_size = query.size(2);
@@ -459,7 +459,7 @@ void paged_attention_v1(
     torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
     torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
     torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
-    int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
+    int64_t max_seq_len, const std::optional<torch::Tensor>& alibi_slopes,
     const std::string& kv_cache_dtype, double k_scale, double v_scale,
     const int64_t tp_rank, const int64_t blocksparse_local_blocks,
     const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
@@ -702,7 +702,7 @@ void paged_attention_v2_impl_launcher(
     torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
     torch::Tensor& value_cache, int num_kv_heads, float scale,
     torch::Tensor& block_tables, torch::Tensor& seq_lens, int block_size,
-    int max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes) {
+    int max_seq_len, const std::optional<torch::Tensor>& alibi_slopes) {
   int num_seqs = query.size(0);
   int num_heads = query.size(1);
   int head_size = query.size(2);
@@ -781,7 +781,7 @@ void paged_attention_v2(
     torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
     torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
     torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
-    int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
+    int64_t max_seq_len, const std::optional<torch::Tensor>& alibi_slopes,
     const std::string& kv_cache_dtype, double k_scale, double v_scale,
     const int64_t tp_rank, const int64_t blocksparse_local_blocks,
     const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
diff --git a/csrc/cpu/quant.cpp b/csrc/cpu/quant.cpp
index d9aed657a3113..33b1637832888 100644
--- a/csrc/cpu/quant.cpp
+++ b/csrc/cpu/quant.cpp
@@ -359,7 +359,7 @@ void int8_scaled_mm(torch::Tensor& c,               // [M, OC], row-major
                     const torch::Tensor& b,         // [IC, OC], column-major
                     const torch::Tensor& a_scales,  // [1] or [M]
                     const torch::Tensor& b_scales,  // [1] or [OC]
-                    const c10::optional<torch::Tensor>& bias  // [OC]
+                    const std::optional<torch::Tensor>& bias  // [OC]
 ) {
   CPU_KERNEL_GUARD_IN(cutlass_scaled_mm)
   // Checks for conformality
@@ -442,8 +442,8 @@ void int8_scaled_mm_azp(torch::Tensor& c,        // [M, OC], row-major
                         const torch::Tensor& a_scales,            // [1] or [M]
                         const torch::Tensor& b_scales,            // [1] or [OC]
                         const torch::Tensor& azp_adj,             // [OC]
-                        const c10::optional<torch::Tensor>& azp,  // [1] or [M]
-                        const c10::optional<torch::Tensor>& bias  // [OC]
+                        const std::optional<torch::Tensor>& azp,  // [1] or [M]
+                        const std::optional<torch::Tensor>& bias  // [OC]
 ) {
   CPU_KERNEL_GUARD_IN(cutlass_scaled_mm_azp)
   // Checks for conformality
@@ -561,7 +561,7 @@ void int8_scaled_mm_azp(torch::Tensor& c,        // [M, OC], row-major
 void static_scaled_int8_quant(torch::Tensor& out,          // [..., hidden_size]
                               const torch::Tensor& input,  // [..., hidden_size]
                               const torch::Tensor& scale,
-                              c10::optional<torch::Tensor> const& azp) {
+                              std::optional<torch::Tensor> const& azp) {
   CPU_KERNEL_GUARD_IN(static_scaled_int8_quant)
   TORCH_CHECK(input.is_contiguous());
   TORCH_CHECK(out.is_contiguous());
@@ -590,7 +590,7 @@ void dynamic_scaled_int8_quant(
     torch::Tensor& out,          // [..., hidden_size]
     const torch::Tensor& input,  // [..., hidden_size]
     torch::Tensor& scale,        // [..., 1]
-    c10::optional<torch::Tensor> const& azp) {
+    std::optional<torch::Tensor> const& azp) {
   CPU_KERNEL_GUARD_IN(dynamic_scaled_int8_quant)
   TORCH_CHECK(input.is_contiguous());
   TORCH_CHECK(out.is_contiguous());
diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp
index 03beefbc6de7d..74e4d8189d403 100644
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@@ -9,14 +9,14 @@ std::string init_cpu_threads_env(const std::string& cpu_ids);
 void int8_scaled_mm(torch::Tensor& c, const torch::Tensor& a,
                     const torch::Tensor& b, const torch::Tensor& a_scales,
                     const torch::Tensor& b_scales,
-                    const c10::optional<torch::Tensor>& bias);
+                    const std::optional<torch::Tensor>& bias);
 
 void int8_scaled_mm_azp(torch::Tensor& c, const torch::Tensor& a,
                         const torch::Tensor& b, const torch::Tensor& a_scales,
                         const torch::Tensor& b_scales,
                         const torch::Tensor& azp_adj,
-                        const c10::optional<torch::Tensor>& azp,
-                        const c10::optional<torch::Tensor>& bias);
+                        const std::optional<torch::Tensor>& azp,
+                        const std::optional<torch::Tensor>& bias);
 
 TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   // vLLM custom ops
diff --git a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp
index 26f7423fd7455..ef413e6dd75c5 100644
--- a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp
+++ b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp
@@ -68,7 +68,7 @@ struct ScaledEpilogueBase {
   // This overload handles the case where there might not be a tensor, in which
   // case a nullptr is passed and a constant (0) is used.
   template <typename Descriptor, typename T>
-  static auto args_from_tensor(c10::optional<torch::Tensor> const& tensor) {
+  static auto args_from_tensor(std::optional<torch::Tensor> const& tensor) {
     static_assert(std::is_same_v<Descriptor, RowOrZeroLoad<T>>);
     using Arguments = typename Descriptor::Arguments;
     auto* data_ptr = tensor ? static_cast<T*>(tensor->data_ptr()) : nullptr;
@@ -223,7 +223,7 @@ struct ScaledEpilogueBiasAzp
   static ArgumentType prepare_args(torch::Tensor const& a_scales,
                                    torch::Tensor const& b_scales,
                                    torch::Tensor const& azp_adj,
-                                   c10::optional<torch::Tensor> const& bias) {
+                                   std::optional<torch::Tensor> const& bias) {
     auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
     auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
     auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
@@ -301,7 +301,7 @@ struct ScaledEpilogueBiasAzpToken
                                    torch::Tensor const& b_scales,
                                    torch::Tensor const& azp_adj,
                                    torch::Tensor const& azp,
-                                   c10::optional<torch::Tensor> const& bias) {
+                                   std::optional<torch::Tensor> const& bias) {
     auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
     auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
     auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
diff --git a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
index c723adf126422..c590c66a66652 100644
--- a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
+++ b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
@@ -67,7 +67,7 @@ struct ScaledEpilogueBase {
   // This overload handles the case where there might not be a tensor, in which
   // case a nullptr is passed and a constant (0) is used.
   template <typename Descriptor, typename T>
-  static auto args_from_tensor(c10::optional<torch::Tensor> const& tensor) {
+  static auto args_from_tensor(std::optional<torch::Tensor> const& tensor) {
     using Arguments = typename Descriptor::Arguments;
     auto* data_ptr = tensor ? static_cast<T*>(tensor->data_ptr()) : nullptr;
     static_assert(std::is_same_v<Descriptor, ColLoad<T, true>> ||
@@ -223,7 +223,7 @@ struct ScaledEpilogueBiasAzp
   static ArgumentType prepare_args(torch::Tensor const& a_scales,
                                    torch::Tensor const& b_scales,
                                    torch::Tensor const& azp_adj,
-                                   c10::optional<torch::Tensor> const& bias) {
+                                   std::optional<torch::Tensor> const& bias) {
     auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
     auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
     auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
@@ -299,7 +299,7 @@ struct ScaledEpilogueBiasAzpToken
                                    torch::Tensor const& b_scales,
                                    torch::Tensor const& azp_adj,
                                    torch::Tensor const& azp,
-                                   c10::optional<torch::Tensor> const& bias) {
+                                   std::optional<torch::Tensor> const& bias) {
     auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
     auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
     auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
diff --git a/csrc/cutlass_extensions/torch_utils.hpp b/csrc/cutlass_extensions/torch_utils.hpp
index 2c78572521eec..a1ff933cce63f 100644
--- a/csrc/cutlass_extensions/torch_utils.hpp
+++ b/csrc/cutlass_extensions/torch_utils.hpp
@@ -97,7 +97,7 @@ static inline auto make_cute_layout(torch::Tensor const& tensor,
 
 template <typename Stride>
 static inline auto maybe_make_cute_layout(
-    c10::optional<torch::Tensor> const& tensor,
+    std::optional<torch::Tensor> const& tensor,
     std::string_view name = "tensor") {
   using Layout = decltype(make_cute_layout<Stride>(*tensor));
 
diff --git a/csrc/mamba/causal_conv1d/causal_conv1d.cu b/csrc/mamba/causal_conv1d/causal_conv1d.cu
index dd1e6de2e0180..f0e5533bcae60 100644
--- a/csrc/mamba/causal_conv1d/causal_conv1d.cu
+++ b/csrc/mamba/causal_conv1d/causal_conv1d.cu
@@ -53,12 +53,12 @@ void set_conv_params_fwd(ConvParamsBase &params,
                          const at::Tensor x,
                          const at::Tensor weight,
                          const at::Tensor out,
-                         const c10::optional<at::Tensor>& bias,
+                         const std::optional<at::Tensor>& bias,
                          bool silu_activation,
                          int64_t pad_slot_id,
-                         const c10::optional<at::Tensor>& query_start_loc = std::nullopt,
-                         const c10::optional<at::Tensor>& cache_indices = std::nullopt,
-                         const c10::optional<at::Tensor>& has_initial_state = std::nullopt) {
+                         const std::optional<at::Tensor>& query_start_loc = std::nullopt,
+                         const std::optional<at::Tensor>& cache_indices = std::nullopt,
+                         const std::optional<at::Tensor>& has_initial_state = std::nullopt) {
 
     // Reset the parameters
     memset(&params, 0, sizeof(params));
@@ -93,11 +93,11 @@ void set_conv_params_fwd(ConvParamsBase &params,
 
 
 void causal_conv1d_fwd(const at::Tensor &x, const at::Tensor &weight,
-                  const c10::optional<at::Tensor> &bias_,
-                  const c10::optional<at::Tensor> &conv_states,
-                  const c10::optional<at::Tensor> &query_start_loc,
-                  const c10::optional<at::Tensor> &cache_indices,
-                  const c10::optional<at::Tensor> &has_initial_state,
+                  const std::optional<at::Tensor> &bias_,
+                  const std::optional<at::Tensor> &conv_states,
+                  const std::optional<at::Tensor> &query_start_loc,
+                  const std::optional<at::Tensor> &cache_indices,
+                  const std::optional<at::Tensor> &has_initial_state,
                   bool silu_activation,
                  // used to identify padding entries if cache_indices provided
                  // in case of padding, the kernel will return early
@@ -194,10 +194,10 @@ void causal_conv1d_fwd(const at::Tensor &x, const at::Tensor &weight,
 void causal_conv1d_update(const at::Tensor &x,
                      const at::Tensor &conv_state,
                      const at::Tensor &weight,
-                     const c10::optional<at::Tensor> &bias_,
+                     const std::optional<at::Tensor> &bias_,
                      bool silu_activation,
-                     const c10::optional<at::Tensor> &cache_seqlens_,
-                     const c10::optional<at::Tensor> &conv_state_indices_,
+                     const std::optional<at::Tensor> &cache_seqlens_,
+                     const std::optional<at::Tensor> &conv_state_indices_,
                      // used to identify padding entries if cache_indices provided
                      // in case of padding, the kernel will return early
                      int64_t pad_slot_id) {
diff --git a/csrc/mamba/mamba_ssm/selective_scan_fwd.cu b/csrc/mamba/mamba_ssm/selective_scan_fwd.cu
index 71624696338d0..bd0a34119c82b 100644
--- a/csrc/mamba/mamba_ssm/selective_scan_fwd.cu
+++ b/csrc/mamba/mamba_ssm/selective_scan_fwd.cu
@@ -402,14 +402,14 @@ void set_ssm_params_fwd(SSMParamsBase &params,
                         const torch::Tensor out,
                         const torch::Tensor z,
                         const torch::Tensor out_z,
-                        const c10::optional<at::Tensor>& D,
-                        const c10::optional<at::Tensor>& delta_bias,
+                        const std::optional<at::Tensor>& D,
+                        const std::optional<at::Tensor>& delta_bias,
                         const torch::Tensor ssm_states,
                         bool has_z, 
                         bool delta_softplus,
-                        const c10::optional<at::Tensor>& query_start_loc,
-                        const c10::optional<at::Tensor>& cache_indices,
-                        const c10::optional<at::Tensor>& has_initial_state,
+                        const std::optional<at::Tensor>& query_start_loc,
+                        const std::optional<at::Tensor>& cache_indices,
+                        const std::optional<at::Tensor>& has_initial_state,
                         bool varlen,
                         int64_t pad_slot_id) {
 
@@ -504,13 +504,13 @@ void set_ssm_params_fwd(SSMParamsBase &params,
 
 void selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,
                   const torch::Tensor &A, const torch::Tensor &B, const torch::Tensor &C,
-                  const c10::optional<torch::Tensor> &D_,
-                  const c10::optional<torch::Tensor> &z_,
-                  const c10::optional<torch::Tensor> &delta_bias_,
+                  const std::optional<torch::Tensor> &D_,
+                  const std::optional<torch::Tensor> &z_,
+                  const std::optional<torch::Tensor> &delta_bias_,
                   bool delta_softplus,
-                  const c10::optional<torch::Tensor> &query_start_loc,
-                  const c10::optional<torch::Tensor> &cache_indices,
-                  const c10::optional<torch::Tensor> &has_initial_state,
+                  const std::optional<torch::Tensor> &query_start_loc,
+                  const std::optional<torch::Tensor> &cache_indices,
+                  const std::optional<torch::Tensor> &has_initial_state,
                   const torch::Tensor &ssm_states,
                   // used to identify padding entries if cache_indices provided
                   // in case of padding, the kernel will return early
diff --git a/csrc/ops.h b/csrc/ops.h
index 347c502845d8f..9efd9b0c24700 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -33,7 +33,7 @@ void paged_attention_v1(
     torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
     torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
     torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
-    int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
+    int64_t max_seq_len, const std::optional<torch::Tensor>& alibi_slopes,
     const std::string& kv_cache_dtype, double k_scale, double v_scale,
     const int64_t tp_rank, const int64_t blocksparse_local_blocks,
     const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
@@ -44,7 +44,7 @@ void paged_attention_v2(
     torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
     torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
     torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
-    int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
+    int64_t max_seq_len, const std::optional<torch::Tensor>& alibi_slopes,
     const std::string& kv_cache_dtype, double k_scale, double v_scale,
     const int64_t tp_rank, const int64_t blocksparse_local_blocks,
     const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
@@ -153,15 +153,15 @@ bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability);
 void cutlass_scaled_mm(torch::Tensor& out, torch::Tensor const& a,
                        torch::Tensor const& b, torch::Tensor const& a_scales,
                        torch::Tensor const& b_scales,
-                       c10::optional<torch::Tensor> const& bias);
+                       std::optional<torch::Tensor> const& bias);
 
 void cutlass_scaled_mm_azp(torch::Tensor& out, torch::Tensor const& a,
                            torch::Tensor const& b,
                            torch::Tensor const& a_scales,
                            torch::Tensor const& b_scales,
                            torch::Tensor const& azp_adj,
-                           c10::optional<torch::Tensor> const& azp,
-                           c10::optional<torch::Tensor> const& bias);
+                           std::optional<torch::Tensor> const& azp,
+                           std::optional<torch::Tensor> const& bias);
 
 bool cutlass_sparse_scaled_mm_supported(int64_t cuda_device_capability);
 
@@ -169,7 +169,7 @@ void cutlass_scaled_sparse_mm(torch::Tensor& out, torch::Tensor const& a,
                               torch::Tensor const& b, torch::Tensor const& e,
                               torch::Tensor const& a_scales,
                               torch::Tensor const& b_scales,
-                              c10::optional<torch::Tensor> const& bias);
+                              std::optional<torch::Tensor> const& bias);
 
 bool cutlass_sparse_compress_entry(torch::Tensor& a_compressed,
                                    torch::Tensor& e, torch::Tensor const& a);
@@ -177,11 +177,11 @@ bool cutlass_sparse_compress_entry(torch::Tensor& a_compressed,
 
 void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
                               torch::Tensor const& scale,
-                              c10::optional<torch::Tensor> const& azp);
+                              std::optional<torch::Tensor> const& azp);
 
 void dynamic_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
                                torch::Tensor& scales,
-                               c10::optional<torch::Tensor> const& azp);
+                               std::optional<torch::Tensor> const& azp);
 
 torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
                         torch::Tensor b_gptq_qzeros,
@@ -198,34 +198,34 @@ void dynamic_scaled_fp8_quant(torch::Tensor& out, torch::Tensor const& input,
 
 void dynamic_per_token_scaled_fp8_quant(
     torch::Tensor& out, torch::Tensor const& input, torch::Tensor& scale,
-    c10::optional<torch::Tensor> const& scale_ub);
+    std::optional<torch::Tensor> const& scale_ub);
 
 void selective_scan_fwd(const torch::Tensor& u, const torch::Tensor& delta,
                         const torch::Tensor& A, const torch::Tensor& B,
                         const torch::Tensor& C,
-                        const c10::optional<torch::Tensor>& D_,
-                        const c10::optional<torch::Tensor>& z_,
-                        const c10::optional<torch::Tensor>& delta_bias_,
+                        const std::optional<torch::Tensor>& D_,
+                        const std::optional<torch::Tensor>& z_,
+                        const std::optional<torch::Tensor>& delta_bias_,
                         bool delta_softplus,
-                        const c10::optional<torch::Tensor>& query_start_loc,
-                        const c10::optional<torch::Tensor>& cache_indices,
-                        const c10::optional<torch::Tensor>& has_initial_state,
+                        const std::optional<torch::Tensor>& query_start_loc,
+                        const std::optional<torch::Tensor>& cache_indices,
+                        const std::optional<torch::Tensor>& has_initial_state,
                         const torch::Tensor& ssm_states, int64_t pad_slot_id);
 
 void causal_conv1d_update(const at::Tensor& x, const at::Tensor& conv_state,
                           const at::Tensor& weight,
-                          const c10::optional<at::Tensor>& bias_,
+                          const std::optional<at::Tensor>& bias_,
                           bool silu_activation,
-                          const c10::optional<at::Tensor>& cache_seqlens_,
-                          const c10::optional<at::Tensor>& conv_state_indices_,
+                          const std::optional<at::Tensor>& cache_seqlens_,
+                          const std::optional<at::Tensor>& conv_state_indices_,
                           int64_t pad_slot_id);
 
 void causal_conv1d_fwd(const at::Tensor& x, const at::Tensor& weight,
-                       const c10::optional<at::Tensor>& bias_,
-                       const c10::optional<at::Tensor>& conv_states,
-                       const c10::optional<at::Tensor>& query_start_loc,
-                       const c10::optional<at::Tensor>& cache_indices,
-                       const c10::optional<at::Tensor>& has_initial_state,
+                       const std::optional<at::Tensor>& bias_,
+                       const std::optional<at::Tensor>& conv_states,
+                       const std::optional<at::Tensor>& query_start_loc,
+                       const std::optional<at::Tensor>& cache_indices,
+                       const std::optional<at::Tensor>& has_initial_state,
                        bool silu_activation, int64_t pad_slot_id);
 
 #ifndef USE_ROCM
diff --git a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
index e9987535bd3ea..e79785827189d 100644
--- a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
+++ b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
@@ -226,7 +226,7 @@ __global__ void dynamic_scaled_int8_azp_quant_kernel(
 void static_scaled_int8_quant(torch::Tensor& out,          // [..., hidden_size]
                               torch::Tensor const& input,  // [..., hidden_size]
                               torch::Tensor const& scale,
-                              c10::optional<torch::Tensor> const& azp) {
+                              std::optional<torch::Tensor> const& azp) {
   TORCH_CHECK(input.is_contiguous());
   TORCH_CHECK(out.is_contiguous());
   TORCH_CHECK(scale.numel() == 1);
@@ -257,7 +257,7 @@ void static_scaled_int8_quant(torch::Tensor& out,          // [..., hidden_size]
 void dynamic_scaled_int8_quant(
     torch::Tensor& out,          // [..., hidden_size]
     torch::Tensor const& input,  // [..., hidden_size]
-    torch::Tensor& scales, c10::optional<torch::Tensor> const& azp) {
+    torch::Tensor& scales, std::optional<torch::Tensor> const& azp) {
   TORCH_CHECK(input.is_contiguous());
   TORCH_CHECK(out.is_contiguous());
   TORCH_CHECK(scales.is_contiguous());
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
index dbb72e8bbd3f5..865fef5aeea11 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
@@ -39,7 +39,7 @@ void cutlass_scaled_mm_sm75(torch::Tensor& out, torch::Tensor const& a,
                             torch::Tensor const& b,
                             torch::Tensor const& a_scales,
                             torch::Tensor const& b_scales,
-                            c10::optional<torch::Tensor> const& bias) {
+                            std::optional<torch::Tensor> const& bias) {
   TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
   TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
   if (bias) {
@@ -58,8 +58,8 @@ void cutlass_scaled_mm_azp_sm75(torch::Tensor& out, torch::Tensor const& a,
                                 torch::Tensor const& a_scales,
                                 torch::Tensor const& b_scales,
                                 torch::Tensor const& azp_adj,
-                                c10::optional<torch::Tensor> const& azp,
-                                c10::optional<torch::Tensor> const& bias) {
+                                std::optional<torch::Tensor> const& azp,
+                                std::optional<torch::Tensor> const& bias) {
   TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
   TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
 
@@ -94,7 +94,7 @@ void cutlass_scaled_mm_sm80(torch::Tensor& out, torch::Tensor const& a,
                             torch::Tensor const& b,
                             torch::Tensor const& a_scales,
                             torch::Tensor const& b_scales,
-                            c10::optional<torch::Tensor> const& bias) {
+                            std::optional<torch::Tensor> const& bias) {
   TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
   TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
   if (bias) {
@@ -113,8 +113,8 @@ void cutlass_scaled_mm_azp_sm80(torch::Tensor& out, torch::Tensor const& a,
                                 torch::Tensor const& a_scales,
                                 torch::Tensor const& b_scales,
                                 torch::Tensor const& azp_adj,
-                                c10::optional<torch::Tensor> const& azp,
-                                c10::optional<torch::Tensor> const& bias) {
+                                std::optional<torch::Tensor> const& azp,
+                                std::optional<torch::Tensor> const& bias) {
   TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
   TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
 
@@ -165,7 +165,7 @@ void cutlass_scaled_mm_sm89(torch::Tensor& out, torch::Tensor const& a,
                             torch::Tensor const& b,
                             torch::Tensor const& a_scales,
                             torch::Tensor const& b_scales,
-                            c10::optional<torch::Tensor> const& bias) {
+                            std::optional<torch::Tensor> const& bias) {
   TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
   TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
   if (bias) {
@@ -184,8 +184,8 @@ void cutlass_scaled_mm_azp_sm89(torch::Tensor& out, torch::Tensor const& a,
                                 torch::Tensor const& a_scales,
                                 torch::Tensor const& b_scales,
                                 torch::Tensor const& azp_adj,
-                                c10::optional<torch::Tensor> const& azp,
-                                c10::optional<torch::Tensor> const& bias) {
+                                std::optional<torch::Tensor> const& azp,
+                                std::optional<torch::Tensor> const& bias) {
   TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
   TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
 
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
index 123f4359c0d1a..e18d7d79e5b77 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
@@ -51,7 +51,7 @@ void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
                             torch::Tensor const& b,
                             torch::Tensor const& a_scales,
                             torch::Tensor const& b_scales,
-                            c10::optional<torch::Tensor> const& bias) {
+                            std::optional<torch::Tensor> const& bias) {
   TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
   TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
   if (bias) {
@@ -70,8 +70,8 @@ void cutlass_scaled_mm_azp_sm90(torch::Tensor& out, torch::Tensor const& a,
                                 torch::Tensor const& a_scales,
                                 torch::Tensor const& b_scales,
                                 torch::Tensor const& azp_adj,
-                                c10::optional<torch::Tensor> const& azp,
-                                c10::optional<torch::Tensor> const& bias) {
+                                std::optional<torch::Tensor> const& azp,
+                                std::optional<torch::Tensor> const& bias) {
   TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
   TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
 
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
index 4f7b6588ef3f7..3f2b52624f366 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
@@ -9,26 +9,26 @@ void cutlass_scaled_mm_sm75(torch::Tensor& c, torch::Tensor const& a,
                             torch::Tensor const& b,
                             torch::Tensor const& a_scales,
                             torch::Tensor const& b_scales,
-                            c10::optional<torch::Tensor> const& bias);
+                            std::optional<torch::Tensor> const& bias);
 
 void cutlass_scaled_mm_sm80(torch::Tensor& c, torch::Tensor const& a,
                             torch::Tensor const& b,
                             torch::Tensor const& a_scales,
                             torch::Tensor const& b_scales,
-                            c10::optional<torch::Tensor> const& bias);
+                            std::optional<torch::Tensor> const& bias);
 
 void cutlass_scaled_mm_sm89(torch::Tensor& c, torch::Tensor const& a,
                             torch::Tensor const& b,
                             torch::Tensor const& a_scales,
                             torch::Tensor const& b_scales,
-                            c10::optional<torch::Tensor> const& bias);
+                            std::optional<torch::Tensor> const& bias);
 
 #if defined ENABLE_SCALED_MM_C3X && ENABLE_SCALED_MM_C3X
 void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
                             torch::Tensor const& b,
                             torch::Tensor const& a_scales,
                             torch::Tensor const& b_scales,
-                            c10::optional<torch::Tensor> const& bias);
+                            std::optional<torch::Tensor> const& bias);
 #endif
 
 void cutlass_scaled_mm_azp_sm75(torch::Tensor& c, torch::Tensor const& a,
@@ -36,24 +36,24 @@ void cutlass_scaled_mm_azp_sm75(torch::Tensor& c, torch::Tensor const& a,
                                 torch::Tensor const& a_scales,
                                 torch::Tensor const& b_scales,
                                 torch::Tensor const& azp_adj,
-                                c10::optional<torch::Tensor> const& azp,
-                                c10::optional<torch::Tensor> const& bias);
+                                std::optional<torch::Tensor> const& azp,
+                                std::optional<torch::Tensor> const& bias);
 
 void cutlass_scaled_mm_azp_sm80(torch::Tensor& c, torch::Tensor const& a,
                                 torch::Tensor const& b,
                                 torch::Tensor const& a_scales,
                                 torch::Tensor const& b_scales,
                                 torch::Tensor const& azp_adj,
-                                c10::optional<torch::Tensor> const& azp,
-                                c10::optional<torch::Tensor> const& bias);
+                                std::optional<torch::Tensor> const& azp,
+                                std::optional<torch::Tensor> const& bias);
 
 void cutlass_scaled_mm_azp_sm89(torch::Tensor& c, torch::Tensor const& a,
                                 torch::Tensor const& b,
                                 torch::Tensor const& a_scales,
                                 torch::Tensor const& b_scales,
                                 torch::Tensor const& azp_adj,
-                                c10::optional<torch::Tensor> const& azp,
-                                c10::optional<torch::Tensor> const& bias);
+                                std::optional<torch::Tensor> const& azp,
+                                std::optional<torch::Tensor> const& bias);
 
 #if defined CUDA_VERSION && CUDA_VERSION >= 12000
 void cutlass_scaled_mm_azp_sm90(torch::Tensor& c, torch::Tensor const& a,
@@ -61,8 +61,8 @@ void cutlass_scaled_mm_azp_sm90(torch::Tensor& c, torch::Tensor const& a,
                                 torch::Tensor const& a_scales,
                                 torch::Tensor const& b_scales,
                                 torch::Tensor const& azp_adj,
-                                c10::optional<torch::Tensor> const& azp,
-                                c10::optional<torch::Tensor> const& bias);
+                                std::optional<torch::Tensor> const& azp,
+                                std::optional<torch::Tensor> const& bias);
 #endif
 
 bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability) {
@@ -84,7 +84,7 @@ bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability) {
 void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
                        torch::Tensor const& b, torch::Tensor const& a_scales,
                        torch::Tensor const& b_scales,
-                       c10::optional<torch::Tensor> const& bias) {
+                       std::optional<torch::Tensor> const& bias) {
   // Checks for conformality
   TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2);
   TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) &&
@@ -148,8 +148,8 @@ void cutlass_scaled_mm_azp(torch::Tensor& c, torch::Tensor const& a,
                            torch::Tensor const& a_scales,
                            torch::Tensor const& b_scales,
                            torch::Tensor const& azp_adj,
-                           c10::optional<torch::Tensor> const& azp,
-                           c10::optional<torch::Tensor> const& bias) {
+                           std::optional<torch::Tensor> const& azp,
+                           std::optional<torch::Tensor> const& bias) {
   // Checks for conformality
   TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2);
   TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) &&
diff --git a/csrc/quantization/machete/generate.py b/csrc/quantization/machete/generate.py
index 2df4d181902f8..a9b5ddf4cbdd2 100644
--- a/csrc/quantization/machete/generate.py
+++ b/csrc/quantization/machete/generate.py
@@ -63,7 +63,7 @@
 
 
 static inline std::optional<at::ScalarType> maybe_scalartype(
-    c10::optional<at::Tensor> const& t) {
+    std::optional<at::Tensor> const& t) {
     if (!t) {
       return std::nullopt;
     } else {
diff --git a/csrc/quantization/machete/machete_mm_kernel.cuh b/csrc/quantization/machete/machete_mm_kernel.cuh
index d4d19ae5deec7..e4af067915e0a 100644
--- a/csrc/quantization/machete/machete_mm_kernel.cuh
+++ b/csrc/quantization/machete/machete_mm_kernel.cuh
@@ -183,11 +183,11 @@ struct MacheteKernelTemplate {
       torch::Tensor const& A,  // MxK matrix
       torch::Tensor const& B,  // KxN prepacked matrix
       torch::Tensor& D,        // MxN matrix
-      c10::optional<torch::Tensor> const& maybe_g_scales,  // scale_KxN matrix
-      c10::optional<torch::Tensor> const& maybe_g_zeros,   // scale_KxN matrix
-      c10::optional<int64_t> maybe_group_size,
-      c10::optional<torch::Tensor> const& maybe_ch_scales,   // len N vector
-      c10::optional<torch::Tensor> const& maybe_tok_scales)  // len M vector
+      std::optional<torch::Tensor> const& maybe_g_scales,  // scale_KxN matrix
+      std::optional<torch::Tensor> const& maybe_g_zeros,   // scale_KxN matrix
+      std::optional<int64_t> maybe_group_size,
+      std::optional<torch::Tensor> const& maybe_ch_scales,   // len N vector
+      std::optional<torch::Tensor> const& maybe_tok_scales)  // len M vector
   {
     static_assert(!with_group_zeropoints || with_group_scales);
 
diff --git a/csrc/quantization/machete/machete_mm_launcher.cuh b/csrc/quantization/machete/machete_mm_launcher.cuh
index 4b0da5b303e0c..cabe0af46f069 100644
--- a/csrc/quantization/machete/machete_mm_launcher.cuh
+++ b/csrc/quantization/machete/machete_mm_launcher.cuh
@@ -13,23 +13,23 @@ struct MMArgs {
   torch::Tensor const& A;
   torch::Tensor const& B;
   vllm::ScalarType const& b_type;
-  c10::optional<at::ScalarType> const& maybe_out_type;
-  c10::optional<torch::Tensor> const& maybe_group_scales;
-  c10::optional<torch::Tensor> const& maybe_group_zeros;
-  c10::optional<int64_t> maybe_group_size;
-  c10::optional<torch::Tensor> const& maybe_channel_scales;
-  c10::optional<torch::Tensor> const& maybe_token_scales;
-  c10::optional<std::string> maybe_schedule;
+  std::optional<at::ScalarType> const& maybe_out_type;
+  std::optional<torch::Tensor> const& maybe_group_scales;
+  std::optional<torch::Tensor> const& maybe_group_zeros;
+  std::optional<int64_t> maybe_group_size;
+  std::optional<torch::Tensor> const& maybe_channel_scales;
+  std::optional<torch::Tensor> const& maybe_token_scales;
+  std::optional<std::string> maybe_schedule;
 };
 
 struct SupportedSchedulesArgs {
   at::ScalarType a_type;
   vllm::ScalarType b_type;
-  c10::optional<at::ScalarType> maybe_group_scales_type;
-  c10::optional<at::ScalarType> maybe_group_zeros_type;
-  c10::optional<at::ScalarType> maybe_channel_scales_type;
-  c10::optional<at::ScalarType> maybe_token_scales_type;
-  c10::optional<at::ScalarType> maybe_out_type;
+  std::optional<at::ScalarType> maybe_group_scales_type;
+  std::optional<at::ScalarType> maybe_group_zeros_type;
+  std::optional<at::ScalarType> maybe_channel_scales_type;
+  std::optional<at::ScalarType> maybe_token_scales_type;
+  std::optional<at::ScalarType> maybe_out_type;
 };
 
 torch::Tensor mm_dispatch(MMArgs args);
diff --git a/csrc/quantization/machete/machete_prepack_launcher.cuh b/csrc/quantization/machete/machete_prepack_launcher.cuh
index 3486d28be2126..634b651a4d107 100644
--- a/csrc/quantization/machete/machete_prepack_launcher.cuh
+++ b/csrc/quantization/machete/machete_prepack_launcher.cuh
@@ -10,7 +10,7 @@ struct PrepackBArgs {
   torch::Tensor const& B;
   at::ScalarType a_type;
   vllm::ScalarType b_type;
-  c10::optional<at::ScalarType> maybe_group_scales_type;
+  std::optional<at::ScalarType> maybe_group_scales_type;
 };
 
 template <typename PrepackedLayoutB>
diff --git a/csrc/quantization/machete/machete_pytorch.cu b/csrc/quantization/machete/machete_pytorch.cu
index da2c2fb0d3e77..05a51ee21ddb7 100644
--- a/csrc/quantization/machete/machete_pytorch.cu
+++ b/csrc/quantization/machete/machete_pytorch.cu
@@ -10,11 +10,11 @@ using namespace vllm;
 
 std::vector<std::string> supported_schedules(
     at::ScalarType a_type, int64_t b_type_id,
-    c10::optional<at::ScalarType> maybe_group_scales_type,
-    c10::optional<at::ScalarType> maybe_group_zeros_type,
-    c10::optional<at::ScalarType> maybe_channel_scales_type,
-    c10::optional<at::ScalarType> maybe_token_scales_type,
-    c10::optional<at::ScalarType> maybe_out_type) {
+    std::optional<at::ScalarType> maybe_group_scales_type,
+    std::optional<at::ScalarType> maybe_group_zeros_type,
+    std::optional<at::ScalarType> maybe_channel_scales_type,
+    std::optional<at::ScalarType> maybe_token_scales_type,
+    std::optional<at::ScalarType> maybe_out_type) {
   ScalarType const b_type = ScalarType::from_id(b_type_id);
   return supported_schedules_dispatch({
       .a_type = a_type,
@@ -29,13 +29,13 @@ std::vector<std::string> supported_schedules(
 
 torch::Tensor mm(torch::Tensor const& A, torch::Tensor const& B,
                  int64_t b_type_id,
-                 c10::optional<at::ScalarType> const& maybe_out_type,
-                 c10::optional<torch::Tensor> const& maybe_group_scales,
-                 c10::optional<torch::Tensor> const& maybe_group_zeros,
-                 c10::optional<int64_t> maybe_group_size,
-                 c10::optional<torch::Tensor> const& maybe_channel_scales,
-                 c10::optional<torch::Tensor> const& maybe_token_scales,
-                 c10::optional<std::string> maybe_schedule) {
+                 std::optional<at::ScalarType> const& maybe_out_type,
+                 std::optional<torch::Tensor> const& maybe_group_scales,
+                 std::optional<torch::Tensor> const& maybe_group_zeros,
+                 std::optional<int64_t> maybe_group_size,
+                 std::optional<torch::Tensor> const& maybe_channel_scales,
+                 std::optional<torch::Tensor> const& maybe_token_scales,
+                 std::optional<std::string> maybe_schedule) {
   ScalarType const b_type = ScalarType::from_id(b_type_id);
   return mm_dispatch({.A = A,
                       .B = B,
@@ -51,7 +51,7 @@ torch::Tensor mm(torch::Tensor const& A, torch::Tensor const& B,
 
 torch::Tensor prepack_B(
     torch::Tensor const& B, at::ScalarType const& a_type, int64_t b_type_id,
-    c10::optional<at::ScalarType> const& maybe_group_scales_type) {
+    std::optional<at::ScalarType> const& maybe_group_scales_type) {
   ScalarType const b_type = ScalarType::from_id(b_type_id);
   return prepack_B_dispatch(
       {.B = B,
diff --git a/csrc/rocm/attention.cu b/csrc/rocm/attention.cu
index b48348a515c8d..0fec9624c457e 100644
--- a/csrc/rocm/attention.cu
+++ b/csrc/rocm/attention.cu
@@ -928,7 +928,7 @@ void paged_attention_custom_launcher(
     torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
     torch::Tensor& value_cache, const int num_kv_heads, float scale,
     torch::Tensor& block_tables, torch::Tensor& context_lens,
-    int max_context_len, const c10::optional<torch::Tensor>& alibi_slopes,
+    int max_context_len, const std::optional<torch::Tensor>& alibi_slopes,
     float k_scale, float v_scale) {
   int num_seqs = query.size(0);
   int num_heads = query.size(1);
@@ -1086,7 +1086,7 @@ void paged_attention(
     torch::Tensor& block_tables,  // [num_seqs, max_num_blocks_per_seq]
     torch::Tensor& context_lens,  // [num_seqs]
     int64_t block_size, int64_t max_context_len,
-    const c10::optional<torch::Tensor>& alibi_slopes,
+    const std::optional<torch::Tensor>& alibi_slopes,
     const std::string& kv_cache_dtype, double k_scale, double v_scale) {
   const int head_size = query.size(2);
   if (kv_cache_dtype == "auto") {
diff --git a/csrc/rocm/ops.h b/csrc/rocm/ops.h
index 9f085115a3956..34b2f9ce8a4c4 100644
--- a/csrc/rocm/ops.h
+++ b/csrc/rocm/ops.h
@@ -9,6 +9,6 @@ void paged_attention(torch::Tensor& out, torch::Tensor& exp_sums,
                      double scale, torch::Tensor& block_tables,
                      torch::Tensor& context_lens, int64_t block_size,
                      int64_t max_context_len,
-                     const c10::optional<torch::Tensor>& alibi_slopes,
+                     const std::optional<torch::Tensor>& alibi_slopes,
                      const std::string& kv_cache_dtype, double k_scale,
                      double v_scale);
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
index 6223dc8cca704..5a1879787c328 100644
--- a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
@@ -286,7 +286,7 @@ void cutlass_scaled_sparse_mm_sm90(torch::Tensor& out, torch::Tensor const& a,
                                    torch::Tensor const& bt_meta,
                                    torch::Tensor const& a_scales,
                                    torch::Tensor const& b_scales,
-                                   c10::optional<torch::Tensor> const& bias) {
+                                   std::optional<torch::Tensor> const& bias) {
   TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
   TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
   if (bias) {
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu b/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
index d464b045b895f..371de0950bc99 100644
--- a/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
@@ -22,7 +22,7 @@ void cutlass_scaled_sparse_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
                                    torch::Tensor const& e,
                                    torch::Tensor const& a_scales,
                                    torch::Tensor const& b_scales,
-                                   c10::optional<torch::Tensor> const& bias);
+                                   std::optional<torch::Tensor> const& bias);
 #endif
 
 void cutlass_scaled_sparse_mm(torch::Tensor& c, torch::Tensor const& a,
@@ -30,7 +30,7 @@ void cutlass_scaled_sparse_mm(torch::Tensor& c, torch::Tensor const& a,
                               torch::Tensor const& bt_meta,
                               torch::Tensor const& a_scales,
                               torch::Tensor const& b_scales,
-                              c10::optional<torch::Tensor> const& bias) {
+                              std::optional<torch::Tensor> const& bias) {
   // Checks for conformality
   TORCH_CHECK(a.dim() == 2 && bt_nzs.dim() == 2 && c.dim() == 2);
   TORCH_CHECK(c.size(1) == bt_nzs.size(0) && bt_nzs.size(1) * 2 == a.size(1) &&

From 635b897246da121238454ed4b2bbc87cb4d4166b Mon Sep 17 00:00:00 2001
From: cennn <61925104+cennn@users.noreply.github.com>
Date: Sun, 5 Jan 2025 23:09:11 +0800
Subject: [PATCH 078/462] [distributed] remove pynccl's redundant stream
 (#11744)

---
 tests/distributed/test_pynccl.py              |  5 ++--
 .../device_communicators/pynccl.py            | 28 ++++++-------------
 vllm/distributed/parallel_state.py            |  3 +-
 3 files changed, 12 insertions(+), 24 deletions(-)

diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py
index 36cfe42251384..a77b48d5e49f3 100644
--- a/tests/distributed/test_pynccl.py
+++ b/tests/distributed/test_pynccl.py
@@ -137,9 +137,8 @@ def worker_fn_with_cudagraph():
         # run something in the default stream to initialize torch engine
         a = torch.ones((4, 4), device=f'cuda:{pynccl_comm.rank}')
         torch.cuda.synchronize()
-        with torch.cuda.graph(
-                graph, stream=pynccl_comm.stream), pynccl_comm.change_state(
-                    enable=True):
+        with torch.cuda.graph(graph), \
+            pynccl_comm.change_state(enable=True):
             a_out = pynccl_comm.all_reduce(a)
         torch.cuda.synchronize()
         graph.replay()
diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py
index a6800f93f167b..93d96fd8f5686 100644
--- a/vllm/distributed/device_communicators/pynccl.py
+++ b/vllm/distributed/device_communicators/pynccl.py
@@ -51,7 +51,6 @@ def __init__(
         if self.world_size == 1:
             self.available = False
             self.disabled = True
-            self.stream = None
             return
         try:
             self.nccl = NCCLLibrary(library_path)
@@ -60,7 +59,6 @@ def __init__(
             # e.g. in a non-GPU environment
             self.available = False
             self.disabled = True
-            self.stream = None
             return
 
         self.available = True
@@ -98,12 +96,12 @@ def __init__(
         with torch.cuda.device(device):
             self.comm: ncclComm_t = self.nccl.ncclCommInitRank(
                 self.world_size, self.unique_id, self.rank)
-            self.stream = torch.cuda.Stream()
 
+            stream = torch.cuda.current_stream()
             # A small all_reduce for warmup.
             data = torch.zeros(1, device=device)
             self.all_reduce(data)
-            self.stream.synchronize()
+            stream.synchronize()
             del data
 
     def all_reduce(self,
@@ -122,7 +120,7 @@ def all_reduce(self,
         out_tensor = torch.empty_like(in_tensor)
 
         if stream is None:
-            stream = self.stream
+            stream = torch.cuda.current_stream()
         self.nccl.ncclAllReduce(buffer_type(in_tensor.data_ptr()),
                                 buffer_type(out_tensor.data_ptr()),
                                 in_tensor.numel(),
@@ -144,7 +142,7 @@ def all_gather(self,
             f"this nccl communicator is created to work on {self.device}, "
             f"but the input tensor is on {input_tensor.device}")
         if stream is None:
-            stream = self.stream
+            stream = torch.cuda.current_stream()
         self.nccl.ncclAllGather(
             buffer_type(input_tensor.data_ptr()),
             buffer_type(output_tensor.data_ptr()), input_tensor.numel(),
@@ -165,7 +163,7 @@ def reduce_scatter(self,
             f"this nccl communicator is created to work on {self.device}, "
             f"but the input tensor is on {input_tensor.device}")
         if stream is None:
-            stream = self.stream
+            stream = torch.cuda.current_stream()
         self.nccl.ncclReduceScatter(
             buffer_type(input_tensor.data_ptr()),
             buffer_type(output_tensor.data_ptr()), output_tensor.numel(),
@@ -180,7 +178,7 @@ def send(self, tensor: torch.Tensor, dst: int, stream=None):
             f"this nccl communicator is created to work on {self.device}, "
             f"but the input tensor is on {tensor.device}")
         if stream is None:
-            stream = self.stream
+            stream = torch.cuda.current_stream()
         self.nccl.ncclSend(buffer_type(tensor.data_ptr()), tensor.numel(),
                            ncclDataTypeEnum.from_torch(tensor.dtype), dst,
                            self.comm, cudaStream_t(stream.cuda_stream))
@@ -192,7 +190,7 @@ def recv(self, tensor: torch.Tensor, src: int, stream=None):
             f"this nccl communicator is created to work on {self.device}, "
             f"but the input tensor is on {tensor.device}")
         if stream is None:
-            stream = self.stream
+            stream = torch.cuda.current_stream()
         self.nccl.ncclRecv(buffer_type(tensor.data_ptr()), tensor.numel(),
                            ncclDataTypeEnum.from_torch(tensor.dtype), src,
                            self.comm, cudaStream_t(stream.cuda_stream))
@@ -204,7 +202,7 @@ def broadcast(self, tensor: torch.Tensor, src: int, stream=None):
             f"this nccl communicator is created to work on {self.device}, "
             f"but the input tensor is on {tensor.device}")
         if stream is None:
-            stream = self.stream
+            stream = torch.cuda.current_stream()
         if src == self.rank:
             sendbuff = buffer_type(tensor.data_ptr())
             # NCCL requires the sender also to have a receive buffer
@@ -217,9 +215,7 @@ def broadcast(self, tensor: torch.Tensor, src: int, stream=None):
                                 self.comm, cudaStream_t(stream.cuda_stream))
 
     @contextmanager
-    def change_state(self,
-                     enable: Optional[bool] = None,
-                     stream: Optional[torch.cuda.Stream] = None):
+    def change_state(self, enable: Optional[bool] = None):
         """
         A context manager to change the state of the communicator.
         """
@@ -227,15 +223,9 @@ def change_state(self,
             # guess a default value when not specified
             enable = self.available
 
-        if stream is None:
-            stream = self.stream
-
         old_disable = self.disabled
-        old_stream = self.stream
 
-        self.stream = stream
         self.disabled = not enable
         yield
 
         self.disabled = old_disable
-        self.stream = old_stream
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index a0d4235460f3b..dccd3addbcb35 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -310,8 +310,7 @@ def graph_capture(
             if not pynccl_comm:
                 maybe_pynccl_context = nullcontext()
             else:
-                maybe_pynccl_context = pynccl_comm.change_state(
-                    stream=torch.cuda.current_stream())
+                maybe_pynccl_context = pynccl_comm.change_state()
             with maybe_pynccl_context:
                 yield graph_capture_context
 

From eba17173d34548a39989eae2530dce53496a1f3d Mon Sep 17 00:00:00 2001
From: Lancer <402430575@qq.com>
Date: Mon, 6 Jan 2025 00:48:16 +0800
Subject: [PATCH 079/462] fix: [doc] fix typo (#11751)

Co-authored-by: Lancer <maruixiang6688@gmail.com>
---
 vllm/core/block/block_table.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py
index dca0b3fe8d304..90c1438efbd08 100644
--- a/vllm/core/block/block_table.py
+++ b/vllm/core/block/block_table.py
@@ -23,7 +23,7 @@ class BlockTable:
             blocks to initialize the BlockTable with. If not provided, an empty
             BlockTable is created.
         max_block_sliding_window (Optional[int], optional): The number of
-            blocks to keep around for each sequance. If None, all blocks
+            blocks to keep around for each sequence. If None, all blocks
             are kept (eg., when sliding window is not used).
             It should at least fit the sliding window size of the model.
 

From 33fc1e2e86ce5d60940463f8f71daaa61728d3b7 Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Sun, 5 Jan 2025 16:35:01 -0500
Subject: [PATCH 080/462] [Frontend] Improve `StreamingResponse` Exception
 Handling (#11752)

---
 vllm/entrypoints/openai/serving_chat.py       | 4 ++--
 vllm/entrypoints/openai/serving_completion.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 9ba5eeb7709c9..89a119ac65695 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -301,7 +301,7 @@ async def chat_completion_stream_generator(
                 ] * num_choices
             else:
                 tool_parsers = [None] * num_choices
-        except RuntimeError as e:
+        except Exception as e:
             logger.exception("Error in tool parser creation.")
             data = self.create_streaming_error_response(str(e))
             yield f"data: {data}\n\n"
@@ -591,7 +591,7 @@ async def chat_completion_stream_generator(
                 completion_tokens=num_completion_tokens,
                 total_tokens=num_prompt_tokens + num_completion_tokens)
 
-        except ValueError as e:
+        except Exception as e:
             # TODO: Use a vllm-specific Validation Error
             logger.exception("Error in chat completion stream generator.")
             data = self.create_streaming_error_response(str(e))
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 17197dce8da23..2c9c20caf8119 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -371,7 +371,7 @@ async def completion_stream_generator(
             # report to FastAPI middleware aggregate usage across all choices
             request_metadata.final_usage_info = final_usage_info
 
-        except ValueError as e:
+        except Exception as e:
             # TODO: Use a vllm-specific Validation Error
             data = self.create_streaming_error_response(str(e))
             yield f"data: {data}\n\n"

From 9e764e7b105a483ebc702cad33922ba8d8c210e1 Mon Sep 17 00:00:00 2001
From: cennn <61925104+cennn@users.noreply.github.com>
Date: Mon, 6 Jan 2025 09:05:48 +0800
Subject: [PATCH 081/462] [distributed] remove pynccl's redundant change_state
 (#11749)

---
 tests/distributed/test_pynccl.py              | 64 ++++++++-----------
 .../device_communicators/pynccl.py            | 17 -----
 vllm/distributed/parallel_state.py            |  9 +--
 3 files changed, 28 insertions(+), 62 deletions(-)

diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py
index a77b48d5e49f3..a8571a1157892 100644
--- a/tests/distributed/test_pynccl.py
+++ b/tests/distributed/test_pynccl.py
@@ -59,8 +59,7 @@ def worker_fn():
                                      device=get_world_group().device)
     tensor = torch.ones(16, 1024, 1024,
                         dtype=torch.float32).cuda(pynccl_comm.rank)
-    with pynccl_comm.change_state(enable=True):
-        tensor = pynccl_comm.all_reduce(tensor)
+    tensor = pynccl_comm.all_reduce(tensor)
     torch.cuda.synchronize()
     assert torch.all(tensor == pynccl_comm.world_size).cpu().item()
 
@@ -81,17 +80,16 @@ def multiple_allreduce_worker_fn():
     group = groups[0] if torch.distributed.get_rank() in [0, 1] else groups[1]
     pynccl_comm = PyNcclCommunicator(group=group, device=device)
     tensor = torch.ones(16, 1024, 1024, dtype=torch.float32, device=device)
-    with pynccl_comm.change_state(enable=True):
-        # two groups can communicate independently
-        if torch.distributed.get_rank() in [0, 1]:
-            tensor = pynccl_comm.all_reduce(tensor)
-            tensor = pynccl_comm.all_reduce(tensor)
-            torch.cuda.synchronize()
-            assert torch.all(tensor == 4).cpu().item()
-        else:
-            tensor = pynccl_comm.all_reduce(tensor)
-            torch.cuda.synchronize()
-            assert torch.all(tensor == 2).cpu().item()
+    # two groups can communicate independently
+    if torch.distributed.get_rank() in [0, 1]:
+        tensor = pynccl_comm.all_reduce(tensor)
+        tensor = pynccl_comm.all_reduce(tensor)
+        torch.cuda.synchronize()
+        assert torch.all(tensor == 4).cpu().item()
+    else:
+        tensor = pynccl_comm.all_reduce(tensor)
+        torch.cuda.synchronize()
+        assert torch.all(tensor == 2).cpu().item()
 
 
 @pytest.mark.skipif(torch.cuda.device_count() < 4,
@@ -137,8 +135,7 @@ def worker_fn_with_cudagraph():
         # run something in the default stream to initialize torch engine
         a = torch.ones((4, 4), device=f'cuda:{pynccl_comm.rank}')
         torch.cuda.synchronize()
-        with torch.cuda.graph(graph), \
-            pynccl_comm.change_state(enable=True):
+        with torch.cuda.graph(graph):
             a_out = pynccl_comm.all_reduce(a)
         torch.cuda.synchronize()
         graph.replay()
@@ -167,8 +164,7 @@ def all_gather_worker_fn():
         for r in range(world_size)
     ]).to(device)
 
-    with pynccl_comm.change_state(enable=True):
-        pynccl_comm.all_gather(result, tensor)
+    pynccl_comm.all_gather(result, tensor)
     torch.cuda.synchronize()
     torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8)
 
@@ -205,8 +201,7 @@ def reduce_scatter_worker_fn():
     expected = sum(tensor[rank * scattered_size:(rank + 1) * scattered_size]
                    for tensor in all_tensors).to(device)
 
-    with pynccl_comm.change_state(enable=True):
-        pynccl_comm.reduce_scatter(result, tensor)
+    pynccl_comm.reduce_scatter(result, tensor)
     torch.cuda.synchronize()
     torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8)
 
@@ -233,15 +228,13 @@ def send_recv_worker_fn():
     else:
         tensor = torch.empty(16, 1024, 1024,
                              dtype=torch.float32).cuda(pynccl_comm.rank)
-    with pynccl_comm.change_state(enable=True):
-        if pynccl_comm.rank == 0:
-            pynccl_comm.send(tensor,
-                             dst=(pynccl_comm.rank + 1) %
-                             pynccl_comm.world_size)
-        else:
-            pynccl_comm.recv(tensor,
-                             src=(pynccl_comm.rank - 1) %
-                             pynccl_comm.world_size)
+
+    if pynccl_comm.rank == 0:
+        pynccl_comm.send(tensor,
+                         dst=(pynccl_comm.rank + 1) % pynccl_comm.world_size)
+    else:
+        pynccl_comm.recv(tensor,
+                         src=(pynccl_comm.rank - 1) % pynccl_comm.world_size)
     torch.cuda.synchronize()
     assert torch.all(tensor == 1).cpu().item()
 
@@ -272,15 +265,12 @@ def multiple_send_recv_worker_fn():
                              1024,
                              dtype=torch.float32,
                              device=device)
-    with pynccl_comm.change_state(enable=True):
-        if torch.distributed.get_rank() in [0, 1]:
-            pynccl_comm.send(tensor,
-                             dst=(pynccl_comm.rank + 1) %
-                             pynccl_comm.world_size)
-        else:
-            pynccl_comm.recv(tensor,
-                             src=(pynccl_comm.rank - 1) %
-                             pynccl_comm.world_size)
+    if torch.distributed.get_rank() in [0, 1]:
+        pynccl_comm.send(tensor,
+                         dst=(pynccl_comm.rank + 1) % pynccl_comm.world_size)
+    else:
+        pynccl_comm.recv(tensor,
+                         src=(pynccl_comm.rank - 1) % pynccl_comm.world_size)
     torch.cuda.synchronize()
     if torch.distributed.get_rank() in [0, 2]:
         assert torch.all(tensor == 1).cpu().item()
diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py
index 93d96fd8f5686..fda4d007ceb5b 100644
--- a/vllm/distributed/device_communicators/pynccl.py
+++ b/vllm/distributed/device_communicators/pynccl.py
@@ -1,4 +1,3 @@
-from contextlib import contextmanager
 from typing import Optional, Union
 
 # ===================== import region =====================
@@ -213,19 +212,3 @@ def broadcast(self, tensor: torch.Tensor, src: int, stream=None):
         self.nccl.ncclBroadcast(sendbuff, recvbuff, tensor.numel(),
                                 ncclDataTypeEnum.from_torch(tensor.dtype), src,
                                 self.comm, cudaStream_t(stream.cuda_stream))
-
-    @contextmanager
-    def change_state(self, enable: Optional[bool] = None):
-        """
-        A context manager to change the state of the communicator.
-        """
-        if enable is None:
-            # guess a default value when not specified
-            enable = self.available
-
-        old_disable = self.disabled
-
-        self.disabled = not enable
-        yield
-
-        self.disabled = old_disable
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index dccd3addbcb35..a837c1dc5953b 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -305,14 +305,7 @@ def graph_capture(
             stream.wait_stream(curr_stream)
 
         with torch.cuda.stream(stream), maybe_ca_context:
-            pynccl_comm = self.pynccl_comm
-            maybe_pynccl_context: Any
-            if not pynccl_comm:
-                maybe_pynccl_context = nullcontext()
-            else:
-                maybe_pynccl_context = pynccl_comm.change_state()
-            with maybe_pynccl_context:
-                yield graph_capture_context
+            yield graph_capture_context
 
     def all_reduce(self, input_: torch.Tensor) -> torch.Tensor:
         """

From 402d37836059463c7ec8b1e25d40c29138f1dd40 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 6 Jan 2025 10:18:33 +0800
Subject: [PATCH 082/462] [Doc] [1/N] Reorganize Getting Started section
 (#11645)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/source/design/arch_overview.md           |  3 +--
 docs/source/design/multiprocessing.md         |  2 +-
 docs/source/{usage => getting_started}/faq.md |  0
 .../cpu-arm.md}                               |  2 +-
 .../cpu-x86.md}                               |  6 +++---
 .../gpu-cuda.md}                              |  4 ++--
 .../gpu-rocm.md}                              |  2 +-
 .../hpu-gaudi.md}                             |  4 +++-
 .../getting_started/installation/index.md     | 19 +++++++++++++++++++
 .../neuron.md}                                |  2 +-
 .../openvino.md}                              |  4 ++--
 .../tpu.md}                                   |  2 +-
 .../xpu.md}                                   |  2 +-
 docs/source/getting_started/quickstart.md     |  2 +-
 .../{debugging.md => troubleshooting.md}      | 11 ++++++-----
 docs/source/index.md                          | 16 ++++------------
 docs/source/models/generative_models.md       |  2 +-
 docs/source/models/pooling_models.md          |  2 +-
 docs/source/serving/distributed_serving.md    |  2 +-
 docs/source/usage/spec_decode.md              |  4 ++--
 docs/source/usage/structured_outputs.md       |  2 +-
 vllm/utils.py                                 |  2 +-
 22 files changed, 54 insertions(+), 41 deletions(-)
 rename docs/source/{usage => getting_started}/faq.md (100%)
 rename docs/source/getting_started/{arm-installation.md => installation/cpu-arm.md} (92%)
 rename docs/source/getting_started/{cpu-installation.md => installation/cpu-x86.md} (95%)
 rename docs/source/getting_started/{installation.md => installation/gpu-cuda.md} (99%)
 rename docs/source/getting_started/{amd-installation.md => installation/gpu-rocm.md} (99%)
 rename docs/source/getting_started/{gaudi-installation.md => installation/hpu-gaudi.md} (99%)
 create mode 100644 docs/source/getting_started/installation/index.md
 rename docs/source/getting_started/{neuron-installation.md => installation/neuron.md} (99%)
 rename docs/source/getting_started/{openvino-installation.md => installation/openvino.md} (90%)
 rename docs/source/getting_started/{tpu-installation.md => installation/tpu.md} (99%)
 rename docs/source/getting_started/{xpu-installation.md => installation/xpu.md} (98%)
 rename docs/source/getting_started/{debugging.md => troubleshooting.md} (94%)

diff --git a/docs/source/design/arch_overview.md b/docs/source/design/arch_overview.md
index 475a3e5fa9ddc..2f1280c047672 100644
--- a/docs/source/design/arch_overview.md
+++ b/docs/source/design/arch_overview.md
@@ -77,8 +77,7 @@ python -m vllm.entrypoints.openai.api_server --model <model>
 
 That code can be found in <gh-file:vllm/entrypoints/openai/api_server.py>.
 
-More details on the API server can be found in the {doc}`OpenAI Compatible
-Server </serving/openai_compatible_server>` document.
+More details on the API server can be found in the [OpenAI-Compatible Server](#openai-compatible-server) document.
 
 ## LLM Engine
 
diff --git a/docs/source/design/multiprocessing.md b/docs/source/design/multiprocessing.md
index 34564413b34f6..da87638e5b743 100644
--- a/docs/source/design/multiprocessing.md
+++ b/docs/source/design/multiprocessing.md
@@ -2,7 +2,7 @@
 
 ## Debugging
 
-Please see the [Debugging Tips](#debugging-python-multiprocessing)
+Please see the [Troubleshooting](#troubleshooting-python-multiprocessing)
 page for information on known issues and how to solve them.
 
 ## Introduction
diff --git a/docs/source/usage/faq.md b/docs/source/getting_started/faq.md
similarity index 100%
rename from docs/source/usage/faq.md
rename to docs/source/getting_started/faq.md
diff --git a/docs/source/getting_started/arm-installation.md b/docs/source/getting_started/installation/cpu-arm.md
similarity index 92%
rename from docs/source/getting_started/arm-installation.md
rename to docs/source/getting_started/installation/cpu-arm.md
index 799b597b3ad5d..a46e2c010600d 100644
--- a/docs/source/getting_started/arm-installation.md
+++ b/docs/source/getting_started/installation/cpu-arm.md
@@ -2,7 +2,7 @@
 
 # Installation for ARM CPUs
 
-vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform. This guide provides installation instructions specific to ARM. For additional details on supported features, refer to the x86 platform documentation covering:
+vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform. This guide provides installation instructions specific to ARM. For additional details on supported features, refer to the [x86 CPU documentation](#installation-x86) covering:
 
 - CPU backend inference capabilities
 - Relevant runtime environment variables
diff --git a/docs/source/getting_started/cpu-installation.md b/docs/source/getting_started/installation/cpu-x86.md
similarity index 95%
rename from docs/source/getting_started/cpu-installation.md
rename to docs/source/getting_started/installation/cpu-x86.md
index c3d3f715ed804..bbb2d1872ef39 100644
--- a/docs/source/getting_started/cpu-installation.md
+++ b/docs/source/getting_started/installation/cpu-x86.md
@@ -1,6 +1,6 @@
-(installation-cpu)=
+(installation-x86)=
 
-# Installation with CPU
+# Installation for x86 CPUs
 
 vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32, FP16 and BF16. vLLM CPU backend supports the following vLLM features:
 
@@ -151,4 +151,4 @@ $ python examples/offline_inference.py
     $ VLLM_CPU_KVCACHE_SPACE=40 VLLM_CPU_OMP_THREADS_BIND="0-31|32-63" vllm serve meta-llama/Llama-2-7b-chat-hf -tp=2 --distributed-executor-backend mp
     ```
 
-  - Using Data Parallel for maximum throughput: to launch an LLM serving endpoint on each NUMA node along with one additional load balancer to dispatch the requests to those endpoints. Common solutions like [Nginx](../serving/deploying_with_nginx.md) or HAProxy are recommended. Anyscale Ray project provides the feature on LLM [serving](https://docs.ray.io/en/latest/serve/index.html). Here is the example to setup a scalable LLM serving with [Ray Serve](https://github.com/intel/llm-on-ray/blob/main/docs/setup.md).
+  - Using Data Parallel for maximum throughput: to launch an LLM serving endpoint on each NUMA node along with one additional load balancer to dispatch the requests to those endpoints. Common solutions like [Nginx](#nginxloadbalancer) or HAProxy are recommended. Anyscale Ray project provides the feature on LLM [serving](https://docs.ray.io/en/latest/serve/index.html). Here is the example to setup a scalable LLM serving with [Ray Serve](https://github.com/intel/llm-on-ray/blob/main/docs/setup.md).
diff --git a/docs/source/getting_started/installation.md b/docs/source/getting_started/installation/gpu-cuda.md
similarity index 99%
rename from docs/source/getting_started/installation.md
rename to docs/source/getting_started/installation/gpu-cuda.md
index 996fb346f43d4..7ea10bb8b59ff 100644
--- a/docs/source/getting_started/installation.md
+++ b/docs/source/getting_started/installation/gpu-cuda.md
@@ -1,6 +1,6 @@
-(installation)=
+(installation-cuda)=
 
-# Installation
+# Installation for CUDA
 
 vLLM is a Python library that also contains pre-compiled C++ and CUDA (12.1) binaries.
 
diff --git a/docs/source/getting_started/amd-installation.md b/docs/source/getting_started/installation/gpu-rocm.md
similarity index 99%
rename from docs/source/getting_started/amd-installation.md
rename to docs/source/getting_started/installation/gpu-rocm.md
index 6d01efbbf8828..796911d7305a6 100644
--- a/docs/source/getting_started/amd-installation.md
+++ b/docs/source/getting_started/installation/gpu-rocm.md
@@ -1,6 +1,6 @@
 (installation-rocm)=
 
-# Installation with ROCm
+# Installation for ROCm
 
 vLLM supports AMD GPUs with ROCm 6.2.
 
diff --git a/docs/source/getting_started/gaudi-installation.md b/docs/source/getting_started/installation/hpu-gaudi.md
similarity index 99%
rename from docs/source/getting_started/gaudi-installation.md
rename to docs/source/getting_started/installation/hpu-gaudi.md
index 1f2ee62860dec..94de169f51a73 100644
--- a/docs/source/getting_started/gaudi-installation.md
+++ b/docs/source/getting_started/installation/hpu-gaudi.md
@@ -1,4 +1,6 @@
-# Installation with Intel® Gaudi® AI Accelerators
+(installation-gaudi)=
+
+# Installation for Intel® Gaudi®
 
 This README provides instructions on running vLLM with Intel Gaudi devices.
 
diff --git a/docs/source/getting_started/installation/index.md b/docs/source/getting_started/installation/index.md
new file mode 100644
index 0000000000000..83de1aff409b2
--- /dev/null
+++ b/docs/source/getting_started/installation/index.md
@@ -0,0 +1,19 @@
+(installation-index)=
+
+# Installation
+
+vLLM supports the following hardware platforms:
+
+```{toctree}
+:maxdepth: 1
+
+gpu-cuda
+gpu-rocm
+cpu-x86
+cpu-arm
+hpu-gaudi
+tpu
+xpu
+openvino
+neuron
+```
diff --git a/docs/source/getting_started/neuron-installation.md b/docs/source/getting_started/installation/neuron.md
similarity index 99%
rename from docs/source/getting_started/neuron-installation.md
rename to docs/source/getting_started/installation/neuron.md
index baaeeb9f53a10..431f90537f543 100644
--- a/docs/source/getting_started/neuron-installation.md
+++ b/docs/source/getting_started/installation/neuron.md
@@ -1,6 +1,6 @@
 (installation-neuron)=
 
-# Installation with Neuron
+# Installation for Neuron
 
 vLLM 0.3.3 onwards supports model inferencing and serving on AWS Trainium/Inferentia with Neuron SDK with continuous batching.
 Paged Attention and Chunked Prefill are currently in development and will be available soon.
diff --git a/docs/source/getting_started/openvino-installation.md b/docs/source/getting_started/installation/openvino.md
similarity index 90%
rename from docs/source/getting_started/openvino-installation.md
rename to docs/source/getting_started/installation/openvino.md
index 8b43c0a90447f..60f95fd1c4250 100644
--- a/docs/source/getting_started/openvino-installation.md
+++ b/docs/source/getting_started/installation/openvino.md
@@ -1,8 +1,8 @@
 (installation-openvino)=
 
-# Installation with OpenVINO
+# Installation for OpenVINO
 
-vLLM powered by OpenVINO supports all LLM models from {doc}`vLLM supported models list <../models/supported_models>` and can perform optimal model serving on all x86-64 CPUs with, at least, AVX2 support, as well as on both integrated and discrete Intel® GPUs ([the list of supported GPUs](https://docs.openvino.ai/2024/about-openvino/release-notes-openvino/system-requirements.html#gpu)). OpenVINO vLLM backend supports the following advanced vLLM features:
+vLLM powered by OpenVINO supports all LLM models from [vLLM supported models list](#supported-models) and can perform optimal model serving on all x86-64 CPUs with, at least, AVX2 support, as well as on both integrated and discrete Intel® GPUs ([the list of supported GPUs](https://docs.openvino.ai/2024/about-openvino/release-notes-openvino/system-requirements.html#gpu)). OpenVINO vLLM backend supports the following advanced vLLM features:
 
 - Prefix caching (`--enable-prefix-caching`)
 - Chunked prefill (`--enable-chunked-prefill`)
diff --git a/docs/source/getting_started/tpu-installation.md b/docs/source/getting_started/installation/tpu.md
similarity index 99%
rename from docs/source/getting_started/tpu-installation.md
rename to docs/source/getting_started/installation/tpu.md
index 4d3ac541c90ce..bc93c44fead30 100644
--- a/docs/source/getting_started/tpu-installation.md
+++ b/docs/source/getting_started/installation/tpu.md
@@ -1,6 +1,6 @@
 (installation-tpu)=
 
-# Installation with TPU
+# Installation for TPUs
 
 Tensor Processing Units (TPUs) are Google's custom-developed application-specific
 integrated circuits (ASICs) used to accelerate machine learning workloads. TPUs
diff --git a/docs/source/getting_started/xpu-installation.md b/docs/source/getting_started/installation/xpu.md
similarity index 98%
rename from docs/source/getting_started/xpu-installation.md
rename to docs/source/getting_started/installation/xpu.md
index 9554ae4b7fb44..be4e3b9bd1bc5 100644
--- a/docs/source/getting_started/xpu-installation.md
+++ b/docs/source/getting_started/installation/xpu.md
@@ -1,6 +1,6 @@
 (installation-xpu)=
 
-# Installation with XPU
+# Installation for XPUs
 
 vLLM initially supports basic model inferencing and serving on Intel GPU platform.
 
diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md
index 9c8b7e4f592c9..ff216f8af30f9 100644
--- a/docs/source/getting_started/quickstart.md
+++ b/docs/source/getting_started/quickstart.md
@@ -23,7 +23,7 @@ $ conda activate myenv
 $ pip install vllm
 ```
 
-Please refer to the {ref}`installation documentation <installation>` for more details on installing vLLM.
+Please refer to the [installation documentation](#installation-index) for more details on installing vLLM.
 
 (offline-batched-inference)=
 
diff --git a/docs/source/getting_started/debugging.md b/docs/source/getting_started/troubleshooting.md
similarity index 94%
rename from docs/source/getting_started/debugging.md
rename to docs/source/getting_started/troubleshooting.md
index 19eb699572a08..5a0310da0f2cb 100644
--- a/docs/source/getting_started/debugging.md
+++ b/docs/source/getting_started/troubleshooting.md
@@ -1,8 +1,8 @@
-(debugging)=
+(troubleshooting)=
 
-# Debugging Tips
+# Troubleshooting
 
-This document outlines some debugging strategies you can consider. If you think you've discovered a bug, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible.
+This document outlines some troubleshooting strategies you can consider. If you think you've discovered a bug, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible.
 
 ```{note}
 Once you've debugged a problem, remember to turn off any debugging environment variables defined, or simply start a new shell to avoid being affected by lingering debugging settings. Otherwise, the system might be slow with debugging functionalities left activated.
@@ -47,6 +47,7 @@ You might also need to set `export NCCL_SOCKET_IFNAME=<your_network_interface>`
 If vLLM crashes and the error trace captures it somewhere around `self.graph.replay()` in `vllm/worker/model_runner.py`, it is a CUDA error inside CUDAGraph.
 To identify the particular CUDA operation that causes the error, you can add `--enforce-eager` to the command line, or `enforce_eager=True` to the {class}`~vllm.LLM` class to disable the CUDAGraph optimization and isolate the exact CUDA operation that causes the error.
 
+(troubleshooting-incorrect-hardware-driver)=
 ## Incorrect hardware/driver
 
 If GPU/CPU communication cannot be established, you can use the following Python script and follow the instructions below to confirm whether the GPU/CPU communication is working correctly.
@@ -139,7 +140,7 @@ A multi-node environment is more complicated than a single-node one. If you see
 Adjust `--nproc-per-node`, `--nnodes`, and `--node-rank` according to your setup, being sure to execute different commands (with different `--node-rank`) on different nodes.
 ```
 
-(debugging-python-multiprocessing)=
+(troubleshooting-python-multiprocessing)=
 ## Python multiprocessing
 
 ### `RuntimeError` Exception
@@ -150,7 +151,7 @@ If you have seen a warning in your logs like this:
 WARNING 12-11 14:50:37 multiproc_worker_utils.py:281] CUDA was previously
     initialized. We must use the `spawn` multiprocessing start method. Setting
     VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See
-    https://docs.vllm.ai/en/latest/getting_started/debugging.html#python-multiprocessing
+    https://docs.vllm.ai/en/latest/getting_started/troubleshooting.html#python-multiprocessing
     for more information.
 ```
 
diff --git a/docs/source/index.md b/docs/source/index.md
index 34f9c4caebe6f..f390474978790 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -50,7 +50,7 @@ For more information, check out the following:
 - [vLLM announcing blog post](https://vllm.ai) (intro to PagedAttention)
 - [vLLM paper](https://arxiv.org/abs/2309.06180) (SOSP 2023)
 - [How continuous batching enables 23x throughput in LLM inference while reducing p50 latency](https://www.anyscale.com/blog/continuous-batching-llm-inference) by Cade Daniel et al.
-- {ref}`vLLM Meetups <meetups>`.
+- [vLLM Meetups](#meetups)
 
 ## Documentation
 
@@ -58,18 +58,11 @@ For more information, check out the following:
 :caption: Getting Started
 :maxdepth: 1
 
-getting_started/installation
-getting_started/amd-installation
-getting_started/openvino-installation
-getting_started/cpu-installation
-getting_started/gaudi-installation
-getting_started/arm-installation
-getting_started/neuron-installation
-getting_started/tpu-installation
-getting_started/xpu-installation
+getting_started/installation/index
 getting_started/quickstart
-getting_started/debugging
 getting_started/examples/examples_index
+getting_started/troubleshooting
+getting_started/faq
 ```
 
 ```{toctree}
@@ -110,7 +103,6 @@ usage/structured_outputs
 usage/spec_decode
 usage/compatibility_matrix
 usage/performance
-usage/faq
 usage/engine_args
 usage/env_vars
 usage/usage_stats
diff --git a/docs/source/models/generative_models.md b/docs/source/models/generative_models.md
index 35e0302b86619..383299d61b5dd 100644
--- a/docs/source/models/generative_models.md
+++ b/docs/source/models/generative_models.md
@@ -120,7 +120,7 @@ outputs = llm.chat(conversation, chat_template=custom_template)
 
 ## Online Inference
 
-Our [OpenAI Compatible Server](../serving/openai_compatible_server.md) provides endpoints that correspond to the offline APIs:
+Our [OpenAI-Compatible Server](#openai-compatible-server) provides endpoints that correspond to the offline APIs:
 
 - [Completions API](#completions-api) is similar to `LLM.generate` but only accepts text.
 - [Chat API](#chat-api)  is similar to `LLM.chat`, accepting both text and [multi-modal inputs](#multimodal-inputs) for models with a chat template.
diff --git a/docs/source/models/pooling_models.md b/docs/source/models/pooling_models.md
index 76c96c9edcc5d..12ded68eb30b5 100644
--- a/docs/source/models/pooling_models.md
+++ b/docs/source/models/pooling_models.md
@@ -106,7 +106,7 @@ A code example can be found here: <gh-file:examples/offline_inference_scoring.py
 
 ## Online Inference
 
-Our [OpenAI Compatible Server](../serving/openai_compatible_server.md) provides endpoints that correspond to the offline APIs:
+Our [OpenAI-Compatible Server](#openai-compatible-server) provides endpoints that correspond to the offline APIs:
 
 - [Pooling API](#pooling-api) is similar to `LLM.encode`, being applicable to all types of pooling models.
 - [Embeddings API](#embeddings-api) is similar to `LLM.embed`, accepting both text and [multi-modal inputs](#multimodal-inputs) for embedding models.
diff --git a/docs/source/serving/distributed_serving.md b/docs/source/serving/distributed_serving.md
index a1dd0e89e8c79..6fbc1ea104678 100644
--- a/docs/source/serving/distributed_serving.md
+++ b/docs/source/serving/distributed_serving.md
@@ -95,7 +95,7 @@ $     --tensor-parallel-size 16
 To make tensor parallel performant, you should make sure the communication between nodes is efficient, e.g. using high-speed network cards like Infiniband. To correctly set up the cluster to use Infiniband, append additional arguments like `--privileged -e NCCL_IB_HCA=mlx5` to the `run_cluster.sh` script. Please contact your system administrator for more information on how to set up the flags. One way to confirm if the Infiniband is working is to run vLLM with `NCCL_DEBUG=TRACE` environment variable set, e.g. `NCCL_DEBUG=TRACE vllm serve ...` and check the logs for the NCCL version and the network used. If you find `[send] via NET/Socket` in the logs, it means NCCL uses raw TCP Socket, which is not efficient for cross-node tensor parallel. If you find `[send] via NET/IB/GDRDMA` in the logs, it means NCCL uses Infiniband with GPU-Direct RDMA, which is efficient.
 
 ```{warning}
-After you start the Ray cluster, you'd better also check the GPU-GPU communication between nodes. It can be non-trivial to set up. Please refer to the [sanity check script](../getting_started/debugging.md) for more information. If you need to set some environment variables for the communication configuration, you can append them to the `run_cluster.sh` script, e.g. `-e NCCL_SOCKET_IFNAME=eth0`. Note that setting environment variables in the shell (e.g. `NCCL_SOCKET_IFNAME=eth0 vllm serve ...`) only works for the processes in the same node, not for the processes in the other nodes. Setting environment variables when you create the cluster is the recommended way. See <gh-issue:6803> for more information.
+After you start the Ray cluster, you'd better also check the GPU-GPU communication between nodes. It can be non-trivial to set up. Please refer to the [sanity check script](#troubleshooting-incorrect-hardware-driver) for more information. If you need to set some environment variables for the communication configuration, you can append them to the `run_cluster.sh` script, e.g. `-e NCCL_SOCKET_IFNAME=eth0`. Note that setting environment variables in the shell (e.g. `NCCL_SOCKET_IFNAME=eth0 vllm serve ...`) only works for the processes in the same node, not for the processes in the other nodes. Setting environment variables when you create the cluster is the recommended way. See <gh-issue:6803> for more information.
 ```
 
 ```{warning}
diff --git a/docs/source/usage/spec_decode.md b/docs/source/usage/spec_decode.md
index 8302da81b6173..8c52c97a41e48 100644
--- a/docs/source/usage/spec_decode.md
+++ b/docs/source/usage/spec_decode.md
@@ -182,7 +182,7 @@ speculative decoding, breaking down the guarantees into three key areas:
 3. **vLLM Logprob Stability**
    \- vLLM does not currently guarantee stable token log probabilities (logprobs). This can result in different outputs for the
    same request across runs. For more details, see the FAQ section
-   titled *Can the output of a prompt vary across runs in vLLM?* in the {ref}`FAQs <faq>`.
+   titled *Can the output of a prompt vary across runs in vLLM?* in the [FAQs](#faq).
 
 **Conclusion**
 
@@ -195,7 +195,7 @@ can occur due to following factors:
 
 **Mitigation Strategies**
 
-For mitigation strategies, please refer to the FAQ entry *Can the output of a prompt vary across runs in vLLM?* in the {ref}`FAQs <faq>`.
+For mitigation strategies, please refer to the FAQ entry *Can the output of a prompt vary across runs in vLLM?* in the [FAQs](#faq).
 
 ## Resources for vLLM contributors
 
diff --git a/docs/source/usage/structured_outputs.md b/docs/source/usage/structured_outputs.md
index 7292012e36a26..26c09bb0d8a0c 100644
--- a/docs/source/usage/structured_outputs.md
+++ b/docs/source/usage/structured_outputs.md
@@ -18,7 +18,7 @@ The following parameters are supported, which must be added as extra parameters:
 - `guided_whitespace_pattern`: used to override the default whitespace pattern for guided json decoding.
 - `guided_decoding_backend`: used to select the guided decoding backend to use.
 
-You can see the complete list of supported parameters on the [OpenAI Compatible Server](../serving/openai_compatible_server.md) page.
+You can see the complete list of supported parameters on the [OpenAI-Compatible Server](#openai-compatible-server)page.
 
 Now let´s see an example for each of the cases, starting with the `guided_choice`, as it´s the easiest one:
 
diff --git a/vllm/utils.py b/vllm/utils.py
index 8ef07d2c326a3..aadeddabf8b55 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -1938,7 +1938,7 @@ def _check_multiproc_method():
                        "the `spawn` multiprocessing start method. Setting "
                        "VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. "
                        "See https://docs.vllm.ai/en/latest/getting_started/"
-                       "debugging.html#python-multiprocessing "
+                       "troubleshooting.html#python-multiprocessing "
                        "for more information.")
         os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
 

From 408e5600158bfa34306cfbd034a3779e488752fa Mon Sep 17 00:00:00 2001
From: Cody Yu <hao.yu.cody@gmail.com>
Date: Sun, 5 Jan 2025 20:49:55 -0800
Subject: [PATCH 083/462] [Bugfix] Remove block size constraint (#11723)

---
 vllm/config.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index b51f9783008b2..b0ed88cb7f42b 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1015,11 +1015,6 @@ def _verify_args(self) -> None:
             raise ValueError(
                 "GPU memory utilization must be less than 1.0. Got "
                 f"{self.gpu_memory_utilization}.")
-        from vllm.platforms import current_platform
-        if (current_platform.is_cuda() and self.block_size is not None
-                and self.block_size > 32):
-            raise ValueError("CUDA Paged Attention kernel only supports "
-                             f"block sizes up to 32. Got {self.block_size}.")
 
     def _verify_cache_dtype(self) -> None:
         if self.cache_dtype == "auto":

From 06bfb51963953d6ae31b87965bfb91b6eca4fd24 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Mon, 6 Jan 2025 14:24:42 +0900
Subject: [PATCH 084/462] [V1] Add BlockTable class (#11693)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/block_table.py      | 78 ++++++++++++++++++++++++++++++
 vllm/v1/worker/gpu_input_batch.py  | 25 ++++------
 vllm/v1/worker/gpu_model_runner.py | 16 +++---
 3 files changed, 94 insertions(+), 25 deletions(-)
 create mode 100644 vllm/v1/worker/block_table.py

diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py
new file mode 100644
index 0000000000000..26a2084b131fa
--- /dev/null
+++ b/vllm/v1/worker/block_table.py
@@ -0,0 +1,78 @@
+from typing import List
+
+import numpy as np
+import torch
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+class BlockTable:
+
+    def __init__(
+        self,
+        max_num_reqs: int,
+        max_model_len: int,
+        max_num_blocks_per_req: int,
+        pin_memory: bool,
+        device: torch.device,
+    ):
+        self.max_num_reqs = max_num_reqs
+        self.max_model_len = max_model_len
+        self.max_num_blocks_per_req = max_num_blocks_per_req
+        self.pin_memory = pin_memory
+        self.device = device
+
+        self.block_table = torch.zeros(
+            (max_num_reqs, max_num_blocks_per_req),
+            device=self.device,
+            dtype=torch.int32,
+        )
+        self.block_table_cpu = torch.zeros(
+            (max_num_reqs, max_num_blocks_per_req),
+            device="cpu",
+            dtype=torch.int32,
+            pin_memory=pin_memory,
+        )
+        self.block_table_np = self.block_table_cpu.numpy()
+        self.num_blocks_per_row = np.zeros(max_num_reqs, dtype=np.int32)
+
+    def append_row(
+        self,
+        row_idx: int,
+        start: int,
+        block_ids: List[int],
+    ) -> None:
+        num_blocks = len(block_ids)
+        self.block_table_np[row_idx, start:start + num_blocks] = block_ids
+        self.num_blocks_per_row[row_idx] = start + num_blocks
+
+    def add_row(self, row_idx: int, block_ids: List[int]) -> None:
+        self.append_row(row_idx, 0, block_ids)
+
+    def move_row(self, src: int, tgt: int) -> None:
+        num_blocks = self.num_blocks_per_row[src]
+        self.block_table_np[tgt, :num_blocks] = self.block_table_np[
+            src, :num_blocks]
+        self.num_blocks_per_row[tgt] = num_blocks
+
+    def commit(self, num_reqs: int) -> None:
+        self.block_table[:num_reqs].copy_(self.block_table_cpu[:num_reqs],
+                                          non_blocking=True)
+
+    def clear(self) -> None:
+        self.block_table.fill_(0)
+        self.block_table_cpu.fill_(0)
+
+    def get_device_tensor(self) -> torch.Tensor:
+        """Ruturns the device tensor of the block table."""
+        return self.block_table
+
+    def get_cpu_tensor(self) -> torch.Tensor:
+        """Returns the CPU tensor of the block table."""
+        return self.block_table_cpu
+
+    def get_numpy_array(self) -> np.ndarray:
+        """Returns the numpy array of the block table."""
+        return self.block_table_np
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index f8a1427c6c26c..40494e64b22f0 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -9,6 +9,7 @@
 from vllm.multimodal import MultiModalKwargs
 from vllm.sampling_params import SamplingParams, SamplingType
 from vllm.v1.sample.metadata import SamplingMetadata
+from vllm.v1.worker.block_table import BlockTable
 
 if TYPE_CHECKING:
     from vllm.multimodal.inputs import PlaceholderRange
@@ -70,19 +71,14 @@ def __init__(
         self.num_prompt_tokens = np.zeros(max_num_reqs, dtype=np.int32)
         self.num_computed_tokens_cpu = np.empty(max_num_reqs, dtype=np.int32)
 
-        # Attention-related.
-        self.block_table = torch.zeros(
-            (max_num_reqs, max_num_blocks_per_req),
-            device=self.device,
-            dtype=torch.int32,
-        )
-        self.block_table_cpu_tensor = torch.zeros(
-            (max_num_reqs, max_num_blocks_per_req),
-            device="cpu",
-            dtype=torch.int32,
+        # Block table.
+        self.block_table = BlockTable(
+            max_num_reqs=max_num_reqs,
+            max_model_len=max_model_len,
+            max_num_blocks_per_req=max_num_blocks_per_req,
             pin_memory=pin_memory,
+            device=device,
         )
-        self.block_table_cpu = self.block_table_cpu_tensor.numpy()
 
         # Sampling-related.
         self.temperature = torch.empty((max_num_reqs, ),
@@ -193,8 +189,7 @@ def add_request(
         self.num_tokens[req_index] = request.num_tokens
 
         self.num_computed_tokens_cpu[req_index] = request.num_computed_tokens
-        num_blocks = len(request.block_ids)
-        self.block_table_cpu[req_index, :num_blocks] = request.block_ids
+        self.block_table.add_row(req_index, request.block_ids)
 
         sampling_params = request.sampling_params
         self.temperature_cpu[req_index] = sampling_params.temperature
@@ -300,9 +295,7 @@ def condense(self, empty_req_indices: List[int]) -> None:
                 self.num_prompt_tokens[last_req_index]
             self.num_computed_tokens_cpu[
                 empty_index] = self.num_computed_tokens_cpu[last_req_index]
-            # TODO(woosuk): Optimize the copy of block_table_cpu.
-            self.block_table_cpu[empty_index] = self.block_table_cpu[
-                last_req_index]
+            self.block_table.move_row(last_req_index, empty_index)
             self.temperature_cpu[empty_index] = self.temperature_cpu[
                 last_req_index]
             self.top_p_cpu[empty_index] = self.top_p_cpu[last_req_index]
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 294c76cfb680e..31e693235f99f 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -211,10 +211,9 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
             if num_new_blocks == 0:
                 continue
             start_index = len(req_state.block_ids)
-            end_index = start_index + num_new_blocks
             req_state.block_ids.extend(req_data.new_block_ids)
-            self.input_batch.block_table_cpu[
-                req_index, start_index:end_index] = req_data.new_block_ids
+            self.input_batch.block_table.append_row(req_index, start_index,
+                                                    req_data.new_block_ids)
 
         req_ids_to_add: List[str] = []
         # Add new requests to the cached states.
@@ -275,9 +274,7 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
 
         # OPTIMIZATION: Start copying the block table first.
         # This way, we can overlap the copy with the following CPU operations.
-        self.input_batch.block_table[:num_reqs].copy_(
-            self.input_batch.block_table_cpu_tensor[:num_reqs],
-            non_blocking=True)
+        self.input_batch.block_table.commit(num_reqs)
 
         # Get the number of scheduled tokens for each request.
         # TODO: The Python loop can be slow. Optimize.
@@ -333,8 +330,8 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
         # NOTE(woosuk): We use torch.index_select instead of np.take here
         # because torch.index_select is much faster than np.take for large
         # tensors.
-        block_numbers = (self.input_batch.block_table_cpu_tensor.flatten()
-                         [block_table_indices].numpy())
+        block_table_cpu = self.input_batch.block_table.get_cpu_tensor()
+        block_numbers = block_table_cpu.flatten()[block_table_indices].numpy()
         block_offsets = positions_np % self.block_size
         np.add(block_numbers * self.block_size,
                block_offsets,
@@ -450,7 +447,8 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
             query_start_loc=query_start_loc,
             max_seq_len=max_seq_len,
             seq_start_loc=seq_start_loc,
-            block_table=self.input_batch.block_table[:num_reqs],
+            block_table=(
+                self.input_batch.block_table.get_device_tensor()[:num_reqs]),
             slot_mapping=slot_mapping,
             use_cascade=use_cascade,
             common_prefix_len=common_prefix_len,

From f8fcca100beada88136944976da88f47f363acab Mon Sep 17 00:00:00 2001
From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
Date: Sun, 5 Jan 2025 23:12:38 -0800
Subject: [PATCH 085/462] [Misc] Fix typo for valid_tool_parses  (#11753)

Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
---
 vllm/entrypoints/openai/api_server.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index e942b475535ad..047f699e4f277 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -767,11 +767,11 @@ async def run_server(args, **uvicorn_kwargs) -> None:
     if args.tool_parser_plugin and len(args.tool_parser_plugin) > 3:
         ToolParserManager.import_tool_parser(args.tool_parser_plugin)
 
-    valide_tool_parses = ToolParserManager.tool_parsers.keys()
+    valid_tool_parses = ToolParserManager.tool_parsers.keys()
     if args.enable_auto_tool_choice \
-        and args.tool_call_parser not in valide_tool_parses:
+        and args.tool_call_parser not in valid_tool_parses:
         raise KeyError(f"invalid tool call parser: {args.tool_call_parser} "
-                       f"(chose from {{ {','.join(valide_tool_parses)} }})")
+                       f"(chose from {{ {','.join(valid_tool_parses)} }})")
 
     # workaround to make sure that we bind the port before the engine is set up.
     # This avoids race conditions with ray.

From 022c5c6944bcf28ac4d0d28ce14f2b559358be52 Mon Sep 17 00:00:00 2001
From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
Date: Sun, 5 Jan 2025 23:59:16 -0800
Subject: [PATCH 086/462] [V1] Refactor get_executor_cls (#11754)

---
 tests/v1/engine/test_engine_core.py        |  6 +++---
 tests/v1/engine/test_engine_core_client.py |  6 +++---
 vllm/v1/engine/async_llm.py                | 21 +--------------------
 vllm/v1/engine/llm_engine.py               | 20 +-------------------
 vllm/v1/executor/abstract.py               | 19 ++++++++++++++++++-
 5 files changed, 26 insertions(+), 46 deletions(-)

diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index 954cec734b956..8dd9b23fbdd5f 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -8,8 +8,8 @@
 from vllm.engine.arg_utils import EngineArgs
 from vllm.platforms import current_platform
 from vllm.v1.engine import EngineCoreRequest
-from vllm.v1.engine.async_llm import AsyncLLM
 from vllm.v1.engine.core import EngineCore
+from vllm.v1.executor.abstract import Executor
 
 if not current_platform.is_cuda():
     pytest.skip(reason="V1 currently only supported on CUDA.",
@@ -43,7 +43,7 @@ def test_engine_core(monkeypatch):
         """Setup the EngineCore."""
         engine_args = EngineArgs(model=MODEL_NAME)
         vllm_config = engine_args.create_engine_config()
-        executor_class = AsyncLLM._get_executor_cls(vllm_config)
+        executor_class = Executor.get_class(vllm_config)
 
         engine_core = EngineCore(vllm_config=vllm_config,
                                  executor_class=executor_class)
@@ -149,7 +149,7 @@ def test_engine_core_advanced_sampling(monkeypatch):
         """Setup the EngineCore."""
         engine_args = EngineArgs(model=MODEL_NAME)
         vllm_config = engine_args.create_engine_config()
-        executor_class = AsyncLLM._get_executor_cls(vllm_config)
+        executor_class = Executor.get_class(vllm_config)
 
         engine_core = EngineCore(vllm_config=vllm_config,
                                  executor_class=executor_class)
diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
index 20d4e6f63b339..5a21806e57a11 100644
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -11,8 +11,8 @@
 from vllm.platforms import current_platform
 from vllm.usage.usage_lib import UsageContext
 from vllm.v1.engine import EngineCoreRequest
-from vllm.v1.engine.async_llm import AsyncLLM
 from vllm.v1.engine.core_client import EngineCoreClient
+from vllm.v1.executor.abstract import Executor
 
 if not current_platform.is_cuda():
     pytest.skip(reason="V1 currently only supported on CUDA.",
@@ -84,7 +84,7 @@ def test_engine_core_client(monkeypatch, multiprocessing_mode: bool):
         engine_args = EngineArgs(model=MODEL_NAME, compilation_config=3)
         vllm_config = engine_args.create_engine_config(
             UsageContext.UNKNOWN_CONTEXT)
-        executor_class = AsyncLLM._get_executor_cls(vllm_config)
+        executor_class = Executor.get_class(vllm_config)
         client = EngineCoreClient.make_client(
             multiprocess_mode=multiprocessing_mode,
             asyncio_mode=False,
@@ -152,7 +152,7 @@ async def test_engine_core_client_asyncio(monkeypatch):
         engine_args = EngineArgs(model=MODEL_NAME)
         vllm_config = engine_args.create_engine_config(
             usage_context=UsageContext.UNKNOWN_CONTEXT)
-        executor_class = AsyncLLM._get_executor_cls(vllm_config)
+        executor_class = Executor.get_class(vllm_config)
         client = EngineCoreClient.make_client(
             multiprocess_mode=True,
             asyncio_mode=True,
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 0696caf88385d..b963ba74f13f0 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -22,7 +22,6 @@
 from vllm.v1.engine.detokenizer import Detokenizer
 from vllm.v1.engine.processor import Processor
 from vllm.v1.executor.abstract import Executor
-from vllm.v1.executor.ray_utils import initialize_ray_cluster
 
 logger = init_logger(__name__)
 
@@ -105,7 +104,7 @@ def from_engine_args(
         else:
             vllm_config = engine_config
 
-        executor_class = cls._get_executor_cls(vllm_config)
+        executor_class = Executor.get_class(vllm_config)
 
         # Create the AsyncLLM.
         return cls(
@@ -127,24 +126,6 @@ def shutdown(self):
         if handler := getattr(self, "output_handler", None):
             handler.cancel()
 
-    @classmethod
-    def _get_executor_cls(cls, vllm_config: VllmConfig) -> Type[Executor]:
-        executor_class: Type[Executor]
-        distributed_executor_backend = (
-            vllm_config.parallel_config.distributed_executor_backend)
-        if distributed_executor_backend == "ray":
-            initialize_ray_cluster(vllm_config.parallel_config)
-            from vllm.v1.executor.ray_executor import RayExecutor
-            executor_class = RayExecutor
-        elif distributed_executor_backend == "mp":
-            from vllm.v1.executor.multiproc_executor import MultiprocExecutor
-            executor_class = MultiprocExecutor
-        else:
-            assert (distributed_executor_backend is None)
-            from vllm.v1.executor.uniproc_executor import UniprocExecutor
-            executor_class = UniprocExecutor
-        return executor_class
-
     async def add_request(
         self,
         request_id: str,
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 0bd9b52c9be82..8ced3a34d2da3 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -89,7 +89,7 @@ def from_engine_args(
 
         # Create the engine configs.
         vllm_config = engine_args.create_engine_config(usage_context)
-        executor_class = cls._get_executor_cls(vllm_config)
+        executor_class = Executor.get_class(vllm_config)
 
         if VLLM_ENABLE_V1_MULTIPROCESSING:
             logger.debug("Enabling multiprocessing for LLMEngine.")
@@ -103,24 +103,6 @@ def from_engine_args(
                    stat_loggers=stat_loggers,
                    multiprocess_mode=enable_multiprocessing)
 
-    @classmethod
-    def _get_executor_cls(cls, vllm_config: VllmConfig) -> Type[Executor]:
-        executor_class: Type[Executor]
-        distributed_executor_backend = (
-            vllm_config.parallel_config.distributed_executor_backend)
-        if distributed_executor_backend == "ray":
-            from vllm.v1.executor.ray_executor import RayExecutor
-            executor_class = RayExecutor
-        elif distributed_executor_backend == "mp":
-            from vllm.v1.executor.multiproc_executor import MultiprocExecutor
-            executor_class = MultiprocExecutor
-        else:
-            assert (distributed_executor_backend is None)
-            from vllm.v1.executor.uniproc_executor import UniprocExecutor
-            executor_class = UniprocExecutor
-
-        return executor_class
-
     def get_num_unfinished_requests(self) -> int:
         return self.detokenizer.get_num_unfinished_requests()
 
diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py
index 564d0447f15a6..5d74d4b01f500 100644
--- a/vllm/v1/executor/abstract.py
+++ b/vllm/v1/executor/abstract.py
@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import Tuple
+from typing import Tuple, Type
 
 from vllm.config import VllmConfig
 from vllm.v1.outputs import ModelRunnerOutput
@@ -8,6 +8,23 @@
 class Executor(ABC):
     """Abstract class for executors."""
 
+    @staticmethod
+    def get_class(vllm_config: VllmConfig) -> Type["Executor"]:
+        executor_class: Type[Executor]
+        distributed_executor_backend = (
+            vllm_config.parallel_config.distributed_executor_backend)
+        if distributed_executor_backend == "ray":
+            from vllm.v1.executor.ray_executor import RayExecutor
+            executor_class = RayExecutor
+        elif distributed_executor_backend == "mp":
+            from vllm.v1.executor.multiproc_executor import MultiprocExecutor
+            executor_class = MultiprocExecutor
+        else:
+            assert (distributed_executor_backend is None)
+            from vllm.v1.executor.uniproc_executor import UniprocExecutor
+            executor_class = UniprocExecutor
+        return executor_class
+
     @abstractmethod
     def __init__(self, vllm_config: VllmConfig) -> None:
         raise NotImplementedError

From 9c749713f6990a9f9d12e526d9bfc2669dfa8ee6 Mon Sep 17 00:00:00 2001
From: Lucas Tucker <47258766+lucas-tucker@users.noreply.github.com>
Date: Mon, 6 Jan 2025 01:59:36 -0600
Subject: [PATCH 087/462] [mypy] Forward pass function type hints in lora
 (#11740)

Signed-off-by: lucast2021 <lucast2021@headroyce.org>
Co-authored-by: lucast2021 <lucast2021@headroyce.org>
---
 vllm/lora/layers.py                  | 12 +++++++++---
 vllm/lora/models.py                  |  3 ++-
 vllm/model_executor/layers/linear.py |  4 +++-
 3 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index 102e40d3f448d..a933ccaecf15e 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -405,7 +405,9 @@ def __init__(self, base_layer: ReplicatedLinear) -> None:
         self.output_size = self.base_layer.output_size
         self.n_slices = 1
 
-    def forward(self, input_):
+    def forward(
+        self, input_: torch.Tensor
+    ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]:
         """Forward of ReplicatedLinearWithLoRA
 
         Args:
@@ -496,7 +498,9 @@ def slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
         bias = bias[start_idx:end_idx]
         return bias
 
-    def forward(self, input_):
+    def forward(
+        self, input_: torch.Tensor
+    ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]:
         """Forward of ColumnParallelLinear
 
         Args:
@@ -833,7 +837,9 @@ def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor:
     def slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
         return bias
 
-    def forward(self, input_):
+    def forward(
+        self, input_: torch.Tensor
+    ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]:
         """Forward of RowParallelLinear
 
         Args:
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 9cfcc6bba727f..5b7225bdc8f37 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -4,7 +4,7 @@
 import os
 import re
 from dataclasses import dataclass, field
-from typing import Any, Callable, Dict, List, Optional, Sequence, Type
+from typing import Any, Callable, Dict, List, Optional, Sequence, Type, Union
 
 import safetensors.torch
 import torch
@@ -219,6 +219,7 @@ def from_local_checkpoint(
 
         config["vllm_max_position_embeddings"] = max_position_embeddings
         peft_helper = PEFTHelper.from_dict(config)
+        unexpected_modules: List[Union[list[str], str]]
         if os.path.isfile(lora_tensor_path):
             tensors: Dict[str, torch.Tensor] = {}
             # Find unexpected modules.
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 33b221b994b2b..48cfb1b221720 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -238,7 +238,9 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
         assert param.size() == loaded_weight.size()
         param.data.copy_(loaded_weight)
 
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
+    def forward(
+        self, x: torch.Tensor
+    ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]:
         bias = self.bias if not self.skip_bias_add else None
         assert self.quant_method is not None
         output = self.quant_method.apply(self, x, bias)

From 2a622d704a4270c8d6fab057e8a545ed86ac35b7 Mon Sep 17 00:00:00 2001
From: Suraj Deshmukh <surajd.service@gmail.com>
Date: Mon, 6 Jan 2025 00:01:22 -0800
Subject: [PATCH 088/462] k8s-config: Update the secret to use stringData
 (#11679)

Signed-off-by: Suraj Deshmukh <surajd.service@gmail.com>
---
 docs/source/serving/deploying_with_k8s.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/serving/deploying_with_k8s.md b/docs/source/serving/deploying_with_k8s.md
index 77f848088ea43..5f9b0e4f55ecc 100644
--- a/docs/source/serving/deploying_with_k8s.md
+++ b/docs/source/serving/deploying_with_k8s.md
@@ -43,7 +43,7 @@ metadata:
   name: hf-token-secret
   namespace: default
 type: Opaque
-data:
+stringData:
   token: "REPLACE_WITH_TOKEN"
 ```
 

From 996357e4808ca5eab97d4c97c7d25b3073f46aab Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 6 Jan 2025 16:02:21 +0800
Subject: [PATCH 089/462] [VLM] Separate out profiling-related logic (#11746)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/multimodal/test_processing.py           |   7 +-
 vllm/model_executor/models/aria.py            |  79 +++---
 vllm/model_executor/models/blip2.py           |  78 +++---
 vllm/model_executor/models/chameleon.py       |  72 +++---
 vllm/model_executor/models/fuyu.py            |  85 ++++---
 vllm/model_executor/models/llava.py           | 181 +++++++++-----
 vllm/model_executor/models/llava_next.py      |  75 +++---
 .../model_executor/models/llava_next_video.py | 148 ++++++-----
 vllm/model_executor/models/llava_onevision.py | 174 +++++++------
 vllm/model_executor/models/phi3v.py           | 104 ++++----
 vllm/model_executor/models/qwen2_audio.py     |  96 +++++---
 vllm/model_executor/models/qwen2_vl.py        | 231 +++++++++++-------
 vllm/model_executor/models/ultravox.py        |  91 ++++---
 vllm/model_executor/models/vision.py          |  37 +--
 vllm/multimodal/processing.py                 | 152 ++++--------
 vllm/multimodal/profiling.py                  | 121 +++++++++
 vllm/multimodal/registry.py                   |   2 +-
 17 files changed, 1015 insertions(+), 718 deletions(-)
 create mode 100644 vllm/multimodal/profiling.py

diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index b32faa699ebf2..75d878217b657 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -586,9 +586,10 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
     )
 
     processor = processor_factory(ctx, cache=None)
+    profiler = processor.profiling_info
 
     mock_supported_mm_limits = MagicMock(return_value={"image": num_supported})
-    processor.get_supported_mm_limits = mock_supported_mm_limits
+    profiler.get_supported_mm_limits = mock_supported_mm_limits
 
     if is_valid:
         exc_ctx = nullcontext()
@@ -596,7 +597,7 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
         exc_ctx = pytest.raises(ValueError, match="this model only supports")
 
     with exc_ctx:
-        processor._get_and_validate_dummy_mm_counts()
+        profiler.get_mm_limits()
 
 
 @pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
@@ -723,7 +724,7 @@ def _test_processing_cache_correctness(
         }
 
         mm_counts = {k: len(vs) for k, vs in mm_data.items()}
-        prompt = baseline_processor._get_dummy_processor_inputs(
+        prompt = baseline_processor.profiling_info.get_dummy_processor_inputs(
             model_config.max_model_len,
             mm_counts,
         ).prompt_text
diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
index 8f5fd64a90c87..2e649f10c0765 100644
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@@ -24,8 +24,9 @@
 from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
                                     NestedTensors)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessorInputs,
+                                        MultiModalDataItems, ProcessingMixin,
                                         PromptReplacement)
+from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs.aria import (AriaMoELMConfig,
                                                   AriaVisionConfig)
@@ -444,18 +445,58 @@ def build_mm_projector(config: PretrainedConfig):
     )
 
 
-class AriaMultiModalProcessor(BaseMultiModalProcessor):
+class AriaProcessingMixin(ProcessingMixin):
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
-        return {"image": None}
+    def _get_hf_config(self):
+        return self.ctx.get_hf_config()
+
+    def _get_vision_config(self) -> AriaVisionConfig:
+        return self._get_hf_config().vision_config
 
     def _get_num_image_tokens(self) -> int:
-        hf_config = self.ctx.get_hf_config()
+        hf_config = self._get_hf_config()
         return max(hf_config.projector_patch_to_query_dict.values())
 
+
+class AriaProfilingInfo(AriaProcessingMixin, BaseProfilingInfo):
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None}
+
     def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
         return {"image": self._get_num_image_tokens()}
 
+    def get_dummy_processor_inputs(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        vision_config = self._get_vision_config()
+
+        max_image_size = vision_config.image_size
+        num_images = mm_counts.get("image", 0)
+
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=max_image_size,
+                                   height=max_image_size,
+                                   num_images=num_images)
+        }
+
+        hf_processor = self._get_hf_processor()
+        image_token: str = hf_processor.image_token  # type: ignore
+
+        return ProcessorInputs(
+            prompt_text=image_token * num_images,
+            mm_data=mm_data,
+        )
+
+
+class AriaMultiModalProcessor(AriaProcessingMixin, BaseMultiModalProcessor):
+
+    def _get_profiling_info(self) -> BaseProfilingInfo:
+        return AriaProfilingInfo(self.ctx)
+
     def _get_mm_fields_config(
         self,
         hf_inputs: BatchFeature,
@@ -472,7 +513,7 @@ def _get_prompt_replacements(
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
-        hf_config = self.ctx.get_hf_config()
+        hf_config = self._get_hf_config()
         image_token_id = hf_config.image_token_index
 
         num_image_tokens = self._get_num_image_tokens()
@@ -485,32 +526,6 @@ def _get_prompt_replacements(
             )
         ]
 
-    def _get_dummy_processor_inputs(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
-        hf_config = self.ctx.get_hf_config()
-        vision_config: AriaVisionConfig = hf_config.vision_config
-
-        max_image_size = vision_config.image_size
-        num_images = mm_counts.get("image", 0)
-
-        mm_data = {
-            "image":
-            self._get_dummy_images(width=max_image_size,
-                                   height=max_image_size,
-                                   num_images=num_images)
-        }
-
-        hf_processor = self._get_hf_processor()
-        image_token: str = hf_processor.image_token  # type: ignore
-
-        return ProcessorInputs(
-            prompt_text=image_token * num_images,
-            mm_data=mm_data,
-        )
-
 
 @MULTIMODAL_REGISTRY.register_processor(AriaMultiModalProcessor)
 class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index b3ecb2f22dc19..fd45783f167b4 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -4,8 +4,8 @@
 
 import torch
 import torch.nn as nn
-from transformers import (BatchFeature, Blip2Config, Blip2Processor,
-                          Blip2QFormerConfig, apply_chunking_to_forward)
+from transformers import (BatchFeature, Blip2Config, Blip2QFormerConfig,
+                          apply_chunking_to_forward)
 
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, VllmConfig
@@ -18,8 +18,9 @@
                                     MultiModalInputsV2, MultiModalKwargs,
                                     NestedTensors, PlaceholderRange)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessorInputs,
+                                        MultiModalDataItems, ProcessingMixin,
                                         PromptReplacement)
+from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
 from .blip import BlipVisionModel
@@ -396,20 +397,52 @@ def forward(
         return sequence_output
 
 
-class Blip2MultiModalProcessor(BaseMultiModalProcessor):
+class Blip2ProcessingMixin(ProcessingMixin):
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
-        return {"image": 1}
+    def _get_hf_config(self):
+        return self.ctx.get_hf_config(Blip2Config)
 
     def _get_num_image_tokens(self) -> int:
-        hf_config = self.ctx.get_hf_config(Blip2Config)
+        hf_config = self._get_hf_config()
         return hf_config.num_query_tokens
 
+
+class Blip2ProfilingInfo(Blip2ProcessingMixin, BaseProfilingInfo):
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": 1}
+
     def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
         return {"image": self._get_num_image_tokens()}
 
-    def _get_hf_processor(self) -> Blip2Processor:
-        return self.ctx.get_hf_processor(Blip2Processor)
+    def get_dummy_processor_inputs(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        hf_config = self._get_hf_config()
+        vision_config = hf_config.vision_config
+
+        max_image_size = vision_config.image_size
+        num_images = mm_counts.get("image", 0)
+
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=max_image_size,
+                                   height=max_image_size,
+                                   num_images=num_images)
+        }
+
+        return ProcessorInputs(
+            prompt_text="",
+            mm_data=mm_data,
+        )
+
+
+class Blip2MultiModalProcessor(Blip2ProcessingMixin, BaseMultiModalProcessor):
+
+    def _get_profiling_info(self) -> BaseProfilingInfo:
+        return Blip2ProfilingInfo(self.ctx)
 
     def _get_mm_fields_config(
         self,
@@ -427,13 +460,13 @@ def _get_prompt_replacements(
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
-        max_image_tokens = self._get_num_image_tokens()
+        num_image_tokens = self._get_num_image_tokens()
 
         return [
             PromptReplacement(
                 modality="image",
                 target="</s>",
-                replacement="<image>" * max_image_tokens + "</s>",
+                replacement="<image>" * num_image_tokens + "</s>",
             )
         ]
 
@@ -457,29 +490,6 @@ def apply(
 
         return result
 
-    def _get_dummy_processor_inputs(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
-        hf_config = self.ctx.get_hf_config(Blip2Config)
-        vision_config = hf_config.vision_config
-
-        max_image_size = vision_config.image_size
-        num_images = mm_counts.get("image", 0)
-
-        mm_data = {
-            "image":
-            self._get_dummy_images(width=max_image_size,
-                                   height=max_image_size,
-                                   num_images=num_images)
-        }
-
-        return ProcessorInputs(
-            prompt_text="",
-            mm_data=mm_data,
-        )
-
 
 @MULTIMODAL_REGISTRY.register_processor(Blip2MultiModalProcessor)
 class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index 1ad44678a591d..73ed73b61ebf9 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -31,8 +31,9 @@
                                     MultiModalInputsV2, MultiModalKwargs,
                                     NestedTensors, PlaceholderRange)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessorInputs,
+                                        MultiModalDataItems, ProcessingMixin,
                                         PromptReplacement)
+from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 from vllm.utils import print_warning_once
 
@@ -48,20 +49,55 @@ class ChameleonImagePixelInputs(TypedDict):
     """Shape: `(batch_size * num_images, num_channels, height, width)`"""
 
 
-class ChameleonMultiModalProcessor(BaseMultiModalProcessor):
+class ChameleonProcessingMixin(ProcessingMixin):
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
-        return {"image": 1}
+    def _get_hf_config(self):
+        return self.ctx.get_hf_config(ChameleonConfig)
+
+    def _get_hf_processor(self):
+        return self.ctx.get_hf_processor(ChameleonProcessor)
 
     def _get_num_image_tokens(self) -> int:
         processor = self._get_hf_processor()
         return processor.image_seq_length
 
+
+class ChameleonProfilingInfo(ChameleonProcessingMixin, BaseProfilingInfo):
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": 1}
+
     def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
         return {"image": self._get_num_image_tokens()}
 
-    def _get_hf_processor(self) -> ChameleonProcessor:
-        return self.ctx.get_hf_processor(ChameleonProcessor)
+    def get_dummy_processor_inputs(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        config = self._get_hf_config()
+
+        width = height = config.vq_config.resolution
+        num_images = mm_counts.get("image", 0)
+
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=width,
+                                   height=height,
+                                   num_images=num_images)
+        }
+
+        return ProcessorInputs(
+            prompt_text="<image>" * num_images,
+            mm_data=mm_data,
+        )
+
+
+class ChameleonMultiModalProcessor(ChameleonProcessingMixin,
+                                   BaseMultiModalProcessor):
+
+    def _get_profiling_info(self) -> BaseProfilingInfo:
+        return ChameleonProfilingInfo(self.ctx)
 
     def _get_mm_fields_config(
         self,
@@ -76,7 +112,7 @@ def _get_prompt_replacements(
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
-        processor = self._get_hf_processor()
+        processor = self._get_hf_processor(**hf_processor_mm_kwargs)
 
         return [
             PromptReplacement(
@@ -90,28 +126,6 @@ def _get_prompt_replacements(
             )
         ]
 
-    def _get_dummy_processor_inputs(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
-        config = self.ctx.get_hf_config(ChameleonConfig)
-
-        width = height = config.vq_config.resolution
-        num_images = mm_counts.get("image", 0)
-
-        mm_data = {
-            "image":
-            self._get_dummy_images(width=width,
-                                   height=height,
-                                   num_images=num_images)
-        }
-
-        return ProcessorInputs(
-            prompt_text="<image>" * num_images,
-            mm_data=mm_data,
-        )
-
     def apply(
         self,
         prompt_text: str,
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index 7cd58fbc7cf21..c937fcb0978b9 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -35,8 +35,9 @@
                                     NestedTensors, PlaceholderRange)
 from vllm.multimodal.parse import ImageProcessorItems, ImageSize
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessorInputs,
+                                        MultiModalDataItems, ProcessingMixin,
                                         PromptReplacement)
+from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsMultiModal, SupportsPP
@@ -63,18 +64,16 @@ class FuyuImagePatchInputs(TypedDict):
     """
 
 
-class FuyuMultiModalProcessor(BaseMultiModalProcessor):
+class FuyuProcessingMixin(ProcessingMixin):
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
-        return {"image": 1}
+    def _get_hf_config(self):
+        return self.ctx.get_hf_config(FuyuConfig)
 
-    def _get_image_target_size(self) -> ImageSize:
-        processor = self._get_hf_processor()
-        image_processor: FuyuImageProcessor = processor.image_processor
+    def _get_hf_processor(self):
+        return self.ctx.get_hf_processor(FuyuProcessor)
 
-        target_size = image_processor.size
-        return ImageSize(width=target_size["width"],
-                         height=target_size["height"])
+    def _get_image_processor(self) -> FuyuImageProcessor:
+        return self._get_hf_processor().image_processor
 
     def _get_image_feature_grid_size(
         self,
@@ -82,7 +81,9 @@ def _get_image_feature_grid_size(
         image_width: int,
         image_height: int,
     ) -> tuple[int, int]:
-        target_width, target_height = self._get_image_target_size()
+        image_processor = self._get_image_processor()
+        target_width = image_processor.size["width"]
+        target_height = image_processor.size["height"]
 
         if not (image_width <= target_width and image_height <= target_height):
             height_scale_factor = target_height / image_height
@@ -96,8 +97,14 @@ def _get_image_feature_grid_size(
         nrows = math.ceil(image_height / 30)
         return ncols, nrows
 
+
+class FuyuProfilingInfo(FuyuProcessingMixin, BaseProfilingInfo):
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": 1}
+
     def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
-        target_width, target_height = self._get_image_target_size()
+        target_width, target_height = self._get_image_size_with_most_features()
 
         max_ncols, max_nrows = self._get_image_feature_grid_size(
             image_width=target_width,
@@ -107,8 +114,36 @@ def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
 
         return {"image": max_image_tokens}
 
-    def _get_hf_processor(self) -> FuyuProcessor:
-        return self.ctx.get_hf_processor(FuyuProcessor)
+    def _get_image_size_with_most_features(self) -> ImageSize:
+        image_processor = self._get_image_processor()
+        return ImageSize(width=image_processor.size["width"],
+                         height=image_processor.size["height"])
+
+    def get_dummy_processor_inputs(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        target_width, target_height = self._get_image_size_with_most_features()
+        num_images = mm_counts.get("image", 0)
+
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images)
+        }
+
+        return ProcessorInputs(
+            prompt_text="",
+            mm_data=mm_data,
+        )
+
+
+class FuyuMultiModalProcessor(FuyuProcessingMixin, BaseMultiModalProcessor):
+
+    def _get_profiling_info(self) -> BaseProfilingInfo:
+        return FuyuProfilingInfo(self.ctx)
 
     def _call_hf_processor(
         self,
@@ -161,7 +196,7 @@ def _get_prompt_replacements(
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
-        hf_config = self.ctx.get_hf_config(FuyuConfig)
+        hf_config = self._get_hf_config()
         bos_token_id = hf_config.bos_token_id
 
         tokenizer = self._get_tokenizer()
@@ -208,26 +243,6 @@ def apply(
 
         return result
 
-    def _get_dummy_processor_inputs(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
-        target_width, target_height = self._get_image_target_size()
-        num_images = mm_counts.get("image", 0)
-
-        mm_data = {
-            "image":
-            self._get_dummy_images(width=target_width,
-                                   height=target_height,
-                                   num_images=num_images)
-        }
-
-        return ProcessorInputs(
-            prompt_text="",
-            mm_data=mm_data,
-        )
-
 
 @MULTIMODAL_REGISTRY.register_processor(FuyuMultiModalProcessor)
 class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index d522378e0bebb..4299af8cd03a2 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -1,4 +1,4 @@
-from abc import abstractmethod
+from abc import ABC, abstractmethod
 from functools import cached_property
 from typing import (Final, Iterable, List, Literal, Mapping, Optional,
                     Protocol, Set, Tuple, TypedDict, Union)
@@ -13,6 +13,7 @@
 
 from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
+from vllm.inputs import InputProcessingContext
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                RowParallelLinear)
@@ -25,9 +26,10 @@
                                     NestedTensors)
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    ImageSize)
-from vllm.multimodal.processing import (InputProcessingContext,
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         MultiModalDataItems, ProcessingCache,
-                                        ProcessorInputs, PromptReplacement)
+                                        ProcessingMixin, PromptReplacement)
+from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
 from .clip import CLIPVisionModel
@@ -37,7 +39,7 @@
 from .siglip import SiglipVisionModel
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
-from .vision import BaseVisionLanguageMultiModalProcessor
+from .vision import get_vision_encoder_info
 
 
 class LlavaImagePixelInputs(TypedDict):
@@ -94,30 +96,42 @@ def forward(self, image_features: torch.Tensor) -> torch.Tensor:
 
 class LlavaLikeConfig(Protocol):
     vision_config: Final[PretrainedConfig]
+    image_token_index: Final[int]
     vision_feature_select_strategy: Final[str]
-    vision_feature_layer: Final[Union[int, List[int]]]
+    vision_feature_layer: Final[Union[int, list[int]]]
 
 
-class BaseLlavaMultiModalProcessor(BaseVisionLanguageMultiModalProcessor):
+class LlavaLikeProcessor(Protocol):
+    image_token: Final[str]
+
+
+class BaseLlavaProcessingMixin(ProcessingMixin, ABC):
 
-    @abstractmethod
     def _get_hf_config(self) -> LlavaLikeConfig:
-        raise NotImplementedError
+        return self.ctx.get_hf_config(LlavaConfig)
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
-        return {"image": None}
+    def _get_vision_encoder_info(self):
+        return get_vision_encoder_info(self._get_hf_config())
 
-    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
-        return {"image": self._get_max_image_tokens()}
+    @abstractmethod
+    def _get_hf_processor(self) -> LlavaLikeProcessor:
+        raise NotImplementedError
 
-    def _get_mm_fields_config(
+    def _get_num_image_tokens(
         self,
-        hf_inputs: BatchFeature,
-        hf_processor_mm_kwargs: Mapping[str, object],
-    ) -> Mapping[str, MultiModalFieldConfig]:
-        return dict(
-            pixel_values=MultiModalFieldConfig.batched("image"),
-            image_embeds=MultiModalFieldConfig.batched("image"),
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        hf_config = self._get_hf_config()
+        vision_encoder_info = self._get_vision_encoder_info()
+
+        return self._apply_feature_select_strategy(
+            hf_config.vision_feature_select_strategy,
+            vision_encoder_info.get_num_image_tokens(
+                image_width=image_width,
+                image_height=image_height,
+            ),
         )
 
     def _apply_feature_select_strategy(
@@ -133,31 +147,38 @@ def _apply_feature_select_strategy(
         msg = f"Unexpected feature select strategy: {strategy!r}"
         raise NotImplementedError(msg)
 
-    def _get_max_image_tokens(self) -> int:
-        hf_config = self._get_hf_config()
 
-        return self._apply_feature_select_strategy(
-            hf_config.vision_feature_select_strategy,
-            self._vision_encoder_info.get_max_image_tokens(),
-        )
+class BaseLlavaProfilingInfo(BaseLlavaProcessingMixin, BaseProfilingInfo):
 
-    def _get_dummy_image_size(self) -> ImageSize:
-        image_size = self._vision_encoder_info.get_image_size()
-        return ImageSize(image_size, image_size)
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None}
 
-    @abstractmethod
-    def _get_image_token(self) -> str:
-        raise NotImplementedError
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+        return {"image": self._get_max_image_tokens()}
+
+    def _get_image_size_with_most_features(self) -> ImageSize:
+        vision_encoder_info = self._get_vision_encoder_info()
+        width = height = vision_encoder_info.get_image_size()
+        return ImageSize(width=width, height=height)
 
-    def _get_dummy_processor_inputs(
+    def _get_max_image_tokens(self) -> int:
+        target_width, target_height = self._get_image_size_with_most_features()
+
+        return self._get_num_image_tokens(
+            image_width=target_width,
+            image_height=target_height,
+        )
+
+    def get_dummy_processor_inputs(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
         num_images = mm_counts.get("image", 0)
 
-        image_token = self._get_image_token()
-        target_width, target_height = self._get_dummy_image_size()
+        processor = self._get_hf_processor()
+        image_token = processor.image_token
+        target_width, target_height = self._get_image_size_with_most_features()
 
         mm_data = {
             "image":
@@ -172,32 +193,32 @@ def _get_dummy_processor_inputs(
         )
 
 
-class LlavaMultiModalProcessor(BaseLlavaMultiModalProcessor):
-
-    def _get_hf_config(self) -> LlavaConfig:
-        return self.ctx.get_hf_config(LlavaConfig)
+class LlavaProcessingMixin(BaseLlavaProcessingMixin):
 
-    def _get_hf_processor(self) -> LlavaProcessor:
+    def _get_hf_processor(self):
         return self.ctx.get_hf_processor(LlavaProcessor)
 
-    def _get_image_token(self) -> str:
-        return self._get_hf_processor().image_token
 
-    def _get_num_image_tokens(
-        self,
-        *,
-        image_width: int,
-        image_height: int,
-    ) -> int:
-        hf_config = self._get_hf_config()
+class LlavaProfilingInfo(LlavaProcessingMixin, BaseLlavaProfilingInfo):
+    pass
 
-        return self._apply_feature_select_strategy(
-            hf_config.vision_feature_select_strategy,
-            self._vision_encoder_info.get_num_image_tokens(
-                image_width=image_width,
-                image_height=image_height,
-            ),
-        )
+
+class BaseLlavaMultiModalProcessor(LlavaProcessingMixin,
+                                   BaseMultiModalProcessor):
+
+    # Copied from BaseMultiModalProcessor
+    @abstractmethod
+    def _get_profiling_info(self) -> BaseProfilingInfo:
+        raise NotImplementedError
+
+    # Copied from BaseMultiModalProcessor
+    @abstractmethod
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        raise NotImplementedError
 
     def _get_prompt_replacements(
         self,
@@ -232,16 +253,37 @@ def get_replacement(item_idx: int):
         ]
 
 
-class PixtralHFMultiModalProcessor(BaseLlavaMultiModalProcessor):
+class LlavaMultiModalProcessor(BaseLlavaMultiModalProcessor):
+
+    def _get_profiling_info(self) -> BaseProfilingInfo:
+        return LlavaProfilingInfo(self.ctx)
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+            image_embeds=MultiModalFieldConfig.batched("image"),
+        )
 
-    def _get_hf_config(self) -> LlavaConfig:
-        return self.ctx.get_hf_config(LlavaConfig)
 
-    def _get_hf_processor(self) -> PixtralProcessor:
+class PixtralHFProcessingMixin(BaseLlavaProcessingMixin):
+
+    def _get_hf_processor(self):
         return self.ctx.get_hf_processor(PixtralProcessor)
 
-    def _get_image_token(self) -> str:
-        return self._get_hf_processor().image_token
+
+class PixtralHFProfilingInfo(PixtralHFProcessingMixin, BaseLlavaProfilingInfo):
+    pass
+
+
+class PixtralHFMultiModalProcessor(PixtralHFProcessingMixin,
+                                   BaseMultiModalProcessor):
+
+    def _get_profiling_info(self) -> BaseProfilingInfo:
+        return PixtralHFProfilingInfo(self.ctx)
 
     def _call_hf_processor(
         self,
@@ -270,6 +312,16 @@ def _call_hf_processor(
 
         return processed_outputs
 
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+            image_embeds=MultiModalFieldConfig.batched("image"),
+        )
+
     def _get_prompt_replacements(
         self,
         mm_items: MultiModalDataItems,
@@ -316,7 +368,7 @@ def _build_llava_or_pixtral_hf_processor(
     *,
     cache: Optional[ProcessingCache] = None,
     enable_sanity_checks: bool = True,
-) -> BaseLlavaMultiModalProcessor:
+) -> BaseMultiModalProcessor:
     hf_config = ctx.get_hf_config(LlavaConfig)
 
     if isinstance(hf_config.vision_config, PixtralVisionConfig):
@@ -663,16 +715,13 @@ def load_weights(self, weights: Iterable[Tuple[str,
 
 class MantisMultiModalProcessor(LlavaMultiModalProcessor):
 
-    def _get_hf_processor(self):
-        return self.ctx.get_hf_processor(LlavaProcessor)
-
     def apply(
         self,
         prompt_text: str,
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> MultiModalInputsV2:
-        hf_config = self.ctx.get_hf_config(LlavaConfig)
+        hf_config = self._get_hf_config()
         image_token_id = hf_config.image_token_index
 
         # Assume that it doesn't depend on the image size
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index f79021596f915..c76ec164a3087 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -1,6 +1,6 @@
 from functools import cached_property
-from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
-                    TypedDict, Union)
+from typing import (Final, Iterable, List, Literal, Mapping, Optional,
+                    Protocol, Set, Tuple, TypedDict, Union)
 
 import numpy as np
 import torch
@@ -17,12 +17,14 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import MultiModalFieldConfig, NestedTensors
 from vllm.multimodal.parse import ImageSize
+from vllm.multimodal.profiling import BaseProfilingInfo
 from vllm.sequence import IntermediateTensors
 
 from .clip import CLIPVisionModel
 from .interfaces import SupportsMultiModal, SupportsPP
-from .llava import (LlavaMultiModalProcessor, LlavaMultiModalProjector,
-                    init_vision_tower_for_llava)
+from .llava import (BaseLlavaMultiModalProcessor, BaseLlavaProcessingMixin,
+                    BaseLlavaProfilingInfo, LlavaLikeConfig,
+                    LlavaMultiModalProjector, init_vision_tower_for_llava)
 from .siglip import SiglipVisionModel
 from .utils import (AutoWeightsLoader, embed_multimodal, flatten_bn,
                     init_vllm_registered_model, maybe_prefix)
@@ -60,35 +62,17 @@ class LlavaNextImageEmbeddingInputs(TypedDict):
                              LlavaNextImageEmbeddingInputs]
 
 
-class LlavaNextMultiModalProcessor(LlavaMultiModalProcessor):
+class LlavaNextLikeConfig(LlavaLikeConfig, Protocol):
+    image_grid_pinpoints: Final[list[list[int]]]
 
-    def _get_hf_config(self) -> LlavaNextConfig:
-        return self.ctx.get_hf_config(LlavaNextConfig)
 
-    def _get_hf_processor(self) -> LlavaNextProcessor:
-        return self.ctx.get_hf_processor(LlavaNextProcessor)
+class LlavaNextProcessingMixin(BaseLlavaProcessingMixin):
 
-    def _get_mm_fields_config(
-        self,
-        hf_inputs: BatchFeature,
-        hf_processor_mm_kwargs: Mapping[str, object],
-    ) -> Mapping[str, MultiModalFieldConfig]:
-        return dict(
-            pixel_values=MultiModalFieldConfig.batched("image"),
-            image_sizes=MultiModalFieldConfig.batched("image"),
-            image_embeds=MultiModalFieldConfig.batched("image"),
-        )
-
-    def _get_image_token(self) -> str:
-        return self._get_hf_processor().image_token
-
-    def _get_max_image_tokens(self) -> int:
-        largest_feature_size, _ = self._get_pinpoint_with_most_features()
-        return largest_feature_size
+    def _get_hf_config(self) -> LlavaNextLikeConfig:
+        return self.ctx.get_hf_config(LlavaNextConfig)
 
-    def _get_dummy_image_size(self) -> ImageSize:
-        _, pinpoint = self._get_pinpoint_with_most_features()
-        return pinpoint
+    def _get_hf_processor(self):
+        return self.ctx.get_hf_processor(LlavaNextProcessor)
 
     # Based on: https://github.com/huggingface/text-generation-inference/blob/v2.2.0/server/text_generation_server/models/vlm_causal_lm.py#L106
     def _get_num_image_tokens(
@@ -98,7 +82,7 @@ def _get_num_image_tokens(
         image_height: int,
     ) -> int:
         hf_config = self._get_hf_config()
-        vision_encoder_info = self._vision_encoder_info
+        vision_encoder_info = self._get_vision_encoder_info()
 
         base_feature_size = self._apply_feature_select_strategy(
             hf_config.vision_feature_select_strategy,
@@ -140,7 +124,7 @@ def _get_num_unpadded_features(
         current_height = npatches * num_patch_height
         current_width = npatches * num_patch_width
 
-        # NOTE: HF resizes based on float32
+        # NOTE: Use float32 to remain consistent with HF output
         original_aspect_ratio = np.array(original_width / original_height,
                                          dtype=np.float32)
         current_aspect_ratio = np.array(current_width / current_height,
@@ -164,11 +148,10 @@ def _get_num_unpadded_features(
 
         return (unpadded_features, newline_features)
 
-    def _get_pinpoint_with_most_features(self) -> tuple[int, ImageSize]:
-        """
-        Get the grid pinpoint with the most features and
-        the corresponding feature size.
-        """
+
+class LlavaNextProfilingInfo(LlavaNextProcessingMixin, BaseLlavaProfilingInfo):
+
+    def _get_image_size_with_most_features(self) -> ImageSize:
         hf_config = self._get_hf_config()
 
         largest_feature_size, largest_feature_pinpoint = 0, None
@@ -183,7 +166,25 @@ def _get_pinpoint_with_most_features(self) -> tuple[int, ImageSize]:
         if largest_feature_size == 0 or largest_feature_pinpoint is None:
             raise ValueError("Cannot have a largest feature size of 0!")
 
-        return largest_feature_size, largest_feature_pinpoint
+        return largest_feature_pinpoint
+
+
+class LlavaNextMultiModalProcessor(LlavaNextProcessingMixin,
+                                   BaseLlavaMultiModalProcessor):
+
+    def _get_profiling_info(self) -> BaseProfilingInfo:
+        return LlavaNextProfilingInfo(self.ctx)
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+            image_sizes=MultiModalFieldConfig.batched("image"),
+            image_embeds=MultiModalFieldConfig.batched("image"),
+        )
 
 
 @MULTIMODAL_REGISTRY.register_processor(LlavaNextMultiModalProcessor)
diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
index ee6b89f0d4498..6e82cee1c95a4 100644
--- a/vllm/model_executor/models/llava_next_video.py
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -15,11 +15,14 @@
 from vllm.model_executor.models.clip import CLIPVisionModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors
-from vllm.multimodal.parse import (ImageSize, MultiModalDataItems,
-                                   VideoEmbeddingItems, VideoProcessorItems)
-from vllm.multimodal.processing import (MultiModalFieldConfig, ProcessorInputs,
+from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
+                                    NestedTensors)
+from vllm.multimodal.parse import (ImageSize, VideoEmbeddingItems,
+                                   VideoProcessorItems)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        MultiModalDataItems, ProcessingMixin,
                                         PromptReplacement)
+from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 from vllm.utils import is_list_of
 
@@ -28,7 +31,7 @@
 from .siglip import SiglipVisionModel
 from .utils import (AutoWeightsLoader, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
-from .vision import BaseVisionLanguageMultiModalProcessor
+from .vision import get_vision_encoder_info
 
 
 class LlavaNextVideoPixelInputs(TypedDict):
@@ -44,29 +47,16 @@ class LlavaNextVideoPixelInputs(TypedDict):
     """
 
 
-class LlavaNextVideoMultiModalProcessor(BaseVisionLanguageMultiModalProcessor):
+class LlavaNextVideoProcessingMixin(ProcessingMixin):
 
-    def _get_hf_config(self) -> LlavaNextVideoConfig:
+    def _get_hf_config(self):
         return self.ctx.get_hf_config(LlavaNextVideoConfig)
 
-    def _get_hf_processor(self) -> LlavaNextVideoProcessor:
-        return self.ctx.get_hf_processor(LlavaNextVideoProcessor)
-
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
-        return {"video": 1}
-
-    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
-        num_frames = self._get_dummy_num_frames(seq_len)
-        max_video_tokens = self._get_max_video_tokens(num_frames)
-
-        return {"video": max_video_tokens}
+    def _get_vision_encoder_info(self):
+        return get_vision_encoder_info(self._get_hf_config())
 
-    def _get_mm_fields_config(
-        self,
-        hf_inputs: BatchFeature,
-        hf_processor_mm_kwargs: Mapping[str, object],
-    ) -> Mapping[str, MultiModalFieldConfig]:
-        return dict(pixel_values_videos=MultiModalFieldConfig.batched("video"))
+    def _get_hf_processor(self):
+        return self.ctx.get_hf_processor(LlavaNextVideoProcessor)
 
     def _get_num_frame_tokens(
         self,
@@ -77,7 +67,8 @@ def _get_num_frame_tokens(
         hf_config = self._get_hf_config()
         spatial_pool_stride = hf_config.spatial_pool_stride
 
-        patch_grid_length = self._vision_encoder_info.get_patch_grid_length()
+        vision_encoder_info = self._get_vision_encoder_info()
+        patch_grid_length = vision_encoder_info.get_patch_grid_length()
         pooled_grid_length = math.ceil(patch_grid_length / spatial_pool_stride)
 
         return pooled_grid_length * pooled_grid_length
@@ -96,18 +87,43 @@ def _get_num_video_tokens(
 
         return num_frame_tokens * num_frames
 
-    def _get_max_video_tokens(self, num_frames: int) -> int:
-        return self._get_num_video_tokens(image_width=999999,
-                                          image_height=999999,
-                                          num_frames=num_frames)
+
+class LlavaNextVideoProfilingInfo(LlavaNextVideoProcessingMixin,
+                                  BaseProfilingInfo):
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"video": 1}
+
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+        target_width, target_height = self._get_image_size_with_most_features()
+
+        max_video_tokens = self._get_num_video_tokens(
+            image_width=target_width,
+            image_height=target_height,
+            num_frames=self._get_dummy_num_frames(seq_len),
+        )
+
+        return {"video": max_video_tokens}
+
+    def _get_image_size_with_most_features(self) -> ImageSize:
+        vision_encoder_info = self._get_vision_encoder_info()
+        width = height = vision_encoder_info.get_image_size()
+        return ImageSize(width=width, height=height)
 
     def _get_max_video_frames(self, max_tokens: int) -> int:
+        target_width, target_height = self._get_image_size_with_most_features()
+
         num_frames = 0
 
         while True:
             next_num_frames = num_frames + 1
+            next_max_tokens = self._get_num_video_tokens(
+                image_width=target_width,
+                image_height=target_height,
+                num_frames=next_num_frames,
+            )
 
-            if self._get_max_video_tokens(next_num_frames) > max_tokens:
+            if next_max_tokens > max_tokens:
                 break
 
             num_frames = next_num_frames
@@ -122,12 +138,45 @@ def _get_dummy_num_frames(self, seq_len: int) -> int:
 
         return max(max_total_frames // max(max_videos, 1), 1)
 
-    def _get_dummy_image_size(self) -> ImageSize:
-        image_size = self._vision_encoder_info.get_image_size()
-        return ImageSize(image_size, image_size)
+    def get_dummy_processor_inputs(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        num_videos = mm_counts.get("video", 0)
+
+        processor = self._get_hf_processor()
+        video_token = processor.video_token
+        target_width, target_height = self._get_image_size_with_most_features()
+
+        mm_data = {
+            "video":
+            self._get_dummy_videos(
+                width=target_width,
+                height=target_height,
+                num_frames=self._get_dummy_num_frames(seq_len),
+                num_videos=num_videos,
+            )
+        }
+
+        return ProcessorInputs(
+            prompt_text=video_token * num_videos,
+            mm_data=mm_data,
+        )
+
 
-    def _get_video_token(self) -> str:
-        return self._get_hf_processor().video_token
+class LlavaNextVideoMultiModalProcessor(LlavaNextVideoProcessingMixin,
+                                        BaseMultiModalProcessor):
+
+    def _get_profiling_info(self) -> BaseProfilingInfo:
+        return LlavaNextVideoProfilingInfo(self.ctx)
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(pixel_values_videos=MultiModalFieldConfig.batched("video"))
 
     def _get_prompt_replacements(
         self,
@@ -162,36 +211,11 @@ def get_replacement(item_idx: int):
             ),
         ]
 
-    def _get_dummy_processor_inputs(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
-        num_videos = mm_counts.get("video", 0)
-
-        video_token = self._get_video_token()
-        target_width, target_height = self._get_dummy_image_size()
-
-        mm_data = {
-            "video":
-            self._get_dummy_videos(
-                width=target_width,
-                height=target_height,
-                num_frames=self._get_dummy_num_frames(seq_len),
-                num_videos=num_videos,
-            )
-        }
-
-        return ProcessorInputs(
-            prompt_text=video_token * num_videos,
-            mm_data=mm_data,
-        )
-
 
 # adopted from transformers modeling_llava_next_video.py
 class LlavaNextVideoPooler(nn.Module):
 
-    def __init__(self, config):
+    def __init__(self, config: LlavaNextVideoConfig):
         super().__init__()
 
         mode = config.spatial_pool_mode
@@ -209,7 +233,7 @@ def __init__(self, config):
             raise ValueError(
                 f"Unknown pooling mode: {mode}. Expected [`average`, `max`]")
 
-    def forward(self, image_features):
+    def forward(self, image_features: torch.Tensor):
         ori_width = int(
             math.sqrt(image_features.shape[1] * self.image_size //
                       self.image_size))
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index 5a3cdadc47cac..6dccc1e0d3b8d 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -1,7 +1,7 @@
 import math
 from functools import cached_property
-from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
-                    TypedDict, Union)
+from typing import (Final, Iterable, List, Literal, Mapping, Optional,
+                    Protocol, Set, Tuple, TypedDict, Union)
 
 import numpy as np
 import torch
@@ -21,15 +21,16 @@
 from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors
 from vllm.multimodal.parse import (MultiModalDataItems, VideoEmbeddingItems,
                                    VideoProcessorItems)
-from vllm.multimodal.processing import (MultiModalFieldConfig, ProcessorInputs,
-                                        PromptReplacement)
+from vllm.multimodal.processing import MultiModalFieldConfig, PromptReplacement
+from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 from vllm.utils import is_list_of
 
 from .clip import CLIPVisionModel
 from .interfaces import SupportsMultiModal, SupportsPP
-from .llava import init_vision_tower_for_llava
-from .llava_next import LlavaNextMultiModalProcessor
+from .llava import BaseLlavaProfilingInfo, init_vision_tower_for_llava
+from .llava_next import (LlavaNextLikeConfig, LlavaNextMultiModalProcessor,
+                         LlavaNextProcessingMixin)
 from .siglip import SiglipVisionModel
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
@@ -82,39 +83,17 @@ class LlavaOnevisionImageEmbeddingInputs(TypedDict):
                                   LlavaOnevisionVideoPixelInputs]
 
 
-class LlavaOnevisionMultiModalProcessor(LlavaNextMultiModalProcessor):
+class LlavaOnevisionLikeConfig(LlavaNextLikeConfig, Protocol):
+    video_token_index: Final[int]
 
-    def _get_hf_config(self) -> LlavaOnevisionConfig:
-        return self.ctx.get_hf_config(LlavaOnevisionConfig)
-
-    def _get_hf_processor(self) -> LlavaOnevisionProcessor:
-        return self.ctx.get_hf_processor(LlavaOnevisionProcessor)
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
-        return {"image": None, "video": None}
-
-    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
-        max_image_tokens = self._get_max_image_tokens()
+class LlavaOnevisionProcessingMixin(LlavaNextProcessingMixin):
 
-        num_frames = self._get_dummy_num_frames(seq_len)
-        max_video_tokens = self._get_max_video_tokens(num_frames)
-
-        return {
-            "image": max_image_tokens,
-            "video": max_video_tokens,
-        }
+    def _get_hf_config(self) -> LlavaOnevisionLikeConfig:
+        return self.ctx.get_hf_config(LlavaOnevisionConfig)
 
-    def _get_mm_fields_config(
-        self,
-        hf_inputs: BatchFeature,
-        hf_processor_mm_kwargs: Mapping[str, object],
-    ) -> Mapping[str, MultiModalFieldConfig]:
-        return dict(
-            pixel_values=MultiModalFieldConfig.batched("image"),
-            image_sizes=MultiModalFieldConfig.batched("image"),
-            image_embeds=MultiModalFieldConfig.batched("image"),
-            pixel_values_videos=MultiModalFieldConfig.batched("video"),
-        )
+    def _get_hf_processor(self):
+        return self.ctx.get_hf_processor(LlavaOnevisionProcessor)
 
     def _get_num_unpadded_features(
         self,
@@ -128,7 +107,7 @@ def _get_num_unpadded_features(
         current_height = npatches * num_patch_height
         current_width = npatches * num_patch_width
 
-        # NOTE: HF resizes based on float32
+        # NOTE: Use float32 to remain consistent with HF output
         original_aspect_ratio = np.array(original_width / original_height,
                                          dtype=np.float32)
         current_aspect_ratio = np.array(current_width / current_height,
@@ -167,7 +146,8 @@ def _get_num_frame_tokens(
         hf_config = self._get_hf_config()
         spatial_pool_stride = getattr(hf_config, "spatial_pool_stride", 2)
 
-        patch_grid_length = self._vision_encoder_info.get_patch_grid_length()
+        vision_encoder_info = self._get_vision_encoder_info()
+        patch_grid_length = vision_encoder_info.get_patch_grid_length()
         pooled_grid_length = math.ceil(patch_grid_length / spatial_pool_stride)
 
         return pooled_grid_length * pooled_grid_length
@@ -186,18 +166,33 @@ def _get_num_video_tokens(
 
         return num_frame_tokens * num_frames + 1  # Newline token
 
-    def _get_max_video_tokens(self, num_frames: int) -> int:
-        return self._get_num_video_tokens(image_width=999999,
-                                          image_height=999999,
-                                          num_frames=num_frames)
+
+class LlavaOnevisionProfilingInfo(LlavaOnevisionProcessingMixin,
+                                  BaseLlavaProfilingInfo):
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None, "video": None}
+
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+        return {
+            "image": self._get_max_image_tokens(),
+            "video": self._get_max_video_tokens(seq_len),
+        }
 
     def _get_max_video_frames(self, max_tokens: int) -> int:
+        target_width, target_height = self._get_image_size_with_most_features()
+
         num_frames = 0
 
         while True:
             next_num_frames = num_frames + 1
+            next_max_tokens = self._get_num_video_tokens(
+                image_width=target_width,
+                image_height=target_height,
+                num_frames=next_num_frames,
+            )
 
-            if self._get_max_video_tokens(next_num_frames) > max_tokens:
+            if next_max_tokens > max_tokens:
                 break
 
             num_frames = next_num_frames
@@ -215,8 +210,65 @@ def _get_dummy_num_frames(self, seq_len: int) -> int:
 
         return max(max_total_frames // max(max_videos, 1), 1)
 
-    def _get_video_token(self) -> str:
-        return self._get_hf_processor().video_token
+    def _get_max_video_tokens(self, seq_len: int) -> int:
+        target_width, target_height = self._get_image_size_with_most_features()
+
+        return self._get_num_video_tokens(
+            image_width=target_width,
+            image_height=target_height,
+            num_frames=self._get_dummy_num_frames(seq_len),
+        )
+
+    def get_dummy_processor_inputs(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        num_images = mm_counts.get("image", 0)
+        num_videos = mm_counts.get("video", 0)
+
+        processor = self._get_hf_processor()
+        image_token = processor.image_token
+        video_token = processor.video_token
+        target_width, target_height = self._get_image_size_with_most_features()
+
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images),
+            "video":
+            self._get_dummy_videos(
+                width=target_width,
+                height=target_height,
+                num_frames=self._get_dummy_num_frames(seq_len),
+                num_videos=num_videos,
+            )
+        }
+
+        return ProcessorInputs(
+            prompt_text=image_token * num_images + video_token * num_videos,
+            mm_data=mm_data,
+        )
+
+
+class LlavaOnevisionMultiModalProcessor(LlavaOnevisionProcessingMixin,
+                                        LlavaNextMultiModalProcessor):
+
+    def _get_profiling_info(self) -> BaseProfilingInfo:
+        return LlavaOnevisionProfilingInfo(self.ctx)
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+            image_sizes=MultiModalFieldConfig.batched("image"),
+            image_embeds=MultiModalFieldConfig.batched("image"),
+            pixel_values_videos=MultiModalFieldConfig.batched("video"),
+        )
 
     def _call_hf_processor(
         self,
@@ -235,7 +287,8 @@ def _call_hf_processor(
                 mm_kwargs=mm_kwargs,
             )
 
-        video_token = self._get_video_token()
+        processor = self._get_hf_processor()
+        video_token = processor.video_token
 
         # LLaVA-OneVision processor doesn't support multiple videos
         # with different sizes when converting back to tensors
@@ -303,37 +356,6 @@ def get_video_replacement(item_idx: int):
             ),
         ]
 
-    def _get_dummy_processor_inputs(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
-        num_images = mm_counts.get("image", 0)
-        num_videos = mm_counts.get("video", 0)
-
-        image_token = self._get_image_token()
-        video_token = self._get_video_token()
-        target_width, target_height = self._get_dummy_image_size()
-
-        mm_data = {
-            "image":
-            self._get_dummy_images(width=target_width,
-                                   height=target_height,
-                                   num_images=num_images),
-            "video":
-            self._get_dummy_videos(
-                width=target_width,
-                height=target_height,
-                num_frames=self._get_dummy_num_frames(seq_len),
-                num_videos=num_videos,
-            )
-        }
-
-        return ProcessorInputs(
-            prompt_text=image_token * num_images + video_token * num_videos,
-            mm_data=mm_data,
-        )
-
 
 class LlavaOnevisionMultiModalProjector(nn.Module):
 
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 7aa9d58d1d348..c8418c14e5fdf 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 from collections.abc import Iterable, Mapping, Sequence
 from functools import cached_property
-from typing import List, Literal, Optional, Set, Tuple, TypedDict, Union
+from typing import Any, List, Literal, Optional, Set, Tuple, TypedDict, Union
 
 import torch
 import torch.nn as nn
@@ -28,22 +28,23 @@
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
-from vllm.model_executor.models.clip import CLIPVisionModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
                                     MultiModalInputsV2, MultiModalKwargs,
                                     NestedTensors, PlaceholderRange)
-from vllm.multimodal.parse import ImageEmbeddingItems, ImageProcessorItems
+from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
+                                   ImageSize)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessorInputs,
+                                        MultiModalDataItems, ProcessingMixin,
                                         PromptReplacement,
                                         _BoundPromptReplacement,
                                         _PlaceholderInfo)
+from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 from vllm.utils import is_list_of
 
-from .clip import dummy_image_for_clip
+from .clip import CLIPVisionModel
 from .interfaces import SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
                     init_vllm_registered_model, maybe_prefix,
@@ -54,10 +55,6 @@
 # Cannot find the following 2 numbers from hf config.
 _IMAGE_TOKEN_ID = 32044
 
-# Result in the max possible feature size (h:w = 16:1)
-MAX_IMAGE_FEATURE_SIZE_HEIGHT = 8000
-MAX_IMAGE_FEATURE_SIZE_WIDTH = 50
-
 CLIP_VIT_LARGE_PATCH14_336_CONFIG = CLIPVisionConfig(dropout=0.0,
                                                      hidden_act="quick_gelu",
                                                      hidden_size=1024,
@@ -305,10 +302,17 @@ def add_image_newline(self, image_features_hd):
         return image_features_hd_newline
 
 
-class Phi3VMultiModalProcessor(BaseMultiModalProcessor):
+class Phi3VProcessingMixin(ProcessingMixin):
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
-        return {"image": None}
+    def _get_hf_processor(
+        self,
+        *,
+        num_crops: Optional[int] = None,
+    ) -> ProcessorMixin:
+        if num_crops is not None:
+            return self.ctx.get_hf_processor(num_crops=num_crops)
+
+        return self.ctx.get_hf_processor()
 
     def _get_num_image_tokens(
         self,
@@ -323,23 +327,55 @@ def _get_num_image_tokens(
             height=image_height,
         )
 
+
+class Phi3VProfilingInfo(Phi3VProcessingMixin, BaseProfilingInfo):
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None}
+
     def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+        target_width, target_height = self._get_image_size_with_most_features()
+
         max_image_tokens = self._get_num_image_tokens(
-            image_width=MAX_IMAGE_FEATURE_SIZE_WIDTH,
-            image_height=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
+            image_width=target_width,
+            image_height=target_height,
         )
 
         return {"image": max_image_tokens}
 
-    def _get_hf_processor(
+    def _get_image_size_with_most_features(self) -> ImageSize:
+        # Result in the max possible feature size (h:w = 16:1)
+        return ImageSize(height=8000, width=50)
+
+    def get_dummy_processor_inputs(
         self,
-        *,
-        num_crops: Optional[int] = None,
-    ) -> ProcessorMixin:
-        if num_crops is not None:
-            return self.ctx.get_hf_processor(num_crops=num_crops)
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        num_images = mm_counts.get("image", 0)
 
-        return self.ctx.get_hf_processor()
+        target_width, target_height = self._get_image_size_with_most_features()
+
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images)
+        }
+
+        hf_processor = self._get_hf_processor()
+        image_tokens: list[str] = hf_processor.img_tokens  # type: ignore
+
+        return ProcessorInputs(
+            prompt_text="".join(image_tokens[:num_images]),
+            mm_data=mm_data,
+        )
+
+
+class Phi3VMultiModalProcessor(Phi3VProcessingMixin, BaseMultiModalProcessor):
+
+    def _get_profiling_info(self) -> BaseProfilingInfo:
+        return Phi3VProfilingInfo(self.ctx)
 
     def _call_hf_processor(
         self,
@@ -377,10 +413,10 @@ def _get_mm_fields_config(
     def _get_prompt_replacements(
         self,
         mm_items: MultiModalDataItems,
-        hf_processor_mm_kwargs: Mapping[str, object],
+        hf_processor_mm_kwargs: Mapping[str, Any],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
-        hf_processor = self._get_hf_processor()
+        hf_processor = self._get_hf_processor(**hf_processor_mm_kwargs)
         image_tokens: list[str] = hf_processor.img_tokens  # type: ignore
 
         tokenizer = self._get_tokenizer()
@@ -442,28 +478,6 @@ def _apply_prompt_replacements(
 
         return token_ids, text, placeholders
 
-    def _get_dummy_processor_inputs(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
-        num_images = mm_counts.get("image", 0)
-
-        data = dummy_image_for_clip(
-            CLIP_VIT_LARGE_PATCH14_336_CONFIG,
-            num_images,
-            image_width_override=MAX_IMAGE_FEATURE_SIZE_WIDTH,
-            image_height_override=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
-        )
-
-        hf_processor = self._get_hf_processor()
-        image_tokens: list[str] = hf_processor.img_tokens  # type: ignore
-
-        return ProcessorInputs(
-            prompt_text="".join(image_tokens[:num_images]),
-            mm_data=data,
-        )
-
     def apply(
         self,
         prompt_text: str,
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index bc3bb1f79b407..a7bb3425ed17c 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -20,8 +20,8 @@
 # limitations under the License.
 """Inference-only Qwen2-Audio model compatible with HuggingFace weights."""
 from functools import cached_property
-from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict,
-                    Union)
+from typing import (Any, Iterable, List, Mapping, Optional, Set, Tuple,
+                    TypedDict, Union)
 
 import torch
 import torch.nn as nn
@@ -40,8 +40,9 @@
                                     NestedTensors)
 from vllm.multimodal.parse import AudioProcessorItems, MultiModalDataParser
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessorInputs,
+                                        MultiModalDataItems, ProcessingMixin,
                                         PromptReplacement)
+from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsMultiModal, SupportsPP
@@ -79,28 +80,70 @@ def _get_feat_extract_output_lengths(input_lengths: torch.Tensor):
     return feat_lengths, output_lengths
 
 
-class Qwen2AudioMultiModalProcessor(BaseMultiModalProcessor):
+class Qwen2AudioProcessingMixin(ProcessingMixin):
+
+    def _get_hf_config(self):
+        return self.ctx.get_hf_config(Qwen2AudioConfig)
+
+    def _get_hf_processor(
+        self,
+        *,
+        # Ignored in initialization
+        sampling_rate: Optional[int] = None,
+    ) -> Qwen2AudioProcessor:
+        return self.ctx.get_hf_processor(Qwen2AudioProcessor)
+
+    def _get_feature_extractor(
+        self,
+        *,
+        # Ignored in initialization
+        sampling_rate: Optional[int] = None,
+    ) -> WhisperFeatureExtractor:
+        hf_processor = self._get_hf_processor(sampling_rate=sampling_rate)
+        feature_extractor = hf_processor.feature_extractor  # type: ignore
+        assert isinstance(feature_extractor, WhisperFeatureExtractor)
+        return feature_extractor
+
+
+class Qwen2AudioProfilingInfo(Qwen2AudioProcessingMixin, BaseProfilingInfo):
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"audio": None}
 
     def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
-        hf_config = self.ctx.get_hf_config(Qwen2AudioConfig)
+        hf_config = self._get_hf_config()
         max_source_positions = hf_config.audio_config.max_source_positions
         max_output_lengths = (max_source_positions - 2) // 2 + 1
 
         return {"audio": max_output_lengths}
 
-    def _get_hf_processor(
+    def get_dummy_processor_inputs(
         self,
-        *,
-        # Ignored in initialization
-        sampling_rate: Optional[int] = None,
-    ) -> Qwen2AudioProcessor:
-        return self.ctx.get_hf_processor(Qwen2AudioProcessor)
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        feature_extractor = self._get_feature_extractor()
+
+        sampling_rate = feature_extractor.sampling_rate
+        audio_len = feature_extractor.chunk_length * sampling_rate
+        num_audios = mm_counts.get("audio", 0)
+
+        mm_data = {
+            "audio":
+            self._get_dummy_audios(length=audio_len, num_audios=num_audios)
+        }
+
+        return ProcessorInputs(
+            prompt_text="<|AUDIO|>" * num_audios,
+            mm_data=mm_data,
+        )
+
 
-    def _get_feature_extractor(self) -> WhisperFeatureExtractor:
-        return self._get_hf_processor().feature_extractor  # type: ignore
+class Qwen2AudioMultiModalProcessor(Qwen2AudioProcessingMixin,
+                                    BaseMultiModalProcessor):
+
+    def _get_profiling_info(self) -> BaseProfilingInfo:
+        return Qwen2AudioProfilingInfo(self.ctx)
 
     def _get_data_parser(self) -> MultiModalDataParser:
         feature_extractor = self._get_feature_extractor()
@@ -110,7 +153,7 @@ def _call_hf_processor(
         self,
         prompt: str,
         mm_data: Mapping[str, object],
-        mm_kwargs: Mapping[str, object],
+        mm_kwargs: Mapping[str, Any],
     ) -> BatchFeature:
         mm_data = dict(mm_data)
         audios = mm_data.pop("audios", [])
@@ -118,7 +161,7 @@ def _call_hf_processor(
         if audios:
             mm_data["audios"] = audios
 
-            feature_extractor = self._get_feature_extractor()
+            feature_extractor = self._get_feature_extractor(**mm_kwargs)
             mm_kwargs = dict(
                 **mm_kwargs,
                 sampling_rate=feature_extractor.sampling_rate,
@@ -151,7 +194,7 @@ def _get_prompt_replacements(
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
-        hf_config = self.ctx.get_hf_config(Qwen2AudioConfig)
+        hf_config = self._get_hf_config()
         placeholder = hf_config.audio_token_index
 
         feature_attention_mask = out_mm_kwargs.get("feature_attention_mask")
@@ -191,27 +234,6 @@ def _always_apply_prompt_replacements(self) -> bool:
         # tokens than the number of audio items)
         return True
 
-    def _get_dummy_processor_inputs(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
-        feature_extractor = self._get_feature_extractor()
-
-        sampling_rate = feature_extractor.sampling_rate
-        audio_len = feature_extractor.chunk_length * sampling_rate
-        num_audios = mm_counts.get("audio", 0)
-
-        mm_data = {
-            "audio":
-            self._get_dummy_audios(length=audio_len, num_audios=num_audios)
-        }
-
-        return ProcessorInputs(
-            prompt_text="<|AUDIO|>" * num_audios,
-            mm_data=mm_data,
-        )
-
 
 @MULTIMODAL_REGISTRY.register_processor(Qwen2AudioMultiModalProcessor)
 class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal,
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index abca85e0e2024..a5c2fb9e84df3 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -59,8 +59,9 @@
 from vllm.multimodal.parse import (ImageSize, ModalityDataItems,
                                    MultiModalDataParser)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessorInputs,
+                                        MultiModalDataItems, ProcessingMixin,
                                         PromptReplacement)
+from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs
 from vllm.platforms import _Backend
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.config import uses_mrope
@@ -708,10 +709,44 @@ def _parse_video_data(
         return super()._parse_video_data(data)
 
 
-class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor):
+class Qwen2VLProcessingMixin(ProcessingMixin):
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
-        return {"image": None, "video": None}
+    def _get_hf_config(self):
+        return self.ctx.get_hf_config(Qwen2VLConfig)
+
+    def _get_hf_processor(
+        self,
+        *,
+        min_pixels: Optional[int] = None,
+        max_pixels: Optional[int] = None,
+    ) -> Qwen2VLProcessor:
+        hf_processor = self.ctx.get_hf_processor(Qwen2VLProcessor)
+        image_processor = hf_processor.image_processor  # type: ignore
+        assert isinstance(image_processor, Qwen2VLImageProcessor)
+
+        if min_pixels:
+            image_processor.min_pixels = min_pixels
+        if max_pixels:
+            image_processor.max_pixels = max_pixels
+        if max_pixels or min_pixels:
+            image_processor.size = {
+                "min_pixels": image_processor.min_pixels,
+                "max_pixels": image_processor.max_pixels,
+            }
+
+        return hf_processor
+
+    def _get_image_processor(
+        self,
+        *,
+        min_pixels: Optional[int] = None,
+        max_pixels: Optional[int] = None,
+    ):
+        hf_processor = self._get_hf_processor(min_pixels=min_pixels,
+                                              max_pixels=max_pixels)
+        image_processor = hf_processor.image_processor  # type: ignore
+        assert isinstance(image_processor, Qwen2VLImageProcessor)
+        return image_processor
 
     def _get_vision_info(
         self,
@@ -721,14 +756,13 @@ def _get_vision_info(
         num_frames: int = 1,
         do_resize: bool = True,
     ) -> tuple[ImageSize, int]:
-        hf_config = self.ctx.get_hf_config(Qwen2VLConfig)
+        hf_config = self._get_hf_config()
         vision_config = hf_config.vision_config
         patch_size = vision_config.patch_size
         merge_size = vision_config.spatial_merge_size
         temporal_patch_size = vision_config.temporal_patch_size
 
-        hf_processor = self._get_hf_processor()
-        image_processor = self._get_image_processor(hf_processor)
+        image_processor = self._get_image_processor()
 
         if do_resize:
             resized_height, resized_width = smart_resize(
@@ -753,7 +787,45 @@ def _get_vision_info(
 
         return preprocessed_size, num_vision_tokens
 
-    def _get_dummy_image_size(self) -> ImageSize:
+    def _get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        _, num_image_tokens = self._get_vision_info(
+            image_width=image_width,
+            image_height=image_height,
+        )
+        return num_image_tokens
+
+    def _get_num_video_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        num_frames: int,
+    ) -> int:
+        _, num_video_tokens = self._get_vision_info(
+            image_width=image_width,
+            image_height=image_height,
+            num_frames=num_frames,
+        )
+        return num_video_tokens
+
+
+class Qwen2VLProfilingInfo(Qwen2VLProcessingMixin, BaseProfilingInfo):
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None, "video": None}
+
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+        return {
+            "image": self._get_max_image_tokens(),
+            "video": self._get_max_video_tokens(seq_len),
+        }
+
+    def _get_image_size_with_most_features(self) -> ImageSize:
         max_image_size, _ = self._get_vision_info(
             image_width=9999999,
             image_height=9999999,
@@ -761,27 +833,27 @@ def _get_dummy_image_size(self) -> ImageSize:
         return max_image_size
 
     def _get_max_image_tokens(self) -> int:
-        _, max_image_tokens = self._get_vision_info(
-            image_width=9999999,
-            image_height=9999999,
-        )
-        return max_image_tokens
+        target_width, target_height = self._get_image_size_with_most_features()
 
-    def _get_max_video_tokens(self, num_frames: int) -> int:
-        _, max_video_tokens = self._get_vision_info(
-            image_width=9999999,
-            image_height=9999999,
-            num_frames=num_frames,
+        return self._get_num_image_tokens(
+            image_width=target_width,
+            image_height=target_height,
         )
-        return max_video_tokens
 
     def _get_max_video_frames(self, max_tokens: int) -> int:
+        target_width, target_height = self._get_image_size_with_most_features()
+
         num_frames = 0
 
         while True:
             next_num_frames = num_frames + 1
+            next_max_tokens = self._get_num_video_tokens(
+                image_width=target_width,
+                image_height=target_height,
+                num_frames=next_num_frames,
+            )
 
-            if self._get_max_video_tokens(next_num_frames) > max_tokens:
+            if next_max_tokens > max_tokens:
                 break
 
             num_frames = next_num_frames
@@ -797,56 +869,73 @@ def _get_dummy_num_frames(self, seq_len: int) -> int:
         max_total_frames = self._get_max_video_frames(seq_len -
                                                       max_image_tokens)
 
-        return max(max_total_frames // max(max_videos, 1), 1)
+        num_frames = max(max_total_frames // max(max_videos, 1), 1)
 
-    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
-        max_image_tokens = self._get_max_image_tokens()
+        # Temporary workaround for https://github.com/huggingface/transformers/issues/35412
+        if num_frames > 1 and num_frames % 2 == 1:
+            num_frames += 1
 
-        num_frames = self._get_dummy_num_frames(seq_len)
-        max_video_tokens = self._get_max_video_tokens(num_frames)
+        return num_frames
 
-        return {
-            "image": max_image_tokens,
-            "video": max_video_tokens,
+    def _get_max_video_tokens(self, seq_len: int) -> int:
+        target_width, target_height = self._get_image_size_with_most_features()
+
+        return self._get_num_video_tokens(
+            image_width=target_width,
+            image_height=target_height,
+            num_frames=self._get_dummy_num_frames(seq_len),
+        )
+
+    def get_dummy_processor_inputs(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        num_images = mm_counts.get("image", 0)
+        num_videos = mm_counts.get("video", 0)
+
+        hf_processor = self._get_hf_processor()
+        image_token: str = hf_processor.image_token
+        video_token: str = hf_processor.video_token
+        target_width, target_height = self._get_image_size_with_most_features()
+
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images),
+            "video":
+            self._get_dummy_videos(
+                width=target_width,
+                height=target_height,
+                num_frames=self._get_dummy_num_frames(seq_len),
+                num_videos=num_videos,
+            )
         }
 
-    def _get_data_parser(self) -> MultiModalDataParser:
-        return Qwen2MultiModalDataParser()
+        return ProcessorInputs(
+            prompt_text=image_token * num_images + video_token * num_videos,
+            mm_data=mm_data,
+        )
 
-    def _get_image_processor(self, hf_processor: Qwen2VLProcessor):
-        image_processor = hf_processor.image_processor  # type: ignore
-        assert isinstance(image_processor, Qwen2VLImageProcessor)
-        return image_processor
 
-    def _get_hf_processor(
-        self,
-        *,
-        min_pixels: Optional[int] = None,
-        max_pixels: Optional[int] = None,
-    ) -> Qwen2VLProcessor:
-        hf_processor = self.ctx.get_hf_processor(Qwen2VLProcessor)
-        image_processor = self._get_image_processor(hf_processor)
+class Qwen2VLMultiModalProcessor(Qwen2VLProcessingMixin,
+                                 BaseMultiModalProcessor):
 
-        if min_pixels:
-            image_processor.min_pixels = min_pixels
-        if max_pixels:
-            image_processor.max_pixels = max_pixels
-        if max_pixels or min_pixels:
-            image_processor.size = {
-                "min_pixels": image_processor.min_pixels,
-                "max_pixels": image_processor.max_pixels,
-            }
+    def _get_profiling_info(self) -> BaseProfilingInfo:
+        return Qwen2VLProfilingInfo(self.ctx)
 
-        return hf_processor
+    def _get_data_parser(self) -> MultiModalDataParser:
+        return Qwen2MultiModalDataParser()
 
     def _get_prompt_replacements(
         self,
         mm_items: MultiModalDataItems,
-        hf_processor_mm_kwargs: Mapping[str, object],
+        hf_processor_mm_kwargs: Mapping[str, Any],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
-        hf_processor = self._get_hf_processor()
-        image_processor = self._get_image_processor(hf_processor)
+        hf_processor = self._get_hf_processor(**hf_processor_mm_kwargs)
+        image_processor = self._get_image_processor(**hf_processor_mm_kwargs)
 
         # NOTE: Only Qwen2VLProcessor in transformers 4.47.0 has
         # image_token and video_token registered
@@ -901,38 +990,6 @@ def _get_mm_fields_config(
             video_grid_thw=MultiModalFieldConfig.batched("video"),
         )
 
-    def _get_dummy_processor_inputs(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
-        num_images = mm_counts.get("image", 0)
-        num_videos = mm_counts.get("video", 0)
-
-        hf_processor = self._get_hf_processor()
-        image_token: str = hf_processor.image_token
-        video_token: str = hf_processor.video_token
-        target_width, target_height = self._get_dummy_image_size()
-
-        mm_data = {
-            "image":
-            self._get_dummy_images(width=target_width,
-                                   height=target_height,
-                                   num_images=num_images),
-            "video":
-            self._get_dummy_videos(
-                width=target_width,
-                height=target_height,
-                num_frames=self._get_dummy_num_frames(seq_len),
-                num_videos=num_videos,
-            )
-        }
-
-        return ProcessorInputs(
-            prompt_text=image_token * num_images + video_token * num_videos,
-            mm_data=mm_data,
-        )
-
 
 @MULTIMODAL_REGISTRY.register_processor(Qwen2VLMultiModalProcessor)
 class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index 6ad4661e3bb8d..ba823acecbb56 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -3,8 +3,8 @@
 
 import math
 from functools import cached_property
-from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
-                    TypedDict, Union)
+from typing import (Any, Iterable, List, Literal, Mapping, Optional, Set,
+                    Tuple, TypedDict, Union)
 
 import torch
 import torch.utils.checkpoint
@@ -26,8 +26,9 @@
                                     NestedTensors)
 from vllm.multimodal.parse import MultiModalDataParser
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessorInputs,
+                                        MultiModalDataItems, ProcessingMixin,
                                         PromptReplacement)
+from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs.ultravox import UltravoxConfig
 
@@ -55,7 +56,30 @@ class UltravoxAudioEmbeddingInputs(TypedDict):
                             UltravoxAudioEmbeddingInputs]
 
 
-class UltravoxMultiModalProcessor(BaseMultiModalProcessor):
+class UltravoxProcessingMixin(ProcessingMixin):
+
+    def _get_hf_processor(
+        self,
+        *,
+        # Ignored in initialization
+        sampling_rate: Optional[int] = None,
+    ) -> ProcessorMixin:
+        return self.ctx.get_hf_processor()
+
+    def _get_feature_extractor(
+        self,
+        *,
+        # Ignored in initialization
+        sampling_rate: Optional[int] = None,
+    ) -> WhisperFeatureExtractor:
+        hf_processor = self._get_hf_processor(sampling_rate=sampling_rate)
+        audio_processor = hf_processor.audio_processor  # type: ignore
+        feature_extractor = audio_processor.feature_extractor  # type: ignore
+        assert isinstance(feature_extractor, WhisperFeatureExtractor)
+        return feature_extractor
+
+
+class UltravoxProfilingInfo(UltravoxProcessingMixin, BaseProfilingInfo):
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"audio": None}
@@ -67,17 +91,33 @@ def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
 
         return {"audio": max_audio_tokens}
 
-    def _get_hf_processor(
+    def get_dummy_processor_inputs(
         self,
-        *,
-        # Ignored in initialization
-        sampling_rate: Optional[int] = None,
-    ) -> ProcessorMixin:
-        return self.ctx.get_hf_processor()
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        feature_extractor = self._get_feature_extractor()
+
+        sampling_rate = feature_extractor.sampling_rate
+        audio_len = feature_extractor.chunk_length * sampling_rate
+        num_audios = mm_counts.get("audio", 0)
+
+        mm_data = {
+            "audio":
+            self._get_dummy_audios(length=audio_len, num_audios=num_audios)
+        }
+
+        return ProcessorInputs(
+            prompt_text="<|audio|>" * num_audios,
+            mm_data=mm_data,
+        )
 
-    def _get_feature_extractor(self) -> WhisperFeatureExtractor:
-        hf_processor = self._get_hf_processor()
-        return hf_processor.audio_processor.feature_extractor  # type: ignore
+
+class UltravoxMultiModalProcessor(UltravoxProcessingMixin,
+                                  BaseMultiModalProcessor):
+
+    def _get_profiling_info(self) -> BaseProfilingInfo:
+        return UltravoxProfilingInfo(self.ctx)
 
     def _get_data_parser(self) -> MultiModalDataParser:
         feature_extractor = self._get_feature_extractor()
@@ -155,10 +195,10 @@ def _get_mm_fields_config(
     def _get_prompt_replacements(
         self,
         mm_items: MultiModalDataItems,
-        hf_processor_mm_kwargs: Mapping[str, object],
+        hf_processor_mm_kwargs: Mapping[str, Any],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
-        hf_processor = self._get_hf_processor()
+        hf_processor = self._get_hf_processor(**hf_processor_mm_kwargs)
         placeholder = hf_processor.audio_token_replacement  # type: ignore
 
         def get_replacement_ultravox(item_idx: int):
@@ -173,27 +213,6 @@ def get_replacement_ultravox(item_idx: int):
             )
         ]
 
-    def _get_dummy_processor_inputs(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
-        feature_extractor = self._get_feature_extractor()
-
-        sampling_rate = feature_extractor.sampling_rate
-        audio_len = feature_extractor.chunk_length * sampling_rate
-        num_audios = mm_counts.get("audio", 0)
-
-        mm_data = {
-            "audio":
-            self._get_dummy_audios(length=audio_len, num_audios=num_audios)
-        }
-
-        return ProcessorInputs(
-            prompt_text="<|audio|>" * num_audios,
-            mm_data=mm_data,
-        )
-
 
 class StackAudioFrames(nn.Module):
     """
diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py
index 014f02ee10a1b..8516c9f7066f7 100644
--- a/vllm/model_executor/models/vision.py
+++ b/vllm/model_executor/models/vision.py
@@ -1,12 +1,8 @@
 from abc import ABC, abstractmethod
-from typing import Final, Generic, Optional, Protocol, TypeVar
+from typing import Final, Generic, Protocol, TypeVar
 
 from transformers import PretrainedConfig
 
-from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        InputProcessingContext,
-                                        ProcessingCache)
-
 _C = TypeVar("_C", bound=PretrainedConfig)
 
 
@@ -43,12 +39,18 @@ def get_patch_grid_length(self) -> int:
         raise NotImplementedError
 
 
-def vision_encoder_info(vision_config: PretrainedConfig) -> VisionEncoderInfo:
+class VisionLanguageConfig(Protocol):
+    vision_config: Final[PretrainedConfig]
+
+
+def get_vision_encoder_info(
+        hf_config: VisionLanguageConfig) -> VisionEncoderInfo:
     # Avoid circular imports
     from .clip import CLIPEncoderInfo, CLIPVisionConfig
     from .pixtral import PixtralHFEncoderInfo, PixtralVisionConfig
     from .siglip import SiglipEncoderInfo, SiglipVisionConfig
 
+    vision_config = hf_config.vision_config
     if isinstance(vision_config, CLIPVisionConfig):
         return CLIPEncoderInfo(vision_config)
     if isinstance(vision_config, PixtralVisionConfig):
@@ -58,26 +60,3 @@ def vision_encoder_info(vision_config: PretrainedConfig) -> VisionEncoderInfo:
 
     msg = f"Unsupported vision config: {type(vision_config)}"
     raise NotImplementedError(msg)
-
-
-class VisionLanguageConfig(Protocol):
-    vision_config: Final[PretrainedConfig]
-
-
-class BaseVisionLanguageMultiModalProcessor(BaseMultiModalProcessor):
-
-    def __init__(self,
-                 ctx: InputProcessingContext,
-                 *,
-                 cache: Optional[ProcessingCache] = None,
-                 enable_sanity_checks: bool = True) -> None:
-        super().__init__(ctx,
-                         cache=cache,
-                         enable_sanity_checks=enable_sanity_checks)
-
-        vision_config = self._get_hf_config().vision_config
-        self._vision_encoder_info = vision_encoder_info(vision_config)
-
-    @abstractmethod
-    def _get_hf_config(self) -> VisionLanguageConfig:
-        raise NotImplementedError
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index ebc16b817684a..933c1d3aff0cb 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -8,11 +8,10 @@
 from typing import Any, NamedTuple, Optional, Protocol, TypeVar, Union
 
 import numpy as np
-import numpy.typing as npt
 import torch
 from blake3 import blake3
 from PIL import Image
-from transformers import BatchFeature, ProcessorMixin
+from transformers import BatchFeature, PretrainedConfig, ProcessorMixin
 
 from vllm.inputs import DummyData, InputProcessingContext
 from vllm.logger import init_logger
@@ -24,6 +23,7 @@
                      MultiModalInputsV2, MultiModalKwargs,
                      MultiModalKwargsItem, PlaceholderRange)
 from .parse import MultiModalDataItems, MultiModalDataParser
+from .profiling import BaseProfilingInfo
 
 logger = init_logger(__name__)
 
@@ -466,14 +466,6 @@ def find_mm_placeholders(
     return dict(full_groupby_modality(it))
 
 
-@dataclass
-class ProcessorInputs:
-    """Keyword arguments to :meth:`BaseMultiModalProcessor`."""
-    prompt_text: str
-    mm_data: MultiModalDataDict
-    hf_processor_mm_kwargs: Mapping[str, object] = field(default_factory=dict)
-
-
 class ProcessingCache:
 
     def __init__(self, capacity: int) -> None:
@@ -585,9 +577,33 @@ def put(
         self._cache.put(cache_key, output_kwargs)
 
 
-class BaseMultiModalProcessor(ABC):
+class ProcessingMixin:
+    """
+    Contains helper functions to perform processing.
+
+    Not to be confused with :class:`transformers.ProcessorMixin`.
+    """
+    ctx: InputProcessingContext
+
+    def _get_tokenizer(self) -> AnyTokenizer:
+        return self.ctx.tokenizer
+
+    def _get_hf_config(self) -> PretrainedConfig:
+        return self.ctx.get_hf_config()
+
+    def _get_hf_processor(self, **kwargs: object) -> ProcessorMixin:
+        """
+        Subclasses can override this method to handle
+        specific kwargs from model config or user inputs.
+        """
+        return self.ctx.get_hf_processor(**kwargs)
+
+
+class BaseMultiModalProcessor(ProcessingMixin, ABC):
     """
     Abstract base class to process multi-modal inputs to be used in vLLM.
+
+    Not to be confused with :class:`transformers.ProcessorMixin`.
     """
 
     def __init__(self,
@@ -601,6 +617,9 @@ def __init__(self,
         self.cache = cache
         self.enable_sanity_checks = enable_sanity_checks
 
+        self.data_parser = self._get_data_parser()
+        self.profiling_info = self._get_profiling_info()
+
     def __call__(
         self,
         prompt: str,
@@ -609,32 +628,9 @@ def __call__(
     ) -> MultiModalInputsV2:
         return self.apply(prompt, mm_data, hf_processor_mm_kwargs)
 
-    @abstractmethod
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
-        """
-        Return the maximum supported number of items for each modality.
-
-        A value of `None` means unlimited number of items.
-
-        Omitting a modality from the returned dictionary means that
-        it is not supported at all.
-        """
-        raise NotImplementedError
-
-    @abstractmethod
-    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
-        """
-        Get the maximum possible number of tokens per data item
-        for each modality.
-
-        The dictionary returned by this method should have the same
-        keys as that returned by :meth:`get_supported_mm_limits`.
-        """
-        raise NotImplementedError
-
     def _get_data_parser(self) -> MultiModalDataParser:
         """
-        Construct a data parser to preprocess multi-modal data items
+        Construct a parser to preprocess multi-modal data items
         before passing them to :meth:`_get_hf_mm_data`.
 
         You can support additional modalities by creating a subclass
@@ -642,15 +638,12 @@ def _get_data_parser(self) -> MultiModalDataParser:
         """
         return MultiModalDataParser()
 
-    def _get_hf_processor(self) -> ProcessorMixin:
+    def _get_profiling_info(self) -> BaseProfilingInfo:
         """
-        Subclasses can add keyword arguments to this method to accept
-        additional kwargs from model config or user inputs.
+        Get the profiling information to find the worst-case memory usage of
+        the model.
         """
-        return self.ctx.get_hf_processor()
-
-    def _get_tokenizer(self) -> AnyTokenizer:
-        return self.ctx.tokenizer
+        raise NotImplementedError
 
     def _to_mm_items(
         self,
@@ -660,8 +653,7 @@ def _to_mm_items(
         Normalize :class:`MultiModalDataDict` to :class:`MultiModalDataItems`
         before passing them to :meth:`_get_hf_mm_data`.
         """
-        parser = self._get_data_parser()
-        mm_items = parser.parse_mm_data(mm_data)
+        mm_items = self.data_parser.parse_mm_data(mm_data)
 
         mm_limits = self.ctx.get_mm_config().limit_per_prompt
         for modality, items in mm_items.items():
@@ -799,7 +791,7 @@ def _apply_hf_processor_missing(
 
         # Some HF processors (e.g. Qwen2-VL) expect corresponding
         # multi-modal tokens to be in the prompt text
-        dummy_inputs = self._get_dummy_processor_inputs(
+        dummy_inputs = self.profiling_info.get_dummy_processor_inputs(
             self.ctx.model_config.max_model_len,
             mm_missing_counts,
         )
@@ -1133,73 +1125,14 @@ def apply(
             mm_placeholders=mm_placeholder_ranges,
         )
 
-    def _get_dummy_audios(
-        self,
-        *,
-        length: int,
-        num_audios: int,
-    ) -> list[npt.NDArray]:
-        audio = np.zeros((length, ))
-        return [audio] * num_audios
-
-    def _get_dummy_images(
-        self,
-        *,
-        width: int,
-        height: int,
-        num_images: int,
-    ) -> list[Image.Image]:
-        image = Image.new("RGB", (width, height), color=0)
-        return [image] * num_images
-
-    def _get_dummy_videos(
-        self,
-        *,
-        width: int,
-        height: int,
-        num_frames: int,
-        num_videos: int,
-    ) -> list[npt.NDArray]:
-        video = np.zeros((num_frames, width, height, 3))
-        return [video] * num_videos
-
-    @abstractmethod
-    def _get_dummy_processor_inputs(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
-        """
-        Build the multi-modal portion of the input which, after processing,
-        results in `mm_max_tokens` in :meth:`get_dummy_data`.
-        """
-        raise NotImplementedError
-
-    def _get_and_validate_dummy_mm_counts(self) -> Mapping[str, int]:
-        mm_limit_per_prompt = self.ctx.get_mm_config().limit_per_prompt
-        supported_mm_limits = self.get_supported_mm_limits()
-
-        mm_limits = {
-            modality: mm_limit_per_prompt.get(modality, 1)
-            for modality in supported_mm_limits
-        }
-
-        for modality, supported_limit in supported_mm_limits.items():
-            limit = mm_limits[modality]
-            if supported_limit is not None and supported_limit < limit:
-                raise ValueError(
-                    f"You set {modality}={limit} (or defaulted to 1) in "
-                    f"`--limit-mm-per-prompt`, but this model only supports "
-                    f"at most {supported_limit} {modality} items.")
-
-        return mm_limits
-
     def _get_dummy_mm_inputs(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
     ) -> MultiModalInputsV2:
-        processor_inputs = self._get_dummy_processor_inputs(seq_len, mm_counts)
+        profiling = self.profiling_info
+        processor_inputs = profiling.get_dummy_processor_inputs(
+            seq_len, mm_counts)
 
         return self.apply(
             prompt_text=processor_inputs.prompt_text,
@@ -1211,8 +1144,9 @@ def get_dummy_data(self, seq_len: int) -> DummyData:
         # Avoid circular import
         from vllm.sequence import SequenceData
 
-        mm_counts = self._get_and_validate_dummy_mm_counts()
-        mm_max_tokens_per_item = self.get_mm_max_tokens_per_item(seq_len)
+        profiling = self.profiling_info
+        mm_counts = profiling.get_mm_limits()
+        mm_max_tokens_per_item = profiling.get_mm_max_tokens_per_item(seq_len)
         if mm_counts.keys() != mm_max_tokens_per_item.keys():
             raise AssertionError(
                 "The keys returned by `get_supported_mm_limits`"
diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
new file mode 100644
index 0000000000000..2ecf0db1a485d
--- /dev/null
+++ b/vllm/multimodal/profiling.py
@@ -0,0 +1,121 @@
+from abc import ABC, abstractmethod
+from collections.abc import Mapping
+from dataclasses import dataclass, field
+from typing import Optional
+
+import numpy as np
+import numpy.typing as npt
+from PIL import Image
+
+from vllm.inputs import InputProcessingContext
+from vllm.logger import init_logger
+
+from .inputs import MultiModalDataDict
+
+logger = init_logger(__name__)
+
+
+@dataclass
+class ProcessorInputs:
+    """Keyword arguments to :meth:`BaseMultiModalProcessor`."""
+    prompt_text: str
+    mm_data: MultiModalDataDict
+    hf_processor_mm_kwargs: Mapping[str, object] = field(default_factory=dict)
+
+
+class BaseProfilingInfo(ABC):
+    """
+    Abstract base class that provides the information necessary to profile
+    multi-modal models.
+    """
+
+    def __init__(self, ctx: InputProcessingContext) -> None:
+        super().__init__()
+
+        self.ctx = ctx
+
+    @abstractmethod
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        """
+        Return the maximum supported number of items for each modality.
+
+        A value of `None` means unlimited number of items.
+
+        Omitting a modality from the returned dictionary means that
+        it is not supported at all.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+        """
+        Get the maximum possible number of tokens per data item
+        for each modality.
+
+        The dictionary returned by this method should have the same
+        keys as that returned by :meth:`get_supported_mm_limits`.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_dummy_processor_inputs(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        """
+        Build the multi-modal portion of the input which, after processing,
+        results in `mm_max_tokens` in :meth:`get_mm_max_tokens_per_item`.
+        """
+        raise NotImplementedError
+
+    def _get_dummy_audios(
+        self,
+        *,
+        length: int,
+        num_audios: int,
+    ) -> list[npt.NDArray]:
+        audio = np.zeros((length, ))
+        return [audio] * num_audios
+
+    def _get_dummy_images(
+        self,
+        *,
+        width: int,
+        height: int,
+        num_images: int,
+    ) -> list[Image.Image]:
+        image = Image.new("RGB", (width, height), color=0)
+        return [image] * num_images
+
+    def _get_dummy_videos(
+        self,
+        *,
+        width: int,
+        height: int,
+        num_frames: int,
+        num_videos: int,
+    ) -> list[npt.NDArray]:
+        video = np.zeros((num_frames, width, height, 3))
+        return [video] * num_videos
+
+    def get_mm_limits(self) -> Mapping[str, int]:
+        mm_config = self.ctx.get_mm_config()
+        mm_limit_per_prompt = mm_config.limit_per_prompt
+
+        supported_mm_limits = self.get_supported_mm_limits()
+
+        mm_limits = {
+            modality: mm_limit_per_prompt.get(modality, 1)
+            for modality in supported_mm_limits
+        }
+
+        for modality, supported_limit in supported_mm_limits.items():
+            limit = mm_limits[modality]
+            if supported_limit is not None and supported_limit < limit:
+                raise ValueError(
+                    f"You set {modality}={limit} (or defaulted to 1) in "
+                    f"`--limit-mm-per-prompt`, but this model only supports "
+                    f"at most {supported_limit} {modality} items.")
+
+        return mm_limits
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index fb4389dc4df42..f75a594a4c4e0 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -224,7 +224,7 @@ def get_max_tokens_per_item_by_modality(
             tokenizer = cached_get_tokenizer(model_config.tokenizer)
             processor = self.create_processor(model_config, tokenizer)
             seq_len = model_config.max_model_len
-            return processor.get_mm_max_tokens_per_item(seq_len)
+            return processor.profiling_info.get_mm_max_tokens_per_item(seq_len)
 
         return {
             key: plugin.get_max_multimodal_tokens(model_config)

From ee77fdb5de42a6fead2b897d87d99d4b1e5650a9 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 6 Jan 2025 21:40:31 +0800
Subject: [PATCH 090/462] [Doc][2/N] Reorganize Models and Usage sections
 (#11755)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .github/ISSUE_TEMPLATE/600-new-model.yml      |   2 +-
 .../disagg_prefill/abstraction.jpg            | Bin
 .../disagg_prefill/overview.jpg               | Bin
 docs/source/contributing/model/basic.md       | 102 ++++++++++++
 docs/source/contributing/model/index.md       |  26 +++
 .../model/multimodal.md}                      |   8 +-
 .../source/contributing/model/registration.md |  56 +++++++
 .../automatic_prefix_caching.md}              |   6 +-
 docs/source/design/kernel/paged_attention.md  |   2 +
 .../dev/offline_inference/offline_index.md    |   1 +
 .../automatic_prefix_caching.md}              |   8 +-
 .../compatibility_matrix.md                   |   6 +-
 .../{usage => features}/disagg_prefill.md     |   4 +-
 docs/source/{usage => features}/lora.md       |   0
 .../{usage => features}/multimodal_inputs.md  |   0
 .../{ => features}/quantization/auto_awq.md   |   0
 .../source/{ => features}/quantization/bnb.md |   0
 .../source/{ => features}/quantization/fp8.md |   0
 .../quantization/fp8_e4m3_kvcache.md          |   0
 .../quantization/fp8_e5m2_kvcache.md          |   0
 .../{ => features}/quantization/gguf.md       |   0
 docs/source/features/quantization/index.md    |  19 +++
 .../{ => features}/quantization/int8.md       |   0
 .../quantization/supported_hardware.md        |  10 +-
 .../source/{usage => features}/spec_decode.md |   0
 .../{usage => features}/structured_outputs.md |   0
 .../{usage => features}/tool_calling.md       |   0
 docs/source/index.md                          |  66 +++-----
 docs/source/models/adding_model.md            | 155 ------------------
 docs/source/models/supported_models.md        |   2 +-
 .../optimization.md}                          |   4 +-
 docs/source/{usage => serving}/engine_args.md |   0
 docs/source/{usage => serving}/env_vars.md    |   0
 .../serving/openai_compatible_server.md       |   2 +-
 docs/source/{usage => serving}/usage_stats.md |   0
 vllm/attention/backends/rocm_flash_attn.py    |   2 +-
 vllm/config.py                                |   6 +-
 vllm/engine/arg_utils.py                      |   2 +-
 vllm/engine/output_processor/multi_step.py    |   2 +-
 vllm/executor/cpu_executor.py                 |   2 +-
 vllm/platforms/cpu.py                         |   2 +-
 vllm/spec_decode/spec_decode_worker.py        |   2 +-
 vllm/utils.py                                 |   2 +-
 vllm/worker/multi_step_model_runner.py        |   2 +-
 vllm/worker/utils.py                          |   2 +-
 45 files changed, 265 insertions(+), 238 deletions(-)
 rename docs/source/assets/{usage => features}/disagg_prefill/abstraction.jpg (100%)
 rename docs/source/assets/{usage => features}/disagg_prefill/overview.jpg (100%)
 create mode 100644 docs/source/contributing/model/basic.md
 create mode 100644 docs/source/contributing/model/index.md
 rename docs/source/{models/enabling_multimodal_inputs.md => contributing/model/multimodal.md} (96%)
 create mode 100644 docs/source/contributing/model/registration.md
 rename docs/source/{automatic_prefix_caching/details.md => design/automatic_prefix_caching.md} (90%)
 rename docs/source/{automatic_prefix_caching/apc.md => features/automatic_prefix_caching.md} (97%)
 rename docs/source/{usage => features}/compatibility_matrix.md (98%)
 rename docs/source/{usage => features}/disagg_prefill.md (96%)
 rename docs/source/{usage => features}/lora.md (100%)
 rename docs/source/{usage => features}/multimodal_inputs.md (100%)
 rename docs/source/{ => features}/quantization/auto_awq.md (100%)
 rename docs/source/{ => features}/quantization/bnb.md (100%)
 rename docs/source/{ => features}/quantization/fp8.md (100%)
 rename docs/source/{ => features}/quantization/fp8_e4m3_kvcache.md (100%)
 rename docs/source/{ => features}/quantization/fp8_e5m2_kvcache.md (100%)
 rename docs/source/{ => features}/quantization/gguf.md (100%)
 create mode 100644 docs/source/features/quantization/index.md
 rename docs/source/{ => features}/quantization/int8.md (100%)
 rename docs/source/{ => features}/quantization/supported_hardware.md (86%)
 rename docs/source/{usage => features}/spec_decode.md (100%)
 rename docs/source/{usage => features}/structured_outputs.md (100%)
 rename docs/source/{usage => features}/tool_calling.md (100%)
 delete mode 100644 docs/source/models/adding_model.md
 rename docs/source/{usage/performance.md => performance/optimization.md} (98%)
 rename docs/source/{usage => serving}/engine_args.md (100%)
 rename docs/source/{usage => serving}/env_vars.md (100%)
 rename docs/source/{usage => serving}/usage_stats.md (100%)

diff --git a/.github/ISSUE_TEMPLATE/600-new-model.yml b/.github/ISSUE_TEMPLATE/600-new-model.yml
index 794617a0cfdf6..713e76c1a5cec 100644
--- a/.github/ISSUE_TEMPLATE/600-new-model.yml
+++ b/.github/ISSUE_TEMPLATE/600-new-model.yml
@@ -9,7 +9,7 @@ body:
     value: >
       #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
 
-      #### We also highly recommend you read https://docs.vllm.ai/en/latest/models/adding_model.html first to understand how to add a new model.
+      #### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/adding_model.html first to understand how to add a new model.
 - type: textarea
   attributes:
     label: The model to consider.
diff --git a/docs/source/assets/usage/disagg_prefill/abstraction.jpg b/docs/source/assets/features/disagg_prefill/abstraction.jpg
similarity index 100%
rename from docs/source/assets/usage/disagg_prefill/abstraction.jpg
rename to docs/source/assets/features/disagg_prefill/abstraction.jpg
diff --git a/docs/source/assets/usage/disagg_prefill/overview.jpg b/docs/source/assets/features/disagg_prefill/overview.jpg
similarity index 100%
rename from docs/source/assets/usage/disagg_prefill/overview.jpg
rename to docs/source/assets/features/disagg_prefill/overview.jpg
diff --git a/docs/source/contributing/model/basic.md b/docs/source/contributing/model/basic.md
new file mode 100644
index 0000000000000..14690ffe24a83
--- /dev/null
+++ b/docs/source/contributing/model/basic.md
@@ -0,0 +1,102 @@
+(new-model-basic)=
+
+# Basic Implementation
+
+This guide walks you through the steps to implement a basic vLLM model.
+
+## 1. Bring your model code
+
+First, clone the PyTorch model code from the source repository.
+For instance, vLLM's [OPT model](gh-file:vllm/model_executor/models/opt.py) was adapted from
+HuggingFace's [modeling_opt.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py) file.
+
+```{warning}
+Make sure to review and adhere to the original code's copyright and licensing terms!
+```
+
+## 2. Make your code compatible with vLLM
+
+To ensure compatibility with vLLM, your model must meet the following requirements:
+
+### Initialization Code
+
+All vLLM modules within the model must include a `prefix` argument in their constructor. This `prefix` is typically the full name of the module in the model's state dictionary and is crucial for:
+
+- Runtime support: vLLM's attention operators are registered in a model's state by their full names. Each attention operator must have a unique prefix as its layer name to avoid conflicts.
+- Non-uniform quantization support: A quantized checkpoint can selectively quantize certain layers while keeping others in full precision. By providing the `prefix` during initialization, vLLM can match the current layer's `prefix` with the quantization configuration to determine if the layer should be initialized in quantized mode.
+
+The initialization code should look like this:
+
+```python
+from torch import nn
+from vllm.config import VllmConfig
+from vllm.attention import Attention
+
+class MyAttention(nn.Module):
+    def __init__(self, vllm_config: VllmConfig, prefix: str):
+        super().__init__()
+        self.attn = Attention(prefix=f"{prefix}.attn")
+
+class MyDecoderLayer(nn.Module):
+    def __init__(self, vllm_config: VllmConfig, prefix: str):
+        super().__init__()
+        self.self_attn = MyAttention(prefix=f"{prefix}.self_attn")
+
+class MyModel(nn.Module):
+    def __init__(self, vllm_config: VllmConfig, prefix: str):
+        super().__init__()
+        self.layers = nn.ModuleList(
+            [MyDecoderLayer(vllm_config, prefix=f"{prefix}.layers.{i}") for i in range(vllm_config.model_config.hf_config.num_hidden_layers)]
+        )
+
+class MyModelForCausalLM(nn.Module):
+    def __init__(self, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        self.model = MyModel(vllm_config, prefix=f"{prefix}.model")
+```
+
+### Computation Code
+
+Rewrite the {meth}`~torch.nn.Module.forward` method of your model to remove any unnecessary code, such as training-specific code. Modify the input parameters to treat `input_ids` and `positions` as flattened tensors with a single batch size dimension, without a max-sequence length dimension.
+
+```python
+def forward(
+    self,
+    input_ids: torch.Tensor,
+    positions: torch.Tensor,
+    kv_caches: List[torch.Tensor],
+    attn_metadata: AttentionMetadata,
+) -> torch.Tensor:
+    ...
+```
+
+```{note}
+Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings.
+If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM.
+```
+
+For reference, check out our [Llama implementation](gh-file:vllm/model_executor/models/llama.py). vLLM already supports a large number of models. It is recommended to find a model similar to yours and adapt it to your model's architecture. Check out <gh-dir:vllm/model_executor/models> for more examples.
+
+## 3. (Optional) Implement tensor parallelism and quantization support
+
+If your model is too large to fit into a single GPU, you can use tensor parallelism to manage it.
+To do this, substitute your model's linear and embedding layers with their tensor-parallel versions.
+For the embedding layer, you can simply replace {class}`torch.nn.Embedding` with `VocabParallelEmbedding`. For the output LM head, you can use `ParallelLMHead`.
+When it comes to the linear layers, we provide the following options to parallelize them:
+
+- `ReplicatedLinear`: Replicates the inputs and weights across multiple GPUs. No memory saving.
+- `RowParallelLinear`: The input tensor is partitioned along the hidden dimension. The weight matrix is partitioned along the rows (input dimension). An *all-reduce* operation is performed after the matrix multiplication to reduce the results. Typically used for the second FFN layer and the output linear transformation of the attention layer.
+- `ColumnParallelLinear`: The input tensor is replicated. The weight matrix is partitioned along the columns (output dimension). The result is partitioned along the column dimension. Typically used for the first FFN layer and the separated QKV transformation of the attention layer in the original Transformer.
+- `MergedColumnParallelLinear`: Column-parallel linear that merges multiple `ColumnParallelLinear` operators. Typically used for the first FFN layer with weighted activation functions (e.g., SiLU). This class handles the sharded weight loading logic of multiple weight matrices.
+- `QKVParallelLinear`: Parallel linear layer for the query, key, and value projections of the multi-head and grouped-query attention mechanisms. When number of key/value heads are less than the world size, this class replicates the key/value heads properly. This class handles the weight loading and replication of the weight matrices.
+
+Note that all the linear layers above take `linear_method` as an input. vLLM will set this parameter according to different quantization schemes to support weight quantization.
+
+## 4. Implement the weight loading logic
+
+You now need to implement the `load_weights` method in your `*ForCausalLM` class.
+This method should load the weights from the HuggingFace's checkpoint file and assign them to the corresponding layers in your model. Specifically, for `MergedColumnParallelLinear` and `QKVParallelLinear` layers, if the original model has separated weight matrices, you need to load the different parts separately.
+
+## 5. Register your model
+
+See [this page](#new-model-registration) for instructions on how to register your new model to be used by vLLM.
diff --git a/docs/source/contributing/model/index.md b/docs/source/contributing/model/index.md
new file mode 100644
index 0000000000000..a2d601c83cf47
--- /dev/null
+++ b/docs/source/contributing/model/index.md
@@ -0,0 +1,26 @@
+(new-model)=
+
+# Adding a New Model
+
+This section provides more information on how to integrate a [HuggingFace Transformers](https://github.com/huggingface/transformers) model into vLLM.
+
+```{toctree}
+:caption: Contents
+:maxdepth: 1
+
+basic
+registration
+multimodal
+```
+
+```{note}
+The complexity of adding a new model depends heavily on the model's architecture.
+The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM.
+However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex.
+```
+
+```{tip}
+If you are encountering issues while integrating your model into vLLM, feel free to open a [GitHub issue](https://github.com/vllm-project/vllm/issues)
+or ask on our [developer slack](https://slack.vllm.ai).
+We will be happy to help you out!
+```
diff --git a/docs/source/models/enabling_multimodal_inputs.md b/docs/source/contributing/model/multimodal.md
similarity index 96%
rename from docs/source/models/enabling_multimodal_inputs.md
rename to docs/source/contributing/model/multimodal.md
index fdd770887900e..e5dcd1223b361 100644
--- a/docs/source/models/enabling_multimodal_inputs.md
+++ b/docs/source/contributing/model/multimodal.md
@@ -2,15 +2,11 @@
 
 # Enabling Multimodal Inputs
 
-This document walks you through the steps to extend a vLLM model so that it accepts [multi-modal inputs](#multimodal-inputs).
-
-```{seealso}
-[Adding a New Model](adding-a-new-model)
-```
+This document walks you through the steps to extend a basic model so that it accepts [multi-modal inputs](#multimodal-inputs).
 
 ## 1. Update the base vLLM model
 
-It is assumed that you have already implemented the model in vLLM according to [these steps](#adding-a-new-model).
+It is assumed that you have already implemented the model in vLLM according to [these steps](#new-model-basic).
 Further update the model as follows:
 
 - Implement the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface.
diff --git a/docs/source/contributing/model/registration.md b/docs/source/contributing/model/registration.md
new file mode 100644
index 0000000000000..cf1cdb0c9de0f
--- /dev/null
+++ b/docs/source/contributing/model/registration.md
@@ -0,0 +1,56 @@
+(new-model-registration)=
+
+# Model Registration
+
+vLLM relies on a model registry to determine how to run each model.
+A list of pre-registered architectures can be found on the [Supported Models](#supported-models) page.
+
+If your model is not on this list, you must register it to vLLM.
+This page provides detailed instructions on how to do so.
+
+## Built-in models
+
+To add a model directly to the vLLM library, start by forking our [GitHub repository](https://github.com/vllm-project/vllm) and then [build it from source](#build-from-source).
+This gives you the ability to modify the codebase and test your model.
+
+After you have implemented your model (see [tutorial](#new-model-basic)), put it into the <gh-dir:vllm/model_executor/models> directory.
+Then, add your model class to `_VLLM_MODELS` in <gh-file:vllm/model_executor/models/registry.py> so that it is automatically registered upon importing vLLM.
+You should also include an example HuggingFace repository for this model in <gh-file:tests/models/registry.py> to run the unit tests.
+Finally, update the [Supported Models](#supported-models) documentation page to promote your model!
+
+```{important}
+The list of models in each section should be maintained in alphabetical order.
+```
+
+## Out-of-tree models
+
+You can load an external model using a plugin without modifying the vLLM codebase.
+
+```{seealso}
+[vLLM's Plugin System](#plugin-system)
+```
+
+To register the model, use the following code:
+
+```python
+from vllm import ModelRegistry
+from your_code import YourModelForCausalLM
+ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM)
+```
+
+If your model imports modules that initialize CUDA, consider lazy-importing it to avoid errors like `RuntimeError: Cannot re-initialize CUDA in forked subprocess`:
+
+```python
+from vllm import ModelRegistry
+
+ModelRegistry.register_model("YourModelForCausalLM", "your_code:YourModelForCausalLM")
+```
+
+```{important}
+If your model is a multimodal model, ensure the model class implements the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface.
+Read more about that [here](#enabling-multimodal-inputs).
+```
+
+```{note}
+Although you can directly put these code snippets in your script using `vllm.LLM`, the recommended way is to place these snippets in a vLLM plugin. This ensures compatibility with various vLLM features like distributed inference and the API server.
+```
diff --git a/docs/source/automatic_prefix_caching/details.md b/docs/source/design/automatic_prefix_caching.md
similarity index 90%
rename from docs/source/automatic_prefix_caching/details.md
rename to docs/source/design/automatic_prefix_caching.md
index 17f806217aa65..4398536b2b4ad 100644
--- a/docs/source/automatic_prefix_caching/details.md
+++ b/docs/source/design/automatic_prefix_caching.md
@@ -1,6 +1,8 @@
-# Implementation
+(design-automatic-prefix-caching)=
 
-The core idea of PagedAttention is to partition the KV cache of each request into KV Blocks. Each block contains the attention keys and values for a fixed number of tokens. The PagedAttention algorithm allows these blocks to be stored in non-contiguous physical memory so that we can eliminate memory fragmentation by allocating the memory on demand.
+# Automatic Prefix Caching
+
+The core idea of [PagedAttention](#design-paged-attention) is to partition the KV cache of each request into KV Blocks. Each block contains the attention keys and values for a fixed number of tokens. The PagedAttention algorithm allows these blocks to be stored in non-contiguous physical memory so that we can eliminate memory fragmentation by allocating the memory on demand.
 
 To automatically cache the KV cache, we utilize the following key observation: Each KV block can be uniquely identified by the tokens within the block and the tokens in the prefix before the block.
 
diff --git a/docs/source/design/kernel/paged_attention.md b/docs/source/design/kernel/paged_attention.md
index c21985b36eb3a..f896f903c78f5 100644
--- a/docs/source/design/kernel/paged_attention.md
+++ b/docs/source/design/kernel/paged_attention.md
@@ -1,3 +1,5 @@
+(design-paged-attention)=
+
 # vLLM Paged Attention
 
 - Currently, vLLM utilizes its own implementation of a multi-head query
diff --git a/docs/source/dev/offline_inference/offline_index.md b/docs/source/dev/offline_inference/offline_index.md
index 318a02d8c78df..c32f99d59e3db 100644
--- a/docs/source/dev/offline_inference/offline_index.md
+++ b/docs/source/dev/offline_inference/offline_index.md
@@ -1,6 +1,7 @@
 # Offline Inference
 
 ```{toctree}
+:caption: Contents
 :maxdepth: 1
 
 llm
diff --git a/docs/source/automatic_prefix_caching/apc.md b/docs/source/features/automatic_prefix_caching.md
similarity index 97%
rename from docs/source/automatic_prefix_caching/apc.md
rename to docs/source/features/automatic_prefix_caching.md
index c0c141c5fb7ef..3d70cbb29c385 100644
--- a/docs/source/automatic_prefix_caching/apc.md
+++ b/docs/source/features/automatic_prefix_caching.md
@@ -1,13 +1,13 @@
-(apc)=
+(automatic-prefix-caching)=
 
-# Introduction
+# Automatic Prefix Caching
 
-## What is Automatic Prefix Caching
+## Introduction
 
 Automatic Prefix Caching (APC in short) caches the KV cache of existing queries, so that a new query can directly reuse the KV cache if it shares the same prefix with one of the existing queries, allowing the new query to skip the computation of the shared part.
 
 ```{note}
-Technical details on how vLLM implements APC are in the next page.
+Technical details on how vLLM implements APC can be found [here](#design-automatic-prefix-caching).
 ```
 
 ## Enabling APC in vLLM
diff --git a/docs/source/usage/compatibility_matrix.md b/docs/source/features/compatibility_matrix.md
similarity index 98%
rename from docs/source/usage/compatibility_matrix.md
rename to docs/source/features/compatibility_matrix.md
index 3cefa12ea8a1d..8d8f7dca2e5b5 100644
--- a/docs/source/usage/compatibility_matrix.md
+++ b/docs/source/features/compatibility_matrix.md
@@ -32,7 +32,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar
 
    * - Feature
      - [CP](#chunked-prefill)
-     - [APC](#apc)
+     - [APC](#automatic-prefix-caching)
      - [LoRA](#lora-adapter)
      - <abbr title="Prompt Adapter">prmpt adptr</abbr>
      - [SD](#spec_decode)
@@ -64,7 +64,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar
      -
      -
      -
-   * - [APC](#apc)
+   * - [APC](#automatic-prefix-caching)
      - ✅
      -
      -
@@ -345,7 +345,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar
      - ✅
      - ✅
      - ✅
-   * - [APC](#apc)
+   * - [APC](#automatic-prefix-caching)
      - [✗](gh-issue:3687)
      - ✅
      - ✅
diff --git a/docs/source/usage/disagg_prefill.md b/docs/source/features/disagg_prefill.md
similarity index 96%
rename from docs/source/usage/disagg_prefill.md
rename to docs/source/features/disagg_prefill.md
index a61c00fad1e3c..05226f2dec87c 100644
--- a/docs/source/usage/disagg_prefill.md
+++ b/docs/source/features/disagg_prefill.md
@@ -41,13 +41,13 @@ Key abstractions for disaggregated prefilling:
 
 Here is a figure illustrating how the above 3 abstractions are organized:
 
-```{image} /assets/usage/disagg_prefill/abstraction.jpg
+```{image} /assets/features/disagg_prefill/abstraction.jpg
 :alt: Disaggregated prefilling abstractions
 ```
 
 The workflow of disaggregated prefilling is as follows:
 
-```{image} /assets/usage/disagg_prefill/overview.jpg
+```{image} /assets/features/disagg_prefill/overview.jpg
 :alt: Disaggregated prefilling workflow
 ```
 
diff --git a/docs/source/usage/lora.md b/docs/source/features/lora.md
similarity index 100%
rename from docs/source/usage/lora.md
rename to docs/source/features/lora.md
diff --git a/docs/source/usage/multimodal_inputs.md b/docs/source/features/multimodal_inputs.md
similarity index 100%
rename from docs/source/usage/multimodal_inputs.md
rename to docs/source/features/multimodal_inputs.md
diff --git a/docs/source/quantization/auto_awq.md b/docs/source/features/quantization/auto_awq.md
similarity index 100%
rename from docs/source/quantization/auto_awq.md
rename to docs/source/features/quantization/auto_awq.md
diff --git a/docs/source/quantization/bnb.md b/docs/source/features/quantization/bnb.md
similarity index 100%
rename from docs/source/quantization/bnb.md
rename to docs/source/features/quantization/bnb.md
diff --git a/docs/source/quantization/fp8.md b/docs/source/features/quantization/fp8.md
similarity index 100%
rename from docs/source/quantization/fp8.md
rename to docs/source/features/quantization/fp8.md
diff --git a/docs/source/quantization/fp8_e4m3_kvcache.md b/docs/source/features/quantization/fp8_e4m3_kvcache.md
similarity index 100%
rename from docs/source/quantization/fp8_e4m3_kvcache.md
rename to docs/source/features/quantization/fp8_e4m3_kvcache.md
diff --git a/docs/source/quantization/fp8_e5m2_kvcache.md b/docs/source/features/quantization/fp8_e5m2_kvcache.md
similarity index 100%
rename from docs/source/quantization/fp8_e5m2_kvcache.md
rename to docs/source/features/quantization/fp8_e5m2_kvcache.md
diff --git a/docs/source/quantization/gguf.md b/docs/source/features/quantization/gguf.md
similarity index 100%
rename from docs/source/quantization/gguf.md
rename to docs/source/features/quantization/gguf.md
diff --git a/docs/source/features/quantization/index.md b/docs/source/features/quantization/index.md
new file mode 100644
index 0000000000000..861cb165c11c2
--- /dev/null
+++ b/docs/source/features/quantization/index.md
@@ -0,0 +1,19 @@
+(quantization-index)=
+
+# Quantization
+
+Quantization trades off model precision for smaller memory footprint, allowing large models to be run on a wider range of devices.
+
+```{toctree}
+:caption: Contents
+:maxdepth: 1
+
+supported_hardware
+auto_awq
+bnb
+gguf
+int8
+fp8
+fp8_e5m2_kvcache
+fp8_e4m3_kvcache
+```
diff --git a/docs/source/quantization/int8.md b/docs/source/features/quantization/int8.md
similarity index 100%
rename from docs/source/quantization/int8.md
rename to docs/source/features/quantization/int8.md
diff --git a/docs/source/quantization/supported_hardware.md b/docs/source/features/quantization/supported_hardware.md
similarity index 86%
rename from docs/source/quantization/supported_hardware.md
rename to docs/source/features/quantization/supported_hardware.md
index 7330c2f8aa194..988288a82d9bc 100644
--- a/docs/source/quantization/supported_hardware.md
+++ b/docs/source/features/quantization/supported_hardware.md
@@ -1,6 +1,6 @@
-(supported-hardware-for-quantization)=
+(quantization-supported-hardware)=
 
-# Supported Hardware for Quantization Kernels
+# Supported Hardware
 
 The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM:
 
@@ -120,12 +120,12 @@ The table below shows the compatibility of various quantization implementations
   - ✗
 ```
 
-## Notes:
-
 - Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0.
 - "✅︎" indicates that the quantization method is supported on the specified hardware.
 - "✗" indicates that the quantization method is not supported on the specified hardware.
 
-Please note that this compatibility chart may be subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods.
+```{note}
+This compatibility chart is subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods.
 
 For the most up-to-date information on hardware support and quantization methods, please refer to <gh-dir:vllm/model_executor/layers/quantization> or consult with the vLLM development team.
+```
diff --git a/docs/source/usage/spec_decode.md b/docs/source/features/spec_decode.md
similarity index 100%
rename from docs/source/usage/spec_decode.md
rename to docs/source/features/spec_decode.md
diff --git a/docs/source/usage/structured_outputs.md b/docs/source/features/structured_outputs.md
similarity index 100%
rename from docs/source/usage/structured_outputs.md
rename to docs/source/features/structured_outputs.md
diff --git a/docs/source/usage/tool_calling.md b/docs/source/features/tool_calling.md
similarity index 100%
rename from docs/source/usage/tool_calling.md
rename to docs/source/features/tool_calling.md
diff --git a/docs/source/index.md b/docs/source/index.md
index f390474978790..4bc40bf0f5e41 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -79,6 +79,9 @@ serving/metrics
 serving/integrations
 serving/tensorizer
 serving/runai_model_streamer
+serving/engine_args
+serving/env_vars
+serving/usage_stats
 ```
 
 ```{toctree}
@@ -88,53 +91,28 @@ serving/runai_model_streamer
 models/supported_models
 models/generative_models
 models/pooling_models
-models/adding_model
-models/enabling_multimodal_inputs
 ```
 
 ```{toctree}
-:caption: Usage
+:caption: Features
 :maxdepth: 1
 
-usage/lora
-usage/multimodal_inputs
-usage/tool_calling
-usage/structured_outputs
-usage/spec_decode
-usage/compatibility_matrix
-usage/performance
-usage/engine_args
-usage/env_vars
-usage/usage_stats
-usage/disagg_prefill
-```
-
-```{toctree}
-:caption: Quantization
-:maxdepth: 1
-
-quantization/supported_hardware
-quantization/auto_awq
-quantization/bnb
-quantization/gguf
-quantization/int8
-quantization/fp8
-quantization/fp8_e5m2_kvcache
-quantization/fp8_e4m3_kvcache
-```
-
-```{toctree}
-:caption: Automatic Prefix Caching
-:maxdepth: 1
-
-automatic_prefix_caching/apc
-automatic_prefix_caching/details
+features/quantization/index
+features/lora
+features/multimodal_inputs
+features/tool_calling
+features/structured_outputs
+features/automatic_prefix_caching
+features/disagg_prefill
+features/spec_decode
+features/compatibility_matrix
 ```
 
 ```{toctree}
 :caption: Performance
 :maxdepth: 1
 
+performance/optimization
 performance/benchmarks
 ```
 
@@ -148,10 +126,8 @@ community/meetups
 community/sponsors
 ```
 
-% API Documentation: API reference aimed at vllm library usage
-
 ```{toctree}
-:caption: API Documentation
+:caption: API Reference
 :maxdepth: 2
 
 dev/sampling_params
@@ -160,30 +136,32 @@ dev/offline_inference/offline_index
 dev/engine/engine_index
 ```
 
-% Design: docs about vLLM internals
+% Design Documents: Details about vLLM internals
 
 ```{toctree}
-:caption: Design
+:caption: Design Documents
 :maxdepth: 2
 
 design/arch_overview
 design/huggingface_integration
 design/plugin_system
-design/input_processing/model_inputs_index
 design/kernel/paged_attention
+design/input_processing/model_inputs_index
 design/multimodal/multimodal_index
+design/automatic_prefix_caching
 design/multiprocessing
 ```
 
-% For Developers: contributing to the vLLM project
+% Developer Guide: How to contribute to the vLLM project
 
 ```{toctree}
-:caption: For Developers
+:caption: Developer Guide
 :maxdepth: 2
 
 contributing/overview
 contributing/profiling/profiling_index
 contributing/dockerfile/dockerfile
+contributing/model/index
 ```
 
 # Indices and tables
diff --git a/docs/source/models/adding_model.md b/docs/source/models/adding_model.md
deleted file mode 100644
index 02537fba020c4..0000000000000
--- a/docs/source/models/adding_model.md
+++ /dev/null
@@ -1,155 +0,0 @@
-(adding-a-new-model)=
-
-# Adding a New Model
-
-This document provides a high-level guide on integrating a [HuggingFace Transformers](https://github.com/huggingface/transformers) model into vLLM.
-
-```{note}
-The complexity of adding a new model depends heavily on the model's architecture.
-The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM.
-However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex.
-```
-
-```{note}
-By default, vLLM models do not support multi-modal inputs. To enable multi-modal support,
-please follow [this guide](#enabling-multimodal-inputs) after implementing the model here.
-```
-
-```{tip}
-If you are encountering issues while integrating your model into vLLM, feel free to open an issue on our [GitHub](https://github.com/vllm-project/vllm/issues) repository.
-We will be happy to help you out!
-```
-
-## 0. Fork the vLLM repository
-
-Start by forking our [GitHub] repository and then [build it from source](#build-from-source).
-This gives you the ability to modify the codebase and test your model.
-
-```{tip}
-If you don't want to fork the repository and modify vLLM's codebase, please refer to the "Out-of-Tree Model Integration" section below.
-```
-
-## 1. Bring your model code
-
-Clone the PyTorch model code from the HuggingFace Transformers repository and put it into the <gh-dir:vllm/model_executor/models> directory.
-For instance, vLLM's [OPT model](gh-file:vllm/model_executor/models/opt.py) was adapted from the HuggingFace's [modeling_opt.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py) file.
-
-```{warning}
-When copying the model code, make sure to review and adhere to the code's copyright and licensing terms.
-```
-
-## 2. Make your code compatible with vLLM
-
-To ensure compatibility with vLLM, your model must meet the following requirements:
-
-### Initialization Code
-
-All vLLM modules within the model must include a `prefix` argument in their constructor. This `prefix` is typically the full name of the module in the model's state dictionary and is crucial for:
-
-- Runtime support: vLLM's attention operators are registered in a model's state by their full names. Each attention operator must have a unique prefix as its layer name to avoid conflicts.
-- Non-uniform quantization support: A quantized checkpoint can selectively quantize certain layers while keeping others in full precision. By providing the `prefix` during initialization, vLLM can match the current layer's `prefix` with the quantization configuration to determine if the layer should be initialized in quantized mode.
-
-The initialization code should look like this:
-
-```python
-from torch import nn
-from vllm.config import VllmConfig
-from vllm.attention import Attention
-
-class MyAttention(nn.Module):
-    def __init__(self, vllm_config: VllmConfig, prefix: str):
-        super().__init__()
-        self.attn = Attention(prefix=f"{prefix}.attn")
-
-class MyDecoderLayer(nn.Module):
-    def __init__(self, vllm_config: VllmConfig, prefix: str):
-        super().__init__()
-        self.self_attn = MyAttention(prefix=f"{prefix}.self_attn")
-
-class MyModel(nn.Module):
-    def __init__(self, vllm_config: VllmConfig, prefix: str):
-        super().__init__()
-        self.layers = nn.ModuleList(
-            [MyDecoderLayer(vllm_config, prefix=f"{prefix}.layers.{i}") for i in range(vllm_config.model_config.hf_config.num_hidden_layers)]
-        )
-
-class MyModelForCausalLM(nn.Module):
-    def __init__(self, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__()
-        self.model = MyModel(vllm_config, prefix=f"{prefix}.model")
-```
-
-### Computation Code
-
-Rewrite the {meth}`~torch.nn.Module.forward` method of your model to remove any unnecessary code, such as training-specific code. Modify the input parameters to treat `input_ids` and `positions` as flattened tensors with a single batch size dimension, without a max-sequence length dimension.
-
-```python
-def forward(
-    self,
-    input_ids: torch.Tensor,
-    positions: torch.Tensor,
-    kv_caches: List[torch.Tensor],
-    attn_metadata: AttentionMetadata,
-) -> torch.Tensor:
-    ...
-```
-
-```{note}
-Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings.
-If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM.
-```
-
-For reference, check out our [Llama implementation](gh-file:vllm/model_executor/models/llama.py). vLLM already supports a large number of models. It is recommended to find a model similar to yours and adapt it to your model's architecture. Check out <gh-dir:vllm/model_executor/models> for more examples.
-
-## 3. (Optional) Implement tensor parallelism and quantization support
-
-If your model is too large to fit into a single GPU, you can use tensor parallelism to manage it.
-To do this, substitute your model's linear and embedding layers with their tensor-parallel versions.
-For the embedding layer, you can simply replace {class}`torch.nn.Embedding` with {code}`VocabParallelEmbedding`. For the output LM head, you can use {code}`ParallelLMHead`.
-When it comes to the linear layers, we provide the following options to parallelize them:
-
-- {code}`ReplicatedLinear`: Replicates the inputs and weights across multiple GPUs. No memory saving.
-- {code}`RowParallelLinear`: The input tensor is partitioned along the hidden dimension. The weight matrix is partitioned along the rows (input dimension). An *all-reduce* operation is performed after the matrix multiplication to reduce the results. Typically used for the second FFN layer and the output linear transformation of the attention layer.
-- {code}`ColumnParallelLinear`: The input tensor is replicated. The weight matrix is partitioned along the columns (output dimension). The result is partitioned along the column dimension. Typically used for the first FFN layer and the separated QKV transformation of the attention layer in the original Transformer.
-- {code}`MergedColumnParallelLinear`: Column-parallel linear that merges multiple {code}`ColumnParallelLinear` operators. Typically used for the first FFN layer with weighted activation functions (e.g., SiLU). This class handles the sharded weight loading logic of multiple weight matrices.
-- {code}`QKVParallelLinear`: Parallel linear layer for the query, key, and value projections of the multi-head and grouped-query attention mechanisms. When number of key/value heads are less than the world size, this class replicates the key/value heads properly. This class handles the weight loading and replication of the weight matrices.
-
-Note that all the linear layers above take {code}`linear_method` as an input. vLLM will set this parameter according to different quantization schemes to support weight quantization.
-
-## 4. Implement the weight loading logic
-
-You now need to implement the {code}`load_weights` method in your {code}`*ForCausalLM` class.
-This method should load the weights from the HuggingFace's checkpoint file and assign them to the corresponding layers in your model. Specifically, for {code}`MergedColumnParallelLinear` and {code}`QKVParallelLinear` layers, if the original model has separated weight matrices, you need to load the different parts separately.
-
-## 5. Register your model
-
-Finally, register your {code}`*ForCausalLM` class to the {code}`_VLLM_MODELS` in <gh-file:vllm/model_executor/models/registry.py>.
-
-## 6. Out-of-Tree Model Integration
-
-You can integrate a model without modifying the vLLM codebase. Steps 2, 3, and 4 are still required, but you can skip steps 1 and 5. Instead, write a plugin to register your model. For general introduction of the plugin system, see [plugin-system](#plugin-system).
-
-To register the model, use the following code:
-
-```python
-from vllm import ModelRegistry
-from your_code import YourModelForCausalLM
-ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM)
-```
-
-If your model imports modules that initialize CUDA, consider lazy-importing it to avoid errors like {code}`RuntimeError: Cannot re-initialize CUDA in forked subprocess`:
-
-```python
-from vllm import ModelRegistry
-
-ModelRegistry.register_model("YourModelForCausalLM", "your_code:YourModelForCausalLM")
-```
-
-```{important}
-If your model is a multimodal model, ensure the model class implements the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface.
-Read more about that [here](#enabling-multimodal-inputs).
-```
-
-```{note}
-Although you can directly put these code snippets in your script using `vllm.LLM`, the recommended way is to place these snippets in a vLLM plugin. This ensures compatibility with various vLLM features like distributed inference and the API server.
-```
diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 7682ed104b8c5..5a2778026192a 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -37,7 +37,7 @@ print(output)
 If vLLM successfully returns text (for generative models) or hidden states (for pooling models), it indicates that your model is supported.
 ````
 
-Otherwise, please refer to [Adding a New Model](#adding-a-new-model) and [Enabling Multimodal Inputs](#enabling-multimodal-inputs) for instructions on how to implement your model in vLLM.
+Otherwise, please refer to [Adding a New Model](#new-model) for instructions on how to implement your model in vLLM.
 Alternatively, you can [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) to request vLLM support.
 
 ### ModelScope
diff --git a/docs/source/usage/performance.md b/docs/source/performance/optimization.md
similarity index 98%
rename from docs/source/usage/performance.md
rename to docs/source/performance/optimization.md
index 2cd3801bfc82d..4fcde9b03b887 100644
--- a/docs/source/usage/performance.md
+++ b/docs/source/performance/optimization.md
@@ -1,6 +1,6 @@
-(performance)=
+(optimization-and-tuning)=
 
-# Performance and Tuning
+# Optimization and Tuning
 
 ## Preemption
 
diff --git a/docs/source/usage/engine_args.md b/docs/source/serving/engine_args.md
similarity index 100%
rename from docs/source/usage/engine_args.md
rename to docs/source/serving/engine_args.md
diff --git a/docs/source/usage/env_vars.md b/docs/source/serving/env_vars.md
similarity index 100%
rename from docs/source/usage/env_vars.md
rename to docs/source/serving/env_vars.md
diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index caf5e8cafd9aa..97e9879075570 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -217,7 +217,7 @@ you can use the [official OpenAI Python client](https://github.com/openai/openai
 
 We support both [Vision](https://platform.openai.com/docs/guides/vision)- and
 [Audio](https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in)-related parameters;
-see our [Multimodal Inputs](../usage/multimodal_inputs.md) guide for more information.
+see our [Multimodal Inputs](#multimodal-inputs) guide for more information.
 - *Note: `image_url.detail` parameter is not supported.*
 
 Code example: <gh-file:examples/openai_chat_completion_client.py>
diff --git a/docs/source/usage/usage_stats.md b/docs/source/serving/usage_stats.md
similarity index 100%
rename from docs/source/usage/usage_stats.md
rename to docs/source/serving/usage_stats.md
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index 480901f71047f..d43c15b661ef7 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -430,7 +430,7 @@ def forward(
         Returns:
             shape = [num_tokens, num_heads * head_size]
         """
-        # Reminder: Please update docs/source/usage/compatibility_matrix.md
+        # Reminder: Please update docs/source/features/compatibility_matrix.md
         # If the feature combo become valid
         if attn_type != AttentionType.DECODER:
             raise NotImplementedError("Encoder self-attention and "
diff --git a/vllm/config.py b/vllm/config.py
index b0ed88cb7f42b..8b824a1fca511 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -644,7 +644,7 @@ def verify_async_output_proc(self, parallel_config, speculative_config,
             self.use_async_output_proc = False
             return
 
-        # Reminder: Please update docs/source/usage/compatibility_matrix.md
+        # Reminder: Please update docs/source/features/compatibility_matrix.md
         # If the feature combo become valid
         from vllm.platforms import current_platform
         if not current_platform.is_async_output_supported(self.enforce_eager):
@@ -665,7 +665,7 @@ def verify_async_output_proc(self, parallel_config, speculative_config,
         if self.runner_type == "pooling":
             self.use_async_output_proc = False
 
-        # Reminder: Please update docs/source/usage/compatibility_matrix.md
+        # Reminder: Please update docs/source/features/compatibility_matrix.md
         # If the feature combo become valid
         if speculative_config:
             logger.warning("Async output processing is not supported with"
@@ -2064,7 +2064,7 @@ def verify_with_model_config(self, model_config: ModelConfig):
                            model_config.quantization)
 
     def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig):
-        # Reminder: Please update docs/source/usage/compatibility_matrix.md
+        # Reminder: Please update docs/source/features/compatibility_matrix.md
         # If the feature combo become valid
         if scheduler_config.chunked_prefill_enabled:
             logger.warning("LoRA with chunked prefill is still experimental "
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 69c7c5077fe32..e94664308cf8d 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1148,7 +1148,7 @@ def create_engine_config(self,
             disable_logprobs=self.disable_logprobs_during_spec_decoding,
         )
 
-        # Reminder: Please update docs/source/usage/compatibility_matrix.md
+        # Reminder: Please update docs/source/features/compatibility_matrix.md
         # If the feature combo become valid
         if self.num_scheduler_steps > 1:
             if speculative_config is not None:
diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py
index 1c6f735f39e04..c8b282b1a7676 100644
--- a/vllm/engine/output_processor/multi_step.py
+++ b/vllm/engine/output_processor/multi_step.py
@@ -65,7 +65,7 @@ def process_prompt_logprob(self, seq_group: SequenceGroup,
     @staticmethod
     @functools.lru_cache
     def _log_prompt_logprob_unsupported_warning_once():
-        # Reminder: Please update docs/source/usage/compatibility_matrix.md
+        # Reminder: Please update docs/source/features/compatibility_matrix.md
         # If the feature combo become valid
         logger.warning(
             "Prompt logprob is not supported by multi step workers. "
diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py
index 5495bc50ede83..c7f018d9a203e 100644
--- a/vllm/executor/cpu_executor.py
+++ b/vllm/executor/cpu_executor.py
@@ -22,7 +22,7 @@ class CPUExecutor(ExecutorBase):
 
     def _init_executor(self) -> None:
         assert self.device_config.device_type == "cpu"
-        # Reminder: Please update docs/source/usage/compatibility_matrix.md
+        # Reminder: Please update docs/source/features/compatibility_matrix.md
         # If the feature combo become valid
         assert self.lora_config is None, "cpu backend doesn't support LoRA"
 
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index 09bde9f065eaa..7ba7f5150150c 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -50,7 +50,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         import vllm.envs as envs
         from vllm.utils import GiB_bytes
         model_config = vllm_config.model_config
-        # Reminder: Please update docs/source/usage/compatibility_matrix.md
+        # Reminder: Please update docs/source/features/compatibility_matrix.md
         # If the feature combo become valid
         if not model_config.enforce_eager:
             logger.warning(
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index de593113b938b..e369da1a70c23 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -108,7 +108,7 @@ def create_spec_worker(*args, **kwargs) -> "SpecDecodeWorker":
     return spec_decode_worker
 
 
-# Reminder: Please update docs/source/usage/compatibility_matrix.md
+# Reminder: Please update docs/source/features/compatibility_matrix.md
 # If the feature combo become valid
 class SpecDecodeWorker(LoraNotSupportedWorkerBase):
     """Worker which implements speculative decoding.
diff --git a/vllm/utils.py b/vllm/utils.py
index aadeddabf8b55..63057153f851d 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -58,7 +58,7 @@
 
 # Exception strings for non-implemented encoder/decoder scenarios
 
-# Reminder: Please update docs/source/usage/compatibility_matrix.md
+# Reminder: Please update docs/source/features/compatibility_matrix.md
 # If the feature combo become valid
 
 STR_NOT_IMPL_ENC_DEC_SWA = \
diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py
index dee63a75c0605..a2c2cebf8d1f6 100644
--- a/vllm/worker/multi_step_model_runner.py
+++ b/vllm/worker/multi_step_model_runner.py
@@ -822,7 +822,7 @@ def _pythonize_sampler_output(
 
     for sgdx, (seq_group,
                sample_result) in enumerate(zip(seq_groups, samples_list)):
-        # Reminder: Please update docs/source/usage/compatibility_matrix.md
+        # Reminder: Please update docs/source/features/compatibility_matrix.md
         # If the feature combo become valid
         # (Check for Guided Decoding)
         if seq_group.sampling_params.logits_processors:
diff --git a/vllm/worker/utils.py b/vllm/worker/utils.py
index 8f2d343440d3e..ffa8c4cb0ff46 100644
--- a/vllm/worker/utils.py
+++ b/vllm/worker/utils.py
@@ -13,7 +13,7 @@ def assert_enc_dec_mr_supported_scenario(
     a supported scenario.
     '''
 
-    # Reminder: Please update docs/source/usage/compatibility_matrix.md
+    # Reminder: Please update docs/source/features/compatibility_matrix.md
     # If the feature combo become valid
 
     if enc_dec_mr.cache_config.enable_prefix_caching:

From 9279b9f83dd3aa5bb3d3ce57bf92d9361755d164 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Mon, 6 Jan 2025 05:48:53 -0800
Subject: [PATCH 091/462] [Bugfix] Fix max image size for LLaVA-Onevision
 (#11769)

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/model_executor/models/llava_onevision.py | 20 +++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index 6dccc1e0d3b8d..5eac2f223d794 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -19,8 +19,8 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors
-from vllm.multimodal.parse import (MultiModalDataItems, VideoEmbeddingItems,
-                                   VideoProcessorItems)
+from vllm.multimodal.parse import (ImageSize, MultiModalDataItems,
+                                   VideoEmbeddingItems, VideoProcessorItems)
 from vllm.multimodal.processing import MultiModalFieldConfig, PromptReplacement
 from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs
 from vllm.sequence import IntermediateTensors
@@ -170,6 +170,22 @@ def _get_num_video_tokens(
 class LlavaOnevisionProfilingInfo(LlavaOnevisionProcessingMixin,
                                   BaseLlavaProfilingInfo):
 
+    def _get_image_size_with_most_features(self) -> ImageSize:
+        hf_config = self._get_hf_config()
+        largest_feature_size, largest_feature_pinpoint = 0, None
+        for (height, width) in hf_config.image_grid_pinpoints:
+            feat_size = self._get_num_image_tokens(image_width=width,
+                                                   image_height=height)
+            if feat_size > largest_feature_size:
+                largest_feature_size = feat_size
+                largest_feature_pinpoint = ImageSize(width=width,
+                                                     height=height)
+
+        if largest_feature_size == 0 or largest_feature_pinpoint is None:
+            raise ValueError("Cannot have a largest feature size of 0!")
+
+        return largest_feature_pinpoint
+
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None, "video": None}
 

From 4ca5d40adc53aca2a1fbaed81d9d622fde46ebf1 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 6 Jan 2025 21:57:44 +0800
Subject: [PATCH 092/462] [doc] explain how to add interleaving sliding window
 support (#11771)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 docs/source/contributing/model/basic.md | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/docs/source/contributing/model/basic.md b/docs/source/contributing/model/basic.md
index 14690ffe24a83..002808ac5fbbd 100644
--- a/docs/source/contributing/model/basic.md
+++ b/docs/source/contributing/model/basic.md
@@ -100,3 +100,16 @@ This method should load the weights from the HuggingFace's checkpoint file and a
 ## 5. Register your model
 
 See [this page](#new-model-registration) for instructions on how to register your new model to be used by vLLM.
+
+## Frequently Asked Questions
+
+### How to support models with interleaving sliding windows?
+
+For models with interleaving sliding windows (e.g. `google/gemma-2-2b-it` and `mistralai/Ministral-8B-Instruct-2410`), the scheduler will treat the model as a full-attention model, i.e., kv-cache of all tokens will not be dropped. This is to make sure prefix caching works with these models. Sliding window only appears as a parameter to the attention kernel computation.
+
+To support a model with interleaving sliding windows, we need to take care of the following details:
+
+- Make sure [this line](https://github.com/vllm-project/vllm/blob/996357e4808ca5eab97d4c97c7d25b3073f46aab/vllm/config.py#L308) evaluates `has_interleaved_attention` to `True` for this model, and set `self.hf_text_config.interleaved_sliding_window` to the format of interleaving sliding windows the model can understand. Then, `self.hf_text_config.sliding_window` will be deleted, and the model will be treated as a full-attention model.
+- In the modeling code, parse the correct sliding window value for every layer, and pass it to the attention layer's `per_layer_sliding_window` argument. For reference, check [this line](https://github.com/vllm-project/vllm/blob/996357e4808ca5eab97d4c97c7d25b3073f46aab/vllm/model_executor/models/llama.py#L171).
+
+With these two steps, interleave sliding windows should work with the model.

From 32c9eff2fff8ee91a60c9410c69042dc4c1cc5c8 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Mon, 6 Jan 2025 23:22:25 +0800
Subject: [PATCH 093/462] [Bugfix][V1] Fix molmo text-only inputs (#11676)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 .../vision_language/test_models.py            | 10 ++
 .../vision_language/vlm_utils/model_utils.py  | 99 ++++++++++++++++++-
 vllm/model_executor/models/molmo.py           | 56 ++++-------
 3 files changed, 123 insertions(+), 42 deletions(-)

diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index dc0b683c1f1cb..146685738a1d0 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -341,6 +341,16 @@
         ),
         hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
     ),
+    "molmo": VLMTestInfo(
+        models=["allenai/Molmo-7B-D-0924"],
+        test_type=(VLMTestType.IMAGE),
+        prompt_formatter=lambda img_prompt:"User: " + img_prompt + " Assistant:", # noqa: E501
+        max_model_len=4096,
+        max_num_seqs=2,
+        image_size_factors=[(),(1.0, 1.0, 1.0)],
+        patch_hf_runner=model_utils.mlomo_patch_hf_runner,
+        postprocess_inputs=model_utils.molmo_post_processor,
+    ),
     # Tests for phi3v currently live in another file because of a bug in
     # transformers. Once this issue is fixed, we can enable them here instead.
     # https://github.com/huggingface/transformers/issues/34307
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
index 3eca8fb9dcb1a..6c7a753af787e 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
@@ -5,17 +5,20 @@
 import re
 import types
 from pathlib import PosixPath
-from typing import Callable, List, Optional, Tuple, Union
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import torch
 from PIL.Image import Image
-from transformers import AutoConfig, AutoTokenizer, BatchEncoding
+from transformers import (AutoConfig, AutoTokenizer, BatchEncoding,
+                          GenerationConfig)
 
 from vllm.sequence import SampleLogprobs
 from vllm.transformers_utils.tokenizer import patch_padding_side
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
 
-from .....conftest import HfRunner, ImageAsset, _ImageAssets
+from .....conftest import (HfRunner, ImageAsset, PromptAudioInput,
+                           PromptImageInput, PromptVideoInput, _ImageAssets)
+from ....utils import TokensTextLogprobs
 from .types import RunnerOutput
 
 
@@ -222,6 +225,11 @@ def wrap_inputs_post_processor(hf_inputs: BatchEncoding, dtype: str):
     return {"model_inputs": hf_inputs}
 
 
+def molmo_post_processor(hf_inputs: BatchEncoding, dtype: str):
+    hf_inputs = cast_dtype_post_processor("images")(hf_inputs, dtype)
+    return {k: v.unsqueeze(0) for k, v in hf_inputs.items()}
+
+
 ####### Prompt path encoders for models that need models on disk
 def qwen_prompt_path_encoder(
         tmp_path: PosixPath, prompt: str, assets: Union[List[ImageAsset],
@@ -451,3 +459,88 @@ def _generate(self, *args, **kwargs):
     hf_model.model.generate = types.MethodType(_generate, hf_model.model)
 
     return hf_model
+
+
+def _generate_greedy_logprobs_limit(
+    self,
+    prompts: List[str],
+    max_tokens: int,
+    num_logprobs: int,
+    images: Optional[PromptImageInput] = None,
+    audios: Optional[PromptAudioInput] = None,
+    videos: Optional[PromptVideoInput] = None,
+    **kwargs: Any,
+) -> List[TokensTextLogprobs]:
+    all_inputs = self.get_inputs(prompts,
+                                 images=images,
+                                 videos=videos,
+                                 audios=audios)
+
+    # Process in batches for inference.
+    if len(all_inputs):
+        input_ids_lst = []
+        images_lst = []
+        images_input_idx_lst = []
+        imges_masks_lst = []
+        for inputs in all_inputs:
+            input_ids_lst.append(inputs["input_ids"])
+            images_lst.append(inputs["images"])
+            images_input_idx_lst.append(inputs["image_input_idx"])
+            imges_masks_lst.append(inputs["image_masks"])
+        batch_inputs = {}
+        batch_inputs['input_ids'] = torch.cat(input_ids_lst, dim=0)
+        batch_inputs['images'] = torch.cat(images_lst, dim=0)
+        batch_inputs['image_input_idx'] = torch.cat(images_input_idx_lst,
+                                                    dim=0)
+        batch_inputs['image_masks'] = torch.cat(imges_masks_lst, dim=0)
+
+        outputs = self.model.generate_from_batch(
+            batch=self.wrap_device(batch_inputs,
+                                   device=self.model.device.type),
+            generation_config=GenerationConfig(
+                max_new_tokens=max_tokens,
+                stop_strings="<|endoftext|>",
+                do_sample=False,
+            ),
+            tokenizer=self.tokenizer,
+            output_hidden_states=True,
+            return_dict_in_generate=True,
+        )
+
+    all_logprobs: List[List[Dict[int, float]]] = []
+    all_output_ids: List[List[int]] = []
+    all_output_strs: List[str] = []
+
+    for index in range(len(all_inputs)):
+        (
+            seq_logprobs_lst,
+            output_len,
+        ) = self._hidden_states_to_logprobs(outputs.hidden_states,
+                                            num_logprobs)
+        all_logprobs.append(seq_logprobs_lst)
+        seq_ids = outputs.sequences[index]
+        output_ids = seq_ids[-output_len:]
+        all_output_ids.append(output_ids.tolist())
+        all_output_strs.append(self.tokenizer.decode(output_ids))
+    outputs = zip(all_output_ids, all_output_strs, all_logprobs)
+    return [(output_ids, output_str, output_logprobs)
+            for output_ids, output_str, output_logprobs in outputs]
+
+
+####### Molmo-specific HuggingFace runner patchers
+def mlomo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+    """Patches and returns an instance of the HfRunner to use for Molmo."""
+    hf_processor = hf_model.processor
+
+    def _processor(*args, **kwargs):
+        return hf_processor.process(*args, **kwargs)
+
+    hf_model.processor = _processor
+
+    setattr(  # noqa: B010
+        hf_model,
+        "generate_greedy_logprobs_limit",
+        types.MethodType(_generate_greedy_logprobs_limit, hf_model),
+    )
+
+    return hf_model
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index cc25be9f5b6a9..0e8287bb56b6b 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -1081,45 +1081,25 @@ def input_processor_for_molmo(ctx: InputContext, inputs: DecoderOnlyInputs):
     else:
         out = processor.process(None, image, tokens=inputs["prompt_token_ids"])
 
-    image_processor = processor.image_processor
-    max_total_crops = 1 + image_processor.max_crops
-    if image is not None:
-        images, image_input_idx, image_masks = pad_images(
-            max_total_crops,
-            out["images"],
-            out["image_input_idx"],
-            out.get("image_masks"),
-        )
-    else:
-        base_image_input_size = image_processor.base_image_input_size
-        image_patch_size = image_processor.image_patch_size
-        image_num_patch = (
-            base_image_input_size[0] // image_patch_size,
-            base_image_input_size[1] // image_patch_size,
-        )
-        n_pixels = image_patch_size * image_patch_size * 3
-        n_patches = image_num_patch[0] * image_num_patch[1]
-
-        image_length_w = image_processor.image_token_length_w
-        image_length_h = image_processor.image_token_length_h
-        tokens_per_image = image_length_w * image_length_h
-        images = torch.full(
-            (max_total_crops, n_patches, n_pixels),
-            -1,
-            dtype=torch.float32,
-        )
-        image_input_idx = torch.full(
-            (max_total_crops, tokens_per_image),
-            -1,
-            dtype=torch.int32,
+    # If there is no image, return directly.
+    if image is None:
+        new_prompt_token_ids = out["input_ids"].tolist()
+        prompt = inputs.get("prompt")
+        if prompt is None:
+            prompt = tokenizer.decode(new_prompt_token_ids)
+        return token_inputs(
+            prompt_token_ids=new_prompt_token_ids,
+            prompt=prompt,
         )
-        if image_processor.image_padding_mask:
-            image_masks = torch.full(
-                (max_total_crops, n_patches),
-                -1,
-                dtype=torch.float32,
-            )
 
+    image_processor = processor.image_processor
+    max_total_crops = 1 + image_processor.max_crops
+    images, image_input_idx, image_masks = pad_images(
+        max_total_crops,
+        out["images"],
+        out["image_input_idx"],
+        out.get("image_masks"),
+    )
     image_data = dict(
         images=images,
         image_input_idx=image_input_idx,
@@ -1143,11 +1123,9 @@ def input_processor_for_molmo(ctx: InputContext, inputs: DecoderOnlyInputs):
                 offset = i
             size += 1
     image_data["image_start_end"] = (offset, offset + size)
-
     prompt = inputs.get("prompt")
     if prompt is None:
         prompt = tokenizer.decode(new_prompt_token_ids)
-
     return token_inputs(
         prompt_token_ids=new_prompt_token_ids,
         prompt=prompt,

From e20c92bb618384ce8d0013e0c9ad273d0c23d65b Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Tue, 7 Jan 2025 00:11:28 +0800
Subject: [PATCH 094/462] [Kernel] Move attn_type to Attention.__init__()
 (#11690)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
---
 tests/kernels/test_encoder_decoder_attn.py  | 100 ++++++++++----------
 tests/kernels/utils.py                      |  12 ++-
 vllm/attention/backends/abstract.py         |   2 +-
 vllm/attention/backends/blocksparse_attn.py |  14 +--
 vllm/attention/backends/flash_attn.py       |   4 +-
 vllm/attention/backends/flashinfer.py       |  15 ++-
 vllm/attention/backends/hpu_attn.py         |  13 +--
 vllm/attention/backends/ipex_attn.py        |  12 +--
 vllm/attention/backends/pallas.py           |  13 +--
 vllm/attention/backends/rocm_flash_attn.py  |  14 +--
 vllm/attention/backends/torch_sdpa.py       |   4 +-
 vllm/attention/backends/xformers.py         |   6 +-
 vllm/attention/layer.py                     |  37 ++------
 vllm/model_executor/models/bart.py          |  44 +++------
 vllm/model_executor/models/bert.py          |  10 +-
 vllm/model_executor/models/mllama.py        |  11 +--
 vllm/model_executor/models/qwen2.py         |  35 ++++---
 vllm/v1/attention/backends/flash_attn.py    |  14 +--
 18 files changed, 159 insertions(+), 201 deletions(-)

diff --git a/tests/kernels/test_encoder_decoder_attn.py b/tests/kernels/test_encoder_decoder_attn.py
index d943b048b7934..614674375786e 100644
--- a/tests/kernels/test_encoder_decoder_attn.py
+++ b/tests/kernels/test_encoder_decoder_attn.py
@@ -13,8 +13,7 @@
 import torch
 
 from tests.kernels.utils import *
-from vllm.attention import (Attention, AttentionBackend, AttentionMetadata,
-                            AttentionType)
+from vllm.attention import Attention, AttentionMetadata, AttentionType
 from vllm.attention.backends.utils import STR_NOT_IMPL_ENC_DEC_ROCM_HIP
 from vllm.attention.selector import (_Backend, _cached_get_attn_backend,
                                      global_force_attn_backend_context_manager)
@@ -64,6 +63,7 @@ class TestPoint(NamedTuple):
     max_dec_seq_len: int
     max_enc_seq_len: int
     num_blocks: int
+    attn_type: AttentionType
 
 
 class TestResources(NamedTuple):
@@ -96,7 +96,6 @@ class TestResources(NamedTuple):
     '''
 
     scale: float
-    attn_backend: AttentionBackend
     attn: Attention
     kv_cache: torch.Tensor
 
@@ -129,16 +128,17 @@ class that Attention will automatically select when it is constructed.
     '''
 
     scale = float(1.0 / (test_pt.head_size**0.5))
-    attn_backend = make_backend(test_pt.backend_name)
     attn = Attention(
         test_pt.num_heads,
         test_pt.head_size,
         scale=scale,
+        prefix=f"{test_pt.attn_type}",
+        attn_type=test_pt.attn_type,
     )
     if test_pt.num_blocks is None or test_pt.num_heads is None:
         # Caller does not require a KV cache
         return TestResources(
-            scale, attn_backend, attn,
+            scale, attn,
             torch.tensor([], dtype=torch.float32, device=CUDA_DEVICE))
 
     # Construct KV cache
@@ -148,7 +148,7 @@ class that Attention will automatically select when it is constructed.
                              test_pt.block_size,
                              device=CUDA_DEVICE,
                              backend=test_pt.backend_name)
-    return TestResources(scale, attn_backend, attn, kv_cache)
+    return TestResources(scale, attn, kv_cache)
 
 
 def _encoder_attn_setup(
@@ -193,6 +193,7 @@ def _encoder_attn_setup(
         _,
         max_q_seq_len,
         _,
+        _,
     ) = test_pt
 
     scale = test_rsrcs.scale
@@ -301,6 +302,7 @@ def _decoder_attn_setup(
         max_q_seq_len,
         _,
         _,
+        _,
     ) = test_pt
 
     scale = test_rsrcs.scale
@@ -488,6 +490,7 @@ def _enc_dec_cross_attn_setup_reuses_query(
         max_decoder_seq_len,
         max_encoder_seq_len,
         _,
+        _,
     ) = test_pt
 
     scale = test_rsrcs.scale
@@ -622,7 +625,6 @@ def _run_encoder_attention_test(
       & attn_metadata
     '''
     assert attn_metadata.num_decode_tokens == 0
-    attn_type = AttentionType.ENCODER
     packed_qkv = encoder_test_params.packed_qkvo.packed_qkv
     assert packed_qkv is not None
     with set_forward_context(attn_metadata, vllm_config):
@@ -635,14 +637,11 @@ def _run_encoder_attention_test(
         # is shaped as [num_tokens, hidden_size] and we can skip the reshape.
         reshaped_query = packed_qkv.query.view(
             -1, test_pt.num_heads * test_pt.head_size)
-        return attn.forward(reshaped_query,
-                            packed_qkv.key,
-                            packed_qkv.value,
-                            torch.tensor([],
-                                         dtype=torch.float32,
-                                         device=packed_qkv.query.device),
-                            attn_metadata,
-                            attn_type=attn_type)
+        return attn.forward(
+            reshaped_query, packed_qkv.key, packed_qkv.value,
+            torch.tensor([],
+                         dtype=torch.float32,
+                         device=packed_qkv.query.device), attn_metadata)
 
 
 def _run_decoder_self_attention_test(
@@ -675,7 +674,6 @@ def _run_decoder_self_attention_test(
     * Attention.forward() applied to packed_{query,key,value}, kv_cache
       & attn_metadata
     '''
-    attn_type = AttentionType.DECODER
     attn = test_rsrcs.attn
     kv_cache = test_rsrcs.kv_cache
     packed_qkv = decoder_test_params.packed_qkvo.packed_qkv
@@ -690,12 +688,8 @@ def _run_decoder_self_attention_test(
         # is shaped as [num_tokens, hidden_size] and we can skip the reshape.
         reshaped_query = packed_qkv.query.view(
             -1, test_pt.num_heads * test_pt.head_size)
-        return attn.forward(reshaped_query,
-                            packed_qkv.key,
-                            packed_qkv.value,
-                            kv_cache,
-                            attn_metadata,
-                            attn_type=attn_type)
+        return attn.forward(reshaped_query, packed_qkv.key, packed_qkv.value,
+                            kv_cache, attn_metadata)
 
 
 def _run_encoder_decoder_cross_attention_test(
@@ -742,7 +736,6 @@ def _run_encoder_decoder_cross_attention_test(
     '''
     assert decoder_test_params.packed_qkvo.packed_qkv is not None
 
-    attn_type = AttentionType.ENCODER_DECODER
     attn = test_rsrcs.attn
     kv_cache = test_rsrcs.kv_cache
     if cross_test_params is None:
@@ -762,12 +755,8 @@ def _run_encoder_decoder_cross_attention_test(
         # is shaped as [num_tokens, hidden_size] and we can skip the reshape.
         reshaped_query = decoder_test_params.packed_qkvo.packed_qkv.query.view(
             -1, test_pt.num_heads * test_pt.head_size)
-        return attn.forward(reshaped_query,
-                            key,
-                            value,
-                            kv_cache,
-                            attn_metadata,
-                            attn_type=attn_type)
+        return attn.forward(reshaped_query, key, value, kv_cache,
+                            attn_metadata)
 
 
 @pytest.fixture(autouse=True)
@@ -839,7 +828,7 @@ def test_encoder_only(
         # is not part of this test
         test_pt = TestPoint(num_heads, head_size, attn_backend.name,
                             batch_size, block_size, max_dec_seq_len,
-                            max_enc_seq_len, 4096)
+                            max_enc_seq_len, 4096, AttentionType.ENCODER)
 
         # Attention scale factor, attention backend instance, attention wrapper
         # instance, KV cache init
@@ -855,7 +844,7 @@ def test_encoder_only(
         # Shared prefill metadata structure
 
         prephase_attn_metadata: AttentionMetadata = make_test_metadata(
-            test_rsrcs.attn_backend,
+            attn_backend,
             True,
             None,
             decoder_test_params=None,
@@ -961,20 +950,29 @@ def test_e2e_enc_dec_attn(
         # Note: KV cache size of 4096 is arbitrary & chosen intentionally
         # to be more than necessary, since exceeding the kv cache size
         # is not part of this test
-        test_pt = TestPoint(num_heads, head_size, attn_backend.name,
-                            batch_size, block_size, max_dec_seq_len,
-                            max_enc_seq_len, 4096)
+        enc_test_pt = TestPoint(num_heads, head_size, attn_backend.name,
+                                batch_size, block_size, max_dec_seq_len,
+                                max_enc_seq_len, 4096, AttentionType.ENCODER)
+        enc_dec_test_pt = TestPoint(num_heads, head_size, attn_backend.name,
+                                    batch_size, block_size, max_dec_seq_len,
+                                    max_enc_seq_len, 4096,
+                                    AttentionType.ENCODER_DECODER)
+        dec_test_pt = TestPoint(num_heads, head_size, attn_backend.name,
+                                batch_size, block_size, max_dec_seq_len,
+                                max_enc_seq_len, 4096, AttentionType.DECODER)
 
         # Attention scale factor, attention backend instance, attention wrapper
         # instance, KV cache init
         vllm_config = VllmConfig()
         with set_current_vllm_config(vllm_config):
-            test_rsrcs = _make_test_resources(test_pt)
+            enc_test_rsrcs = _make_test_resources(enc_test_pt)
+            enc_dec_test_rsrcs = _make_test_resources(enc_dec_test_pt)
+            dec_test_rsrcs = _make_test_resources(dec_test_pt)
 
         # Construct encoder attention test params (only used
         # during prefill)
 
-        enc_test_params = _encoder_attn_setup(test_pt, test_rsrcs)
+        enc_test_params = _encoder_attn_setup(enc_test_pt, enc_test_rsrcs)
 
         # Construct Decoder self-attention prefill-phase & decode-phase
         # test params, including query/key/value tensors, decoder self-attention
@@ -987,7 +985,7 @@ def test_e2e_enc_dec_attn(
             prephase_dec_test_params,
             decphase_dec_test_params,
             cross_block_base_addr,
-        ) = _decoder_attn_setup(test_pt, test_rsrcs)
+        ) = _decoder_attn_setup(dec_test_pt, dec_test_rsrcs)
 
         # Construct encoder/decoder cross-attention prefill-phase
         # & decode-phase test params, including key/value tensors,
@@ -1000,14 +998,14 @@ def test_e2e_enc_dec_attn(
             dec_qkv,
             enc_test_params,
             prephase_dec_test_params,
-            test_pt,
-            test_rsrcs,
+            enc_dec_test_pt,
+            enc_dec_test_rsrcs,
             block_base_addr=cross_block_base_addr)
 
         # Shared prefill metadata structure
         assert prephase_dec_test_params.packed_qkvo.packed_qkv is not None
         prephase_attn_metadata: AttentionMetadata = make_test_metadata(
-            test_rsrcs.attn_backend,
+            attn_backend,
             True,
             prephase_dec_test_params.packed_qkvo.packed_qkv.q_seq_lens,
             decoder_test_params=prephase_dec_test_params,
@@ -1017,10 +1015,10 @@ def test_e2e_enc_dec_attn(
 
         # PREFILL: encoder attention
 
-        enc_pckd_act_out = _run_encoder_attention_test(test_rsrcs.attn,
+        enc_pckd_act_out = _run_encoder_attention_test(enc_test_rsrcs.attn,
                                                        enc_test_params,
                                                        prephase_attn_metadata,
-                                                       test_pt=test_pt,
+                                                       test_pt=enc_test_pt,
                                                        vllm_config=vllm_config)
 
         # - Is encoder attention result correct?
@@ -1030,10 +1028,10 @@ def test_e2e_enc_dec_attn(
         # PREFILL: decoder self-attention test
 
         prephase_dec_pckd_act_out = _run_decoder_self_attention_test(
-            test_rsrcs,
+            dec_test_rsrcs,
             prephase_dec_test_params,
             prephase_attn_metadata,
-            test_pt=test_pt,
+            test_pt=dec_test_pt,
             vllm_config=vllm_config)
 
         # - Is prefill decoder self-attention correct?
@@ -1044,11 +1042,11 @@ def test_e2e_enc_dec_attn(
         # PREFILL: encoder/decoder cross-attention test
 
         prephase_cross_pckd_act_out = _run_encoder_decoder_cross_attention_test(
-            test_rsrcs,
+            enc_dec_test_rsrcs,
             prephase_dec_test_params,
             prephase_cross_test_params,
             prephase_attn_metadata,
-            test_pt=test_pt,
+            test_pt=enc_dec_test_pt,
             vllm_config=vllm_config)
 
         # - Is prefill encoder/decoder cross-attention correct?
@@ -1059,7 +1057,7 @@ def test_e2e_enc_dec_attn(
         # DECODE: build decode-phase attention metadata
 
         decphase_attn_metadata: AttentionMetadata = make_test_metadata(
-            test_rsrcs.attn_backend,
+            attn_backend,
             False,
             dec_qkv.q_seq_lens,
             decoder_test_params=decphase_dec_test_params,
@@ -1070,10 +1068,10 @@ def test_e2e_enc_dec_attn(
         # DECODE: decoder self-attention test
 
         decphase_dec_pckd_act_out = _run_decoder_self_attention_test(
-            test_rsrcs,
+            dec_test_rsrcs,
             decphase_dec_test_params,
             decphase_attn_metadata,
-            test_pt=test_pt,
+            test_pt=dec_test_pt,
             vllm_config=vllm_config)
 
         # - Is decode-phase decoder self-attention correct?
@@ -1084,11 +1082,11 @@ def test_e2e_enc_dec_attn(
         # DECODE: encoder/decoder cross-attention test
 
         decphase_cross_pckd_act_out = _run_encoder_decoder_cross_attention_test(
-            test_rsrcs,
+            enc_dec_test_rsrcs,
             decphase_dec_test_params,
             None,
             decphase_attn_metadata,
-            test_pt=test_pt,
+            test_pt=enc_dec_test_pt,
             vllm_config=vllm_config)
 
         # - Is decode-phase encoder/decoder cross-attention correct?
diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py
index e7865fb2500ef..848eea7f54cab 100644
--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@@ -13,6 +13,7 @@
 
 from vllm.attention import AttentionBackend, AttentionMetadata, AttentionType
 from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.platforms.interface import _Backend
 from vllm.utils import (STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL,
                         STR_XFORMERS_ATTN_VAL, make_tensor_with_pad)
 
@@ -790,7 +791,7 @@ def make_block_tables_slot_mapping(
 
 
 def make_test_metadata(
-    attn_backend: AttentionBackend,
+    attn_backend: _Backend,
     is_prompt: bool,
     seq_lens: Optional[List[int]],
     decoder_test_params: Optional[PhaseTestParameters],
@@ -815,7 +816,7 @@ def make_test_metadata(
 
     Arguments:
 
-    * attn_backend: Backend for sourcing attention kernels
+    * attn_backend_name: Backend for sourcing attention kernels
     * is_prompt: prefill if True, o/w decode
     * seq_lens: list of token counts for each sequence
     * decoder_test_params: decoder self-attention test params; 
@@ -882,6 +883,8 @@ def make_test_metadata(
         #   (kv_mmap)
         cross_kv_mmap = cross_test_params.kv_mmap
 
+    attn_backend_obj = make_backend(attn_backend.name)
+
     if is_prompt:
         # Prefill-phase scenario
 
@@ -902,8 +905,7 @@ def make_test_metadata(
                                    context_lens,
                                    encoder_seq_lens,
                                    device=device)
-
-        return attn_backend.make_metadata(
+        return attn_backend_obj.make_metadata(
             num_prefills=num_prefills,
             slot_mapping=(None if kv_mmap is None else kv_mmap.slot_mapping),
             multi_modal_placeholder_index_maps=None,
@@ -952,7 +954,7 @@ def make_test_metadata(
                                    encoder_seq_lens,
                                    device=device)
 
-        return attn_backend.make_metadata(
+        return attn_backend_obj.make_metadata(
             num_prefills=num_prefills,
             slot_mapping=kv_mmap.slot_mapping,
             multi_modal_placeholder_index_maps=None,
diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index aed04361e5fb4..f5dcaea79af93 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -233,6 +233,7 @@ def __init__(
         kv_cache_dtype: str = "auto",
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
+        attn_type: str = AttentionType.DECODER,
     ) -> None:
         raise NotImplementedError
 
@@ -246,7 +247,6 @@ def forward(
         attn_metadata: T,
         k_scale: float = 1.0,
         v_scale: float = 1.0,
-        attn_type: str = AttentionType.DECODER,
         output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         raise NotImplementedError
diff --git a/vllm/attention/backends/blocksparse_attn.py b/vllm/attention/backends/blocksparse_attn.py
index 99cb84346d84e..7089d59392c36 100644
--- a/vllm/attention/backends/blocksparse_attn.py
+++ b/vllm/attention/backends/blocksparse_attn.py
@@ -300,6 +300,7 @@ def __init__(
         kv_cache_dtype: str,
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
+        attn_type: str = AttentionType.DECODER,
     ) -> None:
         assert blocksparse_params is not None
         assert alibi_slopes is None, ValueError(
@@ -350,6 +351,12 @@ def __init__(
             active_head_range=self.blocksparse_params.active_head_range,
         )
 
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "BlocksparseFlashAttentionImpl")
+
     def forward(
         self,
         query: torch.Tensor,
@@ -359,7 +366,6 @@ def forward(
         attn_metadata: BlocksparseFlashAttentionMetadata,
         k_scale: float = 1.0,
         v_scale: float = 1.0,
-        attn_type: str = AttentionType.DECODER,
         output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with FlashAttention and PagedAttention.
@@ -375,12 +381,6 @@ def forward(
         Returns:
             shape = [num_tokens, num_heads * head_size]
         """
-        if attn_type != AttentionType.DECODER:
-            raise NotImplementedError("Encoder self-attention and "
-                                      "encoder/decoder cross-attention "
-                                      "are not implemented for "
-                                      "BlocksparseFlashAttentionImpl")
-
         num_tokens, hidden_size = query.shape
         # Reshape the query, key, and value tensors.
         query = query.view(-1, self.num_heads, self.head_size)
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index c69e12ad78c44..23ea244f07dfe 100644
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -600,6 +600,7 @@ def __init__(
         kv_cache_dtype: str,
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
+        attn_type: str = AttentionType.DECODER,
     ) -> None:
         if blocksparse_params is not None:
             raise ValueError(
@@ -627,6 +628,7 @@ def __init__(
             raise ValueError(
                 f"Head size {head_size} is not supported by FlashAttention. "
                 f"Supported head sizes are: {support_head_sizes}.")
+        self.attn_type = attn_type
 
     def forward(
         self,
@@ -637,7 +639,6 @@ def forward(
         attn_metadata: FlashAttentionMetadata,
         k_scale: float = 1.0,
         v_scale: float = 1.0,
-        attn_type: str = AttentionType.DECODER,
         output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with FlashAttention.
@@ -659,6 +660,7 @@ def forward(
 
         assert output is not None, "Output tensor must be provided."
 
+        attn_type = self.attn_type
         if (attn_type == AttentionType.ENCODER
                 and (not attn_metadata.is_all_encoder_attn_metadata_set)):
             raise AttributeError("Encoder attention requires setting "
diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
index e367468d05d26..a11462b2068a5 100644
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -748,6 +748,7 @@ def __init__(
         kv_cache_dtype: str,
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
+        attn_type: str = AttentionType.DECODER,
     ) -> None:
         self.num_heads = num_heads
         self.head_size = head_size
@@ -764,6 +765,12 @@ def __init__(
         assert self.num_heads % self.num_kv_heads == 0
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
 
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "FlashInferImpl")
+
     def forward(
         self,
         query: torch.Tensor,
@@ -773,18 +780,10 @@ def forward(
         attn_metadata: FlashInferMetadata,
         k_scale: float = 1.0,
         v_scale: float = 1.0,
-        attn_type: str = AttentionType.DECODER,
         output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
 
         # TODO: directly write to output tensor
-
-        if attn_type != AttentionType.DECODER:
-            raise NotImplementedError("Encoder self-attention and "
-                                      "encoder/decoder cross-attention "
-                                      "are not implemented for "
-                                      "FlashInferImpl")
-
         num_heads: int = self.num_heads
         head_size: int = self.head_size
         num_kv_heads: int = self.num_kv_heads
diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py
index f90d15d4207e7..94a461e0c8c29 100644
--- a/vllm/attention/backends/hpu_attn.py
+++ b/vllm/attention/backends/hpu_attn.py
@@ -102,6 +102,7 @@ def __init__(
         kv_cache_dtype: str,
         blocksparse_params: Optional[Dict[str, Any]] = None,
         max_seq_len: int = 4096,
+        attn_type: str = AttentionType.DECODER,
     ) -> None:
         super(AttentionImpl, self).__init__()
         self.kv_cache_dtype = kv_cache_dtype
@@ -143,6 +144,12 @@ def __init__(
                 f"Head size {head_size} is not supported by PagedAttention. "
                 f"Supported head sizes are: {suppored_head_sizes}.")
 
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "HPUAttentionImpl")
+
     def forward(
         self,
         query: torch.Tensor,
@@ -152,7 +159,6 @@ def forward(
         attn_metadata: HPUAttentionMetadata,
         k_scale: float = 1.0,
         v_scale: float = 1.0,
-        attn_type: str = AttentionType.DECODER,
         output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with xFormers and PagedAttention.
@@ -166,11 +172,6 @@ def forward(
         Returns:
             shape = [num_tokens, num_heads * head_size]
         """
-        if attn_type != AttentionType.DECODER:
-            raise NotImplementedError("Encoder self-attention and "
-                                      "encoder/decoder cross-attention "
-                                      "are not implemented for "
-                                      "HPUAttentionImpl")
         batch_size, seq_len, hidden_size = query.shape
         _, seq_len_kv, _ = key.shape
 
diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py
index 21949874bea47..da1d307daa517 100644
--- a/vllm/attention/backends/ipex_attn.py
+++ b/vllm/attention/backends/ipex_attn.py
@@ -115,6 +115,7 @@ def __init__(
         kv_cache_dtype: str,
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
+        attn_type: str = AttentionType.DECODER,
     ) -> None:
         if blocksparse_params is not None:
             raise ValueError(
@@ -146,6 +147,11 @@ def __init__(
             raise NotImplementedError(
                 "IPEX backend does not support FP8 KV cache. "
                 "Please use xFormers backend instead.")
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "IpexAttnBackendImpl")
 
     def split_kv_cache(
         self,
@@ -172,7 +178,6 @@ def forward(
         attn_metadata: IpexAttnMetadata,  # type: ignore
         k_scale: float = 1.0,
         v_scale: float = 1.0,
-        attn_type: str = AttentionType.DECODER,
         output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with IPEX varlen_attention and PagedAttention.
@@ -189,11 +194,6 @@ def forward(
             shape = [num_tokens, num_heads * head_size]
         """
         assert k_scale == 1.0 and v_scale == 1.0
-        if attn_type != AttentionType.DECODER:
-            raise NotImplementedError("Encoder self-attention and "
-                                      "encoder/decoder cross-attention "
-                                      "are not implemented for "
-                                      "IpexAttnBackendImpl")
         num_tokens, hidden_size = query.shape
         # Reshape the query, key, and value tensors.
         query = query.view(-1, self.num_heads, self.head_size)
diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py
index 9809aed0e66f9..2ac492dd8ae54 100644
--- a/vllm/attention/backends/pallas.py
+++ b/vllm/attention/backends/pallas.py
@@ -100,6 +100,7 @@ def __init__(
         kv_cache_dtype: str,
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
+        attn_type: str = AttentionType.DECODER,
     ) -> None:
         self.num_heads = num_heads
         self.head_size = head_size
@@ -141,6 +142,12 @@ def __init__(
                 # megacore mode will be None.
                 self.megacore_mode = "batch"
 
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "PallasAttentionBackendImpl")
+
     def forward(
         self,
         query: torch.Tensor,
@@ -150,7 +157,6 @@ def forward(
         attn_metadata: PallasMetadata,
         k_scale: float = 1.0,
         v_scale: float = 1.0,
-        attn_type: str = AttentionType.DECODER,
         output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with Pallas attention.
@@ -168,11 +174,6 @@ def forward(
             shape = [batch_size, seq_len, num_heads * head_size]
         """
         assert k_scale == 1.0 and v_scale == 1.0
-        if attn_type != AttentionType.DECODER:
-            raise NotImplementedError("Encoder self-attention and "
-                                      "encoder/decoder cross-attention "
-                                      "are not implemented for "
-                                      "PallasAttentionBackendImpl")
         batch_size, seq_len, hidden_size = query.shape
         query = query.view(batch_size, seq_len, self.num_heads, self.head_size)
         key = key.view(batch_size, seq_len, self.num_kv_heads, self.head_size)
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index d43c15b661ef7..a91a5af5c3d58 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -338,6 +338,7 @@ def __init__(
         kv_cache_dtype: str,
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
+        attn_type: str = AttentionType.DECODER,
     ) -> None:
         if blocksparse_params is not None:
             raise ValueError(
@@ -397,6 +398,12 @@ def __init__(
                 self.attn_func = _sdpa_attention
                 logger.debug("Using naive attention in ROCmBackend")
 
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "ROCmFlashAttentionImpl")
+
     def repeat_kv(self, x: torch.Tensor, n_rep: int) -> torch.Tensor:
         """torch.repeat_interleave(x, dim=1, repeats=n_rep)"""
         tokens, n_kv_heads, head_dim = x.shape
@@ -414,7 +421,6 @@ def forward(
         attn_metadata: ROCmFlashAttentionMetadata,
         k_scale: float = 1.0,
         v_scale: float = 1.0,
-        attn_type: str = AttentionType.DECODER,
         output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with FlashAttention and PagedAttention.
@@ -432,12 +438,6 @@ def forward(
         """
         # Reminder: Please update docs/source/features/compatibility_matrix.md
         # If the feature combo become valid
-        if attn_type != AttentionType.DECODER:
-            raise NotImplementedError("Encoder self-attention and "
-                                      "encoder/decoder cross-attention "
-                                      "are not implemented for "
-                                      "ROCmFlashAttentionImpl")
-
         num_tokens, hidden_size = query.shape
         # Reshape the query, key, and value tensors.
         query = query.view(-1, self.num_heads, self.head_size)
diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py
index 0cff6f5952aba..c14f7754596dd 100644
--- a/vllm/attention/backends/torch_sdpa.py
+++ b/vllm/attention/backends/torch_sdpa.py
@@ -390,6 +390,7 @@ def __init__(
         kv_cache_dtype: str,
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
+        attn_type: str = AttentionType.DECODER,
     ) -> None:
         if blocksparse_params is not None:
             raise ValueError(
@@ -421,6 +422,7 @@ def __init__(
             raise NotImplementedError(
                 "Torch SDPA backend does not support FP8 KV cache. "
                 "Please use xFormers backend instead.")
+        self.attn_type = attn_type
 
     def forward(
         self,
@@ -431,7 +433,6 @@ def forward(
         attn_metadata: TorchSDPAMetadata,  # type: ignore
         k_scale: float = 1.0,
         v_scale: float = 1.0,
-        attn_type: str = AttentionType.DECODER,
         output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with torch SDPA and PagedAttention.
@@ -448,6 +449,7 @@ def forward(
             shape = [num_tokens, num_heads * head_size]
         """
         assert k_scale == 1.0 and v_scale == 1.0
+        attn_type = self.attn_type
         if (attn_type == AttentionType.ENCODER
                 and (not attn_metadata.is_all_encoder_attn_metadata_set)):
             raise AttributeError("Encoder attention requires setting "
diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
index 3e59b3603d2c6..694c7cc1bc36a 100644
--- a/vllm/attention/backends/xformers.py
+++ b/vllm/attention/backends/xformers.py
@@ -379,6 +379,7 @@ def __init__(
         kv_cache_dtype: str,
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
+        attn_type: str = AttentionType.DECODER,
     ) -> None:
         if blocksparse_params is not None:
             raise ValueError(
@@ -405,6 +406,8 @@ def __init__(
                 f"Head size {head_size} is not supported by PagedAttention. "
                 f"Supported head sizes are: {suppored_head_sizes}.")
 
+        self.attn_type = attn_type
+
     def forward(
         self,
         query: torch.Tensor,
@@ -414,7 +417,6 @@ def forward(
         attn_metadata: "XFormersMetadata",
         k_scale: float = 1.0,
         v_scale: float = 1.0,
-        attn_type: str = AttentionType.DECODER,
         output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with xFormers and PagedAttention.
@@ -468,7 +470,7 @@ def forward(
         Returns:
             shape = [num_tokens, num_heads * head_size]
         """
-
+        attn_type = self.attn_type
         # Check that appropriate attention metadata attributes are
         # selected for the desired attention type
         if (attn_type == AttentionType.ENCODER
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 69b6d1e4648df..f1b3598e60b54 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -41,6 +41,7 @@ def __init__(
         logits_soft_cap: Optional[float] = None,
         per_layer_sliding_window: Optional[int] = None,
         prefix: str = "",
+        attn_type: str = AttentionType.DECODER,
     ) -> None:
         super().__init__()
         if per_layer_sliding_window is not None:
@@ -96,7 +97,7 @@ def __init__(
         impl_cls = attn_backend.get_impl_cls()
         self.impl = impl_cls(num_heads, head_size, scale, num_kv_heads,
                              alibi_slopes, sliding_window, kv_cache_dtype,
-                             blocksparse_params, logits_soft_cap)
+                             blocksparse_params, logits_soft_cap, attn_type)
         self.num_heads = num_heads
         self.head_size = head_size
         self.num_kv_heads = num_kv_heads
@@ -119,6 +120,7 @@ def __init__(
             raise ValueError(f"Duplicate layer name: {prefix}")
         compilation_config.static_forward_context[prefix] = self
         self.layer_name = prefix
+        self.attn_type = attn_type
 
     def forward(
         self,
@@ -127,18 +129,12 @@ def forward(
         value: torch.Tensor,
         kv_cache: torch.Tensor,
         attn_metadata: AttentionMetadata,
-        attn_type: str = AttentionType.DECODER,
     ) -> torch.Tensor:
 
         if self.use_direct_call:
-            return self.impl.forward(query,
-                                     key,
-                                     value,
-                                     kv_cache,
-                                     attn_metadata,
-                                     self._k_scale,
-                                     self._v_scale,
-                                     attn_type=attn_type)
+            return self.impl.forward(query, key, value, kv_cache,
+                                     attn_metadata, self._k_scale,
+                                     self._v_scale)
         elif self.use_output:
             output = torch.empty_like(query)
             hidden_size = query.size(-1)
@@ -152,13 +148,11 @@ def forward(
             if value is not None:
                 value = value.view(-1, self.num_kv_heads, self.head_size)
             torch.ops.vllm.unified_attention_with_output(
-                query, key, value, output, kv_cache, attn_type,
-                self.layer_name)
+                query, key, value, output, kv_cache, self.layer_name)
             return output.view(-1, hidden_size)
         else:
             return torch.ops.vllm.unified_attention(query, key, value,
-                                                    kv_cache, attn_type,
-                                                    self.layer_name)
+                                                    kv_cache, self.layer_name)
 
     def extra_repr(self) -> str:
         s = f"head_size={self.impl.head_size}"  # type: ignore
@@ -237,20 +231,13 @@ def unified_attention(
     key: torch.Tensor,
     value: torch.Tensor,
     kv_cache: torch.Tensor,
-    attn_type: str,
     layer_name: str,
 ) -> torch.Tensor:
     forward_context: ForwardContext = get_forward_context()
     attn_metadata = forward_context.dynamic_forward_context
     self = forward_context.static_forward_context[layer_name]
-    return self.impl.forward(query,
-                             key,
-                             value,
-                             kv_cache,
-                             attn_metadata,
-                             self._k_scale,
-                             self._v_scale,
-                             attn_type=attn_type)
+    return self.impl.forward(query, key, value, kv_cache, attn_metadata,
+                             self._k_scale, self._v_scale)
 
 
 def unified_attention_fake(
@@ -258,7 +245,6 @@ def unified_attention_fake(
     key: torch.Tensor,
     value: torch.Tensor,
     kv_cache: torch.Tensor,
-    attn_type: str,
     layer_name: str,
 ) -> torch.Tensor:
     return torch.empty_like(query).contiguous()
@@ -279,7 +265,6 @@ def unified_attention_with_output(
     value: torch.Tensor,
     output: torch.Tensor,
     kv_cache: torch.Tensor,
-    attn_type: str,
     layer_name: str,
 ) -> None:
     forward_context: ForwardContext = get_forward_context()
@@ -292,7 +277,6 @@ def unified_attention_with_output(
                       attn_metadata,
                       self._k_scale,
                       self._v_scale,
-                      attn_type=attn_type,
                       output=output)
 
 
@@ -302,7 +286,6 @@ def unified_attention_with_output_fake(
     value: torch.Tensor,
     output: torch.Tensor,
     kv_cache: torch.Tensor,
-    attn_type: str,
     layer_name: str,
 ) -> None:
     return
diff --git a/vllm/model_executor/models/bart.py b/vllm/model_executor/models/bart.py
index 3776490cb3465..57eb5adc82d5b 100644
--- a/vllm/model_executor/models/bart.py
+++ b/vllm/model_executor/models/bart.py
@@ -71,12 +71,8 @@ def __init__(self, num_embeddings: int, embedding_dim: int):
     def forward(
         self,
         positions: torch.Tensor,
-        attn_type: AttentionType,
     ) -> torch.Tensor:
         """`input_ids' shape is expected to be [bsz x seqlen]."""
-
-        assert attn_type != AttentionType.ENCODER_DECODER
-
         return super().forward(positions + self.offset)
 
 
@@ -180,7 +176,8 @@ def __init__(
                               num_kv_heads=self.num_kv_heads,
                               cache_config=cache_config,
                               quant_config=quant_config,
-                              prefix=f"{prefix}.attn")
+                              prefix=f"{prefix}.attn",
+                              attn_type=AttentionType.ENCODER)
 
     def forward(self, hidden_states: torch.Tensor, kv_cache: torch.Tensor,
                 attn_metadata: AttentionMetadata) -> torch.Tensor:
@@ -189,12 +186,7 @@ def forward(self, hidden_states: torch.Tensor, kv_cache: torch.Tensor,
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
 
-        attn_output = self.attn(q,
-                                k,
-                                v,
-                                kv_cache,
-                                attn_metadata,
-                                attn_type=AttentionType.ENCODER)
+        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
 
         output, _ = self.out_proj(attn_output)
         return output
@@ -264,7 +256,8 @@ def __init__(
                               num_kv_heads=self.num_kv_heads,
                               cache_config=cache_config,
                               quant_config=quant_config,
-                              prefix=f"{prefix}.attn")
+                              prefix=f"{prefix}.attn",
+                              attn_type=AttentionType.DECODER)
 
     def forward(self, hidden_states: torch.Tensor, kv_cache: torch.Tensor,
                 attn_metadata: AttentionMetadata) -> torch.Tensor:
@@ -273,12 +266,7 @@ def forward(self, hidden_states: torch.Tensor, kv_cache: torch.Tensor,
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
 
-        attn_output = self.attn(q,
-                                k,
-                                v,
-                                kv_cache,
-                                attn_metadata,
-                                attn_type=AttentionType.DECODER)
+        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
 
         output, _ = self.out_proj(attn_output)
         return output
@@ -348,7 +336,8 @@ def __init__(
                               num_kv_heads=self.num_kv_heads,
                               cache_config=cache_config,
                               quant_config=quant_config,
-                              prefix=f"{prefix}.attn")
+                              prefix=f"{prefix}.attn",
+                              attn_type=AttentionType.ENCODER_DECODER)
 
     def forward(
         self,
@@ -372,12 +361,7 @@ def forward(
             _, k, v = qkv_enc.split([self.q_size, self.kv_size, self.kv_size],
                                     dim=-1)
 
-        attn_output = self.attn(q,
-                                k,
-                                v,
-                                kv_cache,
-                                attn_metadata,
-                                attn_type=AttentionType.ENCODER_DECODER)
+        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
 
         output, _ = self.out_proj(attn_output)
         return output
@@ -644,10 +628,7 @@ def forward(self, input_ids: torch.Tensor, positions: torch.Tensor,
         # retrieve input_ids and inputs_embeds
         inputs_embeds = self.embed_tokens(input_ids)
 
-        embed_pos = self.embed_positions(
-            positions,
-            AttentionType.ENCODER,
-        )
+        embed_pos = self.embed_positions(positions)
         embed_pos = embed_pos.to(inputs_embeds.device)
 
         hidden_states = inputs_embeds + embed_pos
@@ -734,10 +715,7 @@ def forward(self, decoder_input_ids: torch.Tensor,
         inputs_embeds = self.embed_tokens(decoder_input_ids)
 
         # embed positions
-        embed_pos = self.embed_positions(
-            decoder_positions,
-            AttentionType.DECODER,
-        )
+        embed_pos = self.embed_positions(decoder_positions)
         embed_pos = embed_pos.to(inputs_embeds.device)
 
         hidden_states = inputs_embeds + embed_pos
diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py
index c1d47b1bc9bcd..4be136543de15 100644
--- a/vllm/model_executor/models/bert.py
+++ b/vllm/model_executor/models/bert.py
@@ -238,7 +238,8 @@ def __init__(
                               num_kv_heads=self.num_kv_heads,
                               cache_config=cache_config,
                               quant_config=quant_config,
-                              prefix=f"{prefix}.attn")
+                              prefix=f"{prefix}.attn",
+                              attn_type=AttentionType.ENCODER_ONLY)
 
     def forward(
         self,
@@ -248,12 +249,7 @@ def forward(
     ) -> torch.Tensor:
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
-        output = self.attn(q,
-                           k,
-                           v,
-                           kv_cache,
-                           attn_metadata,
-                           attn_type=AttentionType.ENCODER_ONLY)
+        output = self.attn(q, k, v, kv_cache, attn_metadata)
         return output
 
 
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index 6536f9807730c..c5046e06edecb 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -770,6 +770,7 @@ def __init__(
             self.scaling,
             self.num_local_key_value_heads,
             prefix=f"{prefix}.attn",
+            attn_type=AttentionType.ENCODER_DECODER,
         )
 
     def forward(
@@ -805,13 +806,9 @@ def forward(
                                                kv_range_for_decode,
                                                attn_metadata)
         else:
-            output = self.attn(q.view(-1,
-                                      self.num_local_heads * self.head_dim),
-                               k,
-                               v,
-                               kv_cache,
-                               attn_metadata,
-                               attn_type=AttentionType.ENCODER_DECODER)
+            output = self.attn(
+                q.view(-1, self.num_local_heads * self.head_dim), k, v,
+                kv_cache, attn_metadata)
         out, _ = self.o_proj(output)
         return out
 
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 88f4ea4352726..01745b5fd53e1 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -107,7 +107,8 @@ def __init__(self,
                  cache_config: Optional[CacheConfig] = None,
                  quant_config: Optional[QuantizationConfig] = None,
                  rope_scaling: Optional[Tuple] = None,
-                 prefix: str = "") -> None:
+                 prefix: str = "",
+                 attn_type: str = AttentionType.DECODER) -> None:
         super().__init__()
         self.hidden_size = hidden_size
         tp_size = get_tensor_model_parallel_world_size()
@@ -160,7 +161,8 @@ def __init__(self,
                               num_kv_heads=self.num_kv_heads,
                               cache_config=cache_config,
                               quant_config=quant_config,
-                              prefix=f"{prefix}.attn")
+                              prefix=f"{prefix}.attn",
+                              attn_type=attn_type)
 
     def forward(
         self,
@@ -168,17 +170,11 @@ def forward(
         hidden_states: torch.Tensor,
         kv_cache: torch.Tensor,
         attn_metadata: AttentionMetadata,
-        attn_type: str = AttentionType.DECODER,
     ) -> torch.Tensor:
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
         q, k = self.rotary_emb(positions, q, k)
-        attn_output = self.attn(q,
-                                k,
-                                v,
-                                kv_cache,
-                                attn_metadata,
-                                attn_type=attn_type)
+        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
         output, _ = self.o_proj(attn_output)
         return output
 
@@ -197,6 +193,16 @@ def __init__(
         # Requires transformers > 4.32.0
         rope_theta = getattr(config, "rope_theta", 1000000)
         rope_scaling = getattr(config, "rope_scaling", None)
+
+        # By default, Qwen2 uses causal attention as it is a decoder-only model.
+        # You can override the HF config with `is_causal=False` to enable
+        # bidirectional attention, which is used in some embedding models
+        # (e.g. Alibaba-NLP/gte-Qwen2-7B-instruct)
+        if getattr(config, "is_causal", True):
+            attn_type = AttentionType.DECODER
+        else:
+            attn_type = AttentionType.ENCODER_ONLY
+
         self.self_attn = Qwen2Attention(
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
@@ -207,6 +213,7 @@ def __init__(
             quant_config=quant_config,
             rope_scaling=rope_scaling,
             prefix=f"{prefix}.self_attn",
+            attn_type=attn_type,
         )
         self.mlp = Qwen2MLP(
             hidden_size=self.hidden_size,
@@ -220,15 +227,6 @@ def __init__(
         self.post_attention_layernorm = RMSNorm(config.hidden_size,
                                                 eps=config.rms_norm_eps)
 
-        # By default, Qwen2 uses causal attention as it is a decoder-only model.
-        # You can override the HF config with `is_causal=False` to enable
-        # bidirectional attention, which is used in some embedding models
-        # (e.g. Alibaba-NLP/gte-Qwen2-7B-instruct)
-        if getattr(config, "is_causal", True):
-            self._attn_type = AttentionType.DECODER
-        else:
-            self._attn_type = AttentionType.ENCODER_ONLY
-
     def forward(
         self,
         positions: torch.Tensor,
@@ -249,7 +247,6 @@ def forward(
             hidden_states=hidden_states,
             kv_cache=kv_cache,
             attn_metadata=attn_metadata,
-            attn_type=self._attn_type,
         )
 
         # Fully Connected
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index 65002f1ad70c7..b02bc9ffde538 100644
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -89,6 +89,7 @@ def __init__(
         kv_cache_dtype: str,
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
+        attn_type: AttentionType = AttentionType.DECODER,
     ) -> None:
         if blocksparse_params is not None:
             raise ValueError(
@@ -119,6 +120,12 @@ def __init__(
                 f"Head size {head_size} is not supported by FlashAttention. "
                 f"Supported head sizes are: {support_head_sizes}.")
 
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "FlashAttentionImpl")
+
     def forward(
         self,
         query: torch.Tensor,
@@ -128,7 +135,6 @@ def forward(
         attn_metadata: FlashAttentionMetadata,
         k_scale: float = 1.0,
         v_scale: float = 1.0,
-        attn_type: AttentionType = AttentionType.DECODER,
         output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with FlashAttention.
@@ -142,12 +148,6 @@ def forward(
         Returns:
             shape = [num_tokens, num_heads * head_size]
         """
-        if attn_type != AttentionType.DECODER:
-            raise NotImplementedError("Encoder self-attention and "
-                                      "encoder/decoder cross-attention "
-                                      "are not implemented for "
-                                      "FlashAttentionImpl")
-
         # NOTE(woosuk): FlashAttention does not support FP8 KV cache.
         assert k_scale == 1.0 and v_scale == 1.0, (
             "key/v_scale is not supported in FlashAttention.")

From 91b361ae898c944f823534121613f9d3dc19d7d1 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Mon, 6 Jan 2025 11:58:16 -0800
Subject: [PATCH 095/462] [V1] Extend beyond image modality and support
 mixed-modality inference with Llava-OneVision (#11685)

Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/source/models/supported_models.md        |   2 +-
 tests/multimodal/test_utils.py                | 209 +++++++++++++++++-
 tests/v1/core/test_kv_cache_utils.py          |  18 +-
 tests/v1/core/test_prefix_caching.py          |  17 +-
 vllm/model_executor/models/interfaces.py      |   6 +-
 vllm/model_executor/models/llava_onevision.py |  65 +++---
 vllm/model_executor/models/molmo.py           |   3 -
 vllm/multimodal/__init__.py                   |   3 +
 vllm/multimodal/hasher.py                     | 100 +++++++++
 vllm/multimodal/inputs.py                     |   9 +-
 vllm/multimodal/processing.py                 |  92 +++-----
 vllm/multimodal/utils.py                      |  86 ++++++-
 vllm/v1/engine/__init__.py                    |  18 +-
 vllm/v1/engine/mm_input_mapper.py             |  67 ------
 vllm/v1/engine/processor.py                   | 101 ++++++---
 vllm/v1/request.py                            |  48 ++--
 vllm/v1/worker/gpu_model_runner.py            |  74 ++++---
 17 files changed, 636 insertions(+), 282 deletions(-)
 create mode 100644 vllm/multimodal/hasher.py

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 5a2778026192a..94a8849f7edcd 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -647,7 +647,7 @@ See [this page](#generative-models) for more information on how to use generativ
   - `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc.
   -
   - ✅︎
-  -
+  - ✅︎
 * - `MiniCPMV`
   - MiniCPM-V
   - T + I<sup>E+</sup>
diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py
index 6029f2e514772..198344e5bd88c 100644
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@@ -2,16 +2,22 @@
 import mimetypes
 import os
 from tempfile import NamedTemporaryFile, TemporaryDirectory
-from typing import Dict, Tuple
+from typing import TYPE_CHECKING, Dict, NamedTuple, Optional, Tuple
 
 import numpy as np
 import pytest
 from PIL import Image, ImageChops
 from transformers import AutoConfig, AutoTokenizer
 
+from vllm.multimodal.inputs import PlaceholderRange
 from vllm.multimodal.utils import (MediaConnector,
+                                   merge_and_sort_multimodal_metadata,
                                    repeat_and_pad_placeholder_tokens)
 
+if TYPE_CHECKING:
+    from vllm.multimodal.hasher import MultiModalHashDict
+    from vllm.multimodal.inputs import MultiModalPlaceholderDict
+
 # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
 TEST_IMAGE_URLS = [
     "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
@@ -191,3 +197,204 @@ def test_repeat_and_pad_placeholder_tokens(model):
         assert new_prompt == expected_prompt
         assert new_token_ids == expected_token_ids
         assert ranges == expected_ranges
+
+
+# Used for the next two tests related to `merge_and_sort_multimodal_metadata`.
+class TestCase(NamedTuple):
+    mm_positions: "MultiModalPlaceholderDict"
+    mm_hashes: Optional["MultiModalHashDict"]
+    expected_modalities: list[str]
+    expected_ranges: list[PlaceholderRange]
+    expected_hashes: Optional[list[str]]
+
+
+def test_merge_and_sort_multimodal_metadata():
+
+    test_cases = [
+        # Single modality should return result as is but flattened
+        TestCase(
+            mm_positions={
+                "image": [
+                    PlaceholderRange(offset=0, length=2),
+                    PlaceholderRange(offset=3, length=2),
+                ]
+            },
+            mm_hashes={"image": ["hash1", "hash2"]},
+            expected_modalities=["image"],
+            expected_ranges=[
+                PlaceholderRange(offset=0, length=2),
+                PlaceholderRange(offset=3, length=2),
+            ],
+            expected_hashes=["hash1", "hash2"],
+        ),
+
+        # Single modality without hashes return None for mm hash.
+        TestCase(
+            mm_positions={
+                "image": [
+                    PlaceholderRange(offset=0, length=2),
+                    PlaceholderRange(offset=2, length=2),
+                ]
+            },
+            mm_hashes=None,
+            expected_modalities=["image"],
+            expected_ranges=[
+                PlaceholderRange(offset=0, length=2),
+                PlaceholderRange(offset=2, length=2),
+            ],
+            expected_hashes=None,
+        ),
+
+        # Multiple modalities with hashes should return sorted modalities
+        # and flattened ranges and hashes.
+        TestCase(
+            mm_positions={
+                "image": [
+                    PlaceholderRange(offset=7, length=4),
+                    PlaceholderRange(offset=11, length=5),
+                ],
+                "audio": [
+                    PlaceholderRange(offset=0, length=2),
+                    PlaceholderRange(offset=2, length=3),
+                ]
+            },
+            mm_hashes={
+                "image": ["image_hash1", "image_hash2"],
+                "audio": ["audio_hash1", "audio_hash2"],
+            },
+            expected_modalities=["audio", "image"],
+            expected_ranges=[
+                PlaceholderRange(offset=0, length=2),
+                PlaceholderRange(offset=2, length=3),
+                PlaceholderRange(offset=7, length=4),
+                PlaceholderRange(offset=11, length=5),
+            ],
+            expected_hashes=[
+                "audio_hash1", "audio_hash2", "image_hash1", "image_hash2"
+            ],
+        ),
+
+        # Multiple modalities without hashes should return sorted modalities
+        # and flattened ranges and None.
+        TestCase(
+            mm_positions={
+                "image": [
+                    PlaceholderRange(offset=7, length=4),
+                    PlaceholderRange(offset=11, length=5),
+                ],
+                "audio": [
+                    PlaceholderRange(offset=0, length=2),
+                    PlaceholderRange(offset=2, length=3),
+                ]
+            },
+            mm_hashes=None,
+            expected_modalities=["audio", "image"],
+            expected_ranges=[
+                PlaceholderRange(offset=0, length=2),
+                PlaceholderRange(offset=2, length=3),
+                PlaceholderRange(offset=7, length=4),
+                PlaceholderRange(offset=11, length=5),
+            ],
+            expected_hashes=None,
+        ),
+
+        # Three modalities
+        TestCase(
+            mm_positions={
+                "image": [
+                    PlaceholderRange(offset=15, length=7),
+                    PlaceholderRange(offset=22, length=8),
+                ],
+                "audio": [
+                    PlaceholderRange(offset=0, length=2),
+                ],
+                "video": [
+                    PlaceholderRange(offset=3, length=4),
+                    PlaceholderRange(offset=7, length=5),
+                    PlaceholderRange(offset=12, length=6),
+                ]
+            },
+            mm_hashes={
+                "image": ["image_hash1", "image_hash2"],
+                "audio": ["audio_hash1"],
+                "video": ["video_hash1", "video_hash2", "video_hash3"]
+            },
+            expected_modalities=["audio", "video", "image"],
+            expected_ranges=[
+                PlaceholderRange(offset=0, length=2),
+                PlaceholderRange(offset=3, length=4),
+                PlaceholderRange(offset=7, length=5),
+                PlaceholderRange(offset=12, length=6),
+                PlaceholderRange(offset=15, length=7),
+                PlaceholderRange(offset=22, length=8),
+            ],
+            expected_hashes=[
+                "audio_hash1", "video_hash1", "video_hash2", "video_hash3",
+                "image_hash1", "image_hash2"
+            ],
+        ),
+    ]
+
+    for (mm_positions, mm_hashes, expected_modalities, expected_ranges,
+         expected_hashes) in test_cases:
+        modalities, ranges, hashes = merge_and_sort_multimodal_metadata(
+            mm_positions, mm_hashes)
+
+        assert modalities == expected_modalities
+        assert ranges == expected_ranges
+        assert hashes == expected_hashes
+
+
+def test_merge_and_sort_multimodal_metadata_with_interleaving():
+
+    test_cases = [
+
+        # <image> <audio> <image> <audio>
+        TestCase(
+            mm_positions={
+                "image": [
+                    PlaceholderRange(offset=0, length=4),
+                    PlaceholderRange(offset=8, length=2),
+                ],
+                "audio": [
+                    PlaceholderRange(offset=5, length=2),
+                    PlaceholderRange(offset=11, length=4),
+                ]
+            },
+            mm_hashes={
+                "image": ["image_hash1", "image_hash2"],
+                "audio": ["audio_hash1", "audio_hash2"],
+            },
+            expected_modalities=[],
+            expected_ranges=[],
+            expected_hashes=None,
+        ),
+
+        # <image> <image> <video> <audio> <image>
+        TestCase(
+            mm_positions={
+                "image": [
+                    PlaceholderRange(offset=0, length=2),
+                    PlaceholderRange(offset=2, length=3),
+                    PlaceholderRange(offset=20, length=4),
+                ],
+                "audio": [
+                    PlaceholderRange(offset=5, length=2),
+                ],
+                "video": [
+                    PlaceholderRange(offset=8, length=5),
+                ]
+            },
+            mm_hashes=None,
+            expected_modalities=[],
+            expected_ranges=[],
+            expected_hashes=None,
+        ),
+    ]
+
+    for case in test_cases:
+        with pytest.raises(ValueError) as ex_info:
+            merge_and_sort_multimodal_metadata(case.mm_positions,
+                                               case.mm_hashes)
+
+        assert "Interleaved mixed-modality" in str(ex_info.value)
diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index 2ed70b42991b5..f4081766e39a2 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -1,6 +1,6 @@
 import pytest
 
-from vllm.inputs import token_inputs
+from vllm.multimodal.inputs import MultiModalKwargs
 from vllm.sampling_params import SamplingParams
 from vllm.v1.core.kv_cache_utils import (BlockHashType, FreeKVCacheBlockQueue,
                                          KVCacheBlock,
@@ -14,14 +14,18 @@ def make_request(request_id,
                  prompt_token_ids,
                  mm_positions=None,
                  mm_hashes=None):
+    if mm_positions is None:
+        multi_modal_inputs = None
+    else:
+        multi_modal_inputs = [MultiModalKwargs({})] * len(mm_positions)
+
     return Request(
         request_id=request_id,
-        inputs=token_inputs(
-            prompt_token_ids=prompt_token_ids,
-            multi_modal_placeholders={"image": mm_positions}
-            if mm_positions else None,
-            multi_modal_hashes=mm_hashes,
-        ),
+        prompt=None,
+        prompt_token_ids=prompt_token_ids,
+        multi_modal_inputs=multi_modal_inputs,
+        multi_modal_hashes=mm_hashes,
+        multi_modal_placeholders=mm_positions,
         sampling_params=SamplingParams(max_tokens=17),
         eos_token_id=100,
         arrival_time=0,
diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index 35e3a2f972720..b97f55b8c6535 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -1,8 +1,7 @@
 """Compare the with and without prefix caching."""
 import pytest
 
-from vllm.inputs import token_inputs
-from vllm.multimodal.inputs import PlaceholderRange
+from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
 from vllm.sampling_params import SamplingParams
 from vllm.utils import cdiv
 from vllm.v1.core.kv_cache_manager import KVCacheManager, Request
@@ -13,12 +12,18 @@ def make_request(request_id,
                  prompt_token_ids,
                  mm_positions=None,
                  mm_hashes=None):
+    if mm_positions is None:
+        multi_modal_inputs = None
+    else:
+        multi_modal_inputs = [MultiModalKwargs({})] * len(mm_positions)
+
     return Request(
         request_id=request_id,
-        inputs=token_inputs(prompt_token_ids=prompt_token_ids,
-                            multi_modal_placeholders={"image": mm_positions}
-                            if mm_positions else None,
-                            multi_modal_hashes=mm_hashes),
+        prompt=None,
+        prompt_token_ids=prompt_token_ids,
+        multi_modal_inputs=multi_modal_inputs,
+        multi_modal_hashes=mm_hashes,
+        multi_modal_placeholders=mm_positions,
         sampling_params=SamplingParams(max_tokens=17),
         eos_token_id=100,
         arrival_time=0,
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index 70b78fe64f2d8..6f26603046483 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -39,8 +39,12 @@ def get_multimodal_embeddings(self, **kwargs) -> Optional[T]:
 
         The output embeddings must be one of the following formats:
         - A list or tuple of 2D tensors, where each tensor corresponds to 
-          each input image.
+          each input multimodal data item (e.g, image).
         - A single 3D tensor, with the batch dimension grouping the 2D tensors.
+
+        NOTE: The returned multimodal embeddings must be in the same order as 
+        the appearances of their corresponding multimodal data item in the 
+        input prompt.
         """
         ...
 
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index 5eac2f223d794..9117824995021 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -35,6 +35,9 @@
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
 
+# For profile run
+_MAX_FRAMES_PER_VIDEO = 16
+
 
 class LlavaOnevisionVideoPixelInputs(TypedDict):
     type: Literal["pixel_values_videos"]
@@ -223,8 +226,10 @@ def _get_dummy_num_frames(self, seq_len: int) -> int:
         max_image_tokens = self._get_max_image_tokens() * max_images
         max_total_frames = self._get_max_video_frames(seq_len -
                                                       max_image_tokens)
+        max_frames_per_video = min(max_total_frames // max(max_videos, 1),
+                                   _MAX_FRAMES_PER_VIDEO)
 
-        return max(max_total_frames // max(max_videos, 1), 1)
+        return max(max_frames_per_video, 1)
 
     def _get_max_video_tokens(self, seq_len: int) -> int:
         target_width, target_height = self._get_image_size_with_most_features()
@@ -558,13 +563,15 @@ def _parse_and_validate_video_input(
     def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
         modalities = {}
 
-        if "pixel_values" in kwargs:
-            modalities["images"] = self._parse_and_validate_image_input(
-                **kwargs)
-
-        if "pixel_values_videos" in kwargs:
-            modalities["videos"] = self._parse_and_validate_video_input(
-                **kwargs)
+        # Preserve the order of modalities if there are multiple of them
+        # from the order of kwargs.
+        for input_key in kwargs:
+            if input_key == "pixel_values" and "images" not in modalities:
+                modalities["images"] = self._parse_and_validate_image_input(
+                    **kwargs)
+            if input_key == "pixel_values_videos" and "videos" not in modalities:  # noqa E501
+                modalities["videos"] = self._parse_and_validate_video_input(
+                    **kwargs)
 
         return modalities
 
@@ -824,21 +831,21 @@ def get_multimodal_embeddings(
         if not modalities:
             return None
 
-        # We make a tuple of each embedding with its modality string. This is a
-        # temporary workaround for models to handle mixed modalities when
-        # get_multimodal_embeddings and get_input_embeddings are called
-        # separately.
-        # TODO(ywang96): Add support for mixed-modality inference for v1.
-        multimodal_embeddings: List[Tuple[NestedTensors, str]] = []
-
-        if "images" in modalities:
-            image_input = modalities["images"]
-            vision_embeddings = self._process_image_input(image_input)
-            multimodal_embeddings.append((vision_embeddings, "image"))
-        if "videos" in modalities:
-            video_input = modalities["videos"]
-            video_embeddings = self._process_video_pixels(video_input)
-            multimodal_embeddings.append((video_embeddings, "video"))
+        # The result multimodal_embeddings is tuple of tensors, with each
+        # tensor correspoending to a multimodal data item (image or video).
+        multimodal_embeddings: tuple[torch.Tensor, ...] = ()
+
+        # NOTE: It is important to iterate over the keys in this dictionary
+        # to preserve the order of the modalities.
+        for modality in modalities:
+            if modality == "images":
+                image_input = modalities["images"]
+                vision_embeddings = self._process_image_input(image_input)
+                multimodal_embeddings += tuple(vision_embeddings)
+            if modality == "videos":
+                video_input = modalities["videos"]
+                video_embeddings = self._process_video_pixels(video_input)
+                multimodal_embeddings += tuple(video_embeddings)
 
         return multimodal_embeddings
 
@@ -850,15 +857,9 @@ def get_input_embeddings(
     ) -> torch.Tensor:
         inputs_embeds = self.language_model.get_input_embeddings(input_ids)
         if multimodal_embeddings is not None:
-            for embeddings, modality in multimodal_embeddings:
-                if modality == "image":
-                    inputs_embeds = merge_multimodal_embeddings(
-                        input_ids, inputs_embeds, embeddings,
-                        self.config.image_token_index)
-                if modality == "video":
-                    inputs_embeds = merge_multimodal_embeddings(
-                        input_ids, inputs_embeds, embeddings,
-                        self.config.video_token_index)
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, multimodal_embeddings,
+                [self.config.image_token_index, self.config.video_token_index])
         return inputs_embeds
 
     def forward(
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index 0e8287bb56b6b..2e60bc719f096 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -972,8 +972,6 @@ def image_input_mapper_for_molmo(
         assert len(data) == 1, "Molmo supports only one image per prompt."
         data = data[0]
 
-    # Remove unused dummy PIL image
-    data.pop('raw_mm_data', None)
     return MultiModalKwargs(data)
 
 
@@ -1019,7 +1017,6 @@ def dummy_data_for_molmo(ctx: InputContext, seq_len: int,
     dummy_imgdata = {
         "images": out["images"],
         "image_input_idx": out["image_input_idx"],
-        "raw_mm_data": dummy_image,
     }
     if "image_masks" in out:
         dummy_imgdata["image_masks"] = out["image_masks"]
diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py
index e58bbe81717a0..343b9322ecc5e 100644
--- a/vllm/multimodal/__init__.py
+++ b/vllm/multimodal/__init__.py
@@ -1,4 +1,5 @@
 from .base import MultiModalPlaceholderMap, MultiModalPlugin
+from .hasher import MultiModalHashDict, MultiModalHasher
 from .inputs import (BatchedTensorInputs, ModalityData, MultiModalDataBuiltins,
                      MultiModalDataDict, MultiModalKwargs,
                      MultiModalPlaceholderDict, NestedTensors)
@@ -18,6 +19,8 @@
     "ModalityData",
     "MultiModalDataBuiltins",
     "MultiModalDataDict",
+    "MultiModalHashDict",
+    "MultiModalHasher",
     "MultiModalKwargs",
     "MultiModalPlaceholderDict",
     "MultiModalPlaceholderMap",
diff --git a/vllm/multimodal/hasher.py b/vllm/multimodal/hasher.py
new file mode 100644
index 0000000000000..24aa1ca658048
--- /dev/null
+++ b/vllm/multimodal/hasher.py
@@ -0,0 +1,100 @@
+import pickle
+from typing import TYPE_CHECKING, Iterable, Mapping, Optional
+
+import numpy as np
+import torch
+from blake3 import blake3
+from PIL import Image
+
+from vllm.logger import init_logger
+
+if TYPE_CHECKING:
+    from vllm.inputs import TokensPrompt
+
+logger = init_logger(__name__)
+
+MultiModalHashDict = Mapping[str, list[str]]
+"""
+A dictionary containing hashes for items in each modality.
+"""
+
+
+class MultiModalHasher:
+
+    @classmethod
+    def serialize_item(cls, obj: object) -> bytes:
+        # Simple cases
+        if isinstance(obj, str):
+            return obj.encode("utf-8")
+        if isinstance(obj, bytes):
+            return obj
+        if isinstance(obj, Image.Image):
+            return obj.tobytes()
+
+        # Convertible to NumPy arrays
+        if isinstance(obj, torch.Tensor):
+            obj = obj.numpy()
+        if isinstance(obj, (int, float)):
+            obj = np.array(obj)
+        if isinstance(obj, np.ndarray):
+            return obj.tobytes()
+
+        logger.warning(
+            "No serialization method found for %s. "
+            "Falling back to pickle.", type(obj))
+
+        return pickle.dumps(obj)
+
+    @classmethod
+    def item_to_bytes(
+        cls,
+        key: str,
+        obj: object,
+    ) -> Iterable[tuple[bytes, bytes]]:
+        # Recursive cases
+        if isinstance(obj, (list, tuple)):
+            for i, elem in enumerate(obj):
+                yield from cls.item_to_bytes(f"{key}.{i}", elem)
+        elif isinstance(obj, dict):
+            for k, v in obj.items():
+                yield from cls.item_to_bytes(f"{key}.{k}", v)
+        else:
+            key_bytes = cls.serialize_item(key)
+            value_bytes = cls.serialize_item(obj)
+            yield key_bytes, value_bytes
+
+    @classmethod
+    def hash_kwargs(cls, **kwargs: object) -> str:
+        hasher = blake3()
+
+        for k, v in kwargs.items():
+            for k_bytes, v_bytes in cls.item_to_bytes(k, v):
+                hasher.update(k_bytes)
+                hasher.update(v_bytes)
+
+        return hasher.hexdigest()
+
+    @classmethod
+    def hash_prompt_mm_data(
+            cls, prompt: "TokensPrompt") -> Optional["MultiModalHashDict"]:
+        """Hash multimodal data in the user input prompt if they exist."""
+
+        if "multi_modal_data" not in prompt:
+            return None
+
+        mm_data = prompt["multi_modal_data"]
+        if not mm_data:
+            # mm_data can be None or an empty dict.
+            return None
+
+        mm_items = {
+            modality: items if isinstance(items, list) else [items]
+            for modality, items in mm_data.items()
+        }
+
+        mm_hashes = {
+            modality: [cls.hash_kwargs(**{modality: item}) for item in items]
+            for modality, items in mm_items.items()
+        }
+
+        return mm_hashes
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index b0a1104546186..8fdcc4b524035 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -2,8 +2,8 @@
 from collections import UserDict, defaultdict
 from collections.abc import Mapping, Sequence
 from dataclasses import dataclass
-from typing import (Any, Literal, Optional, TypedDict, TypeVar, Union, cast,
-                    final)
+from typing import (TYPE_CHECKING, Any, Literal, Optional, TypedDict, TypeVar,
+                    Union, cast, final)
 
 import numpy as np
 import torch
@@ -14,6 +14,9 @@
 
 from vllm.utils import JSONTree, full_groupby, is_list_of, json_map_leaves
 
+if TYPE_CHECKING:
+    from .hasher import MultiModalHashDict
+
 _T = TypeVar("_T")
 
 HfImageItem: TypeAlias = Union[Image, np.ndarray, torch.Tensor]
@@ -513,7 +516,7 @@ class MultiModalInputsV2(TypedDict):
     mm_kwargs: MultiModalKwargs
     """Keyword arguments to be directly passed to the model after batching."""
 
-    mm_hashes: NotRequired[list[str]]
+    mm_hashes: NotRequired[Optional["MultiModalHashDict"]]
     """The hashes of the multi-modal data."""
 
     mm_placeholders: MultiModalPlaceholderDict
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 933c1d3aff0cb..41113cd85bd16 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -1,4 +1,3 @@
-import pickle
 import re
 from abc import ABC, abstractmethod
 from collections import defaultdict
@@ -7,18 +6,16 @@
 from functools import lru_cache
 from typing import Any, NamedTuple, Optional, Protocol, TypeVar, Union
 
-import numpy as np
-import torch
-from blake3 import blake3
-from PIL import Image
 from transformers import BatchFeature, PretrainedConfig, ProcessorMixin
 
+from vllm import envs
 from vllm.inputs import DummyData, InputProcessingContext
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import (AnyTokenizer, decode_tokens,
                                                encode_tokens)
 from vllm.utils import LRUCache, flatten_2d_lists, full_groupby
 
+from .hasher import MultiModalHasher
 from .inputs import (MultiModalDataDict, MultiModalFieldConfig,
                      MultiModalInputsV2, MultiModalKwargs,
                      MultiModalKwargsItem, PlaceholderRange)
@@ -486,56 +483,6 @@ def _maybe_log_cache_stats(self) -> None:
             logger.debug("ProcessingCache: hit_ratio = %.2f",
                          cache_stats.hit_ratio)
 
-    def _serialize_item(self, obj: object) -> bytes:
-        # Simple cases
-        if isinstance(obj, str):
-            return obj.encode("utf-8")
-        if isinstance(obj, bytes):
-            return obj
-        if isinstance(obj, Image.Image):
-            return obj.tobytes()
-
-        # Convertible to NumPy arrays
-        if isinstance(obj, torch.Tensor):
-            obj = obj.numpy()
-        if isinstance(obj, (int, float)):
-            obj = np.array(obj)
-        if isinstance(obj, np.ndarray):
-            return obj.tobytes()
-
-        logger.warning(
-            "No serialization method found for %s. "
-            "Falling back to pickle.", type(obj))
-
-        return pickle.dumps(obj)
-
-    def _item_to_bytes(
-        self,
-        key: str,
-        obj: object,
-    ) -> Iterable[tuple[bytes, bytes]]:
-        # Recursive cases
-        if isinstance(obj, (list, tuple)):
-            for i, elem in enumerate(obj):
-                yield from self._item_to_bytes(f"{key}.{i}", elem)
-        elif isinstance(obj, dict):
-            for k, v in obj.items():
-                yield from self._item_to_bytes(f"{key}.{k}", v)
-        else:
-            key_bytes = self._serialize_item(key)
-            value_bytes = self._serialize_item(obj)
-            yield key_bytes, value_bytes
-
-    def _hash_kwargs(self, **kwargs: object) -> str:
-        hasher = blake3()
-
-        for k, v in kwargs.items():
-            for k_bytes, v_bytes in self._item_to_bytes(k, v):
-                hasher.update(k_bytes)
-                hasher.update(v_bytes)
-
-        return hasher.hexdigest()
-
     def get(
         self,
         model_id: str,
@@ -554,9 +501,9 @@ def get(
         """
         self._maybe_log_cache_stats()
 
-        cache_key = self._hash_kwargs(model_id=model_id,
-                                      **{modality: input_item},
-                                      **input_kwargs)
+        cache_key = MultiModalHasher.hash_kwargs(model_id=model_id,
+                                                 **{modality: input_item},
+                                                 **input_kwargs)
         return self._cache.get(cache_key)
 
     def put(
@@ -571,9 +518,9 @@ def put(
         Put a processed multi-modal item into the cache
         according to its dependencies (see :meth:`get`).
         """
-        cache_key = self._hash_kwargs(model_id=model_id,
-                                      **{modality: input_item},
-                                      **input_kwargs)
+        cache_key = MultiModalHasher.hash_kwargs(model_id=model_id,
+                                                 **{modality: input_item},
+                                                 **input_kwargs)
         self._cache.put(cache_key, output_kwargs)
 
 
@@ -1049,6 +996,24 @@ def apply(
         """
         mm_items = self._to_mm_items(mm_data)
 
+        # Create MM hashes (only used in V1)
+        # TODO: Use these hash keys for caching operations in apply_hf_processor
+        # instead of rehashing.
+
+        if envs.VLLM_USE_V1:
+            model_id = self.ctx.model_config.model
+            mm_hashes = {
+                modality: [
+                    MultiModalHasher.hash_kwargs(model_id=model_id,
+                                                 **{modality: item},
+                                                 **hf_processor_mm_kwargs)
+                    for item in items
+                ]
+                for modality, items in mm_items.items()
+            }
+        else:
+            mm_hashes = None
+
         prompt_ids, mm_kwargs = self._cached_apply_hf_processor(
             prompt_text,
             mm_items,
@@ -1122,6 +1087,7 @@ def apply(
             prompt=prompt_text,
             prompt_token_ids=prompt_ids,
             mm_kwargs=mm_kwargs,
+            mm_hashes=mm_hashes,
             mm_placeholders=mm_placeholder_ranges,
         )
 
@@ -1174,7 +1140,9 @@ def get_dummy_data(self, seq_len: int) -> DummyData:
                 "tokens.")
 
         total_len = len(prompt_token_ids)
-        if total_len > seq_len:
+
+        # V0 does not support chunked prefill.
+        if total_len > seq_len and not envs.VLLM_USE_V1:
             logger.warning(
                 "The context length (%d) of the model is too short "
                 "to hold the multi-modal embeddings in the worst case "
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 7b6ded6a27084..f4a514ba55d0c 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -1,6 +1,6 @@
 from functools import lru_cache
 from pathlib import Path
-from typing import Optional, TypeVar, Union
+from typing import TYPE_CHECKING, Optional, TypeVar, Union
 from urllib.parse import ParseResult, urlparse
 
 import numpy as np
@@ -25,6 +25,10 @@
 
 _M = TypeVar("_M")
 
+if TYPE_CHECKING:
+    from .hasher import MultiModalHashDict
+    from .inputs import MultiModalPlaceholderDict
+
 
 class MediaConnector:
 
@@ -437,3 +441,83 @@ def consecutive_placeholder_ranges(
         PlaceholderRange(offset=initial_offset + i * item_size,
                          length=item_size) for i in range(num_items)
     ]
+
+
+def merge_and_sort_multimodal_metadata(
+    mm_positions: "MultiModalPlaceholderDict",
+    mm_hashes: Optional["MultiModalHashDict"],
+) -> tuple[list[str], list[PlaceholderRange], Optional[list[str]]]:
+    """Given a MultiModalPlaceholderDict, merge all PlaceholderRange
+    objects from all available modalities into a single list of 
+    PlaceholderRange, sorted by their offset (starting index in the input 
+    sequence) in the ascending order.
+
+    Optionally if a MultiModalHashDict is given, same operation will be 
+    applied to the object and the sorted list of hashes will be returned.
+
+    Raises:
+        ValueError: If the input prompt has interleaved placeholders from
+            different modalities (e.g, "<image><audio><image> Describe the 
+            content.")
+    
+    Returns:
+        list[str]: Sorted list of involved modalities.
+        list[PlaceholderRange]: Sorted list of all PlaceholdeRanges from 
+            mm_positions.
+        Optional[list[str]]: Sorted list of all hashes from mm_hashes if 
+            given, None otherwise.
+    """
+
+    modalities = list(mm_positions.keys())
+
+    assert len(modalities) > 0, "No modalities found in the mm_positions."
+
+    # For single modality, placeholder ranges and hashes are already sorted
+    # so we can return the list directly.
+    if len(modalities) == 1:
+        if mm_hashes is None:
+            return modalities, list(mm_positions[modalities[0]]), None
+        else:
+            return modalities, list(mm_positions[modalities[0]]), list(
+                mm_hashes[modalities[0]])
+
+    placeholder_lists_with_modality = [(modality, mm_positions[modality])
+                                       for modality in modalities]
+
+    if mm_hashes is None:
+        sorted_placeholder_lists = sorted(placeholder_lists_with_modality,
+                                          key=lambda x: x[1][0]['offset'])
+        sorted_hash_lists = None
+    else:
+        hashes_lists = [
+            mm_hashes[modality] for modality in modalities
+            if modality in mm_hashes
+        ]
+        sorted_pairs = sorted(zip(placeholder_lists_with_modality,
+                                  hashes_lists),
+                              key=lambda x: x[0][1][0]['offset'])
+        sorted_placeholder_tuple, sorted_hash_tuple = zip(*sorted_pairs)
+        sorted_placeholder_lists = list(sorted_placeholder_tuple)
+        sorted_hash_lists = list(sorted_hash_tuple)
+
+    sorted_modalities = [modality for modality, _ in sorted_placeholder_lists]
+
+    # Flatten sorted list of lists to a single list and verify there is no
+    # interleaving of placeholders from different modalities.
+    merged_placeholders: list[PlaceholderRange] = []
+    for modality, placeholder_list in sorted_placeholder_lists:
+        if merged_placeholders and placeholder_list[0][
+                'offset'] < merged_placeholders[-1]['offset']:
+            raise ValueError(
+                "Interleaved mixed-modality inference is currently not "
+                "supported.")
+        merged_placeholders.extend(placeholder_list)
+
+    if sorted_hash_lists is not None:
+        merged_hashes = []
+        for hash_list in sorted_hash_lists:
+            merged_hashes.extend(hash_list)
+    else:
+        merged_hashes = None
+
+    return sorted_modalities, merged_placeholders, merged_hashes
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index f70464fc88298..5e3c5e327ef63 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -1,12 +1,14 @@
 import enum
 from dataclasses import dataclass
-from typing import List, Optional, Union
+from typing import TYPE_CHECKING, List, Optional, Union
 
 import msgspec
 
-from vllm.lora.request import LoRARequest
-from vllm.multimodal import MultiModalKwargs, MultiModalPlaceholderDict
-from vllm.sampling_params import SamplingParams
+if TYPE_CHECKING:
+    from vllm.lora.request import LoRARequest
+    from vllm.multimodal import MultiModalKwargs
+    from vllm.multimodal.inputs import PlaceholderRange
+    from vllm.sampling_params import SamplingParams
 
 
 @dataclass
@@ -21,13 +23,13 @@ class EngineCoreRequest:
     # always be tokenized?
     prompt: Optional[str]
     prompt_token_ids: List[int]
-    mm_inputs: Optional[List[Optional[MultiModalKwargs]]]
+    mm_inputs: Optional[List[Optional["MultiModalKwargs"]]]
     mm_hashes: Optional[List[str]]
-    mm_placeholders: Optional[MultiModalPlaceholderDict]
-    sampling_params: SamplingParams
+    mm_placeholders: Optional[List["PlaceholderRange"]]
+    sampling_params: "SamplingParams"
     eos_token_id: Optional[int]
     arrival_time: float
-    lora_request: Optional[LoRARequest]
+    lora_request: Optional["LoRARequest"]
 
 
 class EngineCoreOutput(
diff --git a/vllm/v1/engine/mm_input_mapper.py b/vllm/v1/engine/mm_input_mapper.py
index 8bfc739b3dbbc..d83460a40ad26 100644
--- a/vllm/v1/engine/mm_input_mapper.py
+++ b/vllm/v1/engine/mm_input_mapper.py
@@ -1,10 +1,6 @@
 from typing import Any, Dict, List, Optional
 
-import PIL
-from blake3 import blake3
-
 from vllm.config import ModelConfig
-from vllm.inputs import PromptType
 from vllm.logger import init_logger
 from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalDataDict,
                              MultiModalKwargs, MultiModalRegistry)
@@ -144,66 +140,3 @@ def process_inputs(
             full_mm_inputs.append(mm_input)
 
         return full_mm_inputs
-
-
-class MMHasher:
-
-    def __init__(self):
-        pass
-
-    def hash_dummy_mm_data(
-            self,
-            mm_data: Optional[MultiModalDataDict]) -> Optional[List[str]]:
-        """Hash user-defined dummy multimodal data used for profiling."""
-
-        if mm_data is None:
-            return None
-
-        image_inputs = mm_data['image']
-
-        # This is a temporary workaround for models (e.g, Molmo) that
-        # process multimodal data in the input processor (therefore
-        # image_inputs is MultiModalKwargs instead of raw input format).
-        # `raw_mm_data` with the original input format is expected
-        # in this case.
-        if isinstance(image_inputs, dict):
-            assert "raw_mm_data" in image_inputs and isinstance(
-                image_inputs["raw_mm_data"], PIL.Image.Image)
-            image_inputs = image_inputs.pop("raw_mm_data")
-
-        return self.hash_images(image_inputs)
-
-    def hash_prompt_mm_data(self, prompt: PromptType) -> Optional[List[str]]:
-        """Hash multimodal data in the user input prompt if they exist."""
-
-        if "multi_modal_data" not in prompt:
-            return None
-
-        mm_data = prompt["multi_modal_data"]
-        if not mm_data:
-            # mm_data can be None or an empty dict.
-            return None
-
-        image_inputs = mm_data["image"]
-
-        return self.hash_images(image_inputs)
-
-    def hash_images(self, image_inputs) -> Optional[List[str]]:
-        """Hash PIL image objects to strings."""
-        if not isinstance(image_inputs, list):
-            image_inputs = [image_inputs]
-        assert len(image_inputs) > 0
-
-        ret = []
-        for image in image_inputs:
-            assert isinstance(image, PIL.Image.Image)
-
-            # Convert image to bytes
-            bytes = image.tobytes()
-
-            # Hash image bytes
-            hasher = blake3()
-            hasher.update(bytes)
-            ret.append(hasher.hexdigest())
-
-        return ret
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index c0f6cfab4865c..43419d2ff5381 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -7,14 +7,15 @@
 from vllm.inputs.parse import is_encoder_decoder_inputs
 from vllm.inputs.preprocess import InputPreprocessor
 from vllm.lora.request import LoRARequest
-from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalKwargs,
-                             MultiModalRegistry)
+from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalHasher,
+                             MultiModalKwargs, MultiModalRegistry)
+from vllm.multimodal.utils import merge_and_sort_multimodal_metadata
 from vllm.pooling_params import PoolingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
 from vllm.v1.engine import EngineCoreRequest
-from vllm.v1.engine.mm_input_mapper import MMHasher, MMInputMapperClient
+from vllm.v1.engine.mm_input_mapper import MMInputMapperClient
 
 
 class Processor:
@@ -47,7 +48,6 @@ def __init__(
         # Multi-modal hasher (for images)
         self.use_hash = (not model_config.disable_mm_preprocessor_cache) or \
             cache_config.enable_prefix_caching
-        self.mm_hasher = MMHasher()
 
     def process_inputs(
         self,
@@ -73,11 +73,6 @@ def process_inputs(
         assert priority == 0, "vLLM V1 does not support priority at the moment."
         assert trace_headers is None, "vLLM V1 does not support tracing yet."
 
-        # Compute MM hashes (if enabled)
-        mm_hashes = None
-        if self.use_hash:
-            mm_hashes = self.mm_hasher.hash_prompt_mm_data(prompt)
-
         # Process inputs.
         preprocessed_inputs = self.input_preprocessor.preprocess(
             prompt,
@@ -108,8 +103,20 @@ def process_inputs(
         sampling_params.update_from_generation_config(
             self.generation_config_fields, eos_token_id)
 
+        # Multimodal related.
+        # Compute MM hashes (if enabled)
+        mm_hashes = None
+        if self.use_hash:
+            # Use mm_hashes from processed inputs if the model has merged
+            # input processor.
+            if decoder_inputs.multi_modal_hashes:
+                mm_hashes = decoder_inputs.multi_modal_hashes
+            # Fallback to using MultiModalHasher directly.
+            else:
+                mm_hashes = MultiModalHasher.hash_prompt_mm_data(prompt)
+
         # For merged preprocessor, mm_data is already mm_inputs
-        precomputed_mm_inputs = None
+        precomputed_mm_inputs: Optional[list[MultiModalKwargs]] = None
         decoder_mm_data = decoder_inputs.multi_modal_data
         if isinstance(decoder_mm_data, MultiModalKwargs):
             # The output of merged multi-modal processor (`decoder_mm_data`)
@@ -122,27 +129,67 @@ def process_inputs(
                 for item in decoder_mm_data.get_items(modality)
             ]
 
-        # Apply MM mapper
-        mm_inputs = None
-        if len(decoder_mm_data) > 0:
-            mm_inputs = self.mm_input_mapper_client.process_inputs(
-                decoder_mm_data,
+        mm_positions = decoder_inputs.multi_modal_placeholders
+
+        # Last-mile processing of multimodal metadata and inputs.
+        if mm_positions:
+
+            # Merge and flatten multimodal placeholders, hashes and inputs
+            # from dictionaries to lists, and sort them by each item's position
+            # in the input sequence.
+            # NOTE: interleaved modalities are not supported.
+            (
+                sorted_modalities,
+                sorted_mm_positions,
+                sorted_mm_hashes,
+            ) = merge_and_sort_multimodal_metadata(
+                mm_positions,
                 mm_hashes,
-                decoder_inputs.mm_processor_kwargs,
-                precomputed_mm_inputs,
             )
 
+            # NOTE: Sort multimodal inputs/kwargs ONLY IF there are multiple
+            # modalities involved AND the model supports merged input processor.
+            if len(sorted_modalities) > 1 and precomputed_mm_inputs:
+
+                modality_order_dict = {
+                    modality: order
+                    for order, modality in enumerate(sorted_modalities)
+                }
+
+                # Sanity check to make sure each multimodal input has only one
+                # modality key.
+                for mm_input in precomputed_mm_inputs:
+                    assert len(mm_input.modalities) == 1
+
+                # Sort MultiModalKwags to match sorted_mm_positions
+                precomputed_mm_inputs = sorted(
+                    precomputed_mm_inputs,
+                    key=lambda mm_input: modality_order_dict[list(
+                        mm_input.modalities)[0]])
+
+            # Apply mm input cache update (and input mapper if necessary).
+            sorted_mm_inputs = self.mm_input_mapper_client.process_inputs(
+                mm_data=decoder_mm_data,
+                mm_hashes=sorted_mm_hashes,
+                mm_processor_kwargs=decoder_inputs.mm_processor_kwargs,
+                precomputed_mm_inputs=precomputed_mm_inputs,
+            )
+        else:
+            sorted_mm_inputs = None
+            sorted_mm_hashes = None
+            sorted_mm_positions = None
+
         return EngineCoreRequest(
-            request_id,
-            decoder_inputs.prompt,
-            decoder_inputs.prompt_token_ids,
-            mm_inputs,
-            mm_hashes,
-            decoder_inputs.multi_modal_placeholders,
-            sampling_params,
-            eos_token_id,
-            arrival_time,
-            lora_request,
+            request_id=request_id,
+            prompt=decoder_inputs.prompt,
+            prompt_token_ids=decoder_inputs.prompt_token_ids,
+            mm_inputs=sorted_mm_inputs,
+            mm_hashes=sorted_mm_hashes,
+            mm_placeholders=sorted_mm_positions,
+            sampling_params=sampling_params,
+            eos_token_id=eos_token_id,
+            arrival_time=arrival_time,
+            lora_request=lora_request,
         )
 
     def _validate_model_inputs(self, inputs: ProcessorInputs):
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index f4783ae366ef0..45450165eaefe 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -1,15 +1,15 @@
 import enum
 from typing import TYPE_CHECKING, List, Optional, Union
 
-from vllm.inputs import DecoderOnlyInputs, SingletonInputsAdapter, token_inputs
 from vllm.lora.request import LoRARequest
-from vllm.multimodal import MultiModalKwargs
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import RequestMetrics
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.utils import ConstantList
 
 if TYPE_CHECKING:
+    from vllm.multimodal import MultiModalKwargs
+    from vllm.multimodal.inputs import PlaceholderRange
     from vllm.v1.core.kv_cache_utils import BlockHashType
 
 
@@ -18,14 +18,17 @@ class Request:
     def __init__(
         self,
         request_id: str,
-        inputs: DecoderOnlyInputs,
+        prompt: Optional[str],
+        prompt_token_ids: List[int],
+        multi_modal_inputs: Optional[List["MultiModalKwargs"]],
+        multi_modal_hashes: Optional[List[str]],
+        multi_modal_placeholders: Optional[List["PlaceholderRange"]],
         sampling_params: SamplingParams,
         eos_token_id: Optional[int],
         arrival_time: float,
         lora_request: Optional[LoRARequest] = None,
     ) -> None:
         self.request_id = request_id
-        self.inputs = SingletonInputsAdapter(inputs)
         self.sampling_params = sampling_params
         # Because of LoRA, the eos token id can be different for each request.
         self.eos_token_id = eos_token_id
@@ -41,26 +44,21 @@ def __init__(
         assert sampling_params.max_tokens is not None
         self.max_tokens = sampling_params.max_tokens
 
-        self.prompt = self.inputs.prompt
-        self.prompt_token_ids = self.inputs.prompt_token_ids
+        self.prompt = prompt
+        self.prompt_token_ids = prompt_token_ids
         self.num_prompt_tokens = len(self.prompt_token_ids)
         self._output_token_ids: List[int] = []
         self._all_token_ids: List[int] = self.prompt_token_ids.copy()
         self.num_computed_tokens = 0
 
-        # Multi-modal input metadata.
-        mm_positions = self.inputs.multi_modal_placeholders
-        if mm_positions:
-            # FIXME(woosuk): Support other modalities.
-            self.mm_positions = mm_positions.get("image", [])
-        else:
-            self.mm_positions = []
-        # Output of the mm input mapper (e.g., image tensors).
-        self.mm_inputs: List[MultiModalKwargs] = []
-        if self.inputs.multi_modal_inputs:
-            self.mm_inputs = self.inputs.multi_modal_inputs
+        # Multi-modal related
+        self.mm_positions = multi_modal_placeholders or []
+        self.mm_inputs = multi_modal_inputs or []
+        self.mm_hashes: List[str] = multi_modal_hashes or []
 
-        self.mm_hashes: List[str] = self.inputs.multi_modal_hashes
+        # Sanity check
+        assert len(self.mm_inputs) == len(self.mm_positions)
+        assert len(self.mm_inputs) == len(self.mm_hashes)
 
         # Cache the computed kv block hashes of the request to avoid
         # recomputing.
@@ -70,15 +68,11 @@ def __init__(
     def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request":
         return cls(
             request_id=request.request_id,
-            inputs=token_inputs(
-                prompt_token_ids=request.prompt_token_ids,
-                prompt=request.prompt,
-                multi_modal_data=None,
-                multi_modal_inputs=request.mm_inputs,
-                multi_modal_hashes=request.mm_hashes,
-                multi_modal_placeholders=request.mm_placeholders,
-                mm_processor_kwargs=None,
-            ),
+            prompt=request.prompt,
+            prompt_token_ids=request.prompt_token_ids,
+            multi_modal_inputs=request.mm_inputs,
+            multi_modal_hashes=request.mm_hashes,
+            multi_modal_placeholders=request.mm_placeholders,
             sampling_params=request.sampling_params,
             eos_token_id=request.eos_token_id,
             arrival_time=request.arrival_time,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 31e693235f99f..a1d4f9b135789 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -19,7 +19,7 @@
                         LayerBlockType, cdiv, is_pin_memory_available)
 from vllm.v1.attention.backends.flash_attn import (FlashAttentionBackend,
                                                    FlashAttentionMetadata)
-from vllm.v1.engine.mm_input_mapper import MMHasher, MMInputMapperClient
+from vllm.v1.engine.mm_input_mapper import MMInputMapperClient
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
@@ -82,12 +82,10 @@ def __init__(
         self.input_registry = INPUT_REGISTRY
         self.mm_registry = MULTIMODAL_REGISTRY
 
-        # NOTE: mm_input_mapper_client and mm_hasher are only used for memory
-        # profiling.
-        self.mm_input_mapper_client = MMInputMapperClient(self.model_config)
-        self.mm_hasher = MMHasher()
-        self.use_hash = (not model_config.disable_mm_preprocessor_cache) or \
-            cache_config.enable_prefix_caching
+        # NOTE: Initialized input mapper is only used for processing dummy
+        # multimodal data into multimodal kwargs for GPU memory profiling.
+        self.mm_input_mapper_profiling = MMInputMapperClient(self.model_config)
+        self.mm_input_mapper_profiling.use_cache = False
 
         self.max_num_encoder_input_tokens = self.scheduler_config.max_num_encoder_input_tokens  # noqa: E501
         self.encoder_cache_size = self.scheduler_config.encoder_cache_size
@@ -722,8 +720,6 @@ def profile_run(self) -> None:
         ]
 
         # Profile with multimodal encoder & encoder cache.
-        # TODO (ywang96): generalize this beyond image modality since
-        # mm_input_mapper only supports image inputs.
         if self.is_multimodal_model:
 
             # Create dummy batch of multimodal inputs.
@@ -735,15 +731,30 @@ def profile_run(self) -> None:
             dummy_mm_data = dummy_request_data.multi_modal_data
 
             # NOTE: Currently model is profiled with a single non-text
-            # modality even when it supports multiple.
-            max_tokens_per_mm_item = max(
-                self.mm_registry.get_max_tokens_per_item_by_modality(
-                    self.model_config).values())
-
-            max_num_mm_items_encoder_budget = min(
-                self.max_num_encoder_input_tokens,
-                self.encoder_cache_size) // max_tokens_per_mm_item
-
+            # modality with the max possible input tokens even when
+            # it supports multiple.
+            max_tokens_by_modality_dict = self.mm_registry.get_max_tokens_per_item_by_modality(  # noqa: E501
+                self.model_config)
+
+            dummy_data_modality, max_tokens_per_mm_item = max(
+                max_tokens_by_modality_dict.items(), key=lambda item: item[1])
+
+            # Check how many items of this modality can be supported by
+            # the encoder cache budget.
+            encoder_cache_budget = min(self.max_num_encoder_input_tokens,
+                                       self.encoder_cache_size)
+            max_num_mm_items_encoder_budget = encoder_cache_budget // \
+                max_tokens_per_mm_item
+
+            # TODO: Allow users to set encoder_cache_budget in case this
+            # happens.
+            assert max_num_mm_items_encoder_budget > 0, (
+                f"Encoder cache budget={encoder_cache_budget} is too small to "
+                f"support the maximum possible size of multimodal embeddings"
+                f"={max_tokens_per_mm_item}.")
+
+            # Check how many items of this modality can be supported by
+            # the decoder budget.
             max_mm_items_per_req = max(
                 self.mm_registry.get_mm_limits_per_prompt(
                     self.model_config).values())
@@ -763,33 +774,24 @@ def profile_run(self) -> None:
             # they are scheduled to be processed separately.
 
             # Case when models have a merged processor, their dummy data is
-            # already batched `MultiModalKwargs`, therefore we need to "unbatch"
-            # and take the first item in each batched tensor.
-            # TODO (ywang96): This is somewhat hacky. Refactor this to be
-            # consistent with the other case.
+            # already batched `MultiModalKwargs`, therefore we take the first
+            # `MultiModalKwargsItem` from the desired modality to profile on.
             if isinstance(dummy_mm_data, MultiModalKwargs):
-                dummy_mm_kwargs = {
-                    k: v[0].unsqueeze(0)
-                    for k, v in dummy_mm_data.items()
-                }
+                dummy_mm_item = dummy_mm_data.get_item(
+                    modality=dummy_data_modality, item_index=0)
+                dummy_mm_kwargs = MultiModalKwargs.from_items([dummy_mm_item])
 
             # Case when models have dummy data explicitly defined as
             # `MultiModalDataDict`, so they need to be processed through input
             # mapper.
+            # TODO (ywang96): deprecate this path once merged processor is
+            # supported on all models.
             else:
-                # Compute MM hashes (if enabled)
-                mm_hashes = None
-                if self.use_hash:
-                    mm_hashes = self.mm_hasher.hash_dummy_mm_data(
-                        dummy_mm_data)
-
-                mm_kwargs_list = self.mm_input_mapper_client.process_inputs(
+                mm_kwargs_list = self.mm_input_mapper_profiling.process_inputs(
                     mm_data=dummy_mm_data,
-                    mm_hashes=mm_hashes,
+                    mm_hashes=None,
                     mm_processor_kwargs=None,
                     precomputed_mm_inputs=None)
-
-                # Take the first `MultiModalKwargs`
                 dummy_mm_kwargs = mm_kwargs_list[0]
 
             batched_dummy_mm_inputs = MultiModalKwargs.batch(

From 08fb75c72e39dcb4f0751dc59583b95bda4d3656 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 7 Jan 2025 09:10:54 +0800
Subject: [PATCH 096/462] [Bugfix] Fix LLaVA-NeXT feature size precision error
 (for real) (#11772)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .../processing/test_llava_next.py             |  3 +-
 .../processing/test_llava_onevision.py        |  3 +-
 vllm/model_executor/models/llava_next.py      | 39 ++++++++-------
 vllm/model_executor/models/llava_onevision.py | 47 ++++++++++---------
 4 files changed, 47 insertions(+), 45 deletions(-)

diff --git a/tests/models/decoder_only/vision_language/processing/test_llava_next.py b/tests/models/decoder_only/vision_language/processing/test_llava_next.py
index 6c8d300717de4..37a6d334ee60c 100644
--- a/tests/models/decoder_only/vision_language/processing/test_llava_next.py
+++ b/tests/models/decoder_only/vision_language/processing/test_llava_next.py
@@ -17,7 +17,8 @@ def processor_for_llava_next():
 
 @pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
 @pytest.mark.parametrize("image_size", [(1669, 2560), (2560, 1669), (183, 488),
-                                        (488, 183), (198, 176), (176, 198)])
+                                        (488, 183), (198, 176), (176, 198),
+                                        (161, 184), (184, 161)])
 @pytest.mark.parametrize("num_imgs", [1, 2])
 def test_processor_prompt_replacements(
     processor_for_llava_next,
diff --git a/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py b/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py
index 71adde6568a17..ed3e2db799be7 100644
--- a/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py
+++ b/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py
@@ -18,7 +18,8 @@ def processor_for_llava_onevision():
 @pytest.mark.parametrize("model_id",
                          ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
 @pytest.mark.parametrize("image_size", [(1669, 2560), (2560, 1669), (183, 488),
-                                        (488, 183), (198, 176), (176, 198)])
+                                        (488, 183), (198, 176), (176, 198),
+                                        (161, 184), (184, 161)])
 @pytest.mark.parametrize("num_imgs", [1, 2])
 def test_processor_prompt_replacements(
     processor_for_llava_onevision,
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index c76ec164a3087..258352416d4a7 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -121,30 +121,29 @@ def _get_num_unpadded_features(
         num_patch_height: int,
         num_patch_width: int,
     ) -> tuple[int, int]:
-        current_height = npatches * num_patch_height
-        current_width = npatches * num_patch_width
-
         # NOTE: Use float32 to remain consistent with HF output
-        original_aspect_ratio = np.array(original_width / original_height,
-                                         dtype=np.float32)
-        current_aspect_ratio = np.array(current_width / current_height,
-                                        dtype=np.float32)
+        current_height_f = np.float32(npatches * num_patch_height)
+        current_width_f = np.float32(npatches * num_patch_width)
+
+        original_width_f = np.float32(original_width)
+        original_height_f = np.float32(original_height)
+
+        original_aspect_ratio = original_width_f / original_height_f
+        current_aspect_ratio = current_width_f / current_height_f
 
         if original_aspect_ratio > current_aspect_ratio:
-            scale_factor = np.array(current_width / original_width,
-                                    dtype=np.float32)
-            new_height = int(original_height * scale_factor)
-            padding = (current_height - new_height) // 2
-            current_height -= 2 * padding
+            scale_factor = current_width_f / original_width_f
+            new_height = int(original_height_f * scale_factor)
+            padding = (current_height_f - new_height) // 2
+            current_height_f -= 2 * padding
         else:
-            scale_factor = np.array(current_height / original_height,
-                                    dtype=np.float32)
-            new_width = int(original_width * scale_factor)
-            padding = (current_width - new_width) // 2
-            current_width -= 2 * padding
-
-        unpadded_features = current_height * current_width
-        newline_features = current_height
+            scale_factor = current_height_f / original_height_f
+            new_width = int(original_width_f * scale_factor)
+            padding = (current_width_f - new_width) // 2
+            current_width_f -= 2 * padding
+
+        unpadded_features = int(current_height_f * current_width_f)
+        newline_features = int(current_height_f)
 
         return (unpadded_features, newline_features)
 
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index 9117824995021..62dae74e377be 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -107,36 +107,37 @@ def _get_num_unpadded_features(
         num_patch_height: int,
         num_patch_width: int,
     ) -> tuple[int, int]:
-        current_height = npatches * num_patch_height
-        current_width = npatches * num_patch_width
-
         # NOTE: Use float32 to remain consistent with HF output
-        original_aspect_ratio = np.array(original_width / original_height,
-                                         dtype=np.float32)
-        current_aspect_ratio = np.array(current_width / current_height,
-                                        dtype=np.float32)
+        current_height_f = np.float32(npatches * num_patch_height)
+        current_width_f = np.float32(npatches * num_patch_width)
+
+        original_width_f = np.float32(original_width)
+        original_height_f = np.float32(original_height)
+
+        original_aspect_ratio = original_width_f / original_height_f
+        current_aspect_ratio = current_width_f / current_height_f
 
         if original_aspect_ratio > current_aspect_ratio:
-            scale_factor = np.array(current_width / original_width,
-                                    dtype=np.float32)
-            new_height = int(original_height * scale_factor)
-            padding = (current_height - new_height) // 2
-            current_height -= 2 * padding
+            scale_factor = current_width_f / original_width_f
+            new_height = int(original_height_f * scale_factor)
+            padding = (current_height_f - new_height) // 2
+            current_height_f -= 2 * padding
         else:
-            scale_factor = np.array(current_height / original_height,
-                                    dtype=np.float32)
-            new_width = int(original_width * scale_factor)
-            padding = (current_width - new_width) // 2
-            current_width -= 2 * padding
+            scale_factor = current_height_f / original_height_f
+            new_width = int(original_width_f * scale_factor)
+            padding = (current_width_f - new_width) // 2
+            current_width_f -= 2 * padding
 
-        unpadded_features = current_height * current_width
-        newline_features = current_height
+        unpadded_features = int(current_height_f * current_width_f)
+        newline_features = int(current_height_f)
 
-        ratio = math.sqrt(current_height * current_width / (9 * npatches**2))
+        ratio = math.sqrt(current_height_f * current_width_f /
+                          (9 * npatches**2))
         if ratio > 1.1:
-            unpadded_features = int(current_height // ratio) * int(
-                current_width // ratio)
-            newline_features = int(current_height // ratio)
+            height_factor = int(current_height_f // ratio)
+            width_factor = int(current_width_f // ratio)
+            unpadded_features = height_factor * width_factor
+            newline_features = height_factor
 
         return (unpadded_features, newline_features)
 

From d0169e1b0fa44a80ba40baf92dd2cedd3611076b Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 7 Jan 2025 11:05:17 +0800
Subject: [PATCH 097/462] [Model] Future-proof Qwen2-Audio multi-modal
 processor (#11776)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/qwen2_audio.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index a7bb3425ed17c..576b01776e5de 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -227,12 +227,14 @@ def get_replacement_qwen2_audio(item_idx: int):
         ]
 
     def _always_apply_prompt_replacements(self) -> bool:
-        # HF never applies prompt replacements, so we have to do it ourselves.
+        # Qwen2-Audio processor will start inserting placeholder tokens
+        # in an upcoming release:
+        # https://github.com/huggingface/transformers/pull/35534
         # NOTE: `_find_placeholders_by_modality` may incorrectly think that HF
         # has already performed processing for multi-audio input when the input
         # audios are short (the corresponding placeholders may take up fewer
         # tokens than the number of audio items)
-        return True
+        return not hasattr(self._get_hf_processor(), "audio_token")
 
 
 @MULTIMODAL_REGISTRY.register_processor(Qwen2AudioMultiModalProcessor)

From d93d2d74fd807a091add17c2065ee8869339f76a Mon Sep 17 00:00:00 2001
From: YiSheng5 <yi.sheng@intel.com>
Date: Tue, 7 Jan 2025 11:09:58 +0800
Subject: [PATCH 098/462] [XPU] Make pp group initilized for
 pipeline-parallelism (#11648)

Signed-off-by: yisheng <yi.sheng@intel.com>
---
 vllm/worker/xpu_worker.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py
index 1295666055b04..e9cb623c8eb45 100644
--- a/vllm/worker/xpu_worker.py
+++ b/vllm/worker/xpu_worker.py
@@ -11,6 +11,7 @@
 from vllm.config import VllmConfig
 from vllm.distributed import (ensure_model_parallel_initialized,
                               init_distributed_environment)
+from vllm.distributed.parallel_state import get_pp_group
 from vllm.logger import init_logger
 from vllm.model_executor import set_random_seed
 from vllm.platforms import current_platform
@@ -176,3 +177,8 @@ def init_worker_distributed_environment(self) -> None:
             parallel_config.pipeline_parallel_size)
         # global all_reduce needed for overall oneccl warm up
         torch.distributed.all_reduce(torch.zeros(1).xpu())
+
+        if parallel_config.pipeline_parallel_size > 1:
+            # Add pp group init to avoid
+            # p2p communication as the first call
+            get_pp_group().all_reduce(torch.zeros(1).xpu())

From 8ceffbf3152d3b26d293ba1e157d0c187884572b Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 7 Jan 2025 11:20:01 +0800
Subject: [PATCH 099/462] [Doc][3/N] Reorganize Serving section (#11766)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 README.md                                     |   2 +-
 .../architecture_helm_deployment.png          | Bin
 .../contributing/dockerfile/dockerfile.md     |   2 +-
 .../source/contributing/model/registration.md |   4 +-
 .../docker.md}                                |   4 +-
 .../frameworks/bentoml.md}                    |   4 +-
 .../frameworks/cerebrium.md}                  |   4 +-
 .../frameworks/dstack.md}                     |   4 +-
 .../frameworks/helm.md}                       |   6 +-
 docs/source/deployment/frameworks/index.md    |  13 +++
 .../frameworks/lws.md}                        |   4 +-
 .../frameworks/skypilot.md}                   |   8 +-
 .../frameworks/triton.md}                     |   4 +-
 docs/source/deployment/integrations/index.md  |   9 ++
 .../integrations/kserve.md}                   |   4 +-
 .../integrations/kubeai.md}                   |   4 +-
 .../integrations/llamastack.md}               |   4 +-
 .../k8s.md}                                   |   4 +-
 .../nginx.md}                                 |   2 +-
 docs/source/design/arch_overview.md           |   2 +-
 docs/source/features/disagg_prefill.md        |   8 +-
 docs/source/features/spec_decode.md           |   2 +-
 .../getting_started/installation/gpu-rocm.md  |   2 +-
 .../getting_started/installation/hpu-gaudi.md |   2 +-
 docs/source/getting_started/quickstart.md     |  18 ++--
 docs/source/index.md                          |  49 ++++++-----
 docs/source/models/extensions/index.md        |   8 ++
 .../extensions}/runai_model_streamer.md       |   2 +-
 .../extensions}/tensorizer.md                 |   2 +-
 docs/source/models/supported_models.md        |  44 +++++-----
 docs/source/serving/distributed_serving.md    |  12 +--
 docs/source/serving/integrations.md           |  17 ----
 docs/source/serving/integrations/index.md     |   8 ++
 .../langchain.md}                             |   8 +-
 .../llamaindex.md}                            |   8 +-
 docs/source/serving/metrics.md                |   2 +-
 .../multimodal_inputs.md                      |  12 +--
 docs/source/serving/offline_inference.md      |  79 ++++++++++++++++++
 .../serving/openai_compatible_server.md       |   8 +-
 docs/source/serving/usage_stats.md            |   2 +-
 40 files changed, 248 insertions(+), 133 deletions(-)
 rename docs/source/{serving => assets/deployment}/architecture_helm_deployment.png (100%)
 rename docs/source/{serving/deploying_with_docker.md => deployment/docker.md} (98%)
 rename docs/source/{serving/deploying_with_bentoml.md => deployment/frameworks/bentoml.md} (89%)
 rename docs/source/{serving/deploying_with_cerebrium.md => deployment/frameworks/cerebrium.md} (98%)
 rename docs/source/{serving/deploying_with_dstack.md => deployment/frameworks/dstack.md} (98%)
 rename docs/source/{serving/deploying_with_helm.md => deployment/frameworks/helm.md} (98%)
 create mode 100644 docs/source/deployment/frameworks/index.md
 rename docs/source/{serving/deploying_with_lws.md => deployment/frameworks/lws.md} (91%)
 rename docs/source/{serving/run_on_sky.md => deployment/frameworks/skypilot.md} (98%)
 rename docs/source/{serving/deploying_with_triton.md => deployment/frameworks/triton.md} (87%)
 create mode 100644 docs/source/deployment/integrations/index.md
 rename docs/source/{serving/deploying_with_kserve.md => deployment/integrations/kserve.md} (85%)
 rename docs/source/{serving/deploying_with_kubeai.md => deployment/integrations/kubeai.md} (93%)
 rename docs/source/{serving/serving_with_llamastack.md => deployment/integrations/llamastack.md} (95%)
 rename docs/source/{serving/deploying_with_k8s.md => deployment/k8s.md} (99%)
 rename docs/source/{serving/deploying_with_nginx.md => deployment/nginx.md} (99%)
 create mode 100644 docs/source/models/extensions/index.md
 rename docs/source/{serving => models/extensions}/runai_model_streamer.md (98%)
 rename docs/source/{serving => models/extensions}/tensorizer.md (95%)
 delete mode 100644 docs/source/serving/integrations.md
 create mode 100644 docs/source/serving/integrations/index.md
 rename docs/source/serving/{serving_with_langchain.md => integrations/langchain.md} (82%)
 rename docs/source/serving/{serving_with_llamaindex.md => integrations/llamaindex.md} (74%)
 rename docs/source/{features => serving}/multimodal_inputs.md (95%)
 create mode 100644 docs/source/serving/offline_inference.md

diff --git a/README.md b/README.md
index f83c9d759b359..652268ec29cac 100644
--- a/README.md
+++ b/README.md
@@ -77,7 +77,7 @@ pip install vllm
 Visit our [documentation](https://vllm.readthedocs.io/en/latest/) to learn more.
 - [Installation](https://vllm.readthedocs.io/en/latest/getting_started/installation.html)
 - [Quickstart](https://vllm.readthedocs.io/en/latest/getting_started/quickstart.html)
-- [Supported Models](https://vllm.readthedocs.io/en/latest/models/supported_models.html)
+- [List of Supported Models](https://vllm.readthedocs.io/en/latest/models/supported_models.html)
 
 ## Contributing
 
diff --git a/docs/source/serving/architecture_helm_deployment.png b/docs/source/assets/deployment/architecture_helm_deployment.png
similarity index 100%
rename from docs/source/serving/architecture_helm_deployment.png
rename to docs/source/assets/deployment/architecture_helm_deployment.png
diff --git a/docs/source/contributing/dockerfile/dockerfile.md b/docs/source/contributing/dockerfile/dockerfile.md
index 7ffec83333d7d..38ea956ba8dfb 100644
--- a/docs/source/contributing/dockerfile/dockerfile.md
+++ b/docs/source/contributing/dockerfile/dockerfile.md
@@ -1,7 +1,7 @@
 # Dockerfile
 
 We provide a <gh-file:Dockerfile> to construct the image for running an OpenAI compatible server with vLLM.
-More information about deploying with Docker can be found [here](../../serving/deploying_with_docker.md).
+More information about deploying with Docker can be found [here](#deployment-docker).
 
 Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes:
 
diff --git a/docs/source/contributing/model/registration.md b/docs/source/contributing/model/registration.md
index cf1cdb0c9de0f..fe5aa94c52896 100644
--- a/docs/source/contributing/model/registration.md
+++ b/docs/source/contributing/model/registration.md
@@ -3,7 +3,7 @@
 # Model Registration
 
 vLLM relies on a model registry to determine how to run each model.
-A list of pre-registered architectures can be found on the [Supported Models](#supported-models) page.
+A list of pre-registered architectures can be found [here](#supported-models).
 
 If your model is not on this list, you must register it to vLLM.
 This page provides detailed instructions on how to do so.
@@ -16,7 +16,7 @@ This gives you the ability to modify the codebase and test your model.
 After you have implemented your model (see [tutorial](#new-model-basic)), put it into the <gh-dir:vllm/model_executor/models> directory.
 Then, add your model class to `_VLLM_MODELS` in <gh-file:vllm/model_executor/models/registry.py> so that it is automatically registered upon importing vLLM.
 You should also include an example HuggingFace repository for this model in <gh-file:tests/models/registry.py> to run the unit tests.
-Finally, update the [Supported Models](#supported-models) documentation page to promote your model!
+Finally, update our [list of supported models](#supported-models) to promote your model!
 
 ```{important}
 The list of models in each section should be maintained in alphabetical order.
diff --git a/docs/source/serving/deploying_with_docker.md b/docs/source/deployment/docker.md
similarity index 98%
rename from docs/source/serving/deploying_with_docker.md
rename to docs/source/deployment/docker.md
index 844bd27800c7a..2df1aca27f1e6 100644
--- a/docs/source/serving/deploying_with_docker.md
+++ b/docs/source/deployment/docker.md
@@ -1,6 +1,6 @@
-(deploying-with-docker)=
+(deployment-docker)=
 
-# Deploying with Docker
+# Using Docker
 
 ## Use vLLM's Official Docker Image
 
diff --git a/docs/source/serving/deploying_with_bentoml.md b/docs/source/deployment/frameworks/bentoml.md
similarity index 89%
rename from docs/source/serving/deploying_with_bentoml.md
rename to docs/source/deployment/frameworks/bentoml.md
index dfa0de4f0f6d7..ea0b5d1d4c93b 100644
--- a/docs/source/serving/deploying_with_bentoml.md
+++ b/docs/source/deployment/frameworks/bentoml.md
@@ -1,6 +1,6 @@
-(deploying-with-bentoml)=
+(deployment-bentoml)=
 
-# Deploying with BentoML
+# BentoML
 
 [BentoML](https://github.com/bentoml/BentoML) allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints. You can serve the model locally or containerize it as an OCI-complicant image and deploy it on Kubernetes.
 
diff --git a/docs/source/serving/deploying_with_cerebrium.md b/docs/source/deployment/frameworks/cerebrium.md
similarity index 98%
rename from docs/source/serving/deploying_with_cerebrium.md
rename to docs/source/deployment/frameworks/cerebrium.md
index 950064c8c1b10..be018dfb75d7a 100644
--- a/docs/source/serving/deploying_with_cerebrium.md
+++ b/docs/source/deployment/frameworks/cerebrium.md
@@ -1,6 +1,6 @@
-(deploying-with-cerebrium)=
+(deployment-cerebrium)=
 
-# Deploying with Cerebrium
+# Cerebrium
 
 ```{raw} html
 <p align="center">
diff --git a/docs/source/serving/deploying_with_dstack.md b/docs/source/deployment/frameworks/dstack.md
similarity index 98%
rename from docs/source/serving/deploying_with_dstack.md
rename to docs/source/deployment/frameworks/dstack.md
index 381f5f786ca2c..4142c1d9f1f60 100644
--- a/docs/source/serving/deploying_with_dstack.md
+++ b/docs/source/deployment/frameworks/dstack.md
@@ -1,6 +1,6 @@
-(deploying-with-dstack)=
+(deployment-dstack)=
 
-# Deploying with dstack
+# dstack
 
 ```{raw} html
 <p align="center">
diff --git a/docs/source/serving/deploying_with_helm.md b/docs/source/deployment/frameworks/helm.md
similarity index 98%
rename from docs/source/serving/deploying_with_helm.md
rename to docs/source/deployment/frameworks/helm.md
index 7286a0a88968f..18ed293191468 100644
--- a/docs/source/serving/deploying_with_helm.md
+++ b/docs/source/deployment/frameworks/helm.md
@@ -1,6 +1,6 @@
-(deploying-with-helm)=
+(deployment-helm)=
 
-# Deploying with Helm
+# Helm
 
 A Helm chart to deploy vLLM for Kubernetes
 
@@ -38,7 +38,7 @@ chart **including persistent volumes** and deletes the release.
 
 ## Architecture
 
-```{image} architecture_helm_deployment.png
+```{image} /assets/deployment/architecture_helm_deployment.png
 ```
 
 ## Values
diff --git a/docs/source/deployment/frameworks/index.md b/docs/source/deployment/frameworks/index.md
new file mode 100644
index 0000000000000..6a59131d36618
--- /dev/null
+++ b/docs/source/deployment/frameworks/index.md
@@ -0,0 +1,13 @@
+# Using other frameworks
+
+```{toctree}
+:maxdepth: 1
+
+bentoml
+cerebrium
+dstack
+helm
+lws
+skypilot
+triton
+```
diff --git a/docs/source/serving/deploying_with_lws.md b/docs/source/deployment/frameworks/lws.md
similarity index 91%
rename from docs/source/serving/deploying_with_lws.md
rename to docs/source/deployment/frameworks/lws.md
index 22bab419eaca3..349fa83fbcb9d 100644
--- a/docs/source/serving/deploying_with_lws.md
+++ b/docs/source/deployment/frameworks/lws.md
@@ -1,6 +1,6 @@
-(deploying-with-lws)=
+(deployment-lws)=
 
-# Deploying with LWS
+# LWS
 
 LeaderWorkerSet (LWS) is a Kubernetes API that aims to address common deployment patterns of AI/ML inference workloads.
 A major use case is for multi-host/multi-node distributed inference.
diff --git a/docs/source/serving/run_on_sky.md b/docs/source/deployment/frameworks/skypilot.md
similarity index 98%
rename from docs/source/serving/run_on_sky.md
rename to docs/source/deployment/frameworks/skypilot.md
index 115873ae49292..f02a943026922 100644
--- a/docs/source/serving/run_on_sky.md
+++ b/docs/source/deployment/frameworks/skypilot.md
@@ -1,6 +1,6 @@
-(on-cloud)=
+(deployment-skypilot)=
 
-# Deploying and scaling up with SkyPilot
+# SkyPilot
 
 ```{raw} html
 <p align="center">
@@ -12,9 +12,9 @@ vLLM can be **run and scaled to multiple service replicas on clouds and Kubernet
 
 ## Prerequisites
 
-- Go to the [HuggingFace model page](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) and request access to the model {code}`meta-llama/Meta-Llama-3-8B-Instruct`.
+- Go to the [HuggingFace model page](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) and request access to the model `meta-llama/Meta-Llama-3-8B-Instruct`.
 - Check that you have installed SkyPilot ([docs](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html)).
-- Check that {code}`sky check` shows clouds or Kubernetes are enabled.
+- Check that `sky check` shows clouds or Kubernetes are enabled.
 
 ```console
 pip install skypilot-nightly
diff --git a/docs/source/serving/deploying_with_triton.md b/docs/source/deployment/frameworks/triton.md
similarity index 87%
rename from docs/source/serving/deploying_with_triton.md
rename to docs/source/deployment/frameworks/triton.md
index 9b0a6f1d54ae8..94d87120159c6 100644
--- a/docs/source/serving/deploying_with_triton.md
+++ b/docs/source/deployment/frameworks/triton.md
@@ -1,5 +1,5 @@
-(deploying-with-triton)=
+(deployment-triton)=
 
-# Deploying with NVIDIA Triton
+# NVIDIA Triton
 
 The [Triton Inference Server](https://github.com/triton-inference-server) hosts a tutorial demonstrating how to quickly deploy a simple [facebook/opt-125m](https://huggingface.co/facebook/opt-125m) model using vLLM. Please see [Deploying a vLLM model in Triton](https://github.com/triton-inference-server/tutorials/blob/main/Quick_Deploy/vLLM/README.md#deploying-a-vllm-model-in-triton) for more details.
diff --git a/docs/source/deployment/integrations/index.md b/docs/source/deployment/integrations/index.md
new file mode 100644
index 0000000000000..d47ede8967547
--- /dev/null
+++ b/docs/source/deployment/integrations/index.md
@@ -0,0 +1,9 @@
+# External Integrations
+
+```{toctree}
+:maxdepth: 1
+
+kserve
+kubeai
+llamastack
+```
diff --git a/docs/source/serving/deploying_with_kserve.md b/docs/source/deployment/integrations/kserve.md
similarity index 85%
rename from docs/source/serving/deploying_with_kserve.md
rename to docs/source/deployment/integrations/kserve.md
index feaeb5d0ec8a2..c780fd74e8f55 100644
--- a/docs/source/serving/deploying_with_kserve.md
+++ b/docs/source/deployment/integrations/kserve.md
@@ -1,6 +1,6 @@
-(deploying-with-kserve)=
+(deployment-kserve)=
 
-# Deploying with KServe
+# KServe
 
 vLLM can be deployed with [KServe](https://github.com/kserve/kserve) on Kubernetes for highly scalable distributed model serving.
 
diff --git a/docs/source/serving/deploying_with_kubeai.md b/docs/source/deployment/integrations/kubeai.md
similarity index 93%
rename from docs/source/serving/deploying_with_kubeai.md
rename to docs/source/deployment/integrations/kubeai.md
index 3609d7e05acd3..2f5772e075d87 100644
--- a/docs/source/serving/deploying_with_kubeai.md
+++ b/docs/source/deployment/integrations/kubeai.md
@@ -1,6 +1,6 @@
-(deploying-with-kubeai)=
+(deployment-kubeai)=
 
-# Deploying with KubeAI
+# KubeAI
 
 [KubeAI](https://github.com/substratusai/kubeai) is a Kubernetes operator that enables you to deploy and manage AI models on Kubernetes. It provides a simple and scalable way to deploy vLLM in production. Functionality such as scale-from-zero, load based autoscaling, model caching, and much more is provided out of the box with zero external dependencies.
 
diff --git a/docs/source/serving/serving_with_llamastack.md b/docs/source/deployment/integrations/llamastack.md
similarity index 95%
rename from docs/source/serving/serving_with_llamastack.md
rename to docs/source/deployment/integrations/llamastack.md
index 71dadca7ad47c..474d2bdfa9580 100644
--- a/docs/source/serving/serving_with_llamastack.md
+++ b/docs/source/deployment/integrations/llamastack.md
@@ -1,6 +1,6 @@
-(run-on-llamastack)=
+(deployment-llamastack)=
 
-# Serving with Llama Stack
+# Llama Stack
 
 vLLM is also available via [Llama Stack](https://github.com/meta-llama/llama-stack) .
 
diff --git a/docs/source/serving/deploying_with_k8s.md b/docs/source/deployment/k8s.md
similarity index 99%
rename from docs/source/serving/deploying_with_k8s.md
rename to docs/source/deployment/k8s.md
index 5f9b0e4f55ecc..760214e112fba 100644
--- a/docs/source/serving/deploying_with_k8s.md
+++ b/docs/source/deployment/k8s.md
@@ -1,6 +1,6 @@
-(deploying-with-k8s)=
+(deployment-k8s)=
 
-# Deploying with Kubernetes
+# Using Kubernetes
 
 Using Kubernetes to deploy vLLM is a scalable and efficient way to serve machine learning models. This guide will walk you through the process of deploying vLLM with Kubernetes, including the necessary prerequisites, steps for deployment, and testing.
 
diff --git a/docs/source/serving/deploying_with_nginx.md b/docs/source/deployment/nginx.md
similarity index 99%
rename from docs/source/serving/deploying_with_nginx.md
rename to docs/source/deployment/nginx.md
index a1f00d8536465..a58f791c2997b 100644
--- a/docs/source/serving/deploying_with_nginx.md
+++ b/docs/source/deployment/nginx.md
@@ -1,6 +1,6 @@
 (nginxloadbalancer)=
 
-# Deploying with Nginx Loadbalancer
+# Using Nginx
 
 This document shows how to launch multiple vLLM serving containers and use Nginx to act as a load balancer between the servers.
 
diff --git a/docs/source/design/arch_overview.md b/docs/source/design/arch_overview.md
index 2f1280c047672..5e0dd021ad02e 100644
--- a/docs/source/design/arch_overview.md
+++ b/docs/source/design/arch_overview.md
@@ -57,7 +57,7 @@ More API details can be found in the {doc}`Offline Inference
 
 The code for the `LLM` class can be found in <gh-file:vllm/entrypoints/llm.py>.
 
-### OpenAI-compatible API server
+### OpenAI-Compatible API Server
 
 The second primary interface to vLLM is via its OpenAI-compatible API server.
 This server can be started using the `vllm serve` command.
diff --git a/docs/source/features/disagg_prefill.md b/docs/source/features/disagg_prefill.md
index 05226f2dec87c..645dc60807dd3 100644
--- a/docs/source/features/disagg_prefill.md
+++ b/docs/source/features/disagg_prefill.md
@@ -1,8 +1,12 @@
 (disagg-prefill)=
 
-# Disaggregated prefilling (experimental)
+# Disaggregated Prefilling (experimental)
 
-This page introduces you the disaggregated prefilling feature in vLLM. This feature is experimental and subject to change.
+This page introduces you the disaggregated prefilling feature in vLLM.
+
+```{note}
+This feature is experimental and subject to change.
+```
 
 ## Why disaggregated prefilling?
 
diff --git a/docs/source/features/spec_decode.md b/docs/source/features/spec_decode.md
index 8c52c97a41e48..bc8a0aa14dc5a 100644
--- a/docs/source/features/spec_decode.md
+++ b/docs/source/features/spec_decode.md
@@ -1,6 +1,6 @@
 (spec-decode)=
 
-# Speculative decoding
+# Speculative Decoding
 
 ```{warning}
 Please note that speculative decoding in vLLM is not yet optimized and does
diff --git a/docs/source/getting_started/installation/gpu-rocm.md b/docs/source/getting_started/installation/gpu-rocm.md
index 796911d7305a6..e36b92513e31d 100644
--- a/docs/source/getting_started/installation/gpu-rocm.md
+++ b/docs/source/getting_started/installation/gpu-rocm.md
@@ -148,7 +148,7 @@ $ export PYTORCH_ROCM_ARCH="gfx90a;gfx942"
 $ python3 setup.py develop
 ```
 
-This may take 5-10 minutes. Currently, {code}`pip install .` does not work for ROCm installation.
+This may take 5-10 minutes. Currently, `pip install .` does not work for ROCm installation.
 
 ```{tip}
 - Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers.
diff --git a/docs/source/getting_started/installation/hpu-gaudi.md b/docs/source/getting_started/installation/hpu-gaudi.md
index 94de169f51a73..1d50cef3bdc83 100644
--- a/docs/source/getting_started/installation/hpu-gaudi.md
+++ b/docs/source/getting_started/installation/hpu-gaudi.md
@@ -82,7 +82,7 @@ $ python setup.py develop
 
 ## Supported Features
 
-- [Offline batched inference](#offline-batched-inference)
+- [Offline inference](#offline-inference)
 - Online inference via [OpenAI-Compatible Server](#openai-compatible-server)
 - HPU autodetection - no need to manually select device within vLLM
 - Paged KV cache with algorithms enabled for Intel Gaudi accelerators
diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md
index ff216f8af30f9..3f9556165ece4 100644
--- a/docs/source/getting_started/quickstart.md
+++ b/docs/source/getting_started/quickstart.md
@@ -2,20 +2,20 @@
 
 # Quickstart
 
-This guide will help you quickly get started with vLLM to:
+This guide will help you quickly get started with vLLM to perform:
 
-- [Run offline batched inference](#offline-batched-inference)
-- [Run OpenAI-compatible inference](#openai-compatible-server)
+- [Offline batched inference](#quickstart-offline)
+- [Online inference using OpenAI-compatible server](#quickstart-online)
 
 ## Prerequisites
 
 - OS: Linux
 - Python: 3.9 -- 3.12
-- GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.)
 
 ## Installation
 
-You can install vLLM using pip. It's recommended to use [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html) to create and manage Python environments.
+If you are using NVIDIA GPUs, you can install vLLM using [pip](https://pypi.org/project/vllm/) directly.
+It's recommended to use [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html) to create and manage Python environments.
 
 ```console
 $ conda create -n myenv python=3.10 -y
@@ -23,9 +23,11 @@ $ conda activate myenv
 $ pip install vllm
 ```
 
-Please refer to the [installation documentation](#installation-index) for more details on installing vLLM.
+```{note}
+For non-CUDA platforms, please refer [here](#installation-index) for specific instructions on how to install vLLM.
+```
 
-(offline-batched-inference)=
+(quickstart-offline)=
 
 ## Offline Batched Inference
 
@@ -73,7 +75,7 @@ for output in outputs:
     print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 ```
 
-(openai-compatible-server)=
+(quickstart-online)=
 
 ## OpenAI-Compatible Server
 
diff --git a/docs/source/index.md b/docs/source/index.md
index 4bc40bf0f5e41..c335155bd6e14 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -65,32 +65,14 @@ getting_started/troubleshooting
 getting_started/faq
 ```
 
-```{toctree}
-:caption: Serving
-:maxdepth: 1
-
-serving/openai_compatible_server
-serving/deploying_with_docker
-serving/deploying_with_k8s
-serving/deploying_with_helm
-serving/deploying_with_nginx
-serving/distributed_serving
-serving/metrics
-serving/integrations
-serving/tensorizer
-serving/runai_model_streamer
-serving/engine_args
-serving/env_vars
-serving/usage_stats
-```
-
 ```{toctree}
 :caption: Models
 :maxdepth: 1
 
-models/supported_models
 models/generative_models
 models/pooling_models
+models/supported_models
+models/extensions/index
 ```
 
 ```{toctree}
@@ -99,7 +81,6 @@ models/pooling_models
 
 features/quantization/index
 features/lora
-features/multimodal_inputs
 features/tool_calling
 features/structured_outputs
 features/automatic_prefix_caching
@@ -108,6 +89,32 @@ features/spec_decode
 features/compatibility_matrix
 ```
 
+```{toctree}
+:caption: Inference and Serving
+:maxdepth: 1
+
+serving/offline_inference
+serving/openai_compatible_server
+serving/multimodal_inputs
+serving/distributed_serving
+serving/metrics
+serving/engine_args
+serving/env_vars
+serving/usage_stats
+serving/integrations/index
+```
+
+```{toctree}
+:caption: Deployment
+:maxdepth: 1
+
+deployment/docker
+deployment/k8s
+deployment/nginx
+deployment/frameworks/index
+deployment/integrations/index
+```
+
 ```{toctree}
 :caption: Performance
 :maxdepth: 1
diff --git a/docs/source/models/extensions/index.md b/docs/source/models/extensions/index.md
new file mode 100644
index 0000000000000..cff09d12eba47
--- /dev/null
+++ b/docs/source/models/extensions/index.md
@@ -0,0 +1,8 @@
+# Built-in Extensions
+
+```{toctree}
+:maxdepth: 1
+
+runai_model_streamer
+tensorizer
+```
diff --git a/docs/source/serving/runai_model_streamer.md b/docs/source/models/extensions/runai_model_streamer.md
similarity index 98%
rename from docs/source/serving/runai_model_streamer.md
rename to docs/source/models/extensions/runai_model_streamer.md
index d4269050ff574..fe2701194a604 100644
--- a/docs/source/serving/runai_model_streamer.md
+++ b/docs/source/models/extensions/runai_model_streamer.md
@@ -1,6 +1,6 @@
 (runai-model-streamer)=
 
-# Loading Models with Run:ai Model Streamer
+# Loading models with Run:ai Model Streamer
 
 Run:ai Model Streamer is a library to read tensors in concurrency, while streaming it to GPU memory.
 Further reading can be found in [Run:ai Model Streamer Documentation](https://github.com/run-ai/runai-model-streamer/blob/master/docs/README.md).
diff --git a/docs/source/serving/tensorizer.md b/docs/source/models/extensions/tensorizer.md
similarity index 95%
rename from docs/source/serving/tensorizer.md
rename to docs/source/models/extensions/tensorizer.md
index d3dd29d48f730..42ed5c795dd27 100644
--- a/docs/source/serving/tensorizer.md
+++ b/docs/source/models/extensions/tensorizer.md
@@ -1,6 +1,6 @@
 (tensorizer)=
 
-# Loading Models with CoreWeave's Tensorizer
+# Loading models with CoreWeave's Tensorizer
 
 vLLM supports loading models with [CoreWeave's Tensorizer](https://docs.coreweave.com/coreweave-machine-learning-and-ai/inference/tensorizer).
 vLLM model tensors that have been serialized to disk, an HTTP/HTTPS endpoint, or S3 endpoint can be deserialized
diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 94a8849f7edcd..590bea992d1fc 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -1,9 +1,9 @@
 (supported-models)=
 
-# Supported Models
+# List of Supported Models
 
 vLLM supports generative and pooling models across various tasks.
-If a model supports more than one task, you can set the task via the {code}`--task` argument.
+If a model supports more than one task, you can set the task via the `--task` argument.
 
 For each task, we list the model architectures that have been implemented in vLLM.
 Alongside each architecture, we include some popular models that use it.
@@ -14,8 +14,8 @@ Alongside each architecture, we include some popular models that use it.
 
 By default, vLLM loads models from [HuggingFace (HF) Hub](https://huggingface.co/models).
 
-To determine whether a given model is supported, you can check the {code}`config.json` file inside the HF repository.
-If the {code}`"architectures"` field contains a model architecture listed below, then it should be supported in theory.
+To determine whether a given model is supported, you can check the `config.json` file inside the HF repository.
+If the `"architectures"` field contains a model architecture listed below, then it should be supported in theory.
 
 ````{tip}
 The easiest way to check if your model is really supported at runtime is to run the program below:
@@ -48,7 +48,7 @@ To use models from [ModelScope](https://www.modelscope.cn) instead of HuggingFac
 $ export VLLM_USE_MODELSCOPE=True
 ```
 
-And use with {code}`trust_remote_code=True`.
+And use with `trust_remote_code=True`.
 
 ```python
 from vllm import LLM
@@ -420,15 +420,15 @@ you should explicitly specify the task type to ensure that the model is used in
 ```
 
 ```{note}
-{code}`ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config.
-You should manually set mean pooling by passing {code}`--override-pooler-config '{"pooling_type": "MEAN"}'`.
+`ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config.
+You should manually set mean pooling by passing `--override-pooler-config '{"pooling_type": "MEAN"}'`.
 ```
 
 ```{note}
-Unlike base Qwen2, {code}`Alibaba-NLP/gte-Qwen2-7B-instruct` uses bi-directional attention.
-You can set {code}`--hf-overrides '{"is_causal": false}'` to change the attention mask accordingly.
+Unlike base Qwen2, `Alibaba-NLP/gte-Qwen2-7B-instruct` uses bi-directional attention.
+You can set `--hf-overrides '{"is_causal": false}'` to change the attention mask accordingly.
 
-On the other hand, its 1.5B variant ({code}`Alibaba-NLP/gte-Qwen2-1.5B-instruct`) uses causal attention
+On the other hand, its 1.5B variant (`Alibaba-NLP/gte-Qwen2-1.5B-instruct`) uses causal attention
 despite being described otherwise on its model card.
 ```
 
@@ -468,8 +468,8 @@ If your model is not in the above list, we will try to automatically convert the
 {func}`vllm.model_executor.models.adapters.as_reward_model`. By default, we return the hidden states of each token directly.
 
 ```{important}
-For process-supervised reward models such as {code}`peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly,
-e.g.: {code}`--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`.
+For process-supervised reward models such as `peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly,
+e.g.: `--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`.
 ```
 
 #### Classification (`--task classify`)
@@ -537,13 +537,13 @@ The following modalities are supported depending on the model:
 - **V**ideo
 - **A**udio
 
-Any combination of modalities joined by {code}`+` are supported.
+Any combination of modalities joined by `+` are supported.
 
-- e.g.: {code}`T + I` means that the model supports text-only, image-only, and text-with-image inputs.
+- e.g.: `T + I` means that the model supports text-only, image-only, and text-with-image inputs.
 
-On the other hand, modalities separated by {code}`/` are mutually exclusive.
+On the other hand, modalities separated by `/` are mutually exclusive.
 
-- e.g.: {code}`T / I` means that the model supports text-only and image-only inputs, but not text-with-image inputs.
+- e.g.: `T / I` means that the model supports text-only and image-only inputs, but not text-with-image inputs.
 
 See [this page](#multimodal-inputs) on how to pass multi-modal inputs to the model.
 
@@ -731,8 +731,8 @@ See [this page](#generative-models) for more information on how to use generativ
 <sup>+</sup> Multiple items can be inputted per text prompt for this modality.
 
 ````{important}
-To enable multiple multi-modal items per text prompt, you have to set {code}`limit_mm_per_prompt` (offline inference)
-or {code}`--limit-mm-per-prompt` (online inference). For example, to enable passing up to 4 images per text prompt:
+To enable multiple multi-modal items per text prompt, you have to set `limit_mm_per_prompt` (offline inference)
+or `--limit-mm-per-prompt` (online inference). For example, to enable passing up to 4 images per text prompt:
 
 ```python
 llm = LLM(
@@ -751,11 +751,11 @@ vLLM currently only supports adding LoRA to the language backbone of multimodal
 ```
 
 ```{note}
-To use {code}`TIGER-Lab/Mantis-8B-siglip-llama3`, you have pass {code}`--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM.
+To use `TIGER-Lab/Mantis-8B-siglip-llama3`, you have pass `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM.
 ```
 
 ```{note}
-The official {code}`openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork ({code}`HwwwH/MiniCPM-V-2`) for now.
+The official `openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (`HwwwH/MiniCPM-V-2`) for now.
 For more details, please see: <gh-pr:4087#issuecomment-2250397630>
 ```
 
@@ -770,7 +770,7 @@ you should explicitly specify the task type to ensure that the model is used in
 
 #### Text Embedding (`--task embed`)
 
-Any text generation model can be converted into an embedding model by passing {code}`--task embed`.
+Any text generation model can be converted into an embedding model by passing `--task embed`.
 
 ```{note}
 To get the best results, you should use pooling models that are specifically trained as such.
@@ -818,7 +818,7 @@ At vLLM, we are committed to facilitating the integration and support of third-p
 2. **Best-Effort Consistency**: While we aim to maintain a level of consistency between the models implemented in vLLM and other frameworks like transformers, complete alignment is not always feasible. Factors like acceleration techniques and the use of low-precision computations can introduce discrepancies. Our commitment is to ensure that the implemented models are functional and produce sensible results.
 
 ```{tip}
-When comparing the output of {code}`model.generate` from HuggingFace Transformers with the output of {code}`llm.generate` from vLLM, note that the former reads the model's generation config file (i.e., [generation_config.json](https://github.com/huggingface/transformers/blob/19dabe96362803fb0a9ae7073d03533966598b17/src/transformers/generation/utils.py#L1945)) and applies the default parameters for generation, while the latter only uses the parameters passed to the function. Ensure all sampling parameters are identical when comparing outputs.
+When comparing the output of `model.generate` from HuggingFace Transformers with the output of `llm.generate` from vLLM, note that the former reads the model's generation config file (i.e., [generation_config.json](https://github.com/huggingface/transformers/blob/19dabe96362803fb0a9ae7073d03533966598b17/src/transformers/generation/utils.py#L1945)) and applies the default parameters for generation, while the latter only uses the parameters passed to the function. Ensure all sampling parameters are identical when comparing outputs.
 ```
 
 3. **Issue Resolution and Model Updates**: Users are encouraged to report any bugs or issues they encounter with third-party models. Proposed fixes should be submitted via PRs, with a clear explanation of the problem and the rationale behind the proposed solution. If a fix for one model impacts another, we rely on the community to highlight and address these cross-model dependencies. Note: for bugfix PRs, it is good etiquette to inform the original author to seek their feedback.
diff --git a/docs/source/serving/distributed_serving.md b/docs/source/serving/distributed_serving.md
index 6fbc1ea104678..b1703249d7224 100644
--- a/docs/source/serving/distributed_serving.md
+++ b/docs/source/serving/distributed_serving.md
@@ -18,13 +18,13 @@ After adding enough GPUs and nodes to hold the model, you can run vLLM first, wh
 There is one edge case: if the model fits in a single node with multiple GPUs, but the number of GPUs cannot divide the model size evenly, you can use pipeline parallelism, which splits the model along layers and supports uneven splits. In this case, the tensor parallel size should be 1 and the pipeline parallel size should be the number of GPUs.
 ```
 
-## Details for Distributed Inference and Serving
+## Running vLLM on a single node
 
 vLLM supports distributed tensor-parallel and pipeline-parallel inference and serving. Currently, we support [Megatron-LM's tensor parallel algorithm](https://arxiv.org/pdf/1909.08053.pdf). We manage the distributed runtime with either [Ray](https://github.com/ray-project/ray) or python native multiprocessing. Multiprocessing can be used when deploying on a single node, multi-node inferencing currently requires Ray.
 
-Multiprocessing will be used by default when not running in a Ray placement group and if there are sufficient GPUs available on the same node for the configured {code}`tensor_parallel_size`, otherwise Ray will be used. This default can be overridden via the {code}`LLM` class {code}`distributed_executor_backend` argument or {code}`--distributed-executor-backend` API server argument. Set it to {code}`mp` for multiprocessing or {code}`ray` for Ray. It's not required for Ray to be installed for the multiprocessing case.
+Multiprocessing will be used by default when not running in a Ray placement group and if there are sufficient GPUs available on the same node for the configured `tensor_parallel_size`, otherwise Ray will be used. This default can be overridden via the `LLM` class `distributed_executor_backend` argument or `--distributed-executor-backend` API server argument. Set it to `mp` for multiprocessing or `ray` for Ray. It's not required for Ray to be installed for the multiprocessing case.
 
-To run multi-GPU inference with the {code}`LLM` class, set the {code}`tensor_parallel_size` argument to the number of GPUs you want to use. For example, to run inference on 4 GPUs:
+To run multi-GPU inference with the `LLM` class, set the `tensor_parallel_size` argument to the number of GPUs you want to use. For example, to run inference on 4 GPUs:
 
 ```python
 from vllm import LLM
@@ -32,14 +32,14 @@ llm = LLM("facebook/opt-13b", tensor_parallel_size=4)
 output = llm.generate("San Franciso is a")
 ```
 
-To run multi-GPU serving, pass in the {code}`--tensor-parallel-size` argument when starting the server. For example, to run API server on 4 GPUs:
+To run multi-GPU serving, pass in the `--tensor-parallel-size` argument when starting the server. For example, to run API server on 4 GPUs:
 
 ```console
 $ vllm serve facebook/opt-13b \
 $     --tensor-parallel-size 4
 ```
 
-You can also additionally specify {code}`--pipeline-parallel-size` to enable pipeline parallelism. For example, to run API server on 8 GPUs with pipeline parallelism and tensor parallelism:
+You can also additionally specify `--pipeline-parallel-size` to enable pipeline parallelism. For example, to run API server on 8 GPUs with pipeline parallelism and tensor parallelism:
 
 ```console
 $ vllm serve gpt2 \
@@ -47,7 +47,7 @@ $     --tensor-parallel-size 4 \
 $     --pipeline-parallel-size 2
 ```
 
-## Multi-Node Inference and Serving
+## Running vLLM on multiple nodes
 
 If a single node does not have enough GPUs to hold the model, you can run the model using multiple nodes. It is important to make sure the execution environment is the same on all nodes, including the model path, the Python environment. The recommended way is to use docker images to ensure the same environment, and hide the heterogeneity of the host machines via mapping them into the same docker configuration.
 
diff --git a/docs/source/serving/integrations.md b/docs/source/serving/integrations.md
deleted file mode 100644
index d214c77254257..0000000000000
--- a/docs/source/serving/integrations.md
+++ /dev/null
@@ -1,17 +0,0 @@
-# Integrations
-
-```{toctree}
-:maxdepth: 1
-
-run_on_sky
-deploying_with_kserve
-deploying_with_kubeai
-deploying_with_triton
-deploying_with_bentoml
-deploying_with_cerebrium
-deploying_with_lws
-deploying_with_dstack
-serving_with_langchain
-serving_with_llamaindex
-serving_with_llamastack
-```
diff --git a/docs/source/serving/integrations/index.md b/docs/source/serving/integrations/index.md
new file mode 100644
index 0000000000000..371c284981ce9
--- /dev/null
+++ b/docs/source/serving/integrations/index.md
@@ -0,0 +1,8 @@
+# External Integrations
+
+```{toctree}
+:maxdepth: 1
+
+langchain
+llamaindex
+```
diff --git a/docs/source/serving/serving_with_langchain.md b/docs/source/serving/integrations/langchain.md
similarity index 82%
rename from docs/source/serving/serving_with_langchain.md
rename to docs/source/serving/integrations/langchain.md
index 96bd5943f3d64..49ff6e0c32a72 100644
--- a/docs/source/serving/serving_with_langchain.md
+++ b/docs/source/serving/integrations/langchain.md
@@ -1,10 +1,10 @@
-(run-on-langchain)=
+(serving-langchain)=
 
-# Serving with Langchain
+# LangChain
 
-vLLM is also available via [Langchain](https://github.com/langchain-ai/langchain) .
+vLLM is also available via [LangChain](https://github.com/langchain-ai/langchain) .
 
-To install langchain, run
+To install LangChain, run
 
 ```console
 $ pip install langchain langchain_community -q
diff --git a/docs/source/serving/serving_with_llamaindex.md b/docs/source/serving/integrations/llamaindex.md
similarity index 74%
rename from docs/source/serving/serving_with_llamaindex.md
rename to docs/source/serving/integrations/llamaindex.md
index 98859d8e3f828..9961c181d7e1c 100644
--- a/docs/source/serving/serving_with_llamaindex.md
+++ b/docs/source/serving/integrations/llamaindex.md
@@ -1,10 +1,10 @@
-(run-on-llamaindex)=
+(serving-llamaindex)=
 
-# Serving with llama_index
+# LlamaIndex
 
-vLLM is also available via [llama_index](https://github.com/run-llama/llama_index) .
+vLLM is also available via [LlamaIndex](https://github.com/run-llama/llama_index) .
 
-To install llamaindex, run
+To install LlamaIndex, run
 
 ```console
 $ pip install llama-index-llms-vllm -q
diff --git a/docs/source/serving/metrics.md b/docs/source/serving/metrics.md
index 2dc78643f6d8f..e6ded2e6dd465 100644
--- a/docs/source/serving/metrics.md
+++ b/docs/source/serving/metrics.md
@@ -4,7 +4,7 @@ vLLM exposes a number of metrics that can be used to monitor the health of the
 system. These metrics are exposed via the `/metrics` endpoint on the vLLM
 OpenAI compatible API server.
 
-You can start the server using Python, or using [Docker](deploying_with_docker.md):
+You can start the server using Python, or using [Docker](#deployment-docker):
 
 ```console
 $ vllm serve unsloth/Llama-3.2-1B-Instruct
diff --git a/docs/source/features/multimodal_inputs.md b/docs/source/serving/multimodal_inputs.md
similarity index 95%
rename from docs/source/features/multimodal_inputs.md
rename to docs/source/serving/multimodal_inputs.md
index 4f45a9f448cf0..0efa09f2869ca 100644
--- a/docs/source/features/multimodal_inputs.md
+++ b/docs/source/serving/multimodal_inputs.md
@@ -18,7 +18,7 @@ To input multi-modal data, follow this schema in {class}`vllm.inputs.PromptType`
 
 ### Image
 
-You can pass a single image to the {code}`'image'` field of the multi-modal dictionary, as shown in the following examples:
+You can pass a single image to the `'image'` field of the multi-modal dictionary, as shown in the following examples:
 
 ```python
 llm = LLM(model="llava-hf/llava-1.5-7b-hf")
@@ -122,21 +122,21 @@ for o in outputs:
 
 ### Video
 
-You can pass a list of NumPy arrays directly to the {code}`'video'` field of the multi-modal dictionary
+You can pass a list of NumPy arrays directly to the `'video'` field of the multi-modal dictionary
 instead of using multi-image input.
 
 Full example: <gh-file:examples/offline_inference_vision_language.py>
 
 ### Audio
 
-You can pass a tuple {code}`(array, sampling_rate)` to the {code}`'audio'` field of the multi-modal dictionary.
+You can pass a tuple `(array, sampling_rate)` to the `'audio'` field of the multi-modal dictionary.
 
 Full example: <gh-file:examples/offline_inference_audio_language.py>
 
 ### Embedding
 
 To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model,
-pass a tensor of shape {code}`(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary.
+pass a tensor of shape `(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary.
 
 ```python
 # Inference with image embeddings as input
@@ -294,7 +294,7 @@ $ export VLLM_IMAGE_FETCH_TIMEOUT=<timeout>
 
 ### Video
 
-Instead of {code}`image_url`, you can pass a video file via {code}`video_url`. Here is a simple example using [LLaVA-OneVision](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf).
+Instead of `image_url`, you can pass a video file via `video_url`. Here is a simple example using [LLaVA-OneVision](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf).
 
 First, launch the OpenAI-compatible server:
 
@@ -418,7 +418,7 @@ result = chat_completion_from_base64.choices[0].message.content
 print("Chat completion output from input audio:", result)
 ```
 
-Alternatively, you can pass {code}`audio_url`, which is the audio counterpart of {code}`image_url` for image input:
+Alternatively, you can pass `audio_url`, which is the audio counterpart of `image_url` for image input:
 
 ```python
 chat_completion_from_url = client.chat.completions.create(
diff --git a/docs/source/serving/offline_inference.md b/docs/source/serving/offline_inference.md
new file mode 100644
index 0000000000000..83178f7811825
--- /dev/null
+++ b/docs/source/serving/offline_inference.md
@@ -0,0 +1,79 @@
+(offline-inference)=
+
+# Offline Inference
+
+You can run vLLM in your own code on a list of prompts.
+
+The offline API is based on the {class}`~vllm.LLM` class.
+To initialize the vLLM engine, create a new instance of `LLM` and specify the model to run.
+
+For example, the following code downloads the [`facebook/opt-125m`](https://huggingface.co/facebook/opt-125m) model from HuggingFace
+and runs it in vLLM using the default configuration.
+
+```python
+llm = LLM(model="facebook/opt-125m")
+```
+
+After initializing the `LLM` instance, you can perform model inference using various APIs.
+The available APIs depend on the type of model that is being run:
+
+- [Generative models](#generative-models) output logprobs which are sampled from to obtain the final output text.
+- [Pooling models](#pooling-models) output their hidden states directly.
+
+Please refer to the above pages for more details about each API.
+
+```{seealso}
+[API Reference](/dev/offline_inference/offline_index)
+```
+
+## Configuration Options
+
+This section lists the most common options for running the vLLM engine.
+For a full list, refer to the [Engine Arguments](#engine-args) page.
+
+### Reducing memory usage
+
+Large models might cause your machine to run out of memory (OOM). Here are some options that help alleviate this problem.
+
+#### Tensor Parallelism (TP)
+
+Tensor parallelism (`tensor_parallel_size` option) can be used to split the model across multiple GPUs.
+
+The following code splits the model across 2 GPUs.
+
+```python
+llm = LLM(model="ibm-granite/granite-3.1-8b-instruct",
+          tensor_parallel_size=2)
+```
+
+```{important}
+To ensure that vLLM initializes CUDA correctly, you should avoid calling related functions (e.g. {func}`torch.cuda.set_device`)
+before initializing vLLM. Otherwise, you may run into an error like `RuntimeError: Cannot re-initialize CUDA in forked subprocess`.
+
+To control which devices are used, please instead set the `CUDA_VISIBLE_DEVICES` environment variable.
+```
+
+#### Quantization
+
+Quantized models take less memory at the cost of lower precision.
+
+Statically quantized models can be downloaded from HF Hub (some popular ones are available at [Neural Magic](https://huggingface.co/neuralmagic))
+and used directly without extra configuration.
+
+Dynamic quantization is also supported via the `quantization` option -- see [here](#quantization-index) for more details.
+
+#### Context length and batch size
+
+You can further reduce memory usage by limit the context length of the model (`max_model_len` option)
+and the maximum batch size (`max_num_seqs` option).
+
+```python
+llm = LLM(model="adept/fuyu-8b",
+          max_model_len=2048,
+          max_num_seqs=2)
+```
+
+### Performance optimization and tuning
+
+You can potentially improve the performance of vLLM by finetuning various options.
+Please refer to [this guide](#optimization-and-tuning) for more details.
diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index 97e9879075570..1e5ea6357d202 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -1,8 +1,10 @@
-# OpenAI Compatible Server
+(openai-compatible-server)=
 
-vLLM provides an HTTP server that implements OpenAI's [Completions](https://platform.openai.com/docs/api-reference/completions) and [Chat](https://platform.openai.com/docs/api-reference/chat) API, and more!
+# OpenAI-Compatible Server
 
-You can start the server via the [`vllm serve`](#vllm-serve) command, or through [Docker](deploying_with_docker.md):
+vLLM provides an HTTP server that implements OpenAI's [Completions API](https://platform.openai.com/docs/api-reference/completions), [Chat API](https://platform.openai.com/docs/api-reference/chat), and more!
+
+You can start the server via the [`vllm serve`](#vllm-serve) command, or through [Docker](#deployment-docker):
 ```bash
 vllm serve NousResearch/Meta-Llama-3-8B-Instruct --dtype auto --api-key token-abc123
 ```
diff --git a/docs/source/serving/usage_stats.md b/docs/source/serving/usage_stats.md
index 3d02fbab9216e..cfc3cb2576873 100644
--- a/docs/source/serving/usage_stats.md
+++ b/docs/source/serving/usage_stats.md
@@ -45,7 +45,7 @@ You can preview the collected data by running the following command:
 tail ~/.config/vllm/usage_stats.json
 ```
 
-## Opt-out of Usage Stats Collection
+## Opting out
 
 You can opt-out of usage stats collection by setting the `VLLM_NO_USAGE_STATS` or `DO_NOT_TRACK` environment variable, or by creating a `~/.config/vllm/do_not_track` file:
 

From b278557935d78b337fb5e82a32b02f75678b4101 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Tue, 7 Jan 2025 12:01:39 +0800
Subject: [PATCH 100/462] [Kernel][LoRA]Punica prefill  kernels fusion (#11234)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: Abatom <abzhonghua@gmail.com>
Co-authored-by: Zhonghua Deng <abatom@163.com>
---
 .buildkite/test-pipeline.yaml          |   3 +-
 tests/lora/test_minicpmv.py            |  77 --------
 tests/lora/test_minicpmv_tp.py         |  63 +++++--
 tests/lora/test_punica_sizes.py        | 166 ++++++++---------
 tests/lora/test_punica_variation.py    | 168 +++++++++--------
 tests/lora/utils.py                    | 144 ++++++++++++---
 vllm/lora/ops/sgmv_expand.py           | 205 +++++++++++++--------
 vllm/lora/ops/sgmv_expand_slice.py     | 241 -------------------------
 vllm/lora/ops/sgmv_shrink.py           | 129 +++++++------
 vllm/lora/ops/utils.py                 | 121 ++++++++++++-
 vllm/lora/punica_wrapper/punica_gpu.py | 154 ++++++----------
 11 files changed, 707 insertions(+), 764 deletions(-)
 delete mode 100644 tests/lora/test_minicpmv.py
 delete mode 100644 vllm/lora/ops/sgmv_expand_slice.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 529daf54faecf..dcfe228ce8eae 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -242,7 +242,7 @@ steps:
   source_file_dependencies:
   - vllm/lora
   - tests/lora
-  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
+  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py
   parallelism: 4
 
 - label: "PyTorch Fullgraph Smoke Test" # 9min
@@ -535,6 +535,7 @@ steps:
     # requires multi-GPU testing for validation.
     - pytest -v -s -x lora/test_chatglm3_tp.py
     - pytest -v -s -x lora/test_llama_tp.py
+    - pytest -v -s -x lora/test_minicpmv_tp.py
 
 
 - label: Weight Loading Multiple GPU Test  # 33min
diff --git a/tests/lora/test_minicpmv.py b/tests/lora/test_minicpmv.py
deleted file mode 100644
index 78bf5a1617233..0000000000000
--- a/tests/lora/test_minicpmv.py
+++ /dev/null
@@ -1,77 +0,0 @@
-from typing import List
-
-import pytest
-
-import vllm
-from vllm.assets.image import ImageAsset
-from vllm.lora.request import LoRARequest
-from vllm.platforms import current_platform
-
-MODEL_PATH = "openbmb/MiniCPM-Llama3-V-2_5"
-
-PROMPT_TEMPLATE = (
-    "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n"
-    "(<image>./</image>)\nWhat is in the image?<|eot_id|>"
-    "<|start_header_id|>assistant<|end_header_id|>\n\n")
-
-IMAGE_ASSETS = [
-    ImageAsset("stop_sign"),
-    ImageAsset("cherry_blossom"),
-]
-
-# After fine-tuning with LoRA, all generated content should start begin `A`.
-EXPECTED_OUTPUT = [
-    "A red and white stop sign with a Chinese archway in the background featuring red lanterns and gold accents.",  # noqa: E501
-    "A pink cherry blossom tree with a blue sky in the background.",
-]
-
-
-def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
-    sampling_params = vllm.SamplingParams(
-        temperature=0,
-        max_tokens=5,
-        stop_token_ids=[128001, 128009],  # eos_id, eot_id
-    )
-
-    inputs = [{
-        "prompt": PROMPT_TEMPLATE,
-        "multi_modal_data": {
-            "image": asset.pil_image
-        },
-    } for asset in IMAGE_ASSETS]
-
-    outputs = llm.generate(
-        inputs,
-        sampling_params,
-        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
-        if lora_id else None,
-    )
-    # Print the outputs.
-    generated_texts: List[str] = []
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text.strip()
-        generated_texts.append(generated_text)
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-    return generated_texts
-
-
-@pytest.mark.xfail(
-    current_platform.is_rocm(),
-    reason="MiniCPM-V dependency xformers incompatible with ROCm")
-def test_minicpmv_lora(minicpmv_lora_files):
-    llm = vllm.LLM(
-        MODEL_PATH,
-        max_num_seqs=2,
-        enable_lora=True,
-        max_loras=4,
-        max_lora_rank=64,
-        trust_remote_code=True,
-        enable_chunked_prefill=True,
-    )
-    output1 = do_sample(llm, minicpmv_lora_files, lora_id=1)
-    for i in range(len(EXPECTED_OUTPUT)):
-        assert EXPECTED_OUTPUT[i].startswith(output1[i])
-    output2 = do_sample(llm, minicpmv_lora_files, lora_id=2)
-    for i in range(len(EXPECTED_OUTPUT)):
-        assert EXPECTED_OUTPUT[i].startswith(output2[i])
diff --git a/tests/lora/test_minicpmv_tp.py b/tests/lora/test_minicpmv_tp.py
index 930f177953a5f..3b0f18325a40b 100644
--- a/tests/lora/test_minicpmv_tp.py
+++ b/tests/lora/test_minicpmv_tp.py
@@ -3,10 +3,10 @@
 import pytest
 
 import vllm
+from tests.utils import fork_new_process_for_each_test
 from vllm.assets.image import ImageAsset
 from vllm.lora.request import LoRARequest
-
-from ..utils import multi_gpu_test
+from vllm.platforms import current_platform
 
 MODEL_PATH = "openbmb/MiniCPM-Llama3-V-2_5"
 
@@ -17,13 +17,11 @@
 
 IMAGE_ASSETS = [
     ImageAsset("stop_sign"),
-    ImageAsset("cherry_blossom"),
 ]
 
 # After fine-tuning with LoRA, all generated content should start begin `A`.
 EXPECTED_OUTPUT = [
     "A red and white stop sign with a Chinese archway in the background featuring red lanterns and gold accents.",  # noqa: E501
-    "A pink cherry blossom tree with a blue sky in the background.",
 ]
 
 
@@ -50,48 +48,75 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
     # Print the outputs.
     generated_texts: List[str] = []
     for output in outputs:
-        prompt = output.prompt
         generated_text = output.outputs[0].text.strip()
         generated_texts.append(generated_text)
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        print(f"Generated text: {generated_text!r}")
     return generated_texts
 
 
-@multi_gpu_test(num_gpus=2)
-@pytest.mark.parametrize("fully_sharded", [True, False])
-def test_minicpmv_tp2(minicpmv_lora_files, fully_sharded):
+@pytest.mark.xfail(
+    current_platform.is_rocm(),
+    reason="MiniCPM-V dependency xformers incompatible with ROCm")
+@fork_new_process_for_each_test
+def test_minicpmv_lora(minicpmv_lora_files):
+    llm = vllm.LLM(
+        MODEL_PATH,
+        max_num_seqs=2,
+        enable_lora=True,
+        max_loras=2,
+        max_lora_rank=8,
+        enforce_eager=True,
+        trust_remote_code=True,
+        enable_chunked_prefill=True,
+    )
+    output1 = do_sample(llm, minicpmv_lora_files, lora_id=1)
+    for i in range(len(EXPECTED_OUTPUT)):
+        assert EXPECTED_OUTPUT[i].startswith(output1[i])
+    output2 = do_sample(llm, minicpmv_lora_files, lora_id=2)
+    for i in range(len(EXPECTED_OUTPUT)):
+        assert EXPECTED_OUTPUT[i].startswith(output2[i])
+
+
+@pytest.mark.xfail(
+    current_platform.is_rocm(),
+    reason="MiniCPM-V dependency xformers incompatible with ROCm")
+@fork_new_process_for_each_test
+def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files):
     llm = vllm.LLM(
         MODEL_PATH,
         enable_lora=True,
         max_num_seqs=2,
         max_loras=4,
         max_lora_rank=64,
-        tensor_parallel_size=2,
+        tensor_parallel_size=4,
         trust_remote_code=True,
-        fully_sharded_loras=fully_sharded,
+        enforce_eager=True,
         enable_chunked_prefill=True,
     )
-
     output_tp = do_sample(llm, minicpmv_lora_files, lora_id=1)
-
     for i in range(len(EXPECTED_OUTPUT)):
         assert EXPECTED_OUTPUT[i].startswith(output_tp[i])
 
 
-@multi_gpu_test(num_gpus=4)
-@pytest.mark.parametrize("fully_sharded", [True, False])
-def test_minicpmv_tp4(minicpmv_lora_files, fully_sharded):
+@pytest.mark.xfail(
+    current_platform.is_rocm(),
+    reason="MiniCPM-V dependency xformers incompatible with ROCm")
+@fork_new_process_for_each_test
+def test_minicpmv_tp4_fully_sharded_loras(minicpmv_lora_files):
     llm = vllm.LLM(
         MODEL_PATH,
         enable_lora=True,
         max_num_seqs=2,
-        max_loras=4,
-        max_lora_rank=64,
+        max_loras=2,
+        max_lora_rank=8,
         tensor_parallel_size=4,
         trust_remote_code=True,
-        fully_sharded_loras=fully_sharded,
+        fully_sharded_loras=True,
         enable_chunked_prefill=True,
     )
     output_tp = do_sample(llm, minicpmv_lora_files, lora_id=1)
     for i in range(len(EXPECTED_OUTPUT)):
         assert EXPECTED_OUTPUT[i].startswith(output_tp[i])
+    output_tp = do_sample(llm, minicpmv_lora_files, lora_id=2)
+    for i in range(len(EXPECTED_OUTPUT)):
+        assert EXPECTED_OUTPUT[i].startswith(output_tp[i])
diff --git a/tests/lora/test_punica_sizes.py b/tests/lora/test_punica_sizes.py
index 66b5f82bbb97d..0351fedd1cfa5 100644
--- a/tests/lora/test_punica_sizes.py
+++ b/tests/lora/test_punica_sizes.py
@@ -4,6 +4,8 @@
 whether the corresponding Triton kernel can run normally when tensor parallelism
 is set to [1, 2, 4, 8, 16, 32, 64].
 """
+from threading import Lock
+
 import pytest
 import torch
 
@@ -11,12 +13,13 @@
 from vllm.lora.ops.bgmv_expand_slice import bgmv_expand_slice
 from vllm.lora.ops.bgmv_shrink import bgmv_shrink
 from vllm.lora.ops.sgmv_expand import sgmv_expand
-from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice
 from vllm.lora.ops.sgmv_shrink import sgmv_shrink
+from vllm.lora.ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
 from vllm.platforms import current_platform
 
-from .utils import (generate_data, generate_data_for_expand_nslices,
-                    ref_torch_groupgemm)
+from .utils import (assert_close, generate_data,
+                    generate_data_for_expand_nslices,
+                    generate_data_for_nslices, ref_torch_groupgemm)
 
 HIDDEN_SIZES = [
     128,
@@ -112,14 +115,7 @@
 SEED = [0]
 CUDA_DEVICES = [f"cuda:{0}"]
 
-
-def assert_close(a, b):
-    rtol, atol = {
-        torch.float16: (6e-2, 6e-2),
-        torch.bfloat16: (6e-2, 6e-2),
-        torch.float32: (1e-2, 1e-2),
-    }[a.dtype]
-    torch.testing.assert_close(a, b, rtol=rtol, atol=atol)
+_dict_lock = Lock()
 
 
 @pytest.mark.parametrize("batches", BATCHES)
@@ -127,6 +123,7 @@ def assert_close(a, b):
 @pytest.mark.parametrize("rank", MAX_RANKS)
 @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
 @pytest.mark.parametrize("scaling", SCALES)
+@pytest.mark.parametrize("nslices", [1, 2, 3])
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("op_type", ["shrink", "expand"])
 @pytest.mark.parametrize("seed", SEED)
@@ -137,6 +134,7 @@ def test_punica_sgmv(
     rank: int,
     hidden_size: int,
     scaling: float,
+    nslices: int,
     dtype: torch.dtype,
     op_type: str,
     seed: int,
@@ -148,19 +146,20 @@ def test_punica_sgmv(
     seq_length = 128
     (
         inputs_tensor,
-        lora_weights,
+        lora_weights_lst,
         our_out_tensor,
         ref_out_tensor,
         b_seq_start_loc,
         lora_indices_tensor,
         seq_len_tensor,
         indices,
-    ) = generate_data(
+    ) = generate_data_for_nslices(
         batches,
         hidden_size,
         num_loras,
         rank,
         seq_length,
+        nslices,
         dtype,
         op_type,
         device,
@@ -172,43 +171,64 @@ def test_punica_sgmv(
     else:
         max_seq_length = max_seq_length.item()
     if op_type == "shrink":
-        sgmv_shrink(
-            inputs_tensor,
-            lora_weights,
-            our_out_tensor,
-            b_seq_start_loc,
-            seq_len_tensor,
-            lora_indices_tensor,
-            batches,
-            max_seq_length,
-            token_nums,
-            scaling,
-        )
+        # Preventing cache error pointer.
+        with _dict_lock:
+            _LORA_A_PTR_DICT.clear()
+            sgmv_shrink(
+                inputs_tensor,
+                lora_weights_lst,
+                our_out_tensor,
+                b_seq_start_loc,
+                seq_len_tensor,
+                lora_indices_tensor,
+                batches,
+                max_seq_length,
+                token_nums,
+                scaling,
+            )
+        for index in range(nslices):
+            ref_torch_groupgemm(
+                ref_out_tensor[index],
+                inputs_tensor,
+                lora_weights_lst[index],
+                lora_indices_tensor,
+                seq_len_tensor,
+                batches,
+                scaling,
+                op_type,
+            )
     else:
-        sgmv_expand(
-            inputs_tensor,
-            lora_weights,
-            our_out_tensor,
-            b_seq_start_loc,
-            seq_len_tensor,
-            lora_indices_tensor,
-            batches,
-            max_seq_length,
-            token_nums,
-            add_inputs=True,
-        )
-    ref_torch_groupgemm(
-        ref_out_tensor,
-        inputs_tensor,
-        lora_weights,
-        lora_indices_tensor,
-        seq_len_tensor,
-        batches,
-        scaling if op_type == "shrink" else 1.0,
-        op_type,
-    )
-    if op_type == "shrink":
-        ref_out_tensor = ref_out_tensor.to(torch.float32)
+        with _dict_lock:
+            _LORA_B_PTR_DICT.clear()
+            sgmv_expand(
+                inputs_tensor,
+                lora_weights_lst,
+                our_out_tensor,
+                b_seq_start_loc,
+                seq_len_tensor,
+                lora_indices_tensor,
+                batches,
+                max_seq_length,
+                token_nums,
+                offset_start=0,
+                add_inputs=True,
+            )
+
+        slice_offset = 0
+        for index in range(nslices):
+            lora_weights = lora_weights_lst[index]
+            ref_torch_groupgemm(
+                ref_out_tensor[:, slice_offset:slice_offset + hidden_size],
+                inputs_tensor[index],
+                lora_weights,
+                lora_indices_tensor,
+                seq_len_tensor,
+                batches,
+                1.0,
+                op_type,
+            )
+            slice_offset += hidden_size
+
     assert_close(our_out_tensor, ref_out_tensor)
 
 
@@ -292,25 +312,22 @@ def test_punica_bgmv(
 @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
 @pytest.mark.parametrize("nslices", [2, 3])
 @pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("op_type", ["sgmv", "bgmv"])
 @pytest.mark.parametrize("seed", SEED)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_punica_expand_nslices(
+def test_punica_bgmv_expand_nslices(
     batches: int,
     num_loras: int,
     rank: int,
     hidden_size: int,
     nslices: int,
     dtype: torch.dtype,
-    op_type: str,
     seed: int,
     device: str,
 ):
-
     torch.set_default_device(device)
     current_platform.seed_everything(seed)
 
-    seq_length = 128 if op_type == "sgmv" else 1
+    seq_length = 1
     (
         inputs_tensor,
         lora_weights_lst,
@@ -330,41 +347,18 @@ def test_punica_expand_nslices(
         nslices,
         device,
     )
-    max_seq_length = seq_len_tensor.max()
-    token_nums = seq_len_tensor.sum().item()
-    if isinstance(max_seq_length, tuple):
-        max_seq_length = max_seq_length[0].item()
-    else:
-        max_seq_length = max_seq_length.item()
     slice_offset = 0
     for index in range(nslices):
         lora_weights = lora_weights_lst[index]
-        if op_type == "sgmv":
-            sgmv_expand_slice(
-                inputs_tensor,
-                lora_weights,
-                our_outputs,
-                b_seq_start_loc,
-                seq_len_tensor,
-                lora_indices_tensor,
-                batches,
-                max_seq_length,
-                token_nums,
-                slice_offset,
-                hidden_size,
-                add_inputs=True,
-            )
-        else:
-
-            bgmv_expand_slice(
-                inputs_tensor,
-                lora_weights,
-                our_outputs,
-                indices,
-                slice_offset,
-                slice_size=hidden_size,
-                add_inputs=True,
-            )
+        bgmv_expand_slice(
+            inputs_tensor,
+            lora_weights,
+            our_outputs,
+            indices,
+            slice_offset,
+            slice_size=hidden_size,
+            add_inputs=True,
+        )
         ref_torch_groupgemm(
             ref_outputs[:, slice_offset:slice_offset + hidden_size],
             inputs_tensor,
diff --git a/tests/lora/test_punica_variation.py b/tests/lora/test_punica_variation.py
index 3b20033271d26..9ee10e7c23ee6 100644
--- a/tests/lora/test_punica_variation.py
+++ b/tests/lora/test_punica_variation.py
@@ -3,6 +3,8 @@
 under different conditions, including various batches, numbers of LoRA , and
 maximum ranks.
 """
+from threading import Lock
+
 import pytest
 import torch
 
@@ -11,12 +13,13 @@
 import vllm.lora.ops.bgmv_expand_slice
 import vllm.lora.ops.bgmv_shrink
 import vllm.lora.ops.sgmv_expand
-import vllm.lora.ops.sgmv_expand_slice
 import vllm.lora.ops.sgmv_shrink  # noqa: F401
+from vllm.lora.ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
 from vllm.platforms import current_platform
 
-from .utils import (generate_data, generate_data_for_expand_nslices,
-                    ref_torch_groupgemm)
+from .utils import (assert_close, generate_data,
+                    generate_data_for_expand_nslices,
+                    generate_data_for_nslices, ref_torch_groupgemm)
 
 HIDDEN_SIZES = [4097]
 
@@ -28,31 +31,23 @@
 SEED = [0]
 CUDA_DEVICES = [f"cuda:{0}"]
 
-
-def assert_close(a, b):
-    rtol, atol = {
-        torch.float16: (6e-2, 6e-2),
-        torch.bfloat16: (6e-2, 6e-2),
-        torch.float32: (1e-2, 1e-2),
-    }[a.dtype]
-    torch.testing.assert_close(a, b, rtol=rtol, atol=atol)
-
-
 # Unlike test_punica_sizes.py, we directly utilize custom op for
 # testing, which verifies the correct registration of these ops.
 bgmv_expand = torch.ops.vllm.bgmv_expand
 bgmv_expand_slice = torch.ops.vllm.bgmv_expand_slice
 bgmv_shrink = torch.ops.vllm.bgmv_shrink
 sgmv_expand = torch.ops.vllm.sgmv_expand
-sgmv_expand_slice = torch.ops.vllm.sgmv_expand_slice
 sgmv_shrink = torch.ops.vllm.sgmv_shrink
 
+_dict_lock = Lock()
+
 
 @pytest.mark.parametrize("batches", BATCHES)
 @pytest.mark.parametrize("num_loras", NUM_LORA)
 @pytest.mark.parametrize("rank", MAX_RANKS)
 @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
 @pytest.mark.parametrize("scaling", SCALES)
+@pytest.mark.parametrize("nslices", [1, 2, 3])
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("op_type", ["shrink", "expand"])
 @pytest.mark.parametrize("seed", SEED)
@@ -63,6 +58,7 @@ def test_punica_sgmv(
     rank: int,
     hidden_size: int,
     scaling: float,
+    nslices: int,
     dtype: torch.dtype,
     op_type: str,
     seed: int,
@@ -74,19 +70,20 @@ def test_punica_sgmv(
     seq_length = 128
     (
         inputs_tensor,
-        lora_weights,
+        lora_weights_lst,
         our_out_tensor,
         ref_out_tensor,
         b_seq_start_loc,
         lora_indices_tensor,
         seq_len_tensor,
         indices,
-    ) = generate_data(
+    ) = generate_data_for_nslices(
         batches,
         hidden_size,
         num_loras,
         rank,
         seq_length,
+        nslices,
         dtype,
         op_type,
         device,
@@ -98,43 +95,64 @@ def test_punica_sgmv(
     else:
         max_seq_length = max_seq_length.item()
     if op_type == "shrink":
-        sgmv_shrink(
-            inputs_tensor,
-            lora_weights,
-            our_out_tensor,
-            b_seq_start_loc,
-            seq_len_tensor,
-            lora_indices_tensor,
-            batches,
-            max_seq_length,
-            token_nums,
-            scaling,
-        )
+        # Preventing cache error pointer.
+        with _dict_lock:
+            _LORA_A_PTR_DICT.clear()
+            sgmv_shrink(
+                inputs_tensor,
+                lora_weights_lst,
+                our_out_tensor,
+                b_seq_start_loc,
+                seq_len_tensor,
+                lora_indices_tensor,
+                batches,
+                max_seq_length,
+                token_nums,
+                scaling,
+            )
+        for index in range(nslices):
+            ref_torch_groupgemm(
+                ref_out_tensor[index],
+                inputs_tensor,
+                lora_weights_lst[index],
+                lora_indices_tensor,
+                seq_len_tensor,
+                batches,
+                scaling,
+                op_type,
+            )
     else:
-        sgmv_expand(
-            inputs_tensor,
-            lora_weights,
-            our_out_tensor,
-            b_seq_start_loc,
-            seq_len_tensor,
-            lora_indices_tensor,
-            batches,
-            max_seq_length,
-            token_nums,
-            add_inputs=True,
-        )
-    ref_torch_groupgemm(
-        ref_out_tensor,
-        inputs_tensor,
-        lora_weights,
-        lora_indices_tensor,
-        seq_len_tensor,
-        batches,
-        scaling if op_type == "shrink" else 1.0,
-        op_type,
-    )
-    if op_type == "shrink":
-        ref_out_tensor = ref_out_tensor.to(torch.float32)
+        with _dict_lock:
+            _LORA_B_PTR_DICT.clear()
+            sgmv_expand(
+                inputs_tensor,
+                lora_weights_lst,
+                our_out_tensor,
+                b_seq_start_loc,
+                seq_len_tensor,
+                lora_indices_tensor,
+                batches,
+                max_seq_length,
+                token_nums,
+                offset_start=0,
+                add_inputs=True,
+            )
+
+        slice_offset = 0
+        for index in range(nslices):
+            lora_weights = lora_weights_lst[index]
+            ref_torch_groupgemm(
+                ref_out_tensor[:, slice_offset:slice_offset + hidden_size],
+                inputs_tensor[index],
+                lora_weights,
+                lora_indices_tensor,
+                seq_len_tensor,
+                batches,
+                1.0,
+                op_type,
+            )
+            slice_offset += hidden_size
+
     assert_close(our_out_tensor, ref_out_tensor)
 
 
@@ -220,24 +238,22 @@ def test_punica_bgmv(
 @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
 @pytest.mark.parametrize("nslices", [2, 3])
 @pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("op_type", ["sgmv", "bgmv"])
 @pytest.mark.parametrize("seed", SEED)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_punica_expand_nslices(
+def test_punica_bgmv_expand_nslices(
     batches: int,
     num_loras: int,
     rank: int,
     hidden_size: int,
     nslices: int,
     dtype: torch.dtype,
-    op_type: str,
     seed: int,
     device: str,
 ):
     torch.set_default_device(device)
     current_platform.seed_everything(seed)
 
-    seq_length = 128 if op_type == "sgmv" else 1
+    seq_length = 1
     (
         inputs_tensor,
         lora_weights_lst,
@@ -257,40 +273,18 @@ def test_punica_expand_nslices(
         nslices,
         device,
     )
-    max_seq_length = seq_len_tensor.max()
-    token_nums = seq_len_tensor.sum().item()
-    if isinstance(max_seq_length, tuple):
-        max_seq_length = max_seq_length[0].item()
-    else:
-        max_seq_length = max_seq_length.item()
     slice_offset = 0
     for index in range(nslices):
         lora_weights = lora_weights_lst[index]
-        if op_type == "sgmv":
-            sgmv_expand_slice(
-                inputs_tensor,
-                lora_weights,
-                our_outputs,
-                b_seq_start_loc,
-                seq_len_tensor,
-                lora_indices_tensor,
-                batches,
-                max_seq_length,
-                token_nums,
-                slice_offset,
-                hidden_size,
-                add_inputs=True,
-            )
-        else:
-            bgmv_expand_slice(
-                inputs_tensor,
-                lora_weights,
-                our_outputs,
-                indices,
-                slice_offset,
-                slice_size=hidden_size,
-                add_inputs=True,
-            )
+        bgmv_expand_slice(
+            inputs_tensor,
+            lora_weights,
+            our_outputs,
+            indices,
+            slice_offset,
+            slice_size=hidden_size,
+            add_inputs=True,
+        )
         ref_torch_groupgemm(
             ref_outputs[:, slice_offset:slice_offset + hidden_size],
             inputs_tensor,
diff --git a/tests/lora/utils.py b/tests/lora/utils.py
index e394c33b3f9ea..b66d18074a7bf 100644
--- a/tests/lora/utils.py
+++ b/tests/lora/utils.py
@@ -18,11 +18,13 @@ def set_module_lora(self, module_name: str, lora: LoRALayerWeights):
     def get_module_lora(self, module_name: str) -> LoRALayerWeights:
         return self._loras[module_name]
 
-    def init_random_lora(self,
-                         module_name: str,
-                         weight: torch.Tensor,
-                         rank: int = 8,
-                         generate_embeddings_tensor: int = 0):
+    def init_random_lora(
+        self,
+        module_name: str,
+        weight: torch.Tensor,
+        rank: int = 8,
+        generate_embeddings_tensor: int = 0,
+    ):
         lora = LoRALayerWeights(
             module_name,
             rank=rank,
@@ -35,21 +37,25 @@ def init_random_lora(self,
                               device=self._device),
         )
         if generate_embeddings_tensor:
-            lora.embeddings_tensor = torch.rand(5,
-                                                generate_embeddings_tensor,
-                                                dtype=weight.dtype,
-                                                device=self._device)
+            lora.embeddings_tensor = torch.rand(
+                5,
+                generate_embeddings_tensor,
+                dtype=weight.dtype,
+                device=self._device,
+            )
         self.set_module_lora(module_name, lora)
 
         return lora
 
-    def init_lora(self,
-                  module_name: str,
-                  input_dim: int,
-                  output_dim: int,
-                  rank=8,
-                  noop=False,
-                  embeddings_tensor=None):
+    def init_lora(
+        self,
+        module_name: str,
+        input_dim: int,
+        output_dim: int,
+        rank=8,
+        noop=False,
+        embeddings_tensor=None,
+    ):
         lora = LoRALayerWeights(
             module_name,
             rank=rank,
@@ -125,8 +131,16 @@ def ref_torch_groupgemm(
     return
 
 
-def generate_data(batches, hidden_size, lora_nums, max_rank, seq_length, dtype,
-                  op_type, device):
+def generate_data(
+    batches,
+    hidden_size,
+    lora_nums,
+    max_rank,
+    seq_length,
+    dtype,
+    op_type,
+    device,
+):
     seq_len_tensor = torch.randint(seq_length, seq_length + 1,
                                    (batches, )).to(device)
     b_seq_start_loc = torch.cumsum(
@@ -187,8 +201,16 @@ def generate_data(batches, hidden_size, lora_nums, max_rank, seq_length, dtype,
     )
 
 
-def generate_data_for_expand_nslices(batches, hidden_size, lora_nums, max_rank,
-                                     seq_length, dtype, nslices, device):
+def generate_data_for_expand_nslices(
+    batches,
+    hidden_size,
+    lora_nums,
+    max_rank,
+    seq_length,
+    dtype,
+    nslices,
+    device,
+):
     seq_len_tensor = torch.randint(seq_length, seq_length + 1,
                                    (batches, )).to(device)
     b_seq_start_loc = torch.cumsum(
@@ -221,7 +243,87 @@ def generate_data_for_expand_nslices(batches, hidden_size, lora_nums, max_rank,
     for b_id in range(batches):
         lora_index = lora_indices_tensor[b_id]
         indices[current_offset:current_offset +
-                seq_len_tensor[b_id]] = lora_index.item()
+                seq_len_tensor[b_id]] = (lora_index.item())
+        current_offset += seq_len_tensor[b_id].item()
+
+    lora_indices_tensor = lora_indices_tensor.to(device)
+    return (
+        inputs_tensor,
+        lora_weights_lst,
+        our_out_tensor,
+        ref_out_tensor,
+        b_seq_start_loc,
+        lora_indices_tensor,
+        seq_len_tensor,
+        indices,
+    )
+
+
+def generate_data_for_nslices(
+    batches,
+    hidden_size,
+    lora_nums,
+    max_rank,
+    seq_length,
+    nslices,
+    dtype,
+    op_type,
+    device,
+):
+    seq_len_tensor = torch.randint(seq_length, seq_length + 1,
+                                   (batches, )).to(device)
+    b_seq_start_loc = torch.cumsum(
+        torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long),
+        dim=0,
+    ).to(device)
+    total_tokens = seq_len_tensor.sum()
+
+    lora_weights_lst = []
+    if op_type == "shrink":
+
+        inputs_tensor = torch.rand((total_tokens, hidden_size),
+                                   dtype=dtype).to(device)
+
+        for _ in range(nslices):
+            if op_type == "shrink":
+                lora_weights_lst.append(
+                    torch.rand(
+                        (lora_nums, max_rank, hidden_size),  # col-major
+                        dtype=dtype,
+                    ).to(device))
+        # NOTE  shrink kernel using torch.float32 as output type
+        # shrink op need atomic_add, so output is initinized by 0
+        our_out_tensor = torch.zeros(
+            (nslices, total_tokens, max_rank),
+            dtype=torch.float32,
+        ).to(device)
+    else:
+        inputs_tensor = torch.rand(
+            (nslices, total_tokens, max_rank),
+            dtype=dtype,
+        ).to(device)
+        for _ in range(nslices):
+            lora_weights_lst.append(
+                torch.rand(
+                    (lora_nums, hidden_size, max_rank),  # col-major
+                    dtype=dtype,
+                ).to(device))
+        # expand op needs to complete y+=a@lora_b, so output is
+        # initinized randomly
+        our_out_tensor = torch.rand((total_tokens, hidden_size * nslices),
+                                    dtype=dtype).to(device)
+
+    # Ensure the same input.
+    ref_out_tensor = our_out_tensor.clone()
+    lora_indices_tensor = torch.randint(0,
+                                        lora_nums - 1 if lora_nums > 1 else 1,
+                                        (batches, ))
+    indices = torch.zeros((total_tokens), dtype=torch.long).to(device)
+    current_offset = 0
+    for b_id in range(batches):
+        lora_index = lora_indices_tensor[b_id]
+        indices[current_offset:current_offset +
+                seq_len_tensor[b_id]] = (lora_index.item())
         current_offset += seq_len_tensor[b_id].item()
 
     lora_indices_tensor = lora_indices_tensor.to(device)
diff --git a/vllm/lora/ops/sgmv_expand.py b/vllm/lora/ops/sgmv_expand.py
index 77c5178493c44..8af44b703810b 100644
--- a/vllm/lora/ops/sgmv_expand.py
+++ b/vllm/lora/ops/sgmv_expand.py
@@ -1,66 +1,109 @@
 """
 Based on:
-Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
-Punica: Multi-Tenant LoRA Serving. 
+Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023).
+Punica: Multi-Tenant LoRA Serving.
 https://arxiv.org/abs/2310.18547
 """
 
+from typing import List
+
 import torch
 import triton
 import triton.language as tl
 
 from vllm.utils import direct_register_custom_op
 
+from .utils import _get_lora_b_ptr
+
 
 @triton.jit
 def _sgmv_expand_kernel(
-    input_ptr,
-    lora_ptr,
-    out_ptr,
-    N,
-    K,
-    b_seq_start_loc,
-    seq_lens,
-    lora_indices,
-    xm_stride,
-    xk_stride,  # 1
-    l0_stride,  # hidden_size*max_rank
-    lora_k_stride,
-    lora_n_stride,
-    cm_stride,
-    cn_stride,
-    BLOCK_M: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-    BLOCK_K: tl.constexpr,
-    EVEN_K: tl.constexpr,
-    ADD_INPUTS: tl.constexpr,
-    CAST_TYPE: tl.constexpr,
-):
+        input_ptr,
+        lora_ptr,
+        out_ptr,
+        N,
+        K,
+        b_seq_start_loc,
+        seq_lens,
+        lora_indices,
+        slice_start_loc,
+        input_d0_stride,
+        input_d1_stride,
+        input_d2_stride,  # 1
+        ls_d0_ptr,
+        ls_d1_ptr,
+        ls_d2_ptr,  # 1
+        output_d0_stride,
+        output_d1_stride,  # 1
+        output_hs_ptr,
+        BLOCK_M: tl.constexpr,
+        BLOCK_N: tl.constexpr,
+        BLOCK_K: tl.constexpr,
+        EVEN_K: tl.constexpr,
+        ADD_INPUTS: tl.constexpr,
+        CAST_TYPE: tl.constexpr,
+        SLICE_NUM: tl.constexpr,
+        SAME_STRIDE: tl.constexpr):
     """
-    The sgmv's expand triton kernel is based on GroupGEMM.
+
+    Similar to the 'sgmv_expand' operator, but with an added parameter
+    'slice_offset'. The reason for not reusing the 'sgmv_expand' operator
+    might be that in the future, we could implement a fusion operator to
+    achieve the current functionality instead of having to call it multiple
+    times.
     """
     pid = tl.program_id(axis=0)
     cur_batch = tl.program_id(axis=1)
+    slice_id = tl.program_id(axis=2)
     cta_n_num = tl.cdiv(N, BLOCK_N)
+    # When the output dimensions of each slice are the same,cur_n=N, otherwise
+    # cur_n=tl.load(output_hs_ptr + slice_id), this situation exists in GQA's
+    # qkv linear.
+    curr_N = N if SAME_STRIDE else tl.load(output_hs_ptr + slice_id)
     pid_m = pid // cta_n_num
     pid_n = pid % cta_n_num
     M = tl.load(seq_lens + cur_batch)
     if pid_m * BLOCK_M > M:
         return
+    if pid_n * BLOCK_N > curr_N:
+        return
     lora_index = tl.load(lora_indices + cur_batch)
     if lora_index == -1:
         return
+
     cur_seq_start = tl.load(b_seq_start_loc + cur_batch)
     offset_m = tl.arange(0, BLOCK_M) + pid_m * BLOCK_M
     offset_n = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
     offset_k = tl.arange(0, BLOCK_K)
     ram = tl.max_contiguous(tl.multiple_of(offset_m % M, BLOCK_M), BLOCK_M)
-    rbn = tl.max_contiguous(tl.multiple_of(offset_n % N, BLOCK_N), BLOCK_N)
+    rbn = tl.max_contiguous(tl.multiple_of(offset_n % curr_N, BLOCK_N),
+                            BLOCK_N)
+    # ls_d*_ptr can be either an integer or a pointer
+    if SAME_STRIDE:
+        # integer
+        cur_lora_d0_stride = ls_d0_ptr
+        cur_lora_d1_stride = ls_d1_ptr
+        cur_lora_d2_stride = ls_d2_ptr
+    else:
+        # pointer
+        cur_lora_d0_stride = tl.load(ls_d0_ptr + slice_id)
+        cur_lora_d1_stride = tl.load(ls_d1_ptr + slice_id)
+        cur_lora_d2_stride = tl.load(ls_d2_ptr + slice_id)
+    if SLICE_NUM == 1:
+        cur_input_ptr = input_ptr
+        cur_lora_ptr = lora_ptr
 
-    a_ptr = (input_ptr + cur_seq_start * xm_stride + ram[:, None] * xm_stride +
-             offset_k[None, :] * xk_stride, )
-    b_ptr = (lora_ptr + l0_stride * lora_index +
-             offset_k[:, None] * lora_n_stride + rbn[None, :] * lora_k_stride)
+    else:
+        cur_input_ptr = input_ptr + slice_id * input_d0_stride
+        cur_lora_ptr = tl.load(lora_ptr + slice_id).to(
+            tl.pointer_type(out_ptr.dtype.element_ty))
+
+    a_ptr = (cur_input_ptr + cur_seq_start * input_d1_stride +
+             ram[:, None] * input_d1_stride +
+             offset_k[None, :] * input_d2_stride, )
+    b_ptr = (cur_lora_ptr + cur_lora_d0_stride * lora_index +
+             offset_k[:, None] * cur_lora_d2_stride +
+             rbn[None, :] * cur_lora_d1_stride)
     accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
     for k in range(tl.cdiv(K, BLOCK_K)):
         if EVEN_K:
@@ -74,26 +117,30 @@ def _sgmv_expand_kernel(
                               mask=offset_k[:, None] < K - k * BLOCK_K,
                               other=0)
         if CAST_TYPE:
-            tiled_a = tiled_a.to(lora_ptr.dtype.element_ty)
+            tiled_a = tiled_a.to(cur_lora_ptr.dtype.element_ty)
         accumulator += tl.dot(
             tiled_a,
             tiled_b,
         )
-        a_ptr += BLOCK_K * xk_stride
-        b_ptr += BLOCK_K * lora_n_stride
-    tiled_c = accumulator.to(lora_ptr.dtype.element_ty)
+        a_ptr += BLOCK_K * input_d2_stride
+        b_ptr += BLOCK_K * cur_lora_d2_stride
+
+    tiled_c = accumulator.to(cur_lora_ptr.dtype.element_ty)
+    if SLICE_NUM == 1:
+        cur_slice_start = slice_start_loc
+    else:
+        cur_slice_start = tl.load(slice_start_loc + slice_id)
+
     offset_cm = cur_seq_start + tl.arange(0, BLOCK_M) + pid_m * BLOCK_M
-    offset_cn = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
-    c_ptr = (out_ptr + offset_cm[:, None] * cm_stride +
-             offset_cn[None, :] * cn_stride)
+    offset_cn = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N + cur_slice_start
+    c_ptr = (out_ptr + offset_cm[:, None] * output_d0_stride +
+             offset_cn[None, :] * output_d1_stride)
     M = tl.load(seq_lens + cur_batch)
     c_mask = (offset_cm[:, None] <
-              (cur_seq_start + M)) & (offset_cn[None, :] < N)
+              (cur_seq_start + M)) & (offset_cn[None, :] <
+                                      (cur_slice_start + curr_N))
     if ADD_INPUTS:
-        # explicitly pass in other=None to tell triton that masked values
-        # can be uninitialized. This is OK because the later tl.store operation
-        # uses the same mask, eliminating the risk of garbage values propagating
-        tiled_out = tl.load(c_ptr, mask=c_mask, other=None)
+        tiled_out = tl.load(c_ptr, mask=c_mask)
         tiled_c += tiled_out
     tl.store(c_ptr, tiled_c, mask=c_mask)
 
@@ -101,7 +148,7 @@ def _sgmv_expand_kernel(
 @torch.inference_mode()
 def _sgmv_expand(
     inputs: torch.Tensor,
-    lora_b_weights: torch.Tensor,
+    lora_b_weights: List[torch.Tensor],
     output_tensor: torch.Tensor,
     b_seq_start_loc: torch.Tensor,
     seq_len_tensor: torch.Tensor,
@@ -109,17 +156,18 @@ def _sgmv_expand(
     batches: int,
     max_seq_length: int,
     token_nums: int,
+    offset_start: int = 0,
     add_inputs: bool = False,
 ) -> None:
     """
     Args:
         inputs (torch.Tensor): input tensor
-        lora_b_weights (torch.Tensor): lora'a weight
+        lora_b_weights (List[torch.Tensor]): lora'b weight
         output_tensor (torch.Tensor): output tensor
         b_seq_start_loc (torch.Tensor): (batch_size,). The cumulative
             sequence lengths of the sequences in the batch, used to index
             into sequence. E.g., if the sequence length is [4, 6], it is
-            [0, 4, 10].
+            [0, 4].
         seq_len_tensor (torch.Tensor): (batch_size,). Record the sequence
             length of the sequences in the batch.
         lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index
@@ -130,77 +178,80 @@ def _sgmv_expand(
             batch.
         token_nums (int): The token numbers in the batch. Used to verify if the 
             token numbers in the inputs matches the one in the metadata.
-        add_inputs (bool, optional): Defaults to False, adds the final lora 
-            results to the output.
+        offset_start (int, optional): Offset start for output_tensor. 
+            Defaults to 0.
+        add_inputs (bool, optional): Whether to add the input tensor to the 
+            output tensor. Defaults to False.
     """
-
     assert inputs.dtype in [torch.float16, torch.bfloat16, torch.float32]
-    assert lora_b_weights.dtype in [
-        torch.float16,
-        torch.bfloat16,
-    ]
-    assert inputs.size(0) == token_nums
-    assert inputs.size(1) == lora_b_weights.size(-1)
+    for weight in lora_b_weights:
+        assert weight.dtype in [torch.float16, torch.bfloat16]
+
+    assert inputs.size(1) == token_nums
+    assert inputs.size(0) == len(lora_b_weights)
+
     assert b_seq_start_loc.size(0) == batches
     assert lora_indices_tensor.size(0) == batches
-    assert inputs.is_contiguous()
     assert output_tensor.is_contiguous()
-
-    if lora_b_weights.ndim == 4:  # shape:(lora_num,1,size,rank)
-        assert lora_b_weights.size(1) == 1
-        lora_b_weights = lora_b_weights.squeeze(dim=1)
-    else:
-        assert lora_b_weights.ndim == 3  # shape:(lora_num,size,rank)
-
-    assert lora_b_weights.is_contiguous()
+    (slice_start_tensor, lora_ptr_tensor, lora_strides_d0_tensor,
+     lora_strides_d1_tensor, lora_strides_d2_tensor, hidden_sizes_tensor,
+     same_stride, MAX_N) = _get_lora_b_ptr(lora_b_weights, offset_start,
+                                           b_seq_start_loc.device)
 
     # TODO tuning this config
+    K = lora_b_weights[0].shape[-1]  # K= rank
 
-    N, K = lora_b_weights.shape[-2:]  # K= rank,N=hidden_size
-    BLOCK_M = 32
-    BLOCK_N = 32
+    BLOCK_M = 64
+    BLOCK_N = 128
     BLOCK_K = 16
     EVEN_K = K % BLOCK_K == 0
     ADD_INPUTS = add_inputs
     CAST_TYPE = False
-    if inputs.dtype == torch.float32 and lora_b_weights.dtype in [
+
+    if inputs.dtype == torch.float32 and lora_b_weights[0].dtype in [
             torch.float16,
             torch.bfloat16,
     ]:
         CAST_TYPE = True
     grid = (
-        triton.cdiv(max_seq_length, BLOCK_M) * triton.cdiv(N, BLOCK_N),
+        triton.cdiv(max_seq_length, BLOCK_M) * triton.cdiv(MAX_N, BLOCK_N),
         batches,
+        len(lora_b_weights),
     )
     _sgmv_expand_kernel[grid](
         inputs,
-        lora_b_weights,
+        lora_ptr_tensor,
         output_tensor,
-        N,
+        MAX_N,
         K,
         b_seq_start_loc,
         seq_len_tensor,
         lora_indices_tensor,
+        slice_start_tensor,
         inputs.stride(0),
         inputs.stride(1),
-        lora_b_weights.stride(0),
-        lora_b_weights.stride(1),
-        lora_b_weights.stride(2),
+        inputs.stride(2),
+        lora_strides_d0_tensor,
+        lora_strides_d1_tensor,
+        lora_strides_d2_tensor,
         output_tensor.stride(0),
         output_tensor.stride(1),
+        hidden_sizes_tensor,
         BLOCK_M,
         BLOCK_N,
         BLOCK_K,
         EVEN_K,
         ADD_INPUTS,
         CAST_TYPE,
+        len(lora_b_weights),
+        same_stride,
     )
     return
 
 
-def sgmv_expand_fake(
+def _sgmv_expand_fake(
     inputs: torch.Tensor,
-    lora_b_weights: torch.Tensor,
+    lora_b_weights: List[torch.Tensor],
     output_tensor: torch.Tensor,
     b_seq_start_loc: torch.Tensor,
     seq_len_tensor: torch.Tensor,
@@ -208,18 +259,18 @@ def sgmv_expand_fake(
     batches: int,
     max_seq_length: int,
     token_nums: int,
+    offset_start: int = 0,
     add_inputs: bool = False,
 ) -> None:
     return
 
 
 try:
-
     direct_register_custom_op(
         op_name="sgmv_expand",
         op_func=_sgmv_expand,
         mutates_args=["output_tensor"],
-        fake_impl=sgmv_expand_fake,
+        fake_impl=_sgmv_expand_fake,
     )
     sgmv_expand = torch.ops.vllm.sgmv_expand
 
diff --git a/vllm/lora/ops/sgmv_expand_slice.py b/vllm/lora/ops/sgmv_expand_slice.py
deleted file mode 100644
index 55c4fb68ed128..0000000000000
--- a/vllm/lora/ops/sgmv_expand_slice.py
+++ /dev/null
@@ -1,241 +0,0 @@
-"""
-Based on:
-Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
-Punica: Multi-Tenant LoRA Serving. 
-https://arxiv.org/abs/2310.18547
-"""
-
-import torch
-import triton
-import triton.language as tl
-
-from vllm.utils import direct_register_custom_op
-
-
-@triton.jit
-def _sgmv_expand_slice_kernel(
-    input_ptr,
-    lora_ptr,
-    out_ptr,
-    N,
-    K,
-    b_seq_start_loc,
-    seq_lens,
-    lora_indices,
-    xm_stride,
-    xk_stride,  # 1
-    l0_stride,  # hidden_size*max_rank
-    lora_k_stride,
-    lora_n_stride,
-    cm_stride,
-    cn_stride,
-    slice_offset,
-    BLOCK_M: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-    BLOCK_K: tl.constexpr,
-    EVEN_K: tl.constexpr,
-    ADD_INPUTS: tl.constexpr,
-    CAST_TYPE: tl.constexpr,
-):
-    """
-
-    Similar to the 'sgmv_expand' operator, but with an added parameter 
-    'slice_offset'. The reason for not reusing the 'sgmv_expand' operator 
-    might be that in the future, we could implement a fusion operator to 
-    achieve the current functionality instead of having to call it multiple 
-    times.
-    """
-    pid = tl.program_id(axis=0)
-    cur_batch = tl.program_id(axis=1)
-    cta_n_num = tl.cdiv(N, BLOCK_N)
-    pid_m = pid // cta_n_num
-    pid_n = pid % cta_n_num
-    M = tl.load(seq_lens + cur_batch)
-    if pid_m * BLOCK_M > M:
-        return
-    lora_index = tl.load(lora_indices + cur_batch)
-    if lora_index == -1:
-        return
-    cur_seq_start = tl.load(b_seq_start_loc + cur_batch)
-    offset_m = tl.arange(0, BLOCK_M) + pid_m * BLOCK_M
-    offset_n = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
-    offset_k = tl.arange(0, BLOCK_K)
-    ram = tl.max_contiguous(tl.multiple_of(offset_m % M, BLOCK_M), BLOCK_M)
-    rbn = tl.max_contiguous(tl.multiple_of(offset_n % N, BLOCK_N), BLOCK_N)
-
-    a_ptr = (input_ptr + cur_seq_start * xm_stride + ram[:, None] * xm_stride +
-             offset_k[None, :] * xk_stride, )
-    b_ptr = (lora_ptr + l0_stride * lora_index +
-             offset_k[:, None] * lora_n_stride + rbn[None, :] * lora_k_stride)
-    accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
-    for k in range(tl.cdiv(K, BLOCK_K)):
-        if EVEN_K:
-            tiled_a = tl.load(a_ptr)
-            tiled_b = tl.load(b_ptr)
-        else:
-            tiled_a = tl.load(a_ptr,
-                              mask=offset_k[None, :] < K - k * BLOCK_K,
-                              other=0)
-            tiled_b = tl.load(b_ptr,
-                              mask=offset_k[:, None] < K - k * BLOCK_K,
-                              other=0)
-        if CAST_TYPE:
-            tiled_a = tiled_a.to(lora_ptr.dtype.element_ty)
-        accumulator += tl.dot(
-            tiled_a,
-            tiled_b,
-        )
-        a_ptr += BLOCK_K * xk_stride
-        b_ptr += BLOCK_K * lora_n_stride
-    tiled_c = accumulator.to(lora_ptr.dtype.element_ty)
-    offset_cm = cur_seq_start + tl.arange(0, BLOCK_M) + pid_m * BLOCK_M
-    offset_cn = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N + slice_offset
-    c_ptr = (out_ptr + offset_cm[:, None] * cm_stride +
-             offset_cn[None, :] * cn_stride)
-    M = tl.load(seq_lens + cur_batch)
-    c_mask = (offset_cm[:, None] < (cur_seq_start + M)) & (offset_cn[None, :] <
-                                                           (slice_offset + N))
-    if ADD_INPUTS:
-        # explicitly pass in other=None to tell triton that masked values
-        # can be uninitialized. This is OK because the later tl.store operation
-        # uses the same mask, eliminating the risk of garbage values propagating
-        tiled_out = tl.load(c_ptr, mask=c_mask, other=None)
-        tiled_c += tiled_out
-    tl.store(c_ptr, tiled_c, mask=c_mask)
-
-
-@torch.inference_mode()
-def _sgmv_expand_slice(
-    inputs: torch.Tensor,
-    lora_b_weights: torch.Tensor,
-    output_tensor: torch.Tensor,
-    b_seq_start_loc: torch.Tensor,
-    seq_len_tensor: torch.Tensor,
-    lora_indices_tensor: torch.Tensor,
-    batches: int,
-    max_seq_length: int,
-    token_nums: int,
-    slice_offset: int,
-    slice_size: int,
-    add_inputs: bool = False,
-) -> None:
-    """_summary_
-
-    Args:
-        inputs (torch.Tensor): input tensor
-        lora_b_weights (torch.Tensor): lora'a weight
-        output_tensor (torch.Tensor): output tensor
-        b_seq_start_loc (torch.Tensor): (batch_size,). The cumulative
-            sequence lengths of the sequences in the batch, used to index
-            into sequence. E.g., if the sequence length is [4, 6], it is
-            [0, 4, 10].
-        seq_len_tensor (torch.Tensor): (batch_size,). Record the sequence
-            length of the sequences in the batch
-        lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index
-            corresponding to each batch. An index of -1 means no lora should be
-            applied.
-        batches (int): batch size
-        max_seq_length (int): The max sequence lengths of the sequences
-            in the batch
-        token_nums (int): The token numbers in the batch. Used to verify if the 
-            token numbers in the inputs matches the one in the metadata.
-        slice_offset (int): output_tensor's offset
-        slice_size (int): current output_tensor's size
-        add_inputs (bool, optional): Defaults to False, adds the final lora 
-            results to the output.
-    """
-
-    assert inputs.dtype in [torch.float16, torch.bfloat16, torch.float32]
-    assert lora_b_weights.dtype in [
-        torch.float16,
-        torch.bfloat16,
-    ]
-    assert inputs.size(0) == token_nums
-    assert inputs.size(1) == lora_b_weights.size(-1)
-    assert b_seq_start_loc.size(0) == batches
-    assert lora_indices_tensor.size(0) == batches
-    assert slice_size == lora_b_weights.size(-2)
-    assert inputs.is_contiguous()
-    assert output_tensor.is_contiguous()
-
-    if lora_b_weights.ndim == 4:  # shape:(lora_num,1,size,rank)
-        assert lora_b_weights.size(1) == 1
-        lora_b_weights = lora_b_weights.squeeze(dim=1)
-    else:
-        assert lora_b_weights.ndim == 3  # shape:(lora_num,size,rank)
-
-    assert lora_b_weights.is_contiguous()
-
-    # TODO tuning this config
-    N, K = lora_b_weights.shape[-2:]  # K= rank,N=hidden_size
-
-    BLOCK_M = 32
-    BLOCK_N = 32
-    BLOCK_K = 16
-    EVEN_K = K % BLOCK_K == 0
-    ADD_INPUTS = add_inputs
-    CAST_TYPE = False
-    if inputs.dtype == torch.float32 and lora_b_weights.dtype in [
-            torch.float16,
-            torch.bfloat16,
-    ]:
-        CAST_TYPE = True
-    grid = (
-        triton.cdiv(max_seq_length, BLOCK_M) * triton.cdiv(N, BLOCK_N),
-        batches,
-    )
-    _sgmv_expand_slice_kernel[grid](
-        inputs,
-        lora_b_weights,
-        output_tensor,
-        N,
-        K,
-        b_seq_start_loc,
-        seq_len_tensor,
-        lora_indices_tensor,
-        inputs.stride(0),
-        inputs.stride(1),
-        lora_b_weights.stride(0),
-        lora_b_weights.stride(1),
-        lora_b_weights.stride(2),
-        output_tensor.stride(0),
-        output_tensor.stride(1),
-        slice_offset,
-        BLOCK_M,
-        BLOCK_N,
-        BLOCK_K,
-        EVEN_K,
-        ADD_INPUTS,
-        CAST_TYPE,
-    )
-    return
-
-
-def sgmv_expand_slice_fake(
-    inputs: torch.Tensor,
-    lora_b_weights: torch.Tensor,
-    output_tensor: torch.Tensor,
-    b_seq_start_loc: torch.Tensor,
-    seq_len_tensor: torch.Tensor,
-    lora_indices_tensor: torch.Tensor,
-    batches: int,
-    max_seq_length: int,
-    token_nums: int,
-    slice_offset: int,
-    slice_size: int,
-    add_inputs: bool = False,
-) -> None:
-    return
-
-
-try:
-    direct_register_custom_op(
-        op_name="sgmv_expand_slice",
-        op_func=_sgmv_expand_slice,
-        mutates_args=["output_tensor"],
-        fake_impl=sgmv_expand_slice_fake,
-    )
-    sgmv_expand_slice = torch.ops.vllm.sgmv_expand_slice
-
-except AttributeError:
-    sgmv_expand_slice = _sgmv_expand_slice
diff --git a/vllm/lora/ops/sgmv_shrink.py b/vllm/lora/ops/sgmv_shrink.py
index 37d1dc84eebca..3d2ebe8286f56 100644
--- a/vllm/lora/ops/sgmv_shrink.py
+++ b/vllm/lora/ops/sgmv_shrink.py
@@ -5,48 +5,60 @@
 https://arxiv.org/abs/2310.18547
 """
 
+from typing import List
+
 import torch
 import triton
 import triton.language as tl
 
 from vllm.utils import direct_register_custom_op
 
+from .utils import _get_lora_a_ptr
+
 
 @triton.jit
 def _sgmv_shrink_kernel(
-    input_ptr,
-    lora_ptr,
-    out_ptr,
-    N,
-    K,
-    b_seq_start_loc,
-    seq_lens,
-    lora_indices,
-    scaling,
-    xm_stride,  # hidden_size
-    xk_stride,  # 1
-    l0_stride,  # hidden_size*max_rank
-    lora_k_stride,
-    lora_n_stride,
-    cm_stride,
-    cn_stride,
-    BLOCK_M: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-    BLOCK_K: tl.constexpr,
-    EVEN_K: tl.constexpr,
-    SPLIT_K: tl.constexpr,
-):
+        input_ptr,
+        lora_ptr,  #1-3
+        out_ptr,
+        N,
+        K,
+        b_seq_start_loc,
+        seq_lens,
+        lora_indices,
+        scaling,
+        input_d0_stride,
+        input_d1_stride,  # 1
+        lora_d0_stride,
+        lora_d1_stride,
+        lora_d2_stride,  # 1
+        output_d0_stride,
+        output_d1_stride,
+        output_d2_stride,  # 1 
+        BLOCK_M: tl.constexpr,
+        BLOCK_N: tl.constexpr,
+        BLOCK_K: tl.constexpr,
+        EVEN_K: tl.constexpr,
+        SPLIT_K: tl.constexpr,
+        SLICE_NUM: tl.constexpr):
     """
     The sgmv's shrink triton kernel is based on GroupGEMM+SPLIT-K.
     The GEMM of Multi-LoRA can be considered as GroupGEMM. Additionally,
     introducing SPLIT-K can improve performance
     """
     pid = tl.program_id(axis=0)
-    pid_sk = tl.program_id(axis=1)
+    pid_mix = tl.program_id(axis=1)
     cur_batch = tl.program_id(axis=2)
     cta_n_num = tl.cdiv(N, BLOCK_N)
     pid_m = pid // cta_n_num
     pid_n = pid % cta_n_num
+    if SLICE_NUM == 1:
+        slice_id: tl.constexpr = 0
+        pid_sk = tl.program_id(axis=1)
+    else:
+        pid_mix = tl.program_id(axis=1)
+        slice_id = pid_mix // SPLIT_K
+        pid_sk = pid_mix % SPLIT_K
 
     M = tl.load(seq_lens + cur_batch)
     if pid_m * BLOCK_M > M:
@@ -61,11 +73,22 @@ def _sgmv_shrink_kernel(
 
     ram = tl.max_contiguous(tl.multiple_of(offset_m % M, BLOCK_M), BLOCK_M)
     rbn = tl.max_contiguous(tl.multiple_of(offset_n % N, BLOCK_N), BLOCK_N)
+    # input ptr
+    a_ptr = (input_ptr + cur_seq_start * input_d0_stride +
+             ram[:, None] * input_d0_stride +
+             offset_k[None, :] * input_d1_stride)
+
+    if SLICE_NUM == 1:
+        # current lora ptr
+        cur_lora_ptr = lora_ptr
+    else:
+        # current lora ptr
+        cur_lora_ptr = tl.load(lora_ptr + slice_id).to(
+            tl.pointer_type(input_ptr.dtype.element_ty))
 
-    a_ptr = (input_ptr + cur_seq_start * xm_stride + ram[:, None] * xm_stride +
-             offset_k[None, :] * xk_stride)
-    b_ptr = (lora_ptr + l0_stride * lora_index + rbn[None, :] * lora_k_stride +
-             offset_k[:, None] * lora_n_stride)
+    b_ptr = (cur_lora_ptr + lora_d0_stride * lora_index +
+             rbn[None, :] * lora_d1_stride +
+             offset_k[:, None] * lora_d2_stride)
 
     accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
     for k in range(0, tl.cdiv(K, BLOCK_K * SPLIT_K)):
@@ -82,13 +105,15 @@ def _sgmv_shrink_kernel(
                               other=0.0)
         accumulator += tl.dot(tiled_a, tiled_b)
 
-        a_ptr += BLOCK_K * SPLIT_K * xk_stride
-        b_ptr += BLOCK_K * SPLIT_K * lora_n_stride
+        a_ptr += BLOCK_K * SPLIT_K * input_d1_stride
+        b_ptr += BLOCK_K * SPLIT_K * lora_d2_stride
     offset_cm = cur_seq_start + tl.arange(0, BLOCK_M) + pid_m * BLOCK_M
 
     offset_cn = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
-    c_ptr = (out_ptr + offset_cm[:, None] * cm_stride +
-             offset_cn[None, :] * cn_stride)
+    cur_out_ptr = (out_ptr if SLICE_NUM == 1 else out_ptr +
+                   slice_id * output_d0_stride)
+    c_ptr = cur_out_ptr + offset_cm[:, None] * output_d1_stride + offset_cn[
+        None, :] * output_d2_stride
     c_mask = (offset_cm[:, None] <
               (cur_seq_start + M)) & (offset_cn[None, :] < N)
     accumulator *= scaling
@@ -102,7 +127,7 @@ def _sgmv_shrink_kernel(
 @torch.inference_mode()
 def _sgmv_shrink(
     inputs: torch.Tensor,
-    lora_a_weights: torch.Tensor,
+    lora_a_weights: List[torch.Tensor],
     output_tensor: torch.Tensor,
     b_seq_start_loc: torch.Tensor,
     seq_len_tensor: torch.Tensor,
@@ -113,10 +138,9 @@ def _sgmv_shrink(
     scaling: float,
 ) -> None:
     """
-
     Args:
         inputs (torch.Tensor): input tensor
-        lora_a_weights (torch.Tensor): lora'a weight
+        lora_a_weights (List[torch.Tensor]): lora'a weight
         output_tensor (torch.Tensor): output tensor
         b_seq_start_loc (torch.Tensor): (batch_size,). The cumulative
             sequence lengths of the sequences in the batch, used to index
@@ -134,27 +158,21 @@ def _sgmv_shrink(
             token numbers in the inputs matches the one in the metadata.
         scaling (float): Scaling factor.
     """
-    assert inputs.dtype == lora_a_weights.dtype
+    assert inputs.dtype == lora_a_weights[0].dtype
     assert inputs.dtype in [torch.float16, torch.bfloat16]
-    assert lora_a_weights.dtype in [
-        torch.float16,
-        torch.bfloat16,
-    ]
+    for weight in lora_a_weights:
+        assert weight.dtype in [torch.float16, torch.bfloat16]
+
     assert inputs.size(0) == token_nums
-    assert inputs.size(1) == lora_a_weights.size(-1)
+    assert inputs.size(1) == lora_a_weights[0].size(-1)
     assert b_seq_start_loc.size(0) == batches
     assert lora_indices_tensor.size(0) == batches
     assert inputs.is_contiguous()
-
-    if lora_a_weights.ndim == 4:  # shape:(lora_num,1,rank, size)
-        assert lora_a_weights.size(1) == 1
-        lora_a_weights = lora_a_weights.squeeze(dim=1)
-    else:
-        assert lora_a_weights.ndim == 3  # shape:(lora_num,rank, size)
-    assert lora_a_weights.is_contiguous()
     assert output_tensor.is_contiguous()
+    (lora_ptr_tensor, lora_strides_d0, lora_strides_d1,
+     lora_strides_d2) = _get_lora_a_ptr(lora_a_weights, b_seq_start_loc.device)
     # TODO tuning this config
-    N, K = lora_a_weights.shape[-2:]  # K=hidden_size,N=rank
+    N, K = lora_a_weights[0].shape[-2:]  # K=hidden_size,N=rank
     BLOCK_M = 32
     BLOCK_N = 16
     BLOCK_K = 32
@@ -162,13 +180,12 @@ def _sgmv_shrink(
     EVEN_K = K % (BLOCK_K * SPLIT_K) == 0
     grid = (
         triton.cdiv(max_seq_length, BLOCK_M) * triton.cdiv(N, BLOCK_N),
-        SPLIT_K,
+        SPLIT_K * len(lora_a_weights),
         batches,
     )
-
     _sgmv_shrink_kernel[grid](
         inputs,
-        lora_a_weights,
+        lora_ptr_tensor,
         output_tensor,
         N,
         K,
@@ -178,23 +195,25 @@ def _sgmv_shrink(
         scaling,
         inputs.stride(0),
         inputs.stride(1),
-        lora_a_weights.stride(0),
-        lora_a_weights.stride(1),
-        lora_a_weights.stride(2),
+        lora_strides_d0,
+        lora_strides_d1,
+        lora_strides_d2,
         output_tensor.stride(0),
         output_tensor.stride(1),
+        output_tensor.stride(2),
         BLOCK_M,
         BLOCK_N,
         BLOCK_K,
         EVEN_K,
         SPLIT_K,
+        len(lora_a_weights),
     )
     return
 
 
 def sgmv_shrink_fake(
     inputs: torch.Tensor,
-    lora_a_weights: torch.Tensor,
+    lora_a_weights: List[torch.Tensor],
     output_tensor: torch.Tensor,
     b_seq_start_loc: torch.Tensor,
     seq_len_tensor: torch.Tensor,
diff --git a/vllm/lora/ops/utils.py b/vllm/lora/ops/utils.py
index 7c3e27313ad97..7df5bc2c225e5 100644
--- a/vllm/lora/ops/utils.py
+++ b/vllm/lora/ops/utils.py
@@ -1,5 +1,7 @@
 import functools
-from typing import Dict
+from typing import Dict, List, Tuple
+
+import torch
 
 
 @functools.lru_cache
@@ -44,3 +46,120 @@ def get_lora_op_configs(op_type: str, batch: int,
     if not config:
         config = _get_default_config(op_type, batch, hidden_size)
     return config
+
+
+_LORA_A_PTR_DICT: Dict[Tuple[int, ...], Tuple[torch.tensor, ...]] = {}
+_LORA_B_PTR_DICT: Dict[Tuple[int, ...], Tuple[torch.tensor, ...]] = {}
+
+
+def _get_lora_a_ptr(lora_a_weights: List[torch.Tensor], device: str):
+    """
+    `_LORA_A_PTR_DICT` collects the required information during `profile_run`, 
+    After this, it remains constant and subsequent usage is through LUT.
+    Refer to: 
+    https://github.com/triton-lang/triton/blob/release/3.1.x/python/tutorials/08-grouped-gemm.py
+    """
+    key = tuple(lora_weight.data_ptr() for lora_weight in lora_a_weights)
+
+    if values := _LORA_A_PTR_DICT.get(key):
+        return values
+
+    lora_strides_d0 = []
+    lora_strides_d1 = []
+    lora_strides_d2 = []
+    tensor_ptrs = []
+    for lora_a_weight in lora_a_weights:
+        if lora_a_weight.ndim == 4:  # shape:(lora_num,1,size,rank)
+            assert lora_a_weight.size(1) == 1
+            lora_a_weight = lora_a_weight.squeeze(dim=1)
+        else:
+            assert lora_a_weight.ndim == 3  # shape:(lora_num,size,rank)
+        assert lora_a_weight.is_contiguous()
+        tensor_ptrs.append(lora_a_weight.data_ptr())
+        lora_strides_d0.append(lora_a_weight.stride(0))
+        lora_strides_d1.append(lora_a_weight.stride(1))
+        lora_strides_d2.append(lora_a_weight.stride(2))
+    if len(lora_a_weights) > 1:
+        lora_ptr_tensor = torch.tensor(tensor_ptrs, device=device)
+    else:
+        lora_ptr_tensor = lora_a_weights[0]
+
+    if (len(set(lora_strides_d0)) > 1 or len(set(lora_strides_d1)) > 1
+            or len(set(lora_strides_d2)) > 1):
+        raise ValueError("All LoRA weights must have the same stride.")
+
+    _LORA_A_PTR_DICT[key] = (
+        lora_ptr_tensor,
+        lora_strides_d0[0],
+        lora_strides_d1[0],
+        lora_strides_d2[0],
+    )
+    return _LORA_A_PTR_DICT.get(key)
+
+
+def _get_lora_b_ptr(lora_weights: List[torch.Tensor], offset_start: int,
+                    device: str):
+    """ 
+     `_LORA_B_PTR_DICT` collects the required information during `profile_run`, 
+    After this, it remains constant and subsequent usage is through LUT.
+    Refer to: 
+    https://github.com/triton-lang/triton/blob/release/3.1.x/python/tutorials/08-grouped-gemm.py
+
+    """
+
+    key = tuple(lora_weight.data_ptr() for lora_weight in lora_weights)
+    if values := _LORA_B_PTR_DICT.get(key):
+        return values
+    slice_offset_lst = []
+    tensor_ptrs = []
+    lora_strides_d0 = []
+    lora_strides_d1 = []
+    lora_strides_d2 = []
+    hidden_sizes = []
+    slice_offset = offset_start
+    for lora_b_weight in lora_weights:
+        if lora_b_weight.ndim == 4:  # shape:(lora_num,1,size,rank)
+            assert lora_b_weight.size(1) == 1
+            lora_b_weight = lora_b_weight.squeeze(dim=1)
+        else:
+            assert lora_b_weight.ndim == 3  # shape:(lora_num,size,rank)
+        assert lora_b_weight.is_contiguous()
+        tensor_ptrs.append(lora_b_weight.data_ptr())
+        lora_strides_d0.append(lora_b_weight.stride(0))
+        lora_strides_d1.append(lora_b_weight.stride(1))
+        lora_strides_d2.append(lora_b_weight.stride(2))
+        slice_offset_lst.append(slice_offset)
+        slice_offset += lora_b_weight.size(1)
+        hidden_sizes.append(lora_b_weight.size(1))
+
+    if len(lora_weights) > 1:
+        # note these are device tensors
+        lora_ptr_tensor = torch.tensor(tensor_ptrs, device=device)
+        slice_start_tensor = torch.tensor(slice_offset_lst, device=device)
+    else:
+        slice_start_tensor = slice_offset_lst[0]
+        lora_ptr_tensor = lora_b_weight[0]
+
+    # If each lora has the same stride, there's no need to use a
+    # tensor for storage.
+    if (len(set(lora_strides_d0)) == 1 and len(set(lora_strides_d1)) == 1 and
+            len(set(lora_strides_d2)) == 1) and len(set(hidden_sizes)) == 1:
+        lora_strides_d0_tensor = lora_strides_d0[0]
+        lora_strides_d1_tensor = lora_strides_d1[0]
+        lora_strides_d2_tensor = lora_strides_d2[0]
+        hidden_sizes_tensor = hidden_sizes[0]
+        same_stride = True
+
+    else:
+        lora_strides_d0_tensor = torch.tensor(lora_strides_d0, device=device)
+        lora_strides_d1_tensor = torch.tensor(lora_strides_d1, device=device)
+        lora_strides_d2_tensor = torch.tensor(lora_strides_d2, device=device)
+        hidden_sizes_tensor = torch.tensor(hidden_sizes, device=device)
+        same_stride = False
+    # MAX_N is the maximum hidden size among all the lora_b weights
+    MAX_N = max(hidden_sizes)
+    _LORA_B_PTR_DICT[key] = (slice_start_tensor, lora_ptr_tensor,
+                             lora_strides_d0_tensor, lora_strides_d1_tensor,
+                             lora_strides_d2_tensor, hidden_sizes_tensor,
+                             same_stride, MAX_N)
+    return _LORA_B_PTR_DICT.get(key)
diff --git a/vllm/lora/punica_wrapper/punica_gpu.py b/vllm/lora/punica_wrapper/punica_gpu.py
index de378df8b3cfa..278f7b5a8e9f4 100644
--- a/vllm/lora/punica_wrapper/punica_gpu.py
+++ b/vllm/lora/punica_wrapper/punica_gpu.py
@@ -5,7 +5,7 @@
 https://arxiv.org/abs/2310.18547
 """
 
-from typing import Callable, Optional, Tuple, Union, final
+from typing import Optional, Tuple, Union, final
 
 import torch
 
@@ -16,7 +16,6 @@
     from vllm.lora.ops.bgmv_expand_slice import bgmv_expand_slice
     from vllm.lora.ops.bgmv_shrink import bgmv_shrink
     from vllm.lora.ops.sgmv_expand import sgmv_expand
-    from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice
     from vllm.lora.ops.sgmv_shrink import sgmv_shrink
 
 from .punica_base import PunicaWrapperBase
@@ -35,11 +34,11 @@ def __init__(self, max_num_batched_tokens: int, max_batches: int,
         PunicaWrapperBase.__init__(self, max_num_batched_tokens, max_batches,
                                    device)
 
-    def _shrink_prefill(
+    def _apply_shrink_prefill(
         self,
         y: torch.Tensor,
         x: torch.Tensor,
-        w_t_all: torch.Tensor,
+        w_t_all: Tuple[torch.Tensor, ...],
         scale: float,
     ):
         #No LoRA request, so return directly
@@ -53,7 +52,7 @@ def _shrink_prefill(
             scale,
         )
 
-    def _shrink_decode(
+    def _apply_shrink_decode(
         self,
         y: torch.Tensor,
         x: torch.Tensor,
@@ -62,56 +61,28 @@ def _shrink_decode(
     ):
         bgmv_shrink(x, w_t_all, y, self.token_lora_indices, scale)
 
-    def _expand_prefill(
+    def _apply_expand_prefill(
         self,
         y: torch.Tensor,
         x: torch.Tensor,
         w_t_all: torch.Tensor,
+        offset_start: int,
         add_inputs: bool,
     ):
         #No LoRA request, so return directly
         if self.no_lora:
             return
-        sgmv_expand(
-            x,
-            w_t_all,
-            y,
-            *self.prefill_metadata,
-            add_inputs,
-        )
-
-    def _expand_decode(
-        self,
-        y: torch.Tensor,
-        x: torch.Tensor,
-        w_t_all: torch.Tensor,
-        add_inputs: bool,
-    ):
-        bgmv_expand(x, w_t_all, y, self.token_lora_indices, add_inputs)
 
-    def _expand_slice_prefill(
-        self,
-        y: torch.Tensor,
-        x: torch.Tensor,
-        w_t_all: torch.Tensor,
-        y_offset: Optional[int],
-        y_slice_size: Optional[int],
-        add_inputs: bool,
-    ):
-        #No LoRA request, so return directly
-        if self.no_lora:
-            return
-        sgmv_expand_slice(
+        sgmv_expand(
             x,
             w_t_all,
             y,
             *self.prefill_metadata,
-            y_offset,
-            y_slice_size,
-            add_inputs,
+            offset_start=offset_start,
+            add_inputs=add_inputs,
         )
 
-    def _expand_slice_decode(
+    def _apply_expand_decode(
         self,
         y: torch.Tensor,
         x: torch.Tensor,
@@ -123,43 +94,6 @@ def _expand_slice_decode(
         bgmv_expand_slice(x, w_t_all, y, self.token_lora_indices, y_offset,
                           y_slice_size, add_inputs)
 
-    def _apply_expand(
-        self,
-        y: torch.Tensor,
-        x: torch.Tensor,
-        w_t_all: torch.Tensor,
-        y_offset: Optional[int],
-        y_slice_size: Optional[int],
-        add_inputs: bool = True,
-    ):
-        """
-        Perform the ` y[:,y_offset:y_offset+y_slice_size]+=x@w_t_all` 
-        computation, which is suitable for the
-        GEMM of lora'b.
-        """
-
-        expand_slice_fun: Callable = (self._expand_slice_prefill
-                                      if self.is_prefill else
-                                      self._expand_slice_decode)
-        expand_slice_fun(y, x, w_t_all, y_offset, y_slice_size, add_inputs)
-
-    def _apply_shrink(self, y: torch.Tensor, x: torch.Tensor,
-                      w_t_all: torch.Tensor, scale: float):
-        """
-        Perform the ` y+=x@w_t_all` computation, which is suitable for the
-        GEMM of lora'a.
-        When `is_prefill is` true, it indicates that it is currently the
-        prefill stage, and the `_shrink_prefill` function should be called.
-        Otherwise, it is the decode stage, and the _shrink_decode function
-        should be called.
-        """
-        y_org = y
-        y = y.view(-1, y.shape[-1])
-        shrink_fun: Callable = (self._shrink_prefill
-                                if self.is_prefill else self._shrink_decode)
-        shrink_fun(y, x, w_t_all, scale)
-        y = y.view_as(y_org)
-
     def add_shrink(self, y: Union[Tuple[torch.Tensor, ...], torch.Tensor],
                    x: torch.Tensor, lora_a_stacked: Tuple[torch.Tensor, ...],
                    scale: float, **kwargs):
@@ -182,10 +116,15 @@ def add_shrink(self, y: Union[Tuple[torch.Tensor, ...], torch.Tensor],
         """
 
         x = x.view(-1, x.shape[-1])
-        # TODO fuse these kernels
-        for slice_idx in range(len(lora_a_stacked)):
-            self._apply_shrink(y[slice_idx], x, lora_a_stacked[slice_idx],
-                               scale)
+
+        if self.is_prefill:
+            # NOTE fused kernel
+            self._apply_shrink_prefill(y, x, lora_a_stacked, scale)
+        else:
+            # TODO fuse these kernels
+            for slice_idx in range(len(lora_a_stacked)):
+                self._apply_shrink_decode(y[slice_idx], x,
+                                          lora_a_stacked[slice_idx], scale)
 
     def add_expand(self,
                    y: torch.Tensor,
@@ -217,20 +156,28 @@ def add_expand(self,
         """
         y_org = y
         y = y.view(-1, y.shape[-1])
-        offset_left = offset_start
         if lora_bias_stacked is not None:
             self._apply_bias(self.token_lora_indices, y, output_slices,
                              lora_bias_stacked)
-        for slice_idx in range(len(lora_b_stacked)):
-            self._apply_expand(
-                y,
-                x[slice_idx],
-                lora_b_stacked[slice_idx],
-                offset_left,
-                output_slices[slice_idx],
-                add_inputs=add_inputs,
-            )
-            offset_left += output_slices[slice_idx]
+        if self.is_prefill:
+            # NOTE fused kernel
+            self._apply_expand_prefill(y,
+                                       x,
+                                       lora_b_stacked,
+                                       offset_start,
+                                       add_inputs=True)
+        else:
+            # TODO fuse these kernels
+            for slice_idx in range(len(lora_b_stacked)):
+                self._apply_expand_decode(
+                    y,
+                    x[slice_idx],
+                    lora_b_stacked[slice_idx],
+                    offset_start,
+                    output_slices[slice_idx],
+                    add_inputs=add_inputs,
+                )
+                offset_start += output_slices[slice_idx]
         y = y.view_as(y_org)
 
     def add_lora_embedding(self,
@@ -252,10 +199,18 @@ def add_lora_embedding(self,
             add_inputs (bool): Default to True.
         """
 
-        # Embedding layer only need expand op
-        expand_fun: Callable = (self._expand_prefill
-                                if self.is_prefill else self._expand_decode)
-        expand_fun(y, x, lora_b_stacked, add_inputs)
+        if self.is_prefill:
+            sgmv_expand(
+                x.unsqueeze(dim=0),
+                [lora_b_stacked],
+                y,
+                *self.prefill_metadata,
+                offset_start=0,
+                add_inputs=add_inputs,
+            )
+        else:
+            bgmv_expand(x, lora_b_stacked, y, self.token_lora_indices,
+                        add_inputs)
 
     def add_lora_linear(self,
                         y: torch.Tensor,
@@ -301,10 +256,11 @@ def add_lora_linear(self,
             r = lora_b_stacked[0].size(-1)
             # We set the buffer to be float32 by default ,refer to:
             # https://github.com/triton-lang/triton/issues/1387
-            buffer = tuple(
-                torch.zeros(
-                    (x.size(0), r), dtype=torch.float32, device=x.device)
-                for _ in range(len(output_slices)))
+            buffer = torch.zeros(
+                (len(output_slices), x.size(0), r),
+                dtype=torch.float32,
+                device=x.device,
+            )
         self.add_shrink(buffer, x, lora_a_stacked, scale, **kwargs)
         self.add_expand(y,
                         buffer,

From 0f3f3c86ec44467fa80b60cb9f971f9ede028f76 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Mon, 6 Jan 2025 20:36:24 -0800
Subject: [PATCH 101/462] [Bugfix] Update attention interface in `Whisper`
 (#11784)

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/model_executor/models/whisper.py | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
index cb54b4c3ba663..c1f3bb0ca33c2 100644
--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@@ -106,6 +106,7 @@ def __init__(
             cache_config=cache_config,
             quant_config=quant_config,
             prefix=f"{prefix}.attn",
+            attn_type=self.attn_type,
         )
 
     def _init_qkv(
@@ -134,12 +135,7 @@ def forward(
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
 
-        attn_output = self.attn(q,
-                                k,
-                                v,
-                                kv_cache,
-                                attn_metadata,
-                                attn_type=self.attn_type)
+        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
 
         output, _ = self.out_proj(attn_output)
 
@@ -164,6 +160,7 @@ def __init__(
             cache_config=cache_config,
             quant_config=quant_config,
             prefix=prefix,
+            attn_type=AttentionType.ENCODER_DECODER,
         )
 
     def _init_qkv(
@@ -207,12 +204,13 @@ def forward(
         else:
             k = v = None
 
-        attn_output = self.attn(q,
-                                k,
-                                v,
-                                kv_cache,
-                                attn_metadata,
-                                attn_type=AttentionType.ENCODER_DECODER)
+        attn_output = self.attn(
+            q,
+            k,
+            v,
+            kv_cache,
+            attn_metadata,
+        )
 
         output, _ = self.out_proj(attn_output)
 
@@ -734,4 +732,4 @@ def load_weights(self, weights: Iterable[Tuple[str,
         loaded_weights = [(name, loaded_weight)
                           for name, loaded_weight in weights]
         mapper = WeightsMapper({".fc1.": ".mlp.fc1.", ".fc2.": ".mlp.fc2."})
-        return loader.load_weights(loaded_weights, mapper=mapper)
\ No newline at end of file
+        return loader.load_weights(loaded_weights, mapper=mapper)

From 898cdf033e31dc28042f7181b1565c78d905196e Mon Sep 17 00:00:00 2001
From: Liangfu Chen <liangfc@amazon.com>
Date: Mon, 6 Jan 2025 21:36:10 -0800
Subject: [PATCH 102/462] [CI] Fix neuron CI and run offline tests (#11779)

Signed-off-by: Liangfu Chen <liangfc@amazon.com>
---
 .buildkite/run-neuron-test.sh        | 53 ++++++++++++++--------------
 Dockerfile.neuron                    |  8 +++--
 examples/offline_inference_neuron.py | 11 ++----
 3 files changed, 35 insertions(+), 37 deletions(-)

diff --git a/.buildkite/run-neuron-test.sh b/.buildkite/run-neuron-test.sh
index 9259391aaed49..aa29c434e7cfb 100644
--- a/.buildkite/run-neuron-test.sh
+++ b/.buildkite/run-neuron-test.sh
@@ -3,6 +3,18 @@
 # This script build the Neuron docker image and run the API server inside the container.
 # It serves a sanity check for compilation and basic model usage.
 set -e
+set -v
+
+image_name="neuron/vllm-ci"
+container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
+
+HF_CACHE="$(realpath ~)/huggingface"
+mkdir -p "${HF_CACHE}"
+HF_MOUNT="/root/.cache/huggingface"
+
+NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache"
+mkdir -p "${NEURON_COMPILE_CACHE_URL}"
+NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache"
 
 # Try building the docker image
 aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
@@ -13,41 +25,30 @@ if [ -f /tmp/neuron-docker-build-timestamp ]; then
     last_build=$(cat /tmp/neuron-docker-build-timestamp)
     current_time=$(date +%s)
     if [ $((current_time - last_build)) -gt 86400 ]; then
+        docker image prune -f
         docker system prune -f
+        rm -rf "${HF_MOUNT:?}/*"
+        rm -rf "${NEURON_COMPILE_CACHE_MOUNT:?}/*"
         echo "$current_time" > /tmp/neuron-docker-build-timestamp
     fi
 else
     date "+%s" > /tmp/neuron-docker-build-timestamp
 fi
 
-docker build -t neuron -f Dockerfile.neuron .
+docker build -t "${image_name}" -f Dockerfile.neuron .
 
 # Setup cleanup
-remove_docker_container() { docker rm -f neuron || true; }
+remove_docker_container() {
+    docker image rm -f "${image_name}" || true;
+}
 trap remove_docker_container EXIT
-remove_docker_container
 
 # Run the image
-docker run --device=/dev/neuron0 --device=/dev/neuron1 --network host --name neuron neuron python3 -m vllm.entrypoints.api_server \
-       --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --max-num-seqs 8 --max-model-len 128 --block-size 128 --device neuron --tensor-parallel-size 2 &
-
-# Wait for the server to start
-wait_for_server_to_start() {
-    timeout=300
-    counter=0
-
-    while [ "$(curl -s -o /dev/null -w '%{http_code}' localhost:8000/health)" != "200" ]; do
-        sleep 1
-        counter=$((counter + 1))
-        if [ $counter -ge $timeout ]; then
-            echo "Timeout after $timeout seconds"
-            break
-        fi
-    done
-}
-wait_for_server_to_start
-
-# Test a simple prompt
-curl -X POST -H "Content-Type: application/json" \
-    localhost:8000/generate \
-    -d '{"prompt": "San Francisco is a"}'
+docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \
+       -v "${HF_CACHE}:${HF_MOUNT}" \
+       -e "HF_HOME=${HF_MOUNT}" \
+       -v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
+       -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
+       --name "${container_name}" \
+       ${image_name} \
+       /bin/bash -c "python3 /workspace/vllm/examples/offline_inference_neuron.py"
diff --git a/Dockerfile.neuron b/Dockerfile.neuron
index 269139fe90f0b..e9cb82889decd 100644
--- a/Dockerfile.neuron
+++ b/Dockerfile.neuron
@@ -15,8 +15,8 @@ RUN apt-get update && \
         ffmpeg libsm6 libxext6 libgl1
 
 ### Mount Point ###
-# When launching the container, mount the code directory to /app
-ARG APP_MOUNT=/app
+# When launching the container, mount the code directory to /workspace
+ARG APP_MOUNT=/workspace
 VOLUME [ ${APP_MOUNT} ]
 WORKDIR ${APP_MOUNT}/vllm
 
@@ -25,6 +25,7 @@ RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
 RUN python3 -m pip install sentencepiece transformers==4.45.2 -U
 RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
 RUN python3 -m pip install neuronx-cc==2.16.345.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
+RUN python3 -m pip install pytest
 
 COPY . .
 ARG GIT_REPO_CHECK=0
@@ -42,4 +43,7 @@ RUN --mount=type=bind,source=.git,target=.git \
 # install development dependencies (for testing)
 RUN python3 -m pip install -e tests/vllm_test_utils
 
+# overwrite entrypoint to run bash script
+RUN echo "import subprocess; import sys; subprocess.check_call(sys.argv[1:])" > /usr/local/bin/dockerd-entrypoint.py
+
 CMD ["/bin/bash"]
diff --git a/examples/offline_inference_neuron.py b/examples/offline_inference_neuron.py
index 2856be7c864ea..f098c8e5fed1e 100644
--- a/examples/offline_inference_neuron.py
+++ b/examples/offline_inference_neuron.py
@@ -1,12 +1,5 @@
-import os
-
 from vllm import LLM, SamplingParams
 
-# creates XLA hlo graphs for all the context length buckets.
-os.environ['NEURON_CONTEXT_LENGTH_BUCKETS'] = "128,512,1024,2048"
-# creates XLA hlo graphs for all the token gen buckets.
-os.environ['NEURON_TOKEN_GEN_BUCKETS'] = "128,512,1024,2048"
-
 # Sample prompts.
 prompts = [
     "Hello, my name is",
@@ -26,8 +19,8 @@
     # Currently, this is a known limitation in continuous batching support
     # in transformers-neuronx.
     # TODO(liangfu): Support paged-attention in transformers-neuronx.
-    max_model_len=2048,
-    block_size=2048,
+    max_model_len=1024,
+    block_size=1024,
     # The device can be automatically detected when AWS Neuron SDK is installed.
     # The device argument can be either unspecified for automated detection,
     # or explicitly assigned.

From e512f76a898d61b8857b36b138bb9cf93ea04d03 Mon Sep 17 00:00:00 2001
From: XiaobingZhang <xiaobingzhangupc@gmail.com>
Date: Tue, 7 Jan 2025 14:12:48 +0800
Subject: [PATCH 103/462] fix init error for MessageQueue when n_local_reader
 is zero (#11768)

---
 vllm/distributed/device_communicators/shm_broadcast.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py
index 9f97b0f01ad8a..4ced991f62f66 100644
--- a/vllm/distributed/device_communicators/shm_broadcast.py
+++ b/vllm/distributed/device_communicators/shm_broadcast.py
@@ -247,7 +247,8 @@ def __init__(
         self.handle = Handle(
             connect_ip=connect_ip,
             local_reader_ranks=local_reader_ranks,
-            buffer_handle=self.buffer.handle(),
+            buffer_handle=self.buffer.handle()
+            if self.buffer is not None else None,
             local_subscribe_port=local_subscribe_port,
             remote_subscribe_port=remote_subscribe_port,
         )

From ce1917fcf211458dfbe6bb86d6a9d2d9bd346e63 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Tue, 7 Jan 2025 01:57:32 -0500
Subject: [PATCH 104/462] [Doc] Create a vulnerability management team (#9925)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 SECURITY.md                                   |  2 +-
 .../contributing/vulnerability_management.md  | 43 +++++++++++++++++++
 docs/source/index.md                          |  1 +
 3 files changed, 45 insertions(+), 1 deletion(-)
 create mode 100644 docs/source/contributing/vulnerability_management.md

diff --git a/SECURITY.md b/SECURITY.md
index ad3f1f16ab560..de0032d26c87b 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -4,7 +4,7 @@
 
 If you believe you have found a security vulnerability in vLLM, we encourage you to let us know right away. We will investigate all legitimate reports and do our best to quickly fix the problem.
 
-Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new).
+Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new). Reports will then be triaged by the [vulnerability management team](https://docs.vllm.ai/contributing/vulnerability_management/).
 
 ---
 
diff --git a/docs/source/contributing/vulnerability_management.md b/docs/source/contributing/vulnerability_management.md
new file mode 100644
index 0000000000000..422dc13e6a644
--- /dev/null
+++ b/docs/source/contributing/vulnerability_management.md
@@ -0,0 +1,43 @@
+# Vulnerability Management
+
+## Reporting Vulnerabilities
+
+As mentioned in the [security
+policy](https://github.com/vllm-project/vllm/tree/main/SECURITY.md), security
+vulnerabilities may be reported privately to the project via
+[GitHub](https://github.com/vllm-project/vllm/security/advisories/new).
+
+## Vulnerability Management Team
+
+Once a vulnerability has been reported to the project, the Vulnerability
+Management Team (VMT) is responsible for managing the vulnerability. The VMT is
+responsible for:
+
+- Triaging the vulnerability.
+- Coordinating with reporters and project maintainers on vulnerability analysis
+  and resolution.
+- Drafting of security advisories for confirmed vulnerabilities, as appropriate.
+- Coordination with project maintainers on a coordinated release of the fix and
+  security advisory.
+
+### Security Advisories
+
+Advisories are published via GitHub through the same system used to report
+vulnerabilities. More information on the process can be found in the [GitHub
+documentation](https://docs.github.com/en/code-security/security-advisories/working-with-repository-security-advisories/about-repository-security-advisories).
+
+### Team Members
+
+We prefer to keep all vulnerability-related communication on the security report
+on GitHub. However, if you need to contact the VMT directly for an urgent issue,
+you may contact the following individuals:
+
+- Simon Mo - simon.mo@hey.com
+- Russell Bryant - rbryant@redhat.com
+
+## Slack Discussion
+
+You may use the `#security` channel in the [VLLM Slack](https://slack.vllm.ai)
+to discuss security-related topics. However, please do not disclose any
+vulnerabilities in this channel. If you need to report a vulnerability, please
+use the GitHub security advisory system or contact a VMT member privately.
diff --git a/docs/source/index.md b/docs/source/index.md
index c335155bd6e14..11d3e24a9b60a 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -169,6 +169,7 @@ contributing/overview
 contributing/profiling/profiling_index
 contributing/dockerfile/dockerfile
 contributing/model/index
+contributing/vulnerability_management
 ```
 
 # Indices and tables

From 1e4ce295ae70771f8e0eaa50962b3dda29c3f0d6 Mon Sep 17 00:00:00 2001
From: Yuan <yuan.zhou@intel.com>
Date: Tue, 7 Jan 2025 15:28:01 +0800
Subject: [PATCH 105/462] [CI][CPU] adding build number to docker image name
 (#11788)

Signed-off-by: Yuan Zhou <yuan.zhou@intel.com>
---
 .buildkite/run-cpu-test.sh | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index 4f1729d46dae2..a4eca078568fd 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -9,31 +9,31 @@ CORE_RANGE=${CORE_RANGE:-48-95}
 NUMA_NODE=${NUMA_NODE:-1}
 
 # Try building the docker image
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test -f Dockerfile.cpu .
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test-"$BUILDKITE_BUILD_NUMBER" -f Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 -f Dockerfile.cpu .
 
 # Setup cleanup
-remove_docker_container() { docker rm -f cpu-test-"$NUMA_NODE" cpu-test-avx2-"$NUMA_NODE" || true; }
+remove_docker_container() { docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; }
 trap remove_docker_container EXIT
 remove_docker_container
 
 # Run the image, setting --shm-size=4g for tensor parallel.
 docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE"  \
- --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test
+ --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
 docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
- --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2-"$NUMA_NODE" cpu-test-avx2
+ --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2
 
 function cpu_tests() {
   set -e
   export NUMA_NODE=$2
 
   # offline inference
-  docker exec cpu-test-avx2-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
     set -e
     python3 examples/offline_inference.py"
 
   # Run basic model test
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
     set -e
     pip install pytest pytest-asyncio \
       decord einops librosa peft Pillow sentence-transformers soundfile \
@@ -46,26 +46,26 @@ function cpu_tests() {
     pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
 
   # Run compressed-tensor test
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
     set -e
     pytest -s -v \
     tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
     tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
 
   # Run AWQ test
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
     set -e
     pytest -s -v \
     tests/quantization/test_ipex_quant.py"
 
   # Run chunked-prefill and prefix-cache test
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
     set -e
     pytest -s -v -k cpu_model \
     tests/basic_correctness/test_chunked_prefill.py"  
 
   # online inference
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
     set -e
     export VLLM_CPU_KVCACHE_SPACE=10 
     export VLLM_CPU_OMP_THREADS_BIND=$1

From 8082ad7950ad96fdc15e6b5a42e8098dd7087f6f Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Tue, 7 Jan 2025 01:55:39 -0800
Subject: [PATCH 106/462] [V1][Doc] Update V1 support for `LLaVa-NeXT-Video`
 (#11798)

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 docs/source/models/supported_models.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 590bea992d1fc..7777545b8b3c1 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -640,7 +640,7 @@ See [this page](#generative-models) for more information on how to use generativ
   - `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc.
   -
   - ✅︎
-  -
+  - ✅︎
 * - `LlavaOnevisionForConditionalGeneration`
   - LLaVA-Onevision
   - T + I<sup>+</sup> + V<sup>+</sup>

From 8f37be38ebfe0295a4925837c501c87149997a4d Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 7 Jan 2025 18:25:02 +0800
Subject: [PATCH 107/462] [Bugfix] Comprehensively test and fix LLaVA-NeXT
 feature size calculation (#11800)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 requirements-test.in                          |   1 +
 requirements-test.txt                         |   4 +
 .../processing/test_llava_next.py             | 129 +++++++++++++++---
 .../processing/test_llava_onevision.py        | 129 +++++++++++++++---
 vllm/model_executor/models/llava_next.py      |  37 ++---
 vllm/model_executor/models/llava_onevision.py |  42 +++---
 6 files changed, 253 insertions(+), 89 deletions(-)

diff --git a/requirements-test.in b/requirements-test.in
index fb4179c3d8423..4b4dc376d1fa5 100644
--- a/requirements-test.in
+++ b/requirements-test.in
@@ -13,6 +13,7 @@ einops # required for MPT, qwen-vl and Mamba
 httpx
 librosa # required for audio tests
 peft
+pqdm
 ray[adag]==2.40.0
 sentence-transformers # required for embedding tests
 soundfile # required for audio tests
diff --git a/requirements-test.txt b/requirements-test.txt
index 3771577fe8ed0..f576e42afcbbf 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -48,6 +48,8 @@ botocore==1.35.57
     #   awscli
     #   boto3
     #   s3transfer
+bounded-pool-executor==0.0.3
+    # via pqdm
 buildkite-test-collector==0.1.9
     # via -r requirements-test.in
 certifi==2024.8.30
@@ -342,6 +344,8 @@ pooch==1.8.2
     # via librosa
 portalocker==2.10.1
     # via sacrebleu
+pqdm==0.2.0
+    # via -r requirements-test.in
 propcache==0.2.0
     # via yarl
 protobuf==5.28.3
diff --git a/tests/models/decoder_only/vision_language/processing/test_llava_next.py b/tests/models/decoder_only/vision_language/processing/test_llava_next.py
index 37a6d334ee60c..9fa6a8a10a0f9 100644
--- a/tests/models/decoder_only/vision_language/processing/test_llava_next.py
+++ b/tests/models/decoder_only/vision_language/processing/test_llava_next.py
@@ -1,8 +1,13 @@
+import itertools
+from functools import partial
+
 import pytest
 from PIL import Image
+from pqdm.threads import pqdm
 from transformers import AutoTokenizer
 
 from vllm.inputs import InputProcessingContext
+from vllm.multimodal.parse import ImageSize
 
 from ....utils import build_model_context
 
@@ -15,20 +20,69 @@ def processor_for_llava_next():
     return LlavaNextMultiModalProcessor
 
 
+def _validate_image_prompt_replacements_one(
+    processor,
+    num_imgs: int,
+    failed_size_excs: list[tuple[ImageSize, Exception]],
+    image_size: ImageSize,
+) -> None:
+    prompt = "<image>" * num_imgs
+    image = Image.new("RGB", size=image_size)
+    mm_data = {"image": [image] * num_imgs}
+
+    try:
+        # The processor will throw an error if there is a mismatch
+        # in the prompt replacements
+        processed_inputs = processor.apply(prompt, mm_data, {})
+
+        image_placeholders = processed_inputs["mm_placeholders"]["image"]
+        assert len(image_placeholders) == num_imgs
+
+        first_placeholder = image_placeholders[0]
+
+        # NOTE: There is a BOS token
+        assert first_placeholder["offset"] == 1
+        assert first_placeholder["length"] == (
+            len(processed_inputs["prompt_token_ids"]) - 1) // num_imgs
+
+    except Exception as exc:
+        failed_size_excs.append((image_size, exc))
+
+
+def _test_image_prompt_replacements(
+    processor,
+    *,
+    num_imgs: int,
+    image_sizes: list[ImageSize],
+) -> None:
+    """
+    Ensure LlavaNextMultiModalProcessor
+    handles prompt replacement properly for input images.
+    """
+    failed_size_excs = list[tuple[ImageSize, Exception]]()
+
+    validate_one = partial(
+        _validate_image_prompt_replacements_one,
+        processor,
+        num_imgs,
+        failed_size_excs,
+    )
+    pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
+
+    if failed_size_excs:
+        msg = "Found failing image sizes:" \
+            + "\n========\n".join(f"[{size}]\n{exc}"
+                                  for size, exc in failed_size_excs)
+        raise AssertionError(msg)
+
+
 @pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
-@pytest.mark.parametrize("image_size", [(1669, 2560), (2560, 1669), (183, 488),
-                                        (488, 183), (198, 176), (176, 198),
-                                        (161, 184), (184, 161)])
 @pytest.mark.parametrize("num_imgs", [1, 2])
-def test_processor_prompt_replacements(
+def test_processor_prompt_replacements_regression(
     processor_for_llava_next,
     model_id: str,
-    image_size: tuple[int, int],
     num_imgs: int,
 ):
-    """
-    Ensure LlavaNextMultiModalProcessor handles prompt replacement properly.
-    """
     ctx = build_model_context(
         model_name=model_id,
         tokenizer_name=model_id,
@@ -37,22 +91,55 @@ def test_processor_prompt_replacements(
     )
     tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
     ctx = InputProcessingContext(ctx.model_config, tokenizer)
+    processor = processor_for_llava_next(ctx)
+
+    image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
+                    (488, 183), (2560, 1669)]
+    image_sizes = [
+        size for w, h in image_ratios
+        for size in [ImageSize(w, h), ImageSize(h, w)]
+    ]
+
+    _test_image_prompt_replacements(
+        processor,
+        num_imgs=num_imgs,
+        image_sizes=image_sizes,
+    )
 
-    # Build the image str / prompt based on the number of images we pass
-    prompt = "<image>" * num_imgs
-    mm_data = {"image": [Image.new("RGB", size=image_size)] * num_imgs}
 
-    # The processor will throw an error if there is a mismatch
-    # in the prompt replacements
+@pytest.mark.skip("This test takes around 2 hours to run. "
+                  "Comment this out to run it manually.")
+@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
+@pytest.mark.parametrize("num_imgs", [1])
+def test_processor_prompt_replacements_all(
+    processor_for_llava_next,
+    model_id: str,
+    num_imgs: int,
+):
+    ctx = build_model_context(
+        model_name=model_id,
+        tokenizer_name=model_id,
+        mm_processor_kwargs=None,
+        limit_mm_per_prompt={"image": num_imgs},
+    )
+    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+    ctx = InputProcessingContext(ctx.model_config, tokenizer)
     processor = processor_for_llava_next(ctx)
-    processed_inputs = processor.apply(prompt, mm_data, {})
 
-    image_placeholders = processed_inputs["mm_placeholders"]["image"]
-    assert len(image_placeholders) == num_imgs
+    seen_aspect_ratios = set[float]()
+    image_sizes = list[ImageSize]()
 
-    first_placeholder = image_placeholders[0]
+    # The aspect ratio of the grid layout is between 1 and 2
+    # NOTE: Assumes that feature size calculation is the same if we
+    # swap the width and height of the image
+    for w, h in itertools.product(range(64, 1024), repeat=2):
+        aspect_ratio = w / h
+        if 1 <= aspect_ratio <= 2 and aspect_ratio not in seen_aspect_ratios:
+            image_sizes.append(ImageSize(w, h))
+            seen_aspect_ratios.add(aspect_ratio)
 
-    # NOTE: There is a BOS token
-    assert first_placeholder["offset"] == 1
-    assert first_placeholder["length"] == (
-        len(processed_inputs["prompt_token_ids"]) - 1) // num_imgs
+    _test_image_prompt_replacements(
+        processor,
+        num_imgs=num_imgs,
+        image_sizes=image_sizes,
+    )
diff --git a/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py b/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py
index ed3e2db799be7..d4cdffa210b6d 100644
--- a/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py
+++ b/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py
@@ -1,8 +1,13 @@
+import itertools
+from functools import partial
+
 import pytest
 from PIL import Image
+from pqdm.threads import pqdm
 from transformers import AutoTokenizer
 
 from vllm.inputs import InputProcessingContext
+from vllm.multimodal.parse import ImageSize
 
 from ....utils import build_model_context
 
@@ -15,22 +20,68 @@ def processor_for_llava_onevision():
     return LlavaOnevisionMultiModalProcessor
 
 
+def _validate_image_prompt_replacements_one(
+    processor,
+    num_imgs: int,
+    failed_size_excs: list[tuple[ImageSize, Exception]],
+    image_size: ImageSize,
+) -> None:
+    prompt = "<image>" * num_imgs
+    image = Image.new("RGB", size=image_size)
+    mm_data = {"image": [image] * num_imgs}
+
+    try:
+        # The processor will throw an error if there is a mismatch
+        # in the prompt replacements
+        processed_inputs = processor.apply(prompt, mm_data, {})
+
+        image_placeholders = processed_inputs["mm_placeholders"]["image"]
+        assert len(image_placeholders) == num_imgs
+
+        first_placeholder = image_placeholders[0]
+
+        assert first_placeholder["offset"] == 0
+        assert first_placeholder["length"] == len(
+            processed_inputs["prompt_token_ids"]) // num_imgs
+    except Exception as exc:
+        failed_size_excs.append((image_size, exc))
+
+
+def _test_image_prompt_replacements(
+    processor,
+    *,
+    num_imgs: int,
+    image_sizes: list[ImageSize],
+) -> None:
+    """
+    Ensure LlavaOnevisionMultiModalProcessor
+    handles prompt replacement properly for input images.
+    """
+    failed_size_excs = list[tuple[ImageSize, Exception]]()
+
+    validate_one = partial(
+        _validate_image_prompt_replacements_one,
+        processor,
+        num_imgs,
+        failed_size_excs,
+    )
+    pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
+
+    if failed_size_excs:
+        msg = "Found failing image sizes:" \
+            + "\n========\n".join(f"[{size}]\n{exc}"
+                                  for size, exc in failed_size_excs)
+        raise AssertionError(msg)
+
+
 @pytest.mark.parametrize("model_id",
                          ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
-@pytest.mark.parametrize("image_size", [(1669, 2560), (2560, 1669), (183, 488),
-                                        (488, 183), (198, 176), (176, 198),
-                                        (161, 184), (184, 161)])
 @pytest.mark.parametrize("num_imgs", [1, 2])
-def test_processor_prompt_replacements(
+def test_processor_prompt_replacements_regression(
     processor_for_llava_onevision,
     model_id: str,
-    image_size: tuple[int, int],
     num_imgs: int,
 ):
-    """
-    Ensure LlavaOnevisionMultiModalProcessor handles prompt replacement
-    properly.
-    """
     ctx = build_model_context(
         model_name=model_id,
         tokenizer_name=model_id,
@@ -39,22 +90,56 @@ def test_processor_prompt_replacements(
     )
     tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
     ctx = InputProcessingContext(ctx.model_config, tokenizer)
+    processor = processor_for_llava_onevision(ctx)
 
-    # Build the image str / prompt based on the number of images we pass
-    prompt = "<image>" * num_imgs
-    mm_data = {"image": [Image.new("RGB", size=image_size)] * num_imgs}
+    image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
+                    (488, 183), (2560, 1669)]
+    image_sizes = [
+        size for w, h in image_ratios
+        for size in [ImageSize(w, h), ImageSize(h, w)]
+    ]
+
+    _test_image_prompt_replacements(
+        processor,
+        num_imgs=num_imgs,
+        image_sizes=image_sizes,
+    )
 
-    # The processor will throw an error if there is a mismatch
-    # in the prompt replacements
+
+@pytest.mark.skip("This test takes around 2 hours to run. "
+                  "Comment this out to run it manually.")
+@pytest.mark.parametrize("model_id",
+                         ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
+@pytest.mark.parametrize("num_imgs", [1])
+def test_processor_prompt_replacements_all(
+    processor_for_llava_onevision,
+    model_id: str,
+    num_imgs: int,
+):
+    ctx = build_model_context(
+        model_name=model_id,
+        tokenizer_name=model_id,
+        mm_processor_kwargs=None,
+        limit_mm_per_prompt={"image": num_imgs},
+    )
+    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+    ctx = InputProcessingContext(ctx.model_config, tokenizer)
     processor = processor_for_llava_onevision(ctx)
-    processed_inputs = processor.apply(prompt, mm_data, {})
 
-    image_placeholders = processed_inputs["mm_placeholders"]["image"]
-    assert len(image_placeholders) == num_imgs
+    seen_aspect_ratios = set[float]()
+    image_sizes = list[ImageSize]()
 
-    first_placeholder = image_placeholders[0]
+    # The aspect ratio of the grid layout is between 1 and 6
+    # NOTE: Assumes that feature size calculation is the same if we
+    # swap the width and height of the image
+    for w, h in itertools.product(range(64, 1024), repeat=2):
+        aspect_ratio = w / h
+        if 1 <= aspect_ratio <= 6 and aspect_ratio not in seen_aspect_ratios:
+            image_sizes.append(ImageSize(w, h))
+            seen_aspect_ratios.add(aspect_ratio)
 
-    # NOTE: There is a BOS token
-    assert first_placeholder["offset"] == 0
-    assert first_placeholder["length"] == len(
-        processed_inputs["prompt_token_ids"]) // num_imgs
+    _test_image_prompt_replacements(
+        processor,
+        num_imgs=num_imgs,
+        image_sizes=image_sizes,
+    )
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 258352416d4a7..815456dac2a2f 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -2,7 +2,6 @@
 from typing import (Final, Iterable, List, Literal, Mapping, Optional,
                     Protocol, Set, Tuple, TypedDict, Union)
 
-import numpy as np
 import torch
 import torch.nn as nn
 from transformers import BatchFeature, LlavaNextConfig, LlavaNextProcessor
@@ -74,7 +73,7 @@ def _get_hf_config(self) -> LlavaNextLikeConfig:
     def _get_hf_processor(self):
         return self.ctx.get_hf_processor(LlavaNextProcessor)
 
-    # Based on: https://github.com/huggingface/text-generation-inference/blob/v2.2.0/server/text_generation_server/models/vlm_causal_lm.py#L106
+    # Based on: https://github.com/huggingface/text-generation-inference/blob/v3.0.1/server/text_generation_server/models/vlm_causal_lm.py#L113
     def _get_num_image_tokens(
         self,
         *,
@@ -111,7 +110,7 @@ def _get_num_image_tokens(
 
         return unpadded_feature_size + newline_feature_size + base_feature_size
 
-    # Based on: https://github.com/huggingface/text-generation-inference/blob/v2.2.0/server/text_generation_server/models/vlm_causal_lm.py#L79
+    # Based on: https://github.com/huggingface/text-generation-inference/blob/v3.0.1/server/text_generation_server/models/vlm_causal_lm.py#L86
     def _get_num_unpadded_features(
         self,
         *,
@@ -121,29 +120,23 @@ def _get_num_unpadded_features(
         num_patch_height: int,
         num_patch_width: int,
     ) -> tuple[int, int]:
-        # NOTE: Use float32 to remain consistent with HF output
-        current_height_f = np.float32(npatches * num_patch_height)
-        current_width_f = np.float32(npatches * num_patch_width)
+        current_height = npatches * num_patch_height
+        current_width = npatches * num_patch_width
 
-        original_width_f = np.float32(original_width)
-        original_height_f = np.float32(original_height)
+        aspect_ratio = original_width / original_height
+        current_aspect_ratio = current_width / current_height
 
-        original_aspect_ratio = original_width_f / original_height_f
-        current_aspect_ratio = current_width_f / current_height_f
-
-        if original_aspect_ratio > current_aspect_ratio:
-            scale_factor = current_width_f / original_width_f
-            new_height = int(original_height_f * scale_factor)
-            padding = (current_height_f - new_height) // 2
-            current_height_f -= 2 * padding
+        if aspect_ratio > current_aspect_ratio:
+            new_height = (original_height * current_width) // original_width
+            padding = (current_height - new_height) // 2
+            current_height = current_height - (2 * padding)
         else:
-            scale_factor = current_height_f / original_height_f
-            new_width = int(original_width_f * scale_factor)
-            padding = (current_width_f - new_width) // 2
-            current_width_f -= 2 * padding
+            new_width = (original_width * current_height) // original_height
+            padding = (current_width - new_width) // 2
+            current_width = current_width - (2 * padding)
 
-        unpadded_features = int(current_height_f * current_width_f)
-        newline_features = int(current_height_f)
+        unpadded_features = current_height * current_width
+        newline_features = current_height
 
         return (unpadded_features, newline_features)
 
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index 62dae74e377be..b5e3edba1f01c 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -3,7 +3,6 @@
 from typing import (Final, Iterable, List, Literal, Mapping, Optional,
                     Protocol, Set, Tuple, TypedDict, Union)
 
-import numpy as np
 import torch
 import torch.nn as nn
 from transformers import (BatchFeature, LlavaOnevisionConfig,
@@ -98,6 +97,8 @@ def _get_hf_config(self) -> LlavaOnevisionLikeConfig:
     def _get_hf_processor(self):
         return self.ctx.get_hf_processor(LlavaOnevisionProcessor)
 
+    # Based on: https://github.com/huggingface/text-generation-inference/blob/v3.0.1/server/text_generation_server/models/vlm_causal_lm.py#L86
+    # with additional logic afterwards taken from LlavaOnevisionProcessor
     def _get_num_unpadded_features(
         self,
         *,
@@ -107,35 +108,28 @@ def _get_num_unpadded_features(
         num_patch_height: int,
         num_patch_width: int,
     ) -> tuple[int, int]:
-        # NOTE: Use float32 to remain consistent with HF output
-        current_height_f = np.float32(npatches * num_patch_height)
-        current_width_f = np.float32(npatches * num_patch_width)
+        current_height = npatches * num_patch_height
+        current_width = npatches * num_patch_width
 
-        original_width_f = np.float32(original_width)
-        original_height_f = np.float32(original_height)
+        aspect_ratio = original_width / original_height
+        current_aspect_ratio = current_width / current_height
 
-        original_aspect_ratio = original_width_f / original_height_f
-        current_aspect_ratio = current_width_f / current_height_f
-
-        if original_aspect_ratio > current_aspect_ratio:
-            scale_factor = current_width_f / original_width_f
-            new_height = int(original_height_f * scale_factor)
-            padding = (current_height_f - new_height) // 2
-            current_height_f -= 2 * padding
+        if aspect_ratio > current_aspect_ratio:
+            new_height = (original_height * current_width) // original_width
+            padding = (current_height - new_height) // 2
+            current_height = current_height - (2 * padding)
         else:
-            scale_factor = current_height_f / original_height_f
-            new_width = int(original_width_f * scale_factor)
-            padding = (current_width_f - new_width) // 2
-            current_width_f -= 2 * padding
+            new_width = (original_width * current_height) // original_height
+            padding = (current_width - new_width) // 2
+            current_width = current_width - (2 * padding)
 
-        unpadded_features = int(current_height_f * current_width_f)
-        newline_features = int(current_height_f)
+        unpadded_features = current_height * current_width
+        newline_features = current_height
 
-        ratio = math.sqrt(current_height_f * current_width_f /
-                          (9 * npatches**2))
+        ratio = math.sqrt(current_height * current_width / (9 * npatches**2))
         if ratio > 1.1:
-            height_factor = int(current_height_f // ratio)
-            width_factor = int(current_width_f // ratio)
+            height_factor = int(current_height // ratio)
+            width_factor = int(current_width // ratio)
             unpadded_features = height_factor * width_factor
             newline_features = height_factor
 

From 869e829b853cc35747c5e4bc9d773a4cff704d12 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 7 Jan 2025 18:41:17 +0800
Subject: [PATCH 108/462] [doc] add doc to explain how to use uv (#11773)

Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 .../getting_started/installation/gpu-cuda.md  | 67 ++++++++++++++-----
 1 file changed, 52 insertions(+), 15 deletions(-)

diff --git a/docs/source/getting_started/installation/gpu-cuda.md b/docs/source/getting_started/installation/gpu-cuda.md
index 7ea10bb8b59ff..295555b6c41f0 100644
--- a/docs/source/getting_started/installation/gpu-cuda.md
+++ b/docs/source/getting_started/installation/gpu-cuda.md
@@ -12,24 +12,43 @@ vLLM is a Python library that also contains pre-compiled C++ and CUDA (12.1) bin
 
 ## Install released versions
 
-You can install vLLM using pip:
+### Create a new Python environment
+
+You can create a new Python environment using `conda`:
 
 ```console
 $ # (Recommended) Create a new conda environment.
 $ conda create -n myenv python=3.12 -y
 $ conda activate myenv
-
-$ # Install vLLM with CUDA 12.1.
-$ pip install vllm
 ```
 
 ```{note}
-Although we recommend using `conda` to create and manage Python environments, it is highly recommended to use `pip` to install vLLM. This is because `pip` can install `torch` with separate library packages like `NCCL`, while `conda` installs `torch` with statically linked `NCCL`. This can cause issues when vLLM tries to use `NCCL`. See <gh-issue:8420> for more details.
+[PyTorch has deprecated the conda release channel](https://github.com/pytorch/pytorch/issues/138506). If you use `conda`, please only use it to create Python environment rather than installing packages. In particular, the PyTorch installed via `conda` will statically link `NCCL` library, which can cause issues when vLLM tries to use `NCCL`. See <gh-issue:8420> for more details.
+```
+
+Or you can create a new Python environment using [uv](https://docs.astral.sh/uv/), a very fast Python environment manager. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment using the following command:
+
+```console
+$ # (Recommended) Create a new uv environment. Use `--seed` to install `pip` and `setuptools` in the environment.
+$ uv venv myenv --python 3.12 --seed
+$ source myenv/bin/activate
+```
+
+In order to be performant, vLLM has to compile many cuda kernels. The compilation unfortunately introduces binary incompatibility with other CUDA versions and PyTorch versions, even for the same PyTorch version with different building configurations.
+
+Therefore, it is recommended to install vLLM with a **fresh new** environment. If either you have a different CUDA version or you want to use an existing PyTorch installation, you need to build vLLM from source. See [below](#build-from-source) for more details.
+
+### Install vLLM
+
+You can install vLLM using either `pip` or `uv pip`:
+
+```console
+$ # Install vLLM with CUDA 12.1.
+$ pip install vllm # If you are using pip.
+$ uv pip install vllm # If you are using uv.
 ```
 
-````{note}
-As of now, vLLM's binaries are compiled with CUDA 12.1 and public PyTorch release versions by default.
-We also provide vLLM binaries compiled with CUDA 11.8 and public PyTorch release versions:
+As of now, vLLM's binaries are compiled with CUDA 12.1 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 11.8 and public PyTorch release versions:
 
 ```console
 $ # Install vLLM with CUDA 11.8.
@@ -38,22 +57,19 @@ $ export PYTHON_VERSION=310
 $ pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
 ```
 
-In order to be performant, vLLM has to compile many cuda kernels. The compilation unfortunately introduces binary incompatibility with other CUDA versions and PyTorch versions, even for the same PyTorch version with different building configurations.
-
-Therefore, it is recommended to install vLLM with a **fresh new** conda environment. If either you have a different CUDA version or you want to use an existing PyTorch installation, you need to build vLLM from source. See below for instructions.
-````
-
 (install-the-latest-code)=
 
 ## Install the latest code
 
-LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on a x86 platform with CUDA 12 for every commit since `v0.5.3`. You can download and install it with the following command:
+LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on a x86 platform with CUDA 12 for every commit since `v0.5.3`.
+
+### Install the latest code using `pip`
 
 ```console
 $ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
 ```
 
-If you want to access the wheels for previous commits, you can specify the commit hash in the URL:
+If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), you can specify the commit hash in the URL:
 
 ```console
 $ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
@@ -62,6 +78,27 @@ $ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_COMMIT}/vllm
 
 Note that the wheels are built with Python 3.8 ABI (see [PEP 425](https://peps.python.org/pep-0425/) for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (`1.0.0.dev`) is just a placeholder to have a unified URL for the wheels. The actual versions of wheels are contained in the wheel metadata. Although we don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the wheels are still built with Python 3.8 ABI to keep the same wheel name as before.
 
+Due to the limitation of `pip`, you have to specify the full URL of the wheel file.
+
+### Install the latest code using `uv`
+
+Another way to install the latest code is to use `uv`:
+
+```console
+$ uv pip install vllm --extra-index-url https://wheels.vllm.ai/nightly
+```
+
+If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), you can specify the commit hash in the URL:
+
+```console
+$ export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch
+$ uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT}
+```
+
+The `uv` approach works for vLLM `v0.6.6` and later and offers an easy-to-remember command. A unique feature of `uv` is that packages in `--extra-index-url` have [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes). If the latest public release is `v0.6.6.post1`, `uv`'s behavior allows installing a commit before `v0.6.6.post1` by specifying the `--extra-index-url`. In contrast, `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version.
+
+### Install the latest code using `docker`
+
 Another way to access the latest code is to use the docker images:
 
 ```console

From 2de197bdd4b82a004ff99806d054dce1d93b3ced Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Tue, 7 Jan 2025 03:47:36 -0800
Subject: [PATCH 109/462] [V1] Support audio language models on V1 (#11733)

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 docs/source/models/supported_models.md    |  4 ++--
 vllm/model_executor/models/qwen2_audio.py |  9 +++++---
 vllm/model_executor/models/ultravox.py    | 28 +++++++++++++++++------
 3 files changed, 29 insertions(+), 12 deletions(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 7777545b8b3c1..8c5f6836d6aa8 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -710,7 +710,7 @@ See [this page](#generative-models) for more information on how to use generativ
   - `Qwen/Qwen2-Audio-7B-Instruct`
   -
   - ✅︎
-  -
+  - ✅︎
 * - `Qwen2VLForConditionalGeneration`
   - Qwen2-VL
   - T + I<sup>E+</sup> + V<sup>E+</sup>
@@ -724,7 +724,7 @@ See [this page](#generative-models) for more information on how to use generativ
   - `fixie-ai/ultravox-v0_3`
   -
   - ✅︎
-  -
+  - ✅︎
 ```
 
 <sup>E</sup> Pre-computed embeddings can be inputted for this modality.  
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index 576b01776e5de..7012ddc66cd9c 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -335,13 +335,16 @@ def _process_audio_input(self,
         selected_audio_feature = audio_outputs.last_hidden_state
         audio_features = self.multi_modal_projector(selected_audio_feature)
         num_audios, max_audio_tokens, embed_dim = audio_features.shape
+        audio_output_lengths = audio_output_lengths.unsqueeze(1)
         audio_features_mask = torch.arange(max_audio_tokens).expand(
-            num_audios, max_audio_tokens
-        ).to(audio_output_lengths.device) < audio_output_lengths.unsqueeze(1)
+            num_audios, max_audio_tokens).to(
+                audio_output_lengths.device) < audio_output_lengths
         masked_audio_features = audio_features[audio_features_mask].view(
             -1, embed_dim)
 
-        return masked_audio_features
+        # Split to tuple of embeddings for individual audio input.
+        return torch.split(masked_audio_features,
+                           audio_output_lengths.flatten().tolist())
 
     def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
         audio_input = self._parse_and_validate_audio_input(**kwargs)
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index ba823acecbb56..ecafd157b1d61 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -1,6 +1,5 @@
 # Adapted from https://github.com/fixie-ai/ultravox/blob/ecd58c4041030bae2ad15aa6bcf04ab43199ea02/ultravox/model/ultravox_model.py
 """PyTorch Ultravox model."""
-
 import math
 from functools import cached_property
 from typing import (Any, Iterable, List, Literal, Mapping, Optional, Set,
@@ -14,6 +13,7 @@
 from transformers.models.whisper import WhisperFeatureExtractor
 from transformers.models.whisper.modeling_whisper import WhisperEncoder
 
+from vllm import envs
 from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn
@@ -35,8 +35,11 @@
 from .interfaces import SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
                     init_vllm_registered_model, maybe_prefix,
+                    merge_multimodal_embeddings,
                     merge_multimodal_embeddings_from_map)
 
+_AUDIO_PLACEHOLDER_OVERRIDE = "<|reserved_special_token_0|>"
+_AUDIO_PLACEHOLDER_TOKEN = 128002
 _AUDIO_TOKENS_PER_SECOND = 6.25
 
 
@@ -64,7 +67,14 @@ def _get_hf_processor(
         # Ignored in initialization
         sampling_rate: Optional[int] = None,
     ) -> ProcessorMixin:
-        return self.ctx.get_hf_processor()
+        hf_processor = self.ctx.get_hf_processor()
+
+        # NOTE: Ultravox processing definition uses '<|eot_id|>' as the
+        # placeholder that will cause confusion with the actual end of turn
+        # token, thus we override placeholder with a reserved special
+        # token.
+        hf_processor.audio_token_replacement = _AUDIO_PLACEHOLDER_OVERRIDE
+        return hf_processor
 
     def _get_feature_extractor(
         self,
@@ -465,11 +475,15 @@ def get_input_embeddings(
         inputs_embeds = self.language_model.get_input_embeddings(input_ids)
         if multimodal_embeddings is not None:
 
-            # TODO(ywang96): use merge_multimodal_embeddings after
-            # v0 is deprecated
-            merge_multimodal_embeddings_from_map(
-                inputs_embeds, multimodal_embeddings,
-                attn_metadata.multi_modal_placeholder_index_maps["audio"])
+            # TODO(ywang96): remove this block after v0 is deprecated.
+            if not envs.VLLM_USE_V1:
+                merge_multimodal_embeddings_from_map(
+                    inputs_embeds, multimodal_embeddings,
+                    attn_metadata.multi_modal_placeholder_index_maps["audio"])
+            else:
+                inputs_embeds = merge_multimodal_embeddings(
+                    input_ids, inputs_embeds, multimodal_embeddings,
+                    _AUDIO_PLACEHOLDER_TOKEN)
         return inputs_embeds
 
     def forward(self,

From d9fa1c05ad7149a43051a283d0cbeeb65bf6b4a3 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 7 Jan 2025 21:42:58 +0800
Subject: [PATCH 110/462] [doc] update how pip can install nightly wheels
 (#11806)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 docs/source/getting_started/installation/gpu-cuda.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/source/getting_started/installation/gpu-cuda.md b/docs/source/getting_started/installation/gpu-cuda.md
index 295555b6c41f0..1cd513177bf0d 100644
--- a/docs/source/getting_started/installation/gpu-cuda.md
+++ b/docs/source/getting_started/installation/gpu-cuda.md
@@ -66,19 +66,19 @@ LLM inference is a fast-evolving field, and the latest code may contain bug fixe
 ### Install the latest code using `pip`
 
 ```console
-$ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
+$ pip install vllm --pre --extra-index-url https://wheels.vllm.ai/nightly
 ```
 
-If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), you can specify the commit hash in the URL:
+`--pre` is required for `pip` to consider pre-released versions.
+
+If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), due to the limitation of `pip`, you have to specify the full URL of the wheel file by embedding the commit hash in the URL:
 
 ```console
 $ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
 $ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
 ```
 
-Note that the wheels are built with Python 3.8 ABI (see [PEP 425](https://peps.python.org/pep-0425/) for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (`1.0.0.dev`) is just a placeholder to have a unified URL for the wheels. The actual versions of wheels are contained in the wheel metadata. Although we don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the wheels are still built with Python 3.8 ABI to keep the same wheel name as before.
-
-Due to the limitation of `pip`, you have to specify the full URL of the wheel file.
+Note that the wheels are built with Python 3.8 ABI (see [PEP 425](https://peps.python.org/pep-0425/) for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (`1.0.0.dev`) is just a placeholder to have a unified URL for the wheels, the actual versions of wheels are contained in the wheel metadata (the wheels listed in the extra index url have correct versions). Although we don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the wheels are still built with Python 3.8 ABI to keep the same wheel name as before.
 
 ### Install the latest code using `uv`
 
@@ -126,7 +126,7 @@ $ cd vllm
 $ VLLM_USE_PRECOMPILED=1 pip install --editable .
 ```
 
-This will download the latest nightly wheel and use the compiled libraries from there in the install.
+This will download the latest nightly wheel from https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl and use the compiled libraries from there in the installation.
 
 The `VLLM_PRECOMPILED_WHEEL_LOCATION` environment variable can be used instead of `VLLM_USE_PRECOMPILED` to specify a custom path or URL to the wheel file. For example, to use the [0.6.1.post1 PyPi wheel](https://pypi.org/project/vllm/#files):
 

From c0efe92d8b9ef968a5b796fd7d6ebc426d78e726 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 7 Jan 2025 21:50:58 +0800
Subject: [PATCH 111/462] [Doc] Add note to `gte-Qwen2` models (#11808)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/source/models/supported_models.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 8c5f6836d6aa8..3ba34c77205e5 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -430,6 +430,9 @@ You can set `--hf-overrides '{"is_causal": false}'` to change the attention mask
 
 On the other hand, its 1.5B variant (`Alibaba-NLP/gte-Qwen2-1.5B-instruct`) uses causal attention
 despite being described otherwise on its model card.
+
+Regardless of the variant, you need to enable `--trust-remote-code` for the correct tokenizer to be
+loaded. See [relevant issue on HF Transformers](https://github.com/huggingface/transformers/issues/34882).
 ```
 
 If your model is not in the above list, we will try to automatically convert the model using

From 869579a702cb086cca6bd6ec4500f954a9adec1c Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Wed, 8 Jan 2025 01:04:28 +0800
Subject: [PATCH 112/462] [optimization] remove python function call for custom
 op (#11750)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/_custom_ops.py                             |  4 ----
 vllm/model_executor/layers/activation.py        | 17 +++++++++++------
 .../layers/fused_moe/fused_marlin_moe.py        |  4 ++--
 .../layers/fused_moe/fused_moe.py               |  3 ++-
 4 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index eb2f69df42624..afb350591e562 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -35,10 +35,6 @@ def register_fake(fn):
 
 
 # activation ops
-def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-    torch.ops._C.silu_and_mul(out, x)
-
-
 def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
     torch.ops._C.gelu_and_mul(out, x)
 
diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index 34d65ed51ef3f..46d4670bfe4f9 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -10,6 +10,7 @@
                               get_tensor_model_parallel_world_size)
 from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.utils import set_weight_attrs
+from vllm.platforms import current_platform
 from vllm.utils import LazyDict
 
 
@@ -58,27 +59,31 @@ class SiluAndMul(CustomOp):
         return: (num_tokens, d) or (batch_size, seq_len, d)
     """
 
+    def __init__(self):
+        super().__init__()
+        if current_platform.is_cuda_alike():
+            self.op = torch.ops._C.silu_and_mul
+        elif current_platform.is_xpu():
+            import intel_extension_for_pytorch as ipex
+            self.op = ipex.llm.functional.silu_and_mul
+
     def forward_native(self, x: torch.Tensor) -> torch.Tensor:
         """PyTorch-native implementation equivalent to forward()."""
         d = x.shape[-1] // 2
         return F.silu(x[..., :d]) * x[..., d:]
 
     def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
-        from vllm import _custom_ops as ops
-
         d = x.shape[-1] // 2
         output_shape = (x.shape[:-1] + (d, ))
         out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
-        ops.silu_and_mul(out, x)
+        self.op(out, x)
         return out
 
     def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
-        from vllm._ipex_ops import ipex_ops as ops
-
         d = x.shape[-1] // 2
         output_shape = (x.shape[:-1] + (d, ))
         out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
-        ops.silu_and_mul(out, x)
+        self.op(out, x)
         return out
 
 
diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
index 4741d69de11ac..87993267c05b5 100644
--- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
@@ -4,7 +4,6 @@
 
 import torch
 
-from vllm import _custom_ops as ops
 from vllm.model_executor.layers.fused_moe.fused_moe import (
     fused_topk, moe_align_block_size, try_get_optimal_moe_config)
 from vllm.scalar_type import scalar_types
@@ -301,7 +300,8 @@ def fused_marlin_moe(
         False,
     )
 
-    ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, 2 * N))
+    torch.ops._C.silu_and_mul(intermediate_cache2,
+                              intermediate_cache1.view(-1, 2 * N))
 
     intermediate_cache3 = torch.ops._moe_C.marlin_gemm_moe(
         intermediate_cache2,
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 4101facbe7874..1bb6bc753d37c 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -753,7 +753,8 @@ def fused_experts_impl(hidden_states: torch.Tensor,
                                 use_int8_w8a16=use_int8_w8a16,
                                 block_shape=block_shape)
 
-        ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, N))
+        torch.ops._C.silu_and_mul(intermediate_cache2,
+                                  intermediate_cache1.view(-1, N))
 
         invoke_fused_moe_kernel(intermediate_cache2,
                                 w2,

From c994223d569221652643e897d8402b835ead411d Mon Sep 17 00:00:00 2001
From: jiangjiadi <34134495+jiangjiadi@users.noreply.github.com>
Date: Wed, 8 Jan 2025 02:36:34 +0800
Subject: [PATCH 113/462] [Bugfix] update the prefix for qwen2 (#11795)

Co-authored-by: jiadi.jjd <jiadi.jjd@antgroup.com>
---
 vllm/model_executor/models/qwen2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 01745b5fd53e1..d20fb150f7e39 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -298,7 +298,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             lambda prefix: Qwen2DecoderLayer(config=config,
                                              cache_config=cache_config,
                                              quant_config=quant_config,
-                                             prefix=f"{prefix}.layers"),
+                                             prefix=prefix),
             prefix=f"{prefix}.layers",
         )
 

From 973f5dc581c35a9c5b9176116e2f42f3f97d0d01 Mon Sep 17 00:00:00 2001
From: sroy745 <142070531+sroy745@users.noreply.github.com>
Date: Tue, 7 Jan 2025 11:19:12 -0800
Subject: [PATCH 114/462] [Doc]Add documentation for using EAGLE in vLLM
 (#11417)

Signed-off-by: Sourashis Roy <sroy@roblox.com>
---
 docs/source/features/spec_decode.md | 66 +++++++++++++++++++++++++++++
 1 file changed, 66 insertions(+)

diff --git a/docs/source/features/spec_decode.md b/docs/source/features/spec_decode.md
index bc8a0aa14dc5a..903acadb71426 100644
--- a/docs/source/features/spec_decode.md
+++ b/docs/source/features/spec_decode.md
@@ -159,6 +159,72 @@ A variety of speculative models of this type are available on HF hub:
 - [granite-7b-instruct-accelerator](https://huggingface.co/ibm-granite/granite-7b-instruct-accelerator)
 - [granite-20b-code-instruct-accelerator](https://huggingface.co/ibm-granite/granite-20b-code-instruct-accelerator)
 
+## Speculating using EAGLE based draft models
+
+The following code configures vLLM to use speculative decoding where proposals are generated by
+an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https://arxiv.org/pdf/2401.15077) based draft model.
+
+```python
+from vllm import LLM, SamplingParams
+
+prompts = [
+    "The future of AI is",
+]
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+llm = LLM(
+    model="meta-llama/Meta-Llama-3-8B-Instruct",
+    tensor_parallel_size=4,
+    speculative_model="path/to/modified/eagle/model",
+    speculative_draft_tensor_parallel_size=1,
+)
+
+outputs = llm.generate(prompts, sampling_params)
+
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+```
+
+A few important things to consider when using the EAGLE based draft models:
+
+1. The EAGLE draft models available in the [HF repository for EAGLE models](https://huggingface.co/yuhuili) cannot be
+   used directly with vLLM due to differences in the expected layer names and model definition.
+   To use these models with vLLM, use the [following script](https://gist.github.com/abhigoyal1997/1e7a4109ccb7704fbc67f625e86b2d6d) 
+   to convert them. Note that this script does not modify the model's weights.
+
+   In the above example, use the script to first convert
+   the [yuhuili/EAGLE-LLaMA3-Instruct-8B](https://huggingface.co/yuhuili/EAGLE-LLaMA3-Instruct-8B) model 
+   and then use the converted checkpoint as the draft model in vLLM.
+
+2. The EAGLE based draft models need to be run without tensor parallelism
+   (i.e. speculative_draft_tensor_parallel_size is set to 1), although
+   it is possible to run the main model using tensor parallelism (see example above).
+
+3. When using EAGLE-based speculators with vLLM, the observed speedup is lower than what is
+   reported in the reference implementation [here](https://github.com/SafeAILab/EAGLE). This issue is under
+   investigation and tracked here: [https://github.com/vllm-project/vllm/issues/9565](https://github.com/vllm-project/vllm/issues/9565).
+
+
+A variety of EAGLE draft models are available on the Hugging Face hub:
+
+| Base Model                                                           | EAGLE on Hugging Face                     | # EAGLE Parameters |
+|---------------------------------------------------------------------|-------------------------------------------|--------------------|
+| Vicuna-7B-v1.3                                                       | yuhuili/EAGLE-Vicuna-7B-v1.3             | 0.24B              |
+| Vicuna-13B-v1.3                                                      | yuhuili/EAGLE-Vicuna-13B-v1.3            | 0.37B              |
+| Vicuna-33B-v1.3                                                      | yuhuili/EAGLE-Vicuna-33B-v1.3            | 0.56B              |
+| LLaMA2-Chat 7B                                                       | yuhuili/EAGLE-llama2-chat-7B             | 0.24B              |
+| LLaMA2-Chat 13B                                                      | yuhuili/EAGLE-llama2-chat-13B            | 0.37B              |
+| LLaMA2-Chat 70B                                                      | yuhuili/EAGLE-llama2-chat-70B            | 0.99B              |
+| Mixtral-8x7B-Instruct-v0.1                                           | yuhuili/EAGLE-mixtral-instruct-8x7B      | 0.28B              |
+| LLaMA3-Instruct 8B                                                   | yuhuili/EAGLE-LLaMA3-Instruct-8B         | 0.25B              |
+| LLaMA3-Instruct 70B                                                  | yuhuili/EAGLE-LLaMA3-Instruct-70B        | 0.99B              |
+| Qwen2-7B-Instruct                                                    | yuhuili/EAGLE-Qwen2-7B-Instruct          | 0.26B              |
+| Qwen2-72B-Instruct                                                   | yuhuili/EAGLE-Qwen2-72B-Instruct         | 1.05B              |
+
+
 ## Lossless guarantees of Speculative Decoding
 
 In vLLM, speculative decoding aims to enhance inference efficiency while maintaining accuracy. This section addresses the lossless guarantees of

From a4e2b268568b335d8fe37f8eaaa894cec3ba9397 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jie=20Fu=20=28=E5=82=85=E6=9D=B0=29?= <jiefu@tencent.com>
Date: Wed, 8 Jan 2025 08:15:50 +0800
Subject: [PATCH 115/462] [Bugfix] Significant performance drop on CPUs with
 --num-scheduler-steps > 1 (#11794)

---
 vllm/engine/arg_utils.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index e94664308cf8d..0850bab6bb7e1 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1157,6 +1157,12 @@ def create_engine_config(self,
             if self.enable_chunked_prefill and self.pipeline_parallel_size > 1:
                 raise ValueError("Multi-Step Chunked-Prefill is not supported "
                                  "for pipeline-parallel-size > 1")
+            from vllm.platforms import current_platform
+            if current_platform.is_cpu():
+                logger.warning("Multi-Step (--num-scheduler-steps > 1) is "
+                               "currently not supported for CPUs and has been "
+                               "disabled.")
+                self.num_scheduler_steps = 1
 
         # make sure num_lookahead_slots is set the higher value depending on
         # if we are using speculative decoding or multi-step

From 5950f555a1d2ce19c30efb24abe03737320d05c1 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 8 Jan 2025 01:20:12 +0000
Subject: [PATCH 116/462] [Doc] Group examples into categories (#11782)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .gitignore                                    |   5 +-
 docs/Makefile                                 |   4 +
 docs/requirements-docs.txt                    |   1 +
 docs/source/conf.py                           |   4 +
 docs/source/generate_examples.py              | 264 +++++++++++++++---
 .../examples/examples_index.template.md       |   8 -
 examples/fp8/README.md                        |   6 +-
 .../Otel.md                                   |   0
 .../dummy_client.py                           |   0
 .../README.md                                 |  10 +-
 .../docker-compose.yaml                       |   0
 .../grafana.json                              |   0
 .../prometheus.yaml                           |   0
 13 files changed, 240 insertions(+), 62 deletions(-)
 delete mode 100644 docs/source/getting_started/examples/examples_index.template.md
 rename examples/{production_monitoring => opentelemetry}/Otel.md (100%)
 rename examples/{production_monitoring => opentelemetry}/dummy_client.py (100%)
 rename examples/{production_monitoring => prometheus_grafana}/README.md (95%)
 rename examples/{production_monitoring => prometheus_grafana}/docker-compose.yaml (100%)
 rename examples/{production_monitoring => prometheus_grafana}/grafana.json (100%)
 rename examples/{production_monitoring => prometheus_grafana}/prometheus.yaml (100%)

diff --git a/.gitignore b/.gitignore
index bb7e4d5b244a8..89dab8f13bab1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -79,10 +79,7 @@ instance/
 
 # Sphinx documentation
 docs/_build/
-docs/source/getting_started/examples/*.rst
-!**/*.template.rst
-docs/source/getting_started/examples/*.md
-!**/*.template.md
+docs/source/getting_started/examples/
 
 # PyBuilder
 .pybuilder/
diff --git a/docs/Makefile b/docs/Makefile
index d0c3cbf1020d5..5b801f79d1f26 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -18,3 +18,7 @@ help:
 # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
 %: Makefile
 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+clean:
+	@$(SPHINXBUILD) -M clean "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+	rm -rf "$(SOURCEDIR)/getting_started/examples"
diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt
index 25a700033cc9e..64cf6ef8fc19d 100644
--- a/docs/requirements-docs.txt
+++ b/docs/requirements-docs.txt
@@ -3,6 +3,7 @@ sphinx-book-theme==1.0.1
 sphinx-copybutton==0.5.2
 myst-parser==3.0.1
 sphinx-argparse==0.4.0
+sphinx-togglebutton==0.3.2
 msgspec
 cloudpickle
 
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 71394c5302a39..1ce11fe057071 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -43,6 +43,10 @@
     "sphinx.ext.autosummary",
     "myst_parser",
     "sphinxarg.ext",
+    "sphinx_togglebutton",
+]
+myst_enable_extensions = [
+    "colon_fence",
 ]
 
 # Add any paths that contain templates here, relative to this directory.
diff --git a/docs/source/generate_examples.py b/docs/source/generate_examples.py
index aef32f7559f74..32bb86c469c78 100644
--- a/docs/source/generate_examples.py
+++ b/docs/source/generate_examples.py
@@ -1,54 +1,234 @@
+import itertools
 import re
+from dataclasses import dataclass, field
 from pathlib import Path
 
+ROOT_DIR = Path(__file__).parent.parent.parent.resolve()
+ROOT_DIR_RELATIVE = '../../../..'
+EXAMPLE_DIR = ROOT_DIR / "examples"
+EXAMPLE_DOC_DIR = ROOT_DIR / "docs/source/getting_started/examples"
+
 
 def fix_case(text: str) -> str:
-    subs = [
-        ("api", "API"),
-        ("llm", "LLM"),
-        ("vllm", "vLLM"),
-        ("openai", "OpenAI"),
-        ("multilora", "MultiLoRA"),
-    ]
-    for sub in subs:
-        text = re.sub(*sub, text, flags=re.IGNORECASE)
+    subs = {
+        "api": "API",
+        "cpu": "CPU",
+        "llm": "LLM",
+        "tpu": "TPU",
+        "aqlm": "AQLM",
+        "gguf": "GGUF",
+        "lora": "LoRA",
+        "vllm": "vLLM",
+        "openai": "OpenAI",
+        "multilora": "MultiLoRA",
+        "mlpspeculator": "MLPSpeculator",
+        r"fp\d+": lambda x: x.group(0).upper(),  # e.g. fp16, fp32
+        r"int\d+": lambda x: x.group(0).upper(),  # e.g. int8, int16
+    }
+    for pattern, repl in subs.items():
+        text = re.sub(rf'\b{pattern}\b', repl, text, flags=re.IGNORECASE)
     return text
 
 
-def generate_title(filename: str) -> str:
-    # Turn filename into a title
-    title = filename.replace("_", " ").title()
-    # Handle acronyms and names
-    title = fix_case(title)
-    return f"# {title}"
+@dataclass
+class Index:
+    """
+    Index class to generate a structured document index.
+
+    Attributes:
+        path (Path): The path save the index file to.
+        title (str): The title of the index.
+        description (str): A brief description of the index.
+        caption (str): An optional caption for the table of contents.
+        maxdepth (int): The maximum depth of the table of contents. Defaults to 1.
+        documents (list[str]): A list of document paths to include in the index. Defaults to an empty list.
+
+    Methods:
+        generate() -> str:
+            Generates the index content as a string in the specified format.
+    """ # noqa: E501
+    path: Path
+    title: str
+    description: str
+    caption: str
+    maxdepth: int = 1
+    documents: list[str] = field(default_factory=list)
+
+    def generate(self) -> str:
+        content = f"# {self.title}\n\n{self.description}\n\n"
+        content += "```{toctree}\n"
+        content += f":caption: {self.caption}\n:maxdepth: {self.maxdepth}\n"
+        content += "\n".join(sorted(self.documents)) + "\n```\n"
+        return content
+
+
+@dataclass
+class Example:
+    """
+    Example class for generating documentation content from a given path.
+
+    Attributes:
+        path (Path): The path to the main directory or file.
+        category (str): The category of the document.
+        main_file (Path): The main file in the directory.
+        other_files (list[Path]): List of other files in the directory.
+        title (str): The title of the document.
+
+    Methods:
+        __post_init__(): Initializes the main_file, other_files, and title attributes.
+        determine_main_file() -> Path: Determines the main file in the given path.
+        determine_other_files() -> list[Path]: Determines other files in the directory excluding the main file.
+        determine_title() -> str: Determines the title of the document.
+        generate() -> str: Generates the documentation content.
+    """ # noqa: E501
+    path: Path
+    category: str = None
+    main_file: Path = field(init=False)
+    other_files: list[Path] = field(init=False)
+    title: str = field(init=False)
+
+    def __post_init__(self):
+        self.main_file = self.determine_main_file()
+        self.other_files = self.determine_other_files()
+        self.title = self.determine_title()
+
+    def determine_main_file(self) -> Path:
+        """
+        Determines the main file in the given path.
+        If the path is a file, it returns the path itself. Otherwise, it searches
+        for Markdown files (*.md) in the directory and returns the first one found.
+        Returns:
+            Path: The main file path, either the original path if it's a file or the first
+            Markdown file found in the directory.
+        Raises:
+            IndexError: If no Markdown files are found in the directory.
+        """ # noqa: E501
+        return self.path if self.path.is_file() else list(
+            self.path.glob("*.md")).pop()
+
+    def determine_other_files(self) -> list[Path]:
+        """
+        Determine other files in the directory excluding the main file.
+
+        This method checks if the given path is a file. If it is, it returns an empty list.
+        Otherwise, it recursively searches through the directory and returns a list of all
+        files that are not the main file.
+
+        Returns:
+            list[Path]: A list of Path objects representing the other files in the directory.
+        """ # noqa: E501
+        if self.path.is_file():
+            return []
+        is_other_file = lambda file: file.is_file() and file != self.main_file
+        return [file for file in self.path.rglob("*") if is_other_file(file)]
+
+    def determine_title(self) -> str:
+        return fix_case(self.path.stem.replace("_", " ").title())
+
+    def generate(self) -> str:
+        # Convert the path to a relative path from __file__
+        make_relative = lambda path: ROOT_DIR_RELATIVE / path.relative_to(
+            ROOT_DIR)
+
+        content = f"Source <gh-file:{self.path.relative_to(ROOT_DIR)}>.\n\n"
+        if self.main_file.suffix == ".py":
+            content += f"# {self.title}\n\n"
+        include = "include" if self.main_file.suffix == ".md" else \
+            "literalinclude"
+        content += f":::{{{include}}} {make_relative(self.main_file)}\n:::\n\n"
+
+        if not self.other_files:
+            return content
+
+        content += "## Example materials\n\n"
+        for file in self.other_files:
+            include = "include" if file.suffix == ".md" else "literalinclude"
+            content += f":::{{admonition}} {file.relative_to(self.path)}\n"
+            content += ":class: dropdown\n\n"
+            content += f":::{{{include}}} {make_relative(file)}\n:::\n"
+            content += ":::\n\n"
+
+        return content
 
 
 def generate_examples():
-    root_dir = Path(__file__).parent.parent.parent.resolve()
-
-    # Source paths
-    script_dir = root_dir / "examples"
-    script_paths = sorted(script_dir.glob("*.py"))
-
-    # Destination paths
-    doc_dir = root_dir / "docs/source/getting_started/examples"
-    doc_paths = [doc_dir / f"{path.stem}.md" for path in script_paths]
-
-    # Generate the example docs for each example script
-    for script_path, doc_path in zip(script_paths, doc_paths):
-        # Make script_path relative to doc_path and call it include_path
-        include_path = '../../../..' / script_path.relative_to(root_dir)
-        content = (f"{generate_title(doc_path.stem)}\n\n"
-                   f"Source: <gh-file:examples/{script_path.name}>.\n\n"
-                   f"```{{literalinclude}} {include_path}\n"
-                   ":language: python\n"
-                   ":linenos:\n```")
+    # Create the EXAMPLE_DOC_DIR if it doesn't exist
+    if not EXAMPLE_DOC_DIR.exists():
+        EXAMPLE_DOC_DIR.mkdir(parents=True)
+
+    # Create empty indices
+    examples_index = Index(
+        path=EXAMPLE_DOC_DIR / "examples_index.md",
+        title="Examples",
+        description=
+        "A collection of examples demonstrating usage of vLLM.\nAll documented examples are autogenerated using <gh-file:docs/source/generate_examples.py> from examples found in <gh-file:examples>.",  # noqa: E501
+        caption="Examples",
+        maxdepth=1)  # TODO change to 2 when examples start being categorised
+    category_indices = {
+        "offline_inference":
+        Index(
+            path=EXAMPLE_DOC_DIR / "examples_offline_inference_index.md",
+            title="Offline Inference",
+            description=
+            "Offline inference examples demonstrate how to use vLLM in an offline setting, where the model is queried for predictions in batches.",  # noqa: E501
+            caption="Examples",
+        ),
+        "online_serving":
+        Index(
+            path=EXAMPLE_DOC_DIR / "examples_online_serving_index.md",
+            title="Online Serving",
+            description=
+            "Online serving examples demonstrate how to use vLLM in an online setting, where the model is queried for predictions in real-time.",  # noqa: E501
+            caption="Examples",
+        ),
+        "other":
+        Index(
+            path=EXAMPLE_DOC_DIR / "examples_other_index.md",
+            title="Other",
+            description=
+            "Other examples that don't strongly fit into the online or offline serving categories.",  # noqa: E501
+            caption="Examples",
+        ),
+    }
+
+    examples = []
+    # Find categorised examples
+    for category in category_indices:
+        category_dir = EXAMPLE_DIR / category
+        py = category_dir.glob("*.py")
+        md = category_dir.glob("*.md")
+        for path in itertools.chain(py, md):
+            examples.append(Example(path, category))
+        # Find examples in subdirectories
+        for path in category_dir.glob("*/*.md"):
+            examples.append(Example(path.parent, category))
+    # Find uncategorised examples
+    py = EXAMPLE_DIR.glob("*.py")
+    md = EXAMPLE_DIR.glob("*.md")
+    for path in itertools.chain(py, md):
+        examples.append(Example(path))
+    # Find examples in subdirectories
+    for path in EXAMPLE_DIR.glob("*/*.md"):
+        # Skip categorised examples
+        if path.parent.name in category_indices:
+            continue
+        examples.append(Example(path.parent))
+
+    # Generate the example documentation
+    for example in examples:
+        doc_path = EXAMPLE_DOC_DIR / f"{example.path.stem}.md"
         with open(doc_path, "w+") as f:
-            f.write(content)
-
-    # Generate the toctree for the example scripts
-    with open(doc_dir / "examples_index.template.md") as f:
-        examples_index = f.read()
-    with open(doc_dir / "examples_index.md", "w+") as f:
-        example_docs = "\n".join(path.stem + ".md" for path in script_paths)
-        f.write(examples_index.replace(r"%EXAMPLE_DOCS%", example_docs))
+            f.write(example.generate())
+        # Add the example to the appropriate index
+        index = category_indices.get(example.category, examples_index)
+        index.documents.append(example.path.stem)
+
+    # Generate the index files
+    for category_index in category_indices.values():
+        if category_index.documents:
+            examples_index.documents.insert(0, category_index.path.name)
+            with open(category_index.path, "w+") as f:
+                f.write(category_index.generate())
+
+    with open(examples_index.path, "w+") as f:
+        f.write(examples_index.generate())
diff --git a/docs/source/getting_started/examples/examples_index.template.md b/docs/source/getting_started/examples/examples_index.template.md
deleted file mode 100644
index de7a91c0ffa48..0000000000000
--- a/docs/source/getting_started/examples/examples_index.template.md
+++ /dev/null
@@ -1,8 +0,0 @@
-# Examples
-
-```{toctree}
-:maxdepth: 1
-:caption: Scripts
-
-%EXAMPLE_DOCS%
-```
\ No newline at end of file
diff --git a/examples/fp8/README.md b/examples/fp8/README.md
index 181c36558fcff..5492872cae93a 100644
--- a/examples/fp8/README.md
+++ b/examples/fp8/README.md
@@ -56,7 +56,7 @@ python3 examples/fp8/extract_scales.py --quantized_model <QUANTIZED_MODEL_DIR> -
 ```
 ### 4. Load KV Cache Scaling Factors into VLLM.
 This script evaluates the inference throughput of language models using various backends such as vLLM. It measures the time taken to process a given number of prompts and generate sequences for each prompt. The recently generated KV cache scaling factors are now integrated into the benchmarking process and allow for KV cache scaling factors to be utilized for FP8.
-```python
+```
 # prerequisites:
 # -  LLaMa 2 kv_cache_scales.json file
 
@@ -90,7 +90,7 @@ optional arguments:
   --kv-cache-dtype {auto,fp8} Data type for kv cache storage. If "auto", will use model data type. FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported ```for common inference criteria.
   --quantization-param-path QUANT_PARAM_JSON Path to the JSON file containing the KV cache scaling factors. This should generally be supplied, when KV cache dtype is FP8. Otherwise, KV cache scaling factors default to 1.0, which may cause accuracy issues. FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria.
 ```
-```
 Example:
+```console
 python3 benchmarks/benchmark_throughput.py --input-len <INPUT_LEN> --output-len <OUTPUT_LEN> -tp <TENSOR_PARALLEL_SIZE> --kv-cache-dtype fp8 --quantization-param-path <path/to/kv_cache_scales.json> --model <path-to-llama2>
-```python
+```
diff --git a/examples/production_monitoring/Otel.md b/examples/opentelemetry/Otel.md
similarity index 100%
rename from examples/production_monitoring/Otel.md
rename to examples/opentelemetry/Otel.md
diff --git a/examples/production_monitoring/dummy_client.py b/examples/opentelemetry/dummy_client.py
similarity index 100%
rename from examples/production_monitoring/dummy_client.py
rename to examples/opentelemetry/dummy_client.py
diff --git a/examples/production_monitoring/README.md b/examples/prometheus_grafana/README.md
similarity index 95%
rename from examples/production_monitoring/README.md
rename to examples/prometheus_grafana/README.md
index 807c0470e7b30..c49e5306a1cb4 100644
--- a/examples/production_monitoring/README.md
+++ b/examples/prometheus_grafana/README.md
@@ -1,4 +1,4 @@
-# vLLM + Prometheus/Grafana 
+# Prometheus and Grafana 
 
 This is a simple example that shows you how to connect vLLM metric logging to the Prometheus/Grafana stack. For this example, we launch Prometheus and Grafana via Docker. You can checkout other methods through [Prometheus](https://prometheus.io/) and [Grafana](https://grafana.com/) websites. 
 
@@ -6,7 +6,7 @@ Install:
 - [`docker`](https://docs.docker.com/engine/install/)
 - [`docker compose`](https://docs.docker.com/compose/install/linux/#install-using-the-repository)
 
-### Launch
+## Launch
 
 Prometheus metric logging is enabled by default in the OpenAI-compatible server. Launch via the entrypoint:
 ```bash
@@ -35,11 +35,11 @@ python3 ../../benchmarks/benchmark_serving.py \
 
 Navigating to [`http://localhost:8000/metrics`](http://localhost:8000/metrics) will show the raw Prometheus metrics being exposed by vLLM.
 
-### Grafana Dashboard
+## Grafana Dashboard
 
 Navigate to [`http://localhost:3000`](http://localhost:3000). Log in with the default username (`admin`) and password (`admin`).
 
-#### Add Prometheus Data Source
+### Add Prometheus Data Source
 
 Navigate to [`http://localhost:3000/connections/datasources/new`](http://localhost:3000/connections/datasources/new) and select Prometheus. 
 
@@ -47,7 +47,7 @@ On Prometheus configuration page, we need to add the `Prometheus Server URL` in
 
 Click `Save & Test`. You should get a green check saying "Successfully queried the Prometheus API.".
 
-#### Import Dashboard 
+### Import Dashboard 
 
 Navigate to [`http://localhost:3000/dashboard/import`](http://localhost:3000/dashboard/import), upload `grafana.json`, and select the `prometheus` datasource. You should see a screen that looks like the following:
 
diff --git a/examples/production_monitoring/docker-compose.yaml b/examples/prometheus_grafana/docker-compose.yaml
similarity index 100%
rename from examples/production_monitoring/docker-compose.yaml
rename to examples/prometheus_grafana/docker-compose.yaml
diff --git a/examples/production_monitoring/grafana.json b/examples/prometheus_grafana/grafana.json
similarity index 100%
rename from examples/production_monitoring/grafana.json
rename to examples/prometheus_grafana/grafana.json
diff --git a/examples/production_monitoring/prometheus.yaml b/examples/prometheus_grafana/prometheus.yaml
similarity index 100%
rename from examples/production_monitoring/prometheus.yaml
rename to examples/prometheus_grafana/prometheus.yaml

From 91445c7bc8000a6f6f1efed0882076d7001be968 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 8 Jan 2025 10:17:16 +0800
Subject: [PATCH 117/462] [Bugfix] Fix image input for Pixtral-HF (#11741)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 ...e_inference_vision_language_multi_image.py | 41 ++++++++++++++++---
 vllm/model_executor/models/llava.py           |  6 +++
 vllm/model_executor/models/pixtral.py         |  2 +-
 vllm/model_executor/models/utils.py           |  9 ++++
 4 files changed, 52 insertions(+), 6 deletions(-)

diff --git a/examples/offline_inference_vision_language_multi_image.py b/examples/offline_inference_vision_language_multi_image.py
index 6af8d7768e75d..cf2e90a325c6a 100644
--- a/examples/offline_inference_vision_language_multi_image.py
+++ b/examples/offline_inference_vision_language_multi_image.py
@@ -23,7 +23,7 @@
 class ModelRequestData(NamedTuple):
     llm: LLM
     prompt: str
-    stop_token_ids: Optional[List[str]]
+    stop_token_ids: Optional[List[int]]
     image_data: List[Image]
     chat_template: Optional[str]
 
@@ -44,12 +44,14 @@ def load_aria(question, image_urls: List[str]) -> ModelRequestData:
     prompt = (f"<|im_start|>user\n{placeholders}{question}<|im_end|>\n"
               "<|im_start|>assistant\n")
     stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
+
     return ModelRequestData(
         llm=llm,
         prompt=prompt,
         stop_token_ids=stop_token_ids,
         image_data=[fetch_image(url) for url in image_urls],
-        chat_template=None)
+        chat_template=None,
+    )
 
 
 def load_h2onvl(question: str, image_urls: List[str]) -> ModelRequestData:
@@ -166,7 +168,8 @@ def load_mllama(question, image_urls: List[str]) -> ModelRequestData:
         limit_mm_per_prompt={"image": len(image_urls)},
     )
 
-    prompt = f"<|image|><|image|><|begin_of_text|>{question}"
+    placeholders = "<|image|>" * len(image_urls)
+    prompt = f"{placeholders}<|begin_of_text|>{question}"
     return ModelRequestData(
         llm=llm,
         prompt=prompt,
@@ -209,6 +212,31 @@ def load_nvlm_d(question: str, image_urls: List[str]):
     )
 
 
+def load_pixtral_hf(question: str, image_urls: List[str]) -> ModelRequestData:
+    model_name = "mistral-community/pixtral-12b"
+
+    # Adjust this as necessary to fit in GPU
+    llm = LLM(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=2,
+        tensor_parallel_size=2,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    placeholders = "[IMG]" * len(image_urls)
+    prompt = f"<s>[INST]{question}\n{placeholders}[/INST]"
+    stop_token_ids = None
+
+    return ModelRequestData(
+        llm=llm,
+        prompt=prompt,
+        stop_token_ids=stop_token_ids,
+        image_data=[fetch_image(url) for url in image_urls],
+        chat_template=None,
+    )
+
+
 def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData:
     # num_crops is an override kwarg to the multimodal image processor;
     # For some models, e.g., Phi-3.5-vision-instruct, it is recommended
@@ -244,7 +272,8 @@ def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData:
     )
 
 
-def load_qwenvl_chat(question: str, image_urls: List[str]) -> ModelRequestData:
+def load_qwen_vl_chat(question: str,
+                      image_urls: List[str]) -> ModelRequestData:
     model_name = "Qwen/Qwen-VL-Chat"
     llm = LLM(
         model=model_name,
@@ -274,6 +303,7 @@ def load_qwenvl_chat(question: str, image_urls: List[str]) -> ModelRequestData:
 
     stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>"]
     stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
+
     return ModelRequestData(
         llm=llm,
         prompt=prompt,
@@ -348,7 +378,8 @@ def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData:
     "mllama": load_mllama,
     "NVLM_D": load_nvlm_d,
     "phi3_v": load_phi3v,
-    "qwen_vl_chat": load_qwenvl_chat,
+    "pixtral_hf": load_pixtral_hf,
+    "qwen_vl_chat": load_qwen_vl_chat,
     "qwen2_vl": load_qwen2_vl,
 }
 
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 4299af8cd03a2..305f1364dba23 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -546,6 +546,12 @@ def _parse_and_validate_image_input(
                 raise ValueError("Incorrect type of pixel values. "
                                  f"Got type: {type(pixel_values)}")
 
+            if self.config.vision_config.model_type == "pixtral":
+                return LlavaImagePixelInputs(
+                    type="pixel_values",
+                    data=flatten_bn(pixel_values),
+                )
+
             return LlavaImagePixelInputs(
                 type="pixel_values",
                 data=self._validate_pixel_values(
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 9e1d38512c0b4..b74bb3c8a3f88 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -774,7 +774,7 @@ def get_num_image_tokens(
     ) -> int:
         return get_pixtral_hf_image_feature_size(
             image_size=self.vision_config.image_size,
-            patch_size=self.get_image_size(),
+            patch_size=self.vision_config.patch_size,
         )
 
     def get_max_image_tokens(self) -> int:
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index 31017f16d3c97..4ed3b237ae0e2 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -281,6 +281,15 @@ def flatten_bn(
     ...
 
 
+@overload
+def flatten_bn(
+    x: Union[List[torch.Tensor], torch.Tensor],
+    *,
+    concat: bool = False,
+) -> Union[List[torch.Tensor], torch.Tensor]:
+    ...
+
+
 def flatten_bn(
     x: Union[List[torch.Tensor], torch.Tensor],
     *,

From 4d29e91be84d27ca313d657eee92c067439a4c23 Mon Sep 17 00:00:00 2001
From: Divakar Verma <137818590+divakar-amd@users.noreply.github.com>
Date: Tue, 7 Jan 2025 20:57:04 -0600
Subject: [PATCH 118/462] [Misc] sort torch profiler table by kernel timing
 (#11813)

---
 benchmarks/benchmark_latency.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index 0a14aedd5feba..e669ce4db299d 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -52,7 +52,7 @@ def run_to_completion(profile_dir: Optional[str] = None):
                 llm.generate(dummy_prompts,
                              sampling_params=sampling_params,
                              use_tqdm=False)
-            print(p.key_averages())
+            print(p.key_averages().table(sort_by="self_cuda_time_total"))
         else:
             start_time = time.perf_counter()
             llm.generate(dummy_prompts,

From dc71af0a71f347badcd917810440fad136e73ba6 Mon Sep 17 00:00:00 2001
From: WangErXiao <863579016@qq.com>
Date: Wed, 8 Jan 2025 12:09:25 +0800
Subject: [PATCH 119/462] =?UTF-8?q?Remove=20the=20duplicate=20imports=20of?=
 =?UTF-8?q?=20MultiModalKwargs=20and=20PlaceholderRange=E2=80=A6=20(#11824?=
 =?UTF-8?q?)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 vllm/v1/core/scheduler.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index baaf3329dc79f..b26716f5c02e6 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -5,8 +5,6 @@
 
 from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
 from vllm.logger import init_logger
-from vllm.multimodal import MultiModalKwargs
-from vllm.multimodal.base import PlaceholderRange
 from vllm.sampling_params import SamplingParams
 from vllm.v1.core.encoder_cache_manager import EncoderCacheManager
 from vllm.v1.core.kv_cache_manager import KVCacheManager

From b640b19cc0babe256c5455befe95340f951763d9 Mon Sep 17 00:00:00 2001
From: Nishidha <nishidha.panpaliya@partner.ibm.com>
Date: Wed, 8 Jan 2025 10:35:37 +0530
Subject: [PATCH 120/462] Fixed docker build for ppc64le (#11518)

Signed-off-by: Nishidha Panpaliya <nishidha.panpaliya@partner.ibm.com>
---
 Dockerfile.ppc64le | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/Dockerfile.ppc64le b/Dockerfile.ppc64le
index 971248577983f..d3cd1c7b313bc 100644
--- a/Dockerfile.ppc64le
+++ b/Dockerfile.ppc64le
@@ -4,7 +4,7 @@ USER root
 
 ENV PATH="/usr/local/cargo/bin:$PATH:/opt/conda/bin/"
 
-RUN apt-get update -y && apt-get install -y git wget curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1 
+RUN apt-get update -y && apt-get install -y git wget curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1 libssl-dev 
 
 # Some packages in requirements-cpu are installed here
 # IBM provides optimized packages for ppc64le processors in the open-ce project for mamba
@@ -18,9 +18,8 @@ ARG GIT_REPO_CHECK=0
 RUN --mount=type=bind,source=.git,target=.git \
     if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi
 
-# These packages will be in rocketce eventually
 RUN --mount=type=cache,target=/root/.cache/pip  \
-    pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \
+    RUSTFLAGS='-L /opt/conda/lib' pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \
         'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
         torch==2.3.1 \
         -r requirements-cpu.txt \

From f4923cb8bce7d9c3038ad6c597ae1ff3ed90fe93 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Wed, 8 Jan 2025 09:08:30 +0400
Subject: [PATCH 121/462] [OpenVINO] Fixed Docker.openvino build (#11732)

Signed-off-by: Ilya Lavrenov <ilya.lavrenov@intel.com>
---
 Dockerfile.openvino | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Dockerfile.openvino b/Dockerfile.openvino
index 8bd188ffde408..32bcbfa9cc168 100644
--- a/Dockerfile.openvino
+++ b/Dockerfile.openvino
@@ -14,6 +14,7 @@ ARG GIT_REPO_CHECK=0
 RUN --mount=type=bind,source=.git,target=.git \
     if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
 
+RUN python3 -m pip install -U pip
 # install build requirements
 RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/requirements-build.txt
 # build vLLM with OpenVINO backend

From f645eb69545672d394e9e9e0ce46c725504fd2a0 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Wed, 8 Jan 2025 13:08:48 +0800
Subject: [PATCH 122/462] [Bugfix] Add checks for LoRA and CPU offload (#11810)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/config.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/vllm/config.py b/vllm/config.py
index 8b824a1fca511..a9b6d6b19127f 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2051,6 +2051,11 @@ def __post_init__(self):
                 f"max_cpu_loras ({self.max_cpu_loras}) must be >= "
                 f"max_loras ({self.max_loras})")
 
+    def verify_with_cache_config(self, cache_config: CacheConfig):
+        # TODO LoRA supports CPU offload.
+        if cache_config.cpu_offload_gb > 0:
+            raise ValueError("CPU offload is not supported with LoRA yet.")
+
     def verify_with_model_config(self, model_config: ModelConfig):
         if self.lora_dtype in (None, "auto"):
             self.lora_dtype = model_config.dtype
@@ -3138,6 +3143,7 @@ def __post_init__(self):
             self.cache_config.verify_with_parallel_config(self.parallel_config)
 
         if self.lora_config:
+            self.lora_config.verify_with_cache_config(self.cache_config)
             self.lora_config.verify_with_model_config(self.model_config)
             self.lora_config.verify_with_scheduler_config(
                 self.scheduler_config)

From 259abd8953a8fea9abf3c4e66aa7c51391fa5b64 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Tue, 7 Jan 2025 21:16:08 -0800
Subject: [PATCH 123/462] [Docs] reorganize sponsorship page (#11639)

Signed-off-by: simon-mo <simon.mo@hey.com>
---
 README.md                         | 15 ++++++++++-----
 docs/source/community/sponsors.md | 14 ++++++++++----
 2 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index 652268ec29cac..8e85b460363fc 100644
--- a/README.md
+++ b/README.md
@@ -90,28 +90,33 @@ vLLM is a community project. Our compute resources for development and testing a
 
 <!-- Note: Please sort them in alphabetical order. -->
 <!-- Note: Please keep these consistent with docs/source/community/sponsors.md -->
-
+Cash Donations:
 - a16z
+- Dropbox
+- Sequoia Capital
+- Skywork AI
+- ZhenFund
+
+Compute Resources:
 - AMD
 - Anyscale
 - AWS
 - Crusoe Cloud
 - Databricks
 - DeepInfra
-- Dropbox
 - Google Cloud
 - Lambda Lab
 - Nebius
+- Novita
 - NVIDIA
 - Replicate
 - Roblox
 - RunPod
-- Sequoia Capital
-- Skywork AI
 - Trainy
 - UC Berkeley
 - UC San Diego
-- ZhenFund
+
+Slack Sponsor: Anyscale
 
 We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM.
 
diff --git a/docs/source/community/sponsors.md b/docs/source/community/sponsors.md
index c6f83b3a92ca0..3d5a57baefbde 100644
--- a/docs/source/community/sponsors.md
+++ b/docs/source/community/sponsors.md
@@ -5,26 +5,32 @@ vLLM is a community project. Our compute resources for development and testing a
 <!-- Note: Please sort them in alphabetical order. -->
 <!-- Note: Please keep these consistent with README.md. -->
 
+Cash Donations:
 - a16z
+- Dropbox
+- Sequoia Capital
+- Skywork AI
+- ZhenFund
+
+Compute Resources:
 - AMD
 - Anyscale
 - AWS
 - Crusoe Cloud
 - Databricks
 - DeepInfra
-- Dropbox
 - Google Cloud
 - Lambda Lab
 - Nebius
+- Novita
 - NVIDIA
 - Replicate
 - Roblox
 - RunPod
-- Sequoia Capital
-- Skywork AI
 - Trainy
 - UC Berkeley
 - UC San Diego
-- ZhenFund
+
+Slack Sponsor: Anyscale
 
 We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM.

From ef68eb28d8d45be6e0defe82245e16be9362e375 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 8 Jan 2025 13:40:09 +0800
Subject: [PATCH 124/462] [Bug] Fix pickling of `ModelConfig` when RunAI Model
 Streamer is used (#11825)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/config.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index a9b6d6b19127f..44426489f686a 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -381,16 +381,16 @@ def maybe_pull_model_tokenizer_for_s3(self, model: str,
         """
         if is_s3(model) or is_s3(tokenizer):
             if is_s3(model):
-                self.s3_model = S3Model()
-                self.s3_model.pull_files(model, allow_pattern=["*config.json"])
+                s3_model = S3Model()
+                s3_model.pull_files(model, allow_pattern=["*config.json"])
                 self.model_weights = self.model
-                self.model = self.s3_model.dir
+                self.model = s3_model.dir
 
             if is_s3(tokenizer):
-                self.s3_tokenizer = S3Model()
-                self.s3_tokenizer.pull_files(
+                s3_tokenizer = S3Model()
+                s3_tokenizer.pull_files(
                     model, ignore_pattern=["*.pt", "*.safetensors", "*.bin"])
-                self.tokenizer = self.s3_tokenizer.dir
+                self.tokenizer = s3_tokenizer.dir
 
     def _init_multimodal_config(
         self, limit_mm_per_prompt: Optional[Mapping[str, int]]

From 889e662eae19fe8f30469883c6854ee4df4315a9 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Wed, 8 Jan 2025 14:36:03 +0800
Subject: [PATCH 125/462] [misc] improve memory profiling (#11809)

Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 tests/test_utils.py                           | 19 +++++-
 .../vllm_test_utils/__init__.py               |  3 +-
 .../vllm_test_utils/monitor.py                | 68 +++++++++++++++++++
 vllm/utils.py                                 | 12 ++--
 4 files changed, 94 insertions(+), 8 deletions(-)
 create mode 100644 tests/vllm_test_utils/vllm_test_utils/monitor.py

diff --git a/tests/test_utils.py b/tests/test_utils.py
index 32a6b0aed66aa..0285b00d73be1 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -5,6 +5,7 @@
 
 import pytest
 import torch
+from vllm_test_utils import monitor
 
 from vllm.utils import (FlexibleArgumentParser, StoreBoolean, deprecate_kwargs,
                         get_open_port, memory_profiling, merge_async_iterators,
@@ -289,8 +290,16 @@ def test_memory_profiling():
 
     weights_memory_in_bytes = 128 * 1024 * 1024 * 4 # 512 MiB
 
+    def measure_current_non_torch():
+        free, total = torch.cuda.mem_get_info()
+        current_used = total - free
+        current_torch = torch.cuda.memory_reserved()
+        current_non_torch = current_used - current_torch
+        return current_non_torch
+
     with memory_profiling(baseline_memory_in_bytes=baseline_memory_in_bytes,
-    weights_memory_in_bytes=weights_memory_in_bytes) as result:
+    weights_memory_in_bytes=weights_memory_in_bytes) as result, \
+        monitor(measure_current_non_torch) as monitored_values:
         # make a memory spike, 1 GiB
         spike = torch.randn(256, 1024, 1024, device='cuda', dtype=torch.float32)
         del spike
@@ -298,7 +307,15 @@ def test_memory_profiling():
         # Add some extra non-torch memory 256 MiB (simulate NCCL)
         handle2 = lib.cudaMalloc(256 * 1024 * 1024)
 
+    # this is an analytic value, it is exact,
+    # we only have 256 MiB non-torch memory increase
+    measured_diff = monitored_values.values[-1] - monitored_values.values[0]
+    assert measured_diff == 256 * 1024 * 1024
+
     # Check that the memory usage is within 5% of the expected values
+    # 5% tolerance is caused by PyTorch caching allocator,
+    # we cannot control PyTorch's behavior of its internal buffers,
+    # which causes a small error (<10 MiB in practice)
     non_torch_ratio = result.non_torch_increase_in_bytes / (256 * 1024 * 1024) # noqa
     torch_peak_ratio = result.torch_peak_increase_in_bytes / (1024 * 1024 * 1024) # noqa
     assert abs(non_torch_ratio - 1) <= 0.05
diff --git a/tests/vllm_test_utils/vllm_test_utils/__init__.py b/tests/vllm_test_utils/vllm_test_utils/__init__.py
index bf0b62a5b75e3..6505c81546bb0 100644
--- a/tests/vllm_test_utils/vllm_test_utils/__init__.py
+++ b/tests/vllm_test_utils/vllm_test_utils/__init__.py
@@ -4,5 +4,6 @@
 """
 
 from .blame import BlameResult, blame
+from .monitor import MonitoredValues, monitor
 
-__all__ = ["blame", "BlameResult"]
+__all__ = ["blame", "BlameResult", "monitor", "MonitoredValues"]
diff --git a/tests/vllm_test_utils/vllm_test_utils/monitor.py b/tests/vllm_test_utils/vllm_test_utils/monitor.py
new file mode 100644
index 0000000000000..a237f53a75d18
--- /dev/null
+++ b/tests/vllm_test_utils/vllm_test_utils/monitor.py
@@ -0,0 +1,68 @@
+import contextlib
+import dataclasses
+import sys
+import traceback
+from typing import Callable, Generator, Generic, TypeVar
+
+_T = TypeVar("_T")
+
+
+@dataclasses.dataclass
+class MonitoredValues(Generic[_T]):
+    values: list[_T] = dataclasses.field(default_factory=list)
+    trace_stacks: list[str] = dataclasses.field(default_factory=list)
+
+
+@contextlib.contextmanager
+def monitor(
+    measure_func: Callable[[],
+                           _T]) -> Generator[MonitoredValues[_T], None, None]:
+    """
+    Trace the function calls to continuously monitor the change of
+    a value.
+
+    Usage:
+
+    ```python
+
+    def measure_func():
+        ... # measure the current value
+        return current_value
+
+    with monitor(measure_func) as monitored_values:
+        # do something
+    
+        monitored_values.values # all changes of the values
+        monitored_values.trace_stacks # trace stacks of every change
+    ```
+    """
+    monitored_values = MonitoredValues[_T]()
+
+    def _trace_calls(frame, event, arg=None):
+        nonlocal monitored_values
+        if event in ['line']:
+            # triggered by every line of Python code.
+            # only Python functions will trigger it,
+            # c/cpp functions will not trigger it.
+            try:
+                # Temporarily disable the trace function
+                sys.settrace(None)
+                # do a measurement
+                current_value = measure_func()
+                if len(monitored_values.values
+                       ) == 0 or current_value != monitored_values.values[-1]:
+                    monitored_values.values.append(current_value)
+                    monitored_values.trace_stacks.append("".join(
+                        traceback.format_stack()))
+                # Re-enable the trace function
+                sys.settrace(_trace_calls)
+            except NameError:
+                # modules are deleted during shutdown
+                pass
+        return _trace_calls
+
+    try:
+        sys.settrace(_trace_calls)
+        yield monitored_values
+    finally:
+        sys.settrace(None)
diff --git a/vllm/utils.py b/vllm/utils.py
index 63057153f851d..2660b53d7bfb0 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -1742,10 +1742,10 @@ class MemorySnapshot:
     timestamp: float = 0.0
 
     def measure(self):
-        self.torch_peak_in_bytes = torch.cuda.memory_stats(
-        )["allocated_bytes.all.peak"]
-        self.torch_memory_in_bytes = torch.cuda.memory_stats(
-        )["allocated_bytes.all.current"]
+        self.torch_peak_in_bytes = torch.cuda.max_memory_reserved()
+        # torch.cuda.memory_reserved() is how many bytes
+        # PyTorch gets from cuda (by calling cudaMalloc, etc.)
+        self.torch_memory_in_bytes = torch.cuda.memory_reserved()
         self.timestamp = time.time()
 
     def __sub__(self, other: "MemorySnapshot") -> "MemorySnapshot":
@@ -1822,10 +1822,10 @@ def memory_profiling(
 
     The memory used for loading weights (a.) is directly given from the argument `weights_memory_in_bytes`.
 
-    The increase of ``torch.cuda.memory_stats()["allocated_bytes.all.peak"]` after profiling gives (b.).
+    The increase of `torch.cuda.memory_stats()["allocated_bytes.all.peak"]` after profiling gives (b.).
 
     (c.) is tricky. We measure the total memory used in this GPU (`torch.cuda.mem_get_info()[1] - torch.cuda.mem_get_info()[0]`),
-    subtract the baseline memory, the memory used by the model weights, and diff of `torch.cuda.memory_stats()["allocated_bytes.all.current"]`.
+    subtract the baseline memory, the memory used by the model weights, and diff of `torch.cuda.memory_reserved()`.
     """ # noqa
     torch.cuda.reset_peak_memory_stats()
 

From ad9f1aa6796297a00456e715043f3eaad55bed53 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Wed, 8 Jan 2025 14:36:49 +0800
Subject: [PATCH 126/462] [doc] update wheels url (#11830)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 docs/source/getting_started/installation/gpu-cuda.md | 4 ++--
 python_only_dev.py                                   | 2 +-
 setup.py                                             | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/source/getting_started/installation/gpu-cuda.md b/docs/source/getting_started/installation/gpu-cuda.md
index 1cd513177bf0d..419b8163fc034 100644
--- a/docs/source/getting_started/installation/gpu-cuda.md
+++ b/docs/source/getting_started/installation/gpu-cuda.md
@@ -75,7 +75,7 @@ If you want to access the wheels for previous commits (e.g. to bisect the behavi
 
 ```console
 $ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
-$ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
+$ pip install https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
 ```
 
 Note that the wheels are built with Python 3.8 ABI (see [PEP 425](https://peps.python.org/pep-0425/) for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (`1.0.0.dev`) is just a placeholder to have a unified URL for the wheels, the actual versions of wheels are contained in the wheel metadata (the wheels listed in the extra index url have correct versions). Although we don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the wheels are still built with Python 3.8 ABI to keep the same wheel name as before.
@@ -126,7 +126,7 @@ $ cd vllm
 $ VLLM_USE_PRECOMPILED=1 pip install --editable .
 ```
 
-This will download the latest nightly wheel from https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl and use the compiled libraries from there in the installation.
+This will download the latest nightly wheel from https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl and use the compiled libraries from there in the installation.
 
 The `VLLM_PRECOMPILED_WHEEL_LOCATION` environment variable can be used instead of `VLLM_USE_PRECOMPILED` to specify a custom path or URL to the wheel file. For example, to use the [0.6.1.post1 PyPi wheel](https://pypi.org/project/vllm/#files):
 
diff --git a/python_only_dev.py b/python_only_dev.py
index f70b4984025b3..7d95ac96e6e4b 100644
--- a/python_only_dev.py
+++ b/python_only_dev.py
@@ -7,7 +7,7 @@
 or
 
 export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
-export VLLM_PRECOMPILED_WHEEL_LOCATION=https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
+export VLLM_PRECOMPILED_WHEEL_LOCATION=https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
 pip install -e .
 """ # noqa
 
diff --git a/setup.py b/setup.py
index ba6953dbdc174..ef9f4e579e84d 100644
--- a/setup.py
+++ b/setup.py
@@ -252,7 +252,7 @@ def run(self):
 
 class repackage_wheel(build_ext):
     """Extracts libraries and other files from an existing wheel."""
-    default_wheel = "https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
+    default_wheel = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
 
     def run(self) -> None:
         wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION",

From a1b2b8606e75ab8fbc066e7f0fae20c1e60244ca Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Tue, 7 Jan 2025 23:05:46 -0800
Subject: [PATCH 127/462] [Docs] Update sponsor name: 'Novita' to 'Novita AI'
 (#11833)

---
 README.md                         | 2 +-
 docs/source/community/sponsors.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 8e85b460363fc..1f82229f39537 100644
--- a/README.md
+++ b/README.md
@@ -107,7 +107,7 @@ Compute Resources:
 - Google Cloud
 - Lambda Lab
 - Nebius
-- Novita
+- Novita AI
 - NVIDIA
 - Replicate
 - Roblox
diff --git a/docs/source/community/sponsors.md b/docs/source/community/sponsors.md
index 3d5a57baefbde..9d2af4c13b088 100644
--- a/docs/source/community/sponsors.md
+++ b/docs/source/community/sponsors.md
@@ -22,7 +22,7 @@ Compute Resources:
 - Google Cloud
 - Lambda Lab
 - Nebius
-- Novita
+- Novita AI
 - NVIDIA
 - Replicate
 - Roblox

From cfd3219f5881e2abea1f7c9d2866ed1838c5057b Mon Sep 17 00:00:00 2001
From: Wallas Henrique <wallashss@users.noreply.github.com>
Date: Wed, 8 Jan 2025 05:35:49 -0300
Subject: [PATCH 128/462] [Hardware][Apple] Native support for macOS Apple
 Silicon (#11696)

Signed-off-by: Wallas Santos <wallashss@ibm.com>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
---
 cmake/cpu_extension.cmake                     | 61 ++++++++++++++-----
 csrc/cpu/cpu_types_arm.hpp                    | 61 ++++++++++++++++++-
 csrc/cpu/utils.cpp                            | 23 +++++--
 .../getting_started/installation/cpu-apple.md | 51 ++++++++++++++++
 .../getting_started/installation/cpu-arm.md   |  4 +-
 .../getting_started/installation/index.md     |  1 +
 requirements-cpu.txt                          |  6 +-
 setup.py                                      |  9 ++-
 vllm/config.py                                | 12 ++++
 vllm/entrypoints/openai/api_server.py         |  3 +
 vllm/utils.py                                 |  7 +++
 11 files changed, 209 insertions(+), 29 deletions(-)
 create mode 100644 docs/source/getting_started/installation/cpu-apple.md

diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index 68f7ca1af05ad..714abca2a5ff7 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -4,6 +4,11 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS ON)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
+if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
+    set(MACOSX_FOUND TRUE)
+endif()
+
+
 #
 # Define environment variables for special configurations
 #
@@ -13,6 +18,9 @@ endif()
 
 include_directories("${CMAKE_SOURCE_DIR}/csrc")
 
+
+set (ENABLE_NUMA TRUE)
+
 #
 # Check the compile flags
 #
@@ -22,18 +30,28 @@ if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64")
         "-mf16c"
     )
 endif()
-list(APPEND CXX_COMPILE_FLAGS
-    "-fopenmp"
-    "-DVLLM_CPU_EXTENSION")
 
-execute_process(COMMAND cat /proc/cpuinfo
-                RESULT_VARIABLE CPUINFO_RET
-                OUTPUT_VARIABLE CPUINFO)
+if(MACOSX_FOUND)
+    list(APPEND CXX_COMPILE_FLAGS
+        "-Xpreprocessor"
+        "-fopenmp"
+        "-DVLLM_CPU_EXTENSION")
+else()
+    list(APPEND CXX_COMPILE_FLAGS
+        "-fopenmp"
+        "-DVLLM_CPU_EXTENSION")
+endif()
 
-if (NOT CPUINFO_RET EQUAL 0)
-    message(FATAL_ERROR "Failed to check CPU features via /proc/cpuinfo")
+if (NOT MACOSX_FOUND)
+    execute_process(COMMAND cat /proc/cpuinfo
+                    RESULT_VARIABLE CPUINFO_RET
+                    OUTPUT_VARIABLE CPUINFO)
+    if (NOT CPUINFO_RET EQUAL 0)
+        message(FATAL_ERROR "Failed to check CPU features via /proc/cpuinfo")
+    endif()
 endif()
 
+
 function (find_isa CPUINFO TARGET OUT)
     string(FIND ${CPUINFO} ${TARGET} ISA_FOUND)
     if(NOT ISA_FOUND EQUAL -1)
@@ -54,12 +72,17 @@ endfunction()
 
 is_avx512_disabled(AVX512_DISABLED)
 
-find_isa(${CPUINFO} "avx2" AVX2_FOUND)
-find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
-find_isa(${CPUINFO} "POWER10" POWER10_FOUND)
-find_isa(${CPUINFO} "POWER9" POWER9_FOUND)
-find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
-find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support
+if (MACOSX_FOUND AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
+    set(APPLE_SILICON_FOUND TRUE)
+else()
+    find_isa(${CPUINFO} "avx2" AVX2_FOUND)
+    find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
+    find_isa(${CPUINFO} "POWER10" POWER10_FOUND)
+    find_isa(${CPUINFO} "POWER9" POWER9_FOUND)
+    find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
+    find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support
+endif()
+
 
 if (AVX512_FOUND AND NOT AVX512_DISABLED)
     list(APPEND CXX_COMPILE_FLAGS
@@ -103,6 +126,9 @@ elseif (ASIMD_FOUND)
         set(MARCH_FLAGS "-march=armv8.2-a+dotprod+fp16")  
     endif()
     list(APPEND CXX_COMPILE_FLAGS ${MARCH_FLAGS})     
+elseif(APPLE_SILICON_FOUND)
+    message(STATUS "Apple Silicon Detected")
+    set(ENABLE_NUMA OFF)
 else()
     message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA or ARMv8 support.")
 endif()
@@ -139,7 +165,12 @@ endif()
 
 message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
 
-list(APPEND LIBS numa)
+if(ENABLE_NUMA)
+    list(APPEND LIBS numa)
+else()
+    message(STATUS "NUMA is disabled")
+    add_compile_definitions(-DVLLM_NUMA_DISABLED)
+endif()
 
 #
 # _C extension
diff --git a/csrc/cpu/cpu_types_arm.hpp b/csrc/cpu/cpu_types_arm.hpp
index 73e0f8cb2e0fb..ae062a5b86892 100644
--- a/csrc/cpu/cpu_types_arm.hpp
+++ b/csrc/cpu/cpu_types_arm.hpp
@@ -91,11 +91,68 @@ struct FP16Vec16 : public Vec<FP16Vec16> {
                 vst1q_f16(reinterpret_cast<__fp16*>(ptr) + 8, reg.val[1]);
             }
         }
+
+        // Note: below is the unrolled version of the following code:
+        // 
+        // for (int i = 0; i < remainder; ++i) {
+        //     reinterpret_cast<__fp16*>(ptr)[full_blocks * 8 + i] = 
+        //          vgetq_lane_f16(temp, i);
+        // }
+        // 
+        // For macOS build (Clang), the arm/neon intrinsics function 
+        // `vgetq_lane_f16` needs the parameter `i` to be constant at compile 
+        // time. 
         
         if (remainder > 0) {
             float16x8_t temp = reg.val[full_blocks];
-            for (int i = 0; i < remainder; ++i) {
-                reinterpret_cast<__fp16*>(ptr)[full_blocks * 8 + i] = vgetq_lane_f16(temp, i);
+            __fp16* fp16_ptr = reinterpret_cast<__fp16*>(ptr);
+            switch (remainder)
+            {
+            case 1:
+              fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
+              break;
+            case 2:
+              fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
+              fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
+              break;
+            case 3:
+              fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
+              fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
+              fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
+              break;
+            case 4:
+              fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
+              fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
+              fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
+              fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3);
+              break;
+            case 5:
+              fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
+              fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
+              fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
+              fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3);
+              fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4);
+              break;
+            case 6:
+              fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
+              fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
+              fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
+              fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3);
+              fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4);
+              fp16_ptr[full_blocks * 8 + 5] = vgetq_lane_f16(temp, 5);
+              break;
+            case 7:
+              fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
+              fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
+              fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
+              fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3);
+              fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4);
+              fp16_ptr[full_blocks * 8 + 5] = vgetq_lane_f16(temp, 5);
+              fp16_ptr[full_blocks * 8 + 6] = vgetq_lane_f16(temp, 6);
+              break;
+            
+            default:
+              break;
             }
         }
     }
diff --git a/csrc/cpu/utils.cpp b/csrc/cpu/utils.cpp
index 1138a55df2f05..42a1c1d924bac 100644
--- a/csrc/cpu/utils.cpp
+++ b/csrc/cpu/utils.cpp
@@ -1,10 +1,22 @@
-#include <numa.h>
-#include <unistd.h>
-#include <string>
-#include <sched.h>
+#ifndef VLLM_NUMA_DISABLED
+  #include <numa.h>
+  #include <unistd.h>
+  #include <string>
+  #include <sched.h>
+#endif
 
 #include "cpu_types.hpp"
 
+#ifdef VLLM_NUMA_DISABLED
+std::string init_cpu_threads_env(const std::string& cpu_ids) {
+  return std::string(
+      "Warning: NUMA is not enabled in this build. `init_cpu_threads_env` has "
+      "no effect to setup thread affinity.");
+}
+
+#endif
+
+#ifndef VLLM_NUMA_DISABLED
 std::string init_cpu_threads_env(const std::string& cpu_ids) {
   bitmask* omp_cpu_mask = numa_parse_cpustring(cpu_ids.c_str());
   TORCH_CHECK(omp_cpu_mask->size > 0);
@@ -57,7 +69,7 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
   omp_lock_t writelock;
   omp_init_lock(&writelock);
 
-#pragma omp parallel for schedule(static, 1)
+  #pragma omp parallel for schedule(static, 1)
   for (size_t i = 0; i < omp_cpu_ids.size(); ++i) {
     cpu_set_t mask;
     CPU_ZERO(&mask);
@@ -88,3 +100,4 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
 
   return ss.str();
 }
+#endif
\ No newline at end of file
diff --git a/docs/source/getting_started/installation/cpu-apple.md b/docs/source/getting_started/installation/cpu-apple.md
new file mode 100644
index 0000000000000..b55e4384d064d
--- /dev/null
+++ b/docs/source/getting_started/installation/cpu-apple.md
@@ -0,0 +1,51 @@
+(installation-apple)=
+
+# Installation for macOS
+
+vLLM has experimental support for macOS with Apple Silicon. For now, users shall build from the source vLLM to natively run on macOS. For more details, like running on vLLM in a docker container, see [ARM CPU Documentation](installation-arm)
+
+Currently the CPU implementation for macOS supports FP32 and FP16 datatypes.
+
+## Requirements
+
+- **Operating System**: `macOS Sonoma` or later
+- **SDK** `XCode 15.4` or later with Command Line Tools
+- **Compilers**: `Apple Clang >= 15.0.0`
+
+<!-- (arm-backend-quick-start-dockerfile)= -->
+
+## Build and installation
+
+After installation of XCode and the Command Line Tools, which include Apple Clang, execute the following commands to build and install vLLM from the source.
+
+```
+$ git clone https://github.com/vllm-project/vllm.git
+$ cd vllm
+$ pip install -r requirements-cpu.txt
+$ pip install -e . 
+```
+
+```{note}
+On macOS the `VLLM_TARGET_DEVICE` is automatically set to `cpu`, which currently is the only supported device.
+```
+
+
+
+## Troubleshooting
+
+If the build has error like the following snippet where standard C++ headers cannot be found, try to remove and reinstall your 
+[Command Line Tools for Xcode](https://developer.apple.com/download/all/).
+
+```
+[...] fatal error: 'map' file not found
+          1 | #include <map>
+            |          ^~~~~
+      1 error generated.
+      [2/8] Building CXX object CMakeFiles/_C.dir/csrc/cpu/pos_encoding.cpp.o
+
+[...] fatal error: 'cstddef' file not found
+         10 | #include <cstddef>
+            |          ^~~~~~~~~
+      1 error generated.
+```
+
diff --git a/docs/source/getting_started/installation/cpu-arm.md b/docs/source/getting_started/installation/cpu-arm.md
index a46e2c010600d..e199073ed721f 100644
--- a/docs/source/getting_started/installation/cpu-arm.md
+++ b/docs/source/getting_started/installation/cpu-arm.md
@@ -2,7 +2,7 @@
 
 # Installation for ARM CPUs
 
-vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform. This guide provides installation instructions specific to ARM. For additional details on supported features, refer to the [x86 CPU documentation](#installation-x86) covering:
+vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform. This guide provides installation instructions specific to ARM (which also apply to Apple Silicon, see [Installation for macOS](#installation-apple) for more). For additional details on supported features, refer to the [x86 CPU documentation](#installation-x86) covering:
 
 - CPU backend inference capabilities
 - Relevant runtime environment variables
@@ -20,7 +20,7 @@ Contents:
 ## Requirements
 
 - **Operating System**: Linux or macOS
-- **Compiler**: `gcc/g++ >= 12.3.0` (optional, but recommended)
+- **Compilers**: `gcc/g++ >= 12.3.0` (optional, but recommended) or `Apple Clang >= 15.0.0` for macOS
 - **Instruction Set Architecture (ISA)**: NEON support is required
 
 (arm-backend-quick-start-dockerfile)=
diff --git a/docs/source/getting_started/installation/index.md b/docs/source/getting_started/installation/index.md
index 83de1aff409b2..0ebadca2ccec9 100644
--- a/docs/source/getting_started/installation/index.md
+++ b/docs/source/getting_started/installation/index.md
@@ -11,6 +11,7 @@ gpu-cuda
 gpu-rocm
 cpu-x86
 cpu-arm
+cpu-apple
 hpu-gaudi
 tpu
 xpu
diff --git a/requirements-cpu.txt b/requirements-cpu.txt
index e62f313297762..056fbf5a7adec 100644
--- a/requirements-cpu.txt
+++ b/requirements-cpu.txt
@@ -2,7 +2,7 @@
 -r requirements-common.txt
 
 # Dependencies for CPUs
-torch==2.5.1+cpu; platform_machine != "ppc64le" and platform_machine != "aarch64" 
-torch==2.5.1; platform_machine == "aarch64"
+torch==2.5.1+cpu; platform_machine != "ppc64le" and platform_machine != "aarch64" and platform_system != "Darwin"
+torch==2.5.1; platform_machine == "aarch64" or platform_system == "Darwin" 
 torchvision; platform_machine != "ppc64le"  # required for the image processor of phi3v, this must be updated alongside torch
-datasets # for benchmark scripts
\ No newline at end of file
+datasets # for benchmark scripts
diff --git a/setup.py b/setup.py
index ef9f4e579e84d..b6c1f5bc8ac3f 100644
--- a/setup.py
+++ b/setup.py
@@ -34,9 +34,14 @@ def load_module_from_path(module_name, path):
 
 VLLM_TARGET_DEVICE = envs.VLLM_TARGET_DEVICE
 
-if not sys.platform.startswith("linux"):
+if sys.platform.startswith("darwin") and VLLM_TARGET_DEVICE != "cpu":
     logger.warning(
-        "vLLM only supports Linux platform (including WSL). "
+        "VLLM_TARGET_DEVICE automatically set to `cpu` due to macOS")
+    VLLM_TARGET_DEVICE = "cpu"
+elif not (sys.platform.startswith("linux")
+          or sys.platform.startswith("darwin")):
+    logger.warning(
+        "vLLM only supports Linux platform (including WSL) and MacOS."
         "Building on %s, "
         "so vLLM may not be able to run correctly", sys.platform)
     VLLM_TARGET_DEVICE = "empty"
diff --git a/vllm/config.py b/vllm/config.py
index 44426489f686a..535cbe97a311a 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -4,6 +4,7 @@
 import hashlib
 import json
 import os
+import sys
 import warnings
 from contextlib import contextmanager
 from dataclasses import dataclass, field, replace
@@ -2259,6 +2260,17 @@ def _get_and_verify_dtype(
                     "supported for POWERPC.")
                 torch_dtype = torch.bfloat16
 
+            # TODO: change this condition to check if the platform support bf16
+            # instead of checking the OS. For instance M2 shall supports bf16
+            # already. But we need to modify `cpu_extension.cmake` to activate
+            # the feature in the build.
+            if (current_platform.is_cpu() and sys.platform.startswith("darwin")
+                    and current_platform.get_cpu_architecture()
+                    == CpuArchEnum.ARM and config_dtype == torch.bfloat16):
+                logger.info("For macOS with Apple Silicon, currently bfloat16 "
+                            "is not supported. Setting dtype to float16.")
+                torch_dtype = torch.float16
+
             if current_platform.is_hpu() and config_dtype == torch.float16:
                 logger.info(
                     "For HPU, we cast models to bfloat16 instead of"
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 047f699e4f277..bc1471e1f534d 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -7,6 +7,7 @@
 import re
 import signal
 import socket
+import sys
 import tempfile
 import uuid
 from argparse import Namespace
@@ -805,6 +806,8 @@ def signal_handler(*_) -> None:
             ssl_certfile=args.ssl_certfile,
             ssl_ca_certs=args.ssl_ca_certs,
             ssl_cert_reqs=args.ssl_cert_reqs,
+            # Workaround to work on macOS
+            fd=sock.fileno() if sys.platform.startswith("darwin") else None,
             **uvicorn_kwargs,
         )
 
diff --git a/vllm/utils.py b/vllm/utils.py
index 2660b53d7bfb0..c09cae70e9af8 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -524,6 +524,13 @@ def get_open_port() -> int:
 
 
 def find_process_using_port(port: int) -> Optional[psutil.Process]:
+    # TODO: We can not check for running processes with network
+    # port on macOS. Therefore, we can not have a full graceful shutdown
+    # of vLLM. For now, let's not look for processes in this case.
+    # Ref: https://www.florianreinhard.de/accessdenied-in-psutil/
+    if sys.platform.startswith("darwin"):
+        return None
+
     for conn in psutil.net_connections():
         if conn.laddr.port == port:
             try:

From f12141170a95ad866b3c55762623bc718994e1d7 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Wed, 8 Jan 2025 18:46:43 +0800
Subject: [PATCH 129/462] [torch.compile] consider relevant code in compilation
 cache (#11614)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/compilation/backends.py   | 70 ++++++++++++++++++++++++++++++----
 vllm/compilation/decorators.py | 28 +++++++++++++-
 vllm/config.py                 | 29 ++------------
 vllm/sequence.py               |  7 ++++
 4 files changed, 99 insertions(+), 35 deletions(-)

diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index a8dd628b9cd6f..87655530cead4 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -145,6 +145,7 @@ def wrap_inductor(graph: fx.GraphModule,
                   example_inputs,
                   additional_inductor_config,
                   compilation_config: CompilationConfig,
+                  vllm_backend: "VllmBackend",
                   graph_index: int = 0,
                   num_graphs: int = 1,
                   runtime_shape: Optional[int] = None,
@@ -176,7 +177,7 @@ def wrap_inductor(graph: fx.GraphModule,
     # see https://github.com/pytorch/pytorch/issues/138980
     graph = copy.deepcopy(graph)
 
-    cache_data = compilation_config.inductor_hash_cache
+    cache_data = vllm_backend.inductor_hash_cache
     if (runtime_shape, graph_index) in cache_data:
         # we compiled this graph before
         # so we can directly lookup the compiled graph via hash
@@ -196,7 +197,7 @@ def wrap_inductor(graph: fx.GraphModule,
                 hash_str, example_inputs, True, False)
             assert inductor_compiled_graph is not None, (
                 "Inductor cache lookup failed. Please remove"
-                f"the cache file {compilation_config.inductor_hash_cache.cache_file_path} and try again."  # noqa
+                f"the cache file {cache_data.cache_file_path} and try again."  # noqa
             )
 
         # Inductor calling convention (function signature):
@@ -354,7 +355,7 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter):
 
     def __init__(self, module: torch.fx.GraphModule,
                  compile_submod_names: List[str], vllm_config: VllmConfig,
-                 graph_pool):
+                 graph_pool, vllm_backend: "VllmBackend"):
         super().__init__(module)
         from torch._guards import detect_fake_mode
         self.fake_mode = detect_fake_mode()
@@ -362,6 +363,7 @@ def __init__(self, module: torch.fx.GraphModule,
         self.compilation_config = vllm_config.compilation_config
         self.graph_pool = graph_pool
         self.vllm_config = vllm_config
+        self.vllm_backend = vllm_backend
 
     def run(self, *args):
         fake_args = [
@@ -389,6 +391,7 @@ def call_module(self, target: torch.fx.node.Target,
                 args,
                 self.compilation_config.inductor_compile_config,
                 self.compilation_config,
+                self.vllm_backend,
                 graph_index=index,
                 num_graphs=len(self.compile_submod_names),
                 runtime_shape=None,
@@ -397,7 +400,7 @@ def call_module(self, target: torch.fx.node.Target,
             self.module.__dict__[target] = PiecewiseBackend(
                 submod, self.vllm_config, self.graph_pool, index,
                 len(self.compile_submod_names), sym_shape_indices,
-                compiled_graph_for_general_shape)
+                compiled_graph_for_general_shape, self.vllm_backend)
 
             compilation_counter.num_piecewise_capturable_graphs_seen += 1
 
@@ -430,6 +433,7 @@ class VllmBackend:
     post_grad_passes: Sequence[Callable]
     sym_tensor_indices: List[int]
     input_buffers: List[torch.Tensor]
+    inductor_hash_cache: InductorHashCache
 
     def __init__(
         self,
@@ -472,6 +476,53 @@ def configure_post_pass(self):
 
     def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
 
+        if not self.compilation_config.cache_dir:
+            # no provided cache dir, generate one based on the known factors
+            # that affects the compilation. if none of the factors change,
+            # the cache dir will be the same so that we can reuse the compiled
+            # graph.
+
+            # 1. factors come from the vllm_config (it mainly summarizes how the
+            #    model is created)
+            vllm_config = self.vllm_config
+            config_hash = vllm_config.compute_hash()
+
+            # 2. factors come from the code files that are traced by Dynamo (
+            #    it mainly summarizes how the model is used in forward pass)
+            forward_code_files = list(
+                sorted(self.compilation_config.traced_files))
+            self.compilation_config.traced_files.clear()
+            logger.debug(
+                "Traced files (to be considered for compilation cache):\n%s",
+                "\n".join(forward_code_files))
+            hash_content = []
+            for filepath in forward_code_files:
+                hash_content.append(filepath)
+                with open(filepath) as f:
+                    hash_content.append(f.read())
+            import hashlib
+            code_hash = hashlib.md5(
+                "\n".join(hash_content).encode()).hexdigest()
+
+            # combine the two hashes to generate the cache dir
+            hash_key = hashlib.md5(
+                f"{config_hash}_{code_hash}".encode()).hexdigest()[:10]
+            cache_dir = os.path.join(
+                envs.VLLM_CACHE_ROOT, "torch_compile_cache", hash_key,
+                f"rank_{vllm_config.parallel_config.rank}")
+        else:
+            cache_dir = self.compilation_config.cache_dir
+        os.makedirs(cache_dir, exist_ok=True)
+
+        disabled = envs.VLLM_DISABLE_COMPILE_CACHE
+        self.inductor_hash_cache: InductorHashCache = InductorHashCache(
+            cache_dir, disabled=disabled)
+        if disabled:
+            logger.info("vLLM's torch.compile cache is disabled.")
+        else:
+            logger.info("Using cache directory: %s for vLLM's torch.compile",
+                        cache_dir)
+
         # when dynamo calls the backend, it means the bytecode
         # transform and analysis are done
         compilation_counter.num_graphs_seen += 1
@@ -507,8 +558,8 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
         # propagate the split graph to the piecewise backend,
         # compile submodules with symbolic shapes
         PiecewiseCompileInterpreter(self.split_gm, submod_names_to_compile,
-                                    self.vllm_config,
-                                    self.graph_pool).run(*example_inputs)
+                                    self.vllm_config, self.graph_pool,
+                                    self).run(*example_inputs)
 
         self._called = True
 
@@ -577,7 +628,8 @@ class PiecewiseBackend:
     def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig,
                  graph_pool: Any, piecewise_compile_index: int,
                  total_piecewise_compiles: int, sym_shape_indices: List[int],
-                 compiled_graph_for_general_shape: Callable):
+                 compiled_graph_for_general_shape: Callable,
+                 vllm_backend: VllmBackend):
         """
         The backend for piecewise compilation.
         It mainly handles the compilation and cudagraph capturing.
@@ -597,6 +649,7 @@ def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig,
         self.graph_pool = graph_pool
         self.piecewise_compile_index = piecewise_compile_index
         self.total_piecewise_compiles = total_piecewise_compiles
+        self.vllm_backend = vllm_backend
 
         self.is_first_graph = piecewise_compile_index == 0
         self.is_last_graph = (
@@ -634,7 +687,7 @@ def check_for_ending_compilation(self):
         if self.is_last_graph and not self.to_be_compiled_sizes:
             # no specific sizes to compile
             # save the hash of the inductor graph for the next run
-            self.compilation_config.inductor_hash_cache.save_to_file()
+            self.vllm_backend.inductor_hash_cache.save_to_file()
             end_monitoring_torch_compile(self.vllm_config)
 
     def __call__(self, *args) -> Any:
@@ -662,6 +715,7 @@ def __call__(self, *args) -> Any:
                 args,
                 self.compilation_config.inductor_compile_config,
                 self.compilation_config,
+                self.vllm_backend,
                 graph_index=self.piecewise_compile_index,
                 num_graphs=self.total_piecewise_compiles,
                 runtime_shape=runtime_shape,
diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py
index 805a217ee6ca1..10513111ea7f1 100644
--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -1,8 +1,10 @@
 import inspect
 from typing import Callable, Dict, List, Optional, TypeVar, Union, overload
+from unittest.mock import patch
 
 import torch
 import torch.nn as nn
+from torch._dynamo.symbolic_convert import InliningInstructionTranslator
 
 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
@@ -196,7 +198,31 @@ def __call__(self, *args, **kwargs):
             # we need to control all the compilation of the model.
             torch._dynamo.eval_frame.remove_from_cache(
                 self.original_code_object)
-            return self.compiled_callable(*args, **kwargs)
+
+            # collect all relevant files traced by Dynamo,
+            # so that the compilation cache can trigger re-compilation
+            # properly when any of these files change.
+
+            # 1. the file containing the top-level forward function
+            self.vllm_config.compilation_config.traced_files.add(
+                self.original_code_object.co_filename)
+
+            # 2. every time Dynamo sees a function call, it will inline
+            # the function by calling InliningInstructionTranslator.inline_call
+            # we hijack this function to know all the functions called
+            # during Dynamo tracing, and their corresponding files
+            inline_call = InliningInstructionTranslator.inline_call
+
+            def patched_inline_call(parent, func, args, kwargs):
+                code = func.get_code()
+                self.vllm_config.compilation_config.traced_files.add(
+                    code.co_filename)
+                return inline_call(parent, func, args, kwargs)
+
+            with patch.object(InliningInstructionTranslator, 'inline_call',
+                              patched_inline_call):
+                output = self.compiled_callable(*args, **kwargs)
+            return output
 
         # usually, capturing the model once is enough, and then we can
         # dispatch to the compiled code directly, without going through
diff --git a/vllm/config.py b/vllm/config.py
index 535cbe97a311a..6dabeb3861af2 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -3,7 +3,6 @@
 import enum
 import hashlib
 import json
-import os
 import sys
 import warnings
 from contextlib import contextmanager
@@ -2778,9 +2777,8 @@ def model_post_init(self, __context: Any) -> None:
     # keep track of enabled and disabled custom ops
     enabled_custom_ops: Counter[str] = PrivateAttr
     disabled_custom_ops: Counter[str] = PrivateAttr
+    traced_files: Set[str] = PrivateAttr
     compilation_time: float = PrivateAttr
-    # should be InductorHashCache, but Pydantic does not support it
-    inductor_hash_cache: Any = PrivateAttr
 
     # Per-model forward context
     # Mainly used to store attention cls
@@ -2818,6 +2816,7 @@ def __repr__(self) -> str:
             "compilation_time",
             "bs_to_padded_graph_size",
             "pass_config",
+            "traced_files",
         }
         return self.model_dump_json(exclude=exclude, exclude_unset=True)
 
@@ -2877,6 +2876,7 @@ def model_post_init(self, __context: Any) -> None:
 
         self.enabled_custom_ops = Counter()
         self.disabled_custom_ops = Counter()
+        self.traced_files = set()
         self.static_forward_context = {}
         self.compilation_time = 0.0
 
@@ -2899,29 +2899,6 @@ def init_backend(self, vllm_config: "VllmConfig") -> Union[str, Callable]:
         # merge with the config use_inductor
         assert self.level == CompilationLevel.PIECEWISE
 
-        if not self.cache_dir:
-            # no provided cache dir, generate one based on the known factors
-            # that affects the compilation. if none of the factors change,
-            # the cache dir will be the same so that we can reuse the compiled
-            # graph.
-            hash_key = vllm_config.compute_hash()
-            cache_dir = os.path.join(
-                envs.VLLM_CACHE_ROOT, "torch_compile_cache", hash_key,
-                f"rank_{vllm_config.parallel_config.rank}")
-            os.makedirs(cache_dir, exist_ok=True)
-            self.cache_dir = cache_dir
-
-            disabled = envs.VLLM_DISABLE_COMPILE_CACHE
-            from vllm.compilation.backends import InductorHashCache
-            self.inductor_hash_cache: InductorHashCache = InductorHashCache(
-                self.cache_dir, disabled=disabled)
-            if disabled:
-                logger.info("vLLM's torch.compile cache is disabled.")
-            else:
-                logger.info(
-                    "Using cache directory: %s for vLLM's torch.compile",
-                    self.cache_dir)
-
         from vllm.compilation.backends import VllmBackend
         return VllmBackend(vllm_config)
 
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 0157abbd2eed5..5857f656dfc10 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -1108,6 +1108,13 @@ class IntermediateTensors:
 
     tensors: Dict[str, torch.Tensor]
 
+    def __init__(self, tensors):
+        # manually define this function, so that
+        # Dynamo knows `IntermediateTensors()` comes from this file.
+        # Otherwise, dataclass will generate this function by evaluating
+        # a string, and we will lose the information about the source file.
+        self.tensors = tensors
+
     def __getitem__(self, key: Union[str, slice]):
         if isinstance(key, str):
             return self.tensors[key]

From 2a0596bc480bb835dc05a30f5e708ecbfffbcd69 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 8 Jan 2025 18:59:58 +0800
Subject: [PATCH 130/462] [VLM] Reorganize profiling/processing-related code
 (#11812)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .../processing/test_llava_next.py             |  41 ++--
 .../processing/test_llava_onevision.py        |  41 ++--
 .../vision_language/processing/test_phi3v.py  |  24 +-
 .../processing/test_qwen2_vl.py               |  22 +-
 tests/multimodal/test_processing.py           |  52 ++---
 .../vllm_add_dummy_model/my_llava.py          |  10 +-
 vllm/inputs/preprocess.py                     |   2 +-
 vllm/inputs/registry.py                       |   4 +-
 vllm/model_executor/models/aria.py            |  47 ++--
 vllm/model_executor/models/blip2.py           |  39 ++--
 vllm/model_executor/models/chameleon.py       |  47 ++--
 vllm/model_executor/models/fuyu.py            |  80 +++----
 vllm/model_executor/models/llava.py           | 175 +++++++-------
 vllm/model_executor/models/llava_next.py      |  55 +++--
 .../model_executor/models/llava_next_video.py | 104 +++++----
 vllm/model_executor/models/llava_onevision.py | 115 +++++----
 vllm/model_executor/models/phi3v.py           |  83 +++----
 vllm/model_executor/models/qwen2_audio.py     |  49 ++--
 vllm/model_executor/models/qwen2_vl.py        | 113 +++++----
 vllm/model_executor/models/ultravox.py        |  46 ++--
 vllm/multimodal/processing.py                 | 219 +++++++-----------
 vllm/multimodal/profiling.py                  | 152 +++++++++---
 vllm/multimodal/registry.py                   |  73 +++++-
 23 files changed, 833 insertions(+), 760 deletions(-)

diff --git a/tests/models/decoder_only/vision_language/processing/test_llava_next.py b/tests/models/decoder_only/vision_language/processing/test_llava_next.py
index 9fa6a8a10a0f9..689d17be81889 100644
--- a/tests/models/decoder_only/vision_language/processing/test_llava_next.py
+++ b/tests/models/decoder_only/vision_language/processing/test_llava_next.py
@@ -4,24 +4,17 @@
 import pytest
 from PIL import Image
 from pqdm.threads import pqdm
-from transformers import AutoTokenizer
 
-from vllm.inputs import InputProcessingContext
+from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.parse import ImageSize
+from vllm.multimodal.processing import BaseMultiModalProcessor
+from vllm.multimodal.utils import cached_get_tokenizer
 
 from ....utils import build_model_context
 
 
-# Fixtures lazy import to avoid initializing CUDA during test collection
-@pytest.fixture()
-def processor_for_llava_next():
-    from vllm.model_executor.models.llava_next import (
-        LlavaNextMultiModalProcessor)
-    return LlavaNextMultiModalProcessor
-
-
 def _validate_image_prompt_replacements_one(
-    processor,
+    processor: BaseMultiModalProcessor,
     num_imgs: int,
     failed_size_excs: list[tuple[ImageSize, Exception]],
     image_size: ImageSize,
@@ -78,20 +71,17 @@ def _test_image_prompt_replacements(
 
 @pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
 @pytest.mark.parametrize("num_imgs", [1, 2])
-def test_processor_prompt_replacements_regression(
-    processor_for_llava_next,
-    model_id: str,
-    num_imgs: int,
-):
+def test_processor_prompt_replacements_regression(model_id, num_imgs):
     ctx = build_model_context(
         model_name=model_id,
         tokenizer_name=model_id,
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
-    ctx = InputProcessingContext(ctx.model_config, tokenizer)
-    processor = processor_for_llava_next(ctx)
+    processor = MULTIMODAL_REGISTRY.create_processor(
+        ctx.model_config,
+        tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer),
+    )
 
     image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
                     (488, 183), (2560, 1669)]
@@ -111,20 +101,17 @@ def test_processor_prompt_replacements_regression(
                   "Comment this out to run it manually.")
 @pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
 @pytest.mark.parametrize("num_imgs", [1])
-def test_processor_prompt_replacements_all(
-    processor_for_llava_next,
-    model_id: str,
-    num_imgs: int,
-):
+def test_processor_prompt_replacements_all(model_id, num_imgs):
     ctx = build_model_context(
         model_name=model_id,
         tokenizer_name=model_id,
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
-    ctx = InputProcessingContext(ctx.model_config, tokenizer)
-    processor = processor_for_llava_next(ctx)
+    processor = MULTIMODAL_REGISTRY.create_processor(
+        ctx.model_config,
+        tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer),
+    )
 
     seen_aspect_ratios = set[float]()
     image_sizes = list[ImageSize]()
diff --git a/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py b/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py
index d4cdffa210b6d..a033354f0e9b8 100644
--- a/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py
+++ b/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py
@@ -4,24 +4,17 @@
 import pytest
 from PIL import Image
 from pqdm.threads import pqdm
-from transformers import AutoTokenizer
 
-from vllm.inputs import InputProcessingContext
+from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.parse import ImageSize
+from vllm.multimodal.processing import BaseMultiModalProcessor
+from vllm.multimodal.utils import cached_get_tokenizer
 
 from ....utils import build_model_context
 
 
-# Fixtures lazy import to avoid initializing CUDA during test collection
-@pytest.fixture()
-def processor_for_llava_onevision():
-    from vllm.model_executor.models.llava_onevision import (
-        LlavaOnevisionMultiModalProcessor)
-    return LlavaOnevisionMultiModalProcessor
-
-
 def _validate_image_prompt_replacements_one(
-    processor,
+    processor: BaseMultiModalProcessor,
     num_imgs: int,
     failed_size_excs: list[tuple[ImageSize, Exception]],
     image_size: ImageSize,
@@ -77,20 +70,17 @@ def _test_image_prompt_replacements(
 @pytest.mark.parametrize("model_id",
                          ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
 @pytest.mark.parametrize("num_imgs", [1, 2])
-def test_processor_prompt_replacements_regression(
-    processor_for_llava_onevision,
-    model_id: str,
-    num_imgs: int,
-):
+def test_processor_prompt_replacements_regression(model_id, num_imgs):
     ctx = build_model_context(
         model_name=model_id,
         tokenizer_name=model_id,
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
-    ctx = InputProcessingContext(ctx.model_config, tokenizer)
-    processor = processor_for_llava_onevision(ctx)
+    processor = MULTIMODAL_REGISTRY.create_processor(
+        ctx.model_config,
+        tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer),
+    )
 
     image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
                     (488, 183), (2560, 1669)]
@@ -111,20 +101,17 @@ def test_processor_prompt_replacements_regression(
 @pytest.mark.parametrize("model_id",
                          ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
 @pytest.mark.parametrize("num_imgs", [1])
-def test_processor_prompt_replacements_all(
-    processor_for_llava_onevision,
-    model_id: str,
-    num_imgs: int,
-):
+def test_processor_prompt_replacements_all(model_id, num_imgs):
     ctx = build_model_context(
         model_name=model_id,
         tokenizer_name=model_id,
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
-    ctx = InputProcessingContext(ctx.model_config, tokenizer)
-    processor = processor_for_llava_onevision(ctx)
+    processor = MULTIMODAL_REGISTRY.create_processor(
+        ctx.model_config,
+        tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer),
+    )
 
     seen_aspect_ratios = set[float]()
     image_sizes = list[ImageSize]()
diff --git a/tests/models/decoder_only/vision_language/processing/test_phi3v.py b/tests/models/decoder_only/vision_language/processing/test_phi3v.py
index 249045b3c04ce..c5b77260c6544 100644
--- a/tests/models/decoder_only/vision_language/processing/test_phi3v.py
+++ b/tests/models/decoder_only/vision_language/processing/test_phi3v.py
@@ -1,21 +1,13 @@
 """Tests for phi3v's multimodal preprocessing kwargs."""
 import pytest
-from transformers import AutoTokenizer
 
-from vllm.inputs import InputProcessingContext
-from vllm.model_executor.models.phi3v import _IMAGE_TOKEN_ID
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.utils import cached_get_tokenizer
 
 from .....conftest import _ImageAssets
 from ....utils import build_model_context
 
 
-# Wrap lazy imports to avoid initializing CUDA during test collection
-@pytest.fixture()
-def processor_for_phi3v():
-    from vllm.model_executor.models.phi3v import Phi3VMultiModalProcessor
-    return Phi3VMultiModalProcessor
-
-
 @pytest.mark.parametrize("model_id", ["microsoft/Phi-3.5-vision-instruct"])
 # yapf: disable
 @pytest.mark.parametrize(
@@ -29,7 +21,6 @@ def processor_for_phi3v():
 # yapf: enable
 @pytest.mark.parametrize("num_imgs", [1, 2])
 def test_processor_override(
-    processor_for_phi3v,
     image_assets: _ImageAssets,
     model_id: str,
     mm_processor_kwargs: dict[str, int],
@@ -37,21 +28,26 @@ def test_processor_override(
     num_imgs: int,
 ):
     """Ensure input_processor_for_phi3v handles num_crops properly."""
+    # Avoid initializing CUDA early
+    from vllm.model_executor.models.phi3v import _IMAGE_TOKEN_ID
+
     ctx = build_model_context(
         model_name=model_id,
         tokenizer_name=model_id,
         trust_remote_code=True,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
-    ctx = InputProcessingContext(ctx.model_config, tokenizer)
+    tokenizer = cached_get_tokenizer(ctx.model_config.tokenizer)
+    processor = MULTIMODAL_REGISTRY.create_processor(
+        ctx.model_config,
+        tokenizer=tokenizer,
+    )
 
     # Build the image str / prompt based on the number of images we pass
     img_str = "".join([f"<|image_{idx}|>\n" for idx in range(1, num_imgs + 1)])
     prompt = f"<|user|>\n{img_str}<|end|>\n<|assistant|>\n"
     mm_data = {"image": [image_assets[0].pil_image] * num_imgs}
 
-    processor = processor_for_phi3v(ctx)
     processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
 
     # Ensure we have the right number of placeholders per num_crops size
diff --git a/tests/models/decoder_only/vision_language/processing/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/processing/test_qwen2_vl.py
index b9ac887edf90f..0d54802f2b733 100644
--- a/tests/models/decoder_only/vision_language/processing/test_qwen2_vl.py
+++ b/tests/models/decoder_only/vision_language/processing/test_qwen2_vl.py
@@ -1,19 +1,12 @@
 import pytest
-from transformers import AutoTokenizer
 
-from vllm.inputs import InputProcessingContext
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.utils import cached_get_tokenizer
 
 from .....conftest import _ImageAssets
 from ....utils import build_model_context
 
 
-# Fixtures lazy import to avoid initializing CUDA during test collection
-@pytest.fixture()
-def processor_for_qwen2_vl():
-    from vllm.model_executor.models.qwen2_vl import Qwen2VLMultiModalProcessor
-    return Qwen2VLMultiModalProcessor
-
-
 @pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"])
 # yapf: disable
 @pytest.mark.parametrize(
@@ -24,7 +17,6 @@ def processor_for_qwen2_vl():
 # yapf: enable
 @pytest.mark.parametrize("num_imgs", [1, 2])
 def test_processor_override(
-    processor_for_qwen2_vl,
     image_assets: _ImageAssets,
     model_id: str,
     mm_processor_kwargs: dict[str, object],
@@ -39,18 +31,20 @@ def test_processor_override(
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
-    ctx = InputProcessingContext(ctx.model_config, tokenizer)
+    tokenizer = cached_get_tokenizer(ctx.model_config.tokenizer)
+    processor = MULTIMODAL_REGISTRY.create_processor(
+        ctx.model_config,
+        tokenizer=tokenizer,
+    )
 
     # Build the image str / prompt based on the number of images we pass
     prompt = "<|vision_start|><|image_pad|><|vision_end|>" * num_imgs
     mm_data = {"image": [image_assets[0].pil_image] * num_imgs}
 
-    processor = processor_for_qwen2_vl(ctx)
     processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
 
     # Ensure we have the right number of placeholders per num_crops size
-    hf_processor = processor._get_hf_processor(**mm_processor_kwargs)
+    hf_processor = processor.info.get_hf_processor(**mm_processor_kwargs)
     image_token_id = tokenizer.convert_tokens_to_ids(hf_processor.image_token)
     img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
     pixel_shape = processed_inputs["mm_kwargs"]["pixel_values"].shape
diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index 75d878217b657..d98bd9736b65f 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -10,12 +10,17 @@
 from vllm.config import ModelConfig
 from vllm.inputs import InputProcessingContext
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.processing import (ProcessingCache, PromptReplacement,
-                                        _PlaceholderInfo, find_mm_placeholders,
+# yapf conflicts with isort for this block
+# yapf: disable
+from vllm.multimodal.processing import (PlaceholderInfo, ProcessingCache,
+                                        PromptReplacement,
+                                        find_mm_placeholders,
                                         find_text_matches, find_token_matches,
                                         iter_token_matches,
                                         replace_text_matches,
                                         replace_token_matches)
+# yapf: enable
+from vllm.multimodal.profiling import MultiModalProfiler
 from vllm.multimodal.utils import cached_get_tokenizer
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils import full_groupby
@@ -431,7 +436,7 @@ def test_find_replace_tokens(
             [1, 9833, 28747, 32000, 9833, 28747, 32000, 32000, 918],
             {
                 "pattern_1": [
-                    _PlaceholderInfo(
+                    PlaceholderInfo(
                         modality="pattern_1",
                         item_idx=0,
                         start_idx=6,
@@ -445,13 +450,13 @@ def test_find_replace_tokens(
             [1, 32000, 32000, 9833, 28747, 32000, 32000, 1550, 918, 1550],
             {
                 "pattern_1": [
-                    _PlaceholderInfo(
+                    PlaceholderInfo(
                         modality="pattern_1",
                         item_idx=0,
                         start_idx=1,
                         replacement=[32000, 32000],
                     ),
-                    _PlaceholderInfo(
+                    PlaceholderInfo(
                         modality="pattern_1",
                         item_idx=1,
                         start_idx=5,
@@ -459,7 +464,7 @@ def test_find_replace_tokens(
                     ),
                 ],
                 "pattern_3": [
-                    _PlaceholderInfo(
+                    PlaceholderInfo(
                         modality="pattern_3",
                         item_idx=0,
                         start_idx=7,
@@ -472,13 +477,13 @@ def test_find_replace_tokens(
             [1, 32000, 32000, 32000, 32000, 32000, 1550, 918, 1550],
             {
                 "pattern_1": [
-                    _PlaceholderInfo(
+                    PlaceholderInfo(
                         modality="pattern_1",
                         item_idx=0,
                         start_idx=1,
                         replacement=[32000, 32000],
                     ),
-                    _PlaceholderInfo(
+                    PlaceholderInfo(
                         modality="pattern_1",
                         item_idx=1,
                         start_idx=3,
@@ -486,7 +491,7 @@ def test_find_replace_tokens(
                     ),
                 ],
                 "pattern_3": [
-                    _PlaceholderInfo(
+                    PlaceholderInfo(
                         modality="pattern_3",
                         item_idx=0,
                         start_idx=6,
@@ -577,19 +582,15 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
         revision=None,
         limit_mm_per_prompt=limit_mm_per_prompt,
     )
-    model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
 
-    processor_factory = MULTIMODAL_REGISTRY._processor_factories[model_cls]
-    ctx = InputProcessingContext(
+    processor = MULTIMODAL_REGISTRY.create_processor(
         model_config,
         tokenizer=cached_get_tokenizer(model_config.tokenizer),
     )
-
-    processor = processor_factory(ctx, cache=None)
-    profiler = processor.profiling_info
+    profiler = MultiModalProfiler(processor)
 
     mock_supported_mm_limits = MagicMock(return_value={"image": num_supported})
-    profiler.get_supported_mm_limits = mock_supported_mm_limits
+    processor.info.get_supported_mm_limits = mock_supported_mm_limits
 
     if is_valid:
         exc_ctx = nullcontext()
@@ -597,7 +598,7 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
         exc_ctx = pytest.raises(ValueError, match="this model only supports")
 
     with exc_ctx:
-        profiler.get_mm_limits()
+        profiler.get_dummy_data(model_config.max_model_len)
 
 
 @pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
@@ -620,16 +621,12 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
         revision=None,
         limit_mm_per_prompt=limit_mm_per_prompt,
     )
-    model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
 
-    processor_factory = MULTIMODAL_REGISTRY._processor_factories[model_cls]
-    ctx = InputProcessingContext(
+    processor = MULTIMODAL_REGISTRY.create_processor(
         model_config,
         tokenizer=cached_get_tokenizer(model_config.tokenizer),
     )
 
-    processor = processor_factory(ctx, cache=None)
-
     rng = np.random.RandomState(0)
     image = _rand_img(rng, min_wh=128, max_wh=256)
     if num_images == 0:
@@ -681,9 +678,9 @@ def _test_processing_cache_correctness(
         hf_overrides=hf_overrides,
         limit_mm_per_prompt=limit_mm_per_prompt,
     )
-    model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
 
-    processor_factory = MULTIMODAL_REGISTRY._processor_factories[model_cls]
+    model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
+    factories = MULTIMODAL_REGISTRY._processor_factories[model_cls]
     ctx = InputProcessingContext(
         model_config,
         tokenizer=cached_get_tokenizer(model_config.tokenizer),
@@ -691,8 +688,9 @@ def _test_processing_cache_correctness(
     # Ensure that it can fit all of the data
     cache = ProcessingCache(capacity=1 << 30)
 
-    baseline_processor = processor_factory(ctx, cache=None)
-    cached_processor = processor_factory(ctx, cache=cache)
+    baseline_processor = factories.build_processor(ctx, cache=None)
+    cached_processor = factories.build_processor(ctx, cache=cache)
+    dummy_inputs = baseline_processor.dummy_inputs
 
     rng = np.random.RandomState(0)
 
@@ -724,7 +722,7 @@ def _test_processing_cache_correctness(
         }
 
         mm_counts = {k: len(vs) for k, vs in mm_data.items()}
-        prompt = baseline_processor.profiling_info.get_dummy_processor_inputs(
+        prompt = dummy_inputs.get_dummy_processor_inputs(
             model_config.max_model_len,
             mm_counts,
         ).prompt_text
diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
index 06dfebbb95527..ac64edfd4ec9d 100644
--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
@@ -2,13 +2,17 @@
 
 import torch
 
-from vllm.model_executor.models.llava import (LlavaForConditionalGeneration,
-                                              LlavaMultiModalProcessor)
+from vllm.model_executor.models.llava import (LlavaDummyInputsBuilder,
+                                              LlavaForConditionalGeneration,
+                                              LlavaMultiModalProcessor,
+                                              LlavaProcessingInfo)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 
 
-@MULTIMODAL_REGISTRY.register_processor(LlavaMultiModalProcessor)
+@MULTIMODAL_REGISTRY.register_processor(LlavaMultiModalProcessor,
+                                        info=LlavaProcessingInfo,
+                                        dummy_inputs=LlavaDummyInputsBuilder)
 class MyLlava(LlavaForConditionalGeneration):
 
     def compute_logits(
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index b362ee0cac328..6ddc1eb76f10d 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -7,7 +7,7 @@
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
-from vllm.multimodal.processing import MultiModalDataDict, MultiModalInputsV2
+from vllm.multimodal.inputs import MultiModalDataDict, MultiModalInputsV2
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
 from vllm.utils import print_info_once, print_warning_once
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index 2d9d024e03e80..b22b3f1594f24 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -323,6 +323,7 @@ def dummy_data_for_profiling(
         # Avoid circular import
         from vllm.model_executor.model_loader import get_model_architecture
         from vllm.multimodal import MultiModalKwargs
+        from vllm.multimodal.profiling import MultiModalProfiler
         from vllm.multimodal.utils import cached_get_tokenizer
 
         if mm_registry.has_processor(model_config):
@@ -331,7 +332,8 @@ def dummy_data_for_profiling(
                 trust_remote_code=model_config.trust_remote_code,
             )
             processor = mm_registry.create_processor(model_config, tokenizer)
-            dummy_data = processor.get_dummy_data(seq_len)
+            profiler = MultiModalProfiler(processor)
+            dummy_data = profiler.get_dummy_data(seq_len)
         else:
             model_cls, _ = get_model_architecture(model_config)
             if is_encoder_data:
diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
index 2e649f10c0765..089062ab53fc3 100644
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@@ -23,10 +23,10 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
                                     NestedTensors)
+from vllm.multimodal.parse import MultiModalDataItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessingMixin,
-                                        PromptReplacement)
-from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs
+                                        BaseProcessingInfo, PromptReplacement)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs.aria import (AriaMoELMConfig,
                                                   AriaVisionConfig)
@@ -445,33 +445,33 @@ def build_mm_projector(config: PretrainedConfig):
     )
 
 
-class AriaProcessingMixin(ProcessingMixin):
+class AriaProcessingInfo(BaseProcessingInfo):
 
-    def _get_hf_config(self):
+    def get_hf_config(self):
         return self.ctx.get_hf_config()
 
-    def _get_vision_config(self) -> AriaVisionConfig:
-        return self._get_hf_config().vision_config
-
-    def _get_num_image_tokens(self) -> int:
-        hf_config = self._get_hf_config()
-        return max(hf_config.projector_patch_to_query_dict.values())
-
-
-class AriaProfilingInfo(AriaProcessingMixin, BaseProfilingInfo):
+    def get_vision_config(self) -> AriaVisionConfig:
+        return self.get_hf_config().vision_config
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None}
 
     def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
-        return {"image": self._get_num_image_tokens()}
+        return {"image": self.get_num_image_tokens()}
+
+    def get_num_image_tokens(self) -> int:
+        hf_config = self.get_hf_config()
+        return max(hf_config.projector_patch_to_query_dict.values())
+
+
+class AriaDummyInputsBuilder(BaseDummyInputsBuilder[AriaProcessingInfo]):
 
     def get_dummy_processor_inputs(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
-        vision_config = self._get_vision_config()
+        vision_config = self.info.get_vision_config()
 
         max_image_size = vision_config.image_size
         num_images = mm_counts.get("image", 0)
@@ -483,7 +483,7 @@ def get_dummy_processor_inputs(
                                    num_images=num_images)
         }
 
-        hf_processor = self._get_hf_processor()
+        hf_processor = self.info.get_hf_processor()
         image_token: str = hf_processor.image_token  # type: ignore
 
         return ProcessorInputs(
@@ -492,10 +492,7 @@ def get_dummy_processor_inputs(
         )
 
 
-class AriaMultiModalProcessor(AriaProcessingMixin, BaseMultiModalProcessor):
-
-    def _get_profiling_info(self) -> BaseProfilingInfo:
-        return AriaProfilingInfo(self.ctx)
+class AriaMultiModalProcessor(BaseMultiModalProcessor[AriaProcessingInfo]):
 
     def _get_mm_fields_config(
         self,
@@ -513,10 +510,10 @@ def _get_prompt_replacements(
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
-        hf_config = self._get_hf_config()
+        hf_config = self.info.get_hf_config()
         image_token_id = hf_config.image_token_index
 
-        num_image_tokens = self._get_num_image_tokens()
+        num_image_tokens = self.info.get_num_image_tokens()
 
         return [
             PromptReplacement(
@@ -527,7 +524,9 @@ def _get_prompt_replacements(
         ]
 
 
-@MULTIMODAL_REGISTRY.register_processor(AriaMultiModalProcessor)
+@MULTIMODAL_REGISTRY.register_processor(AriaMultiModalProcessor,
+                                        info=AriaProcessingInfo,
+                                        dummy_inputs=AriaDummyInputsBuilder)
 class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
     """
     Aria model for conditional generation tasks.
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index fd45783f167b4..7dfc0b687c6e3 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -17,10 +17,10 @@
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
                                     MultiModalInputsV2, MultiModalKwargs,
                                     NestedTensors, PlaceholderRange)
+from vllm.multimodal.parse import MultiModalDataItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessingMixin,
-                                        PromptReplacement)
-from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs
+                                        BaseProcessingInfo, PromptReplacement)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
 from .blip import BlipVisionModel
@@ -397,30 +397,30 @@ def forward(
         return sequence_output
 
 
-class Blip2ProcessingMixin(ProcessingMixin):
+class Blip2ProcessingInfo(BaseProcessingInfo):
 
-    def _get_hf_config(self):
+    def get_hf_config(self):
         return self.ctx.get_hf_config(Blip2Config)
 
-    def _get_num_image_tokens(self) -> int:
-        hf_config = self._get_hf_config()
-        return hf_config.num_query_tokens
-
-
-class Blip2ProfilingInfo(Blip2ProcessingMixin, BaseProfilingInfo):
-
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": 1}
 
     def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
-        return {"image": self._get_num_image_tokens()}
+        return {"image": self.get_num_image_tokens()}
+
+    def get_num_image_tokens(self) -> int:
+        hf_config = self.get_hf_config()
+        return hf_config.num_query_tokens
+
+
+class Blip2DummyInputsBuilder(BaseDummyInputsBuilder[Blip2ProcessingInfo]):
 
     def get_dummy_processor_inputs(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
-        hf_config = self._get_hf_config()
+        hf_config = self.info.get_hf_config()
         vision_config = hf_config.vision_config
 
         max_image_size = vision_config.image_size
@@ -439,10 +439,7 @@ def get_dummy_processor_inputs(
         )
 
 
-class Blip2MultiModalProcessor(Blip2ProcessingMixin, BaseMultiModalProcessor):
-
-    def _get_profiling_info(self) -> BaseProfilingInfo:
-        return Blip2ProfilingInfo(self.ctx)
+class Blip2MultiModalProcessor(BaseMultiModalProcessor[Blip2ProcessingInfo]):
 
     def _get_mm_fields_config(
         self,
@@ -460,7 +457,7 @@ def _get_prompt_replacements(
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
-        num_image_tokens = self._get_num_image_tokens()
+        num_image_tokens = self.info.get_num_image_tokens()
 
         return [
             PromptReplacement(
@@ -491,7 +488,9 @@ def apply(
         return result
 
 
-@MULTIMODAL_REGISTRY.register_processor(Blip2MultiModalProcessor)
+@MULTIMODAL_REGISTRY.register_processor(Blip2MultiModalProcessor,
+                                        info=Blip2ProcessingInfo,
+                                        dummy_inputs=Blip2DummyInputsBuilder)
 class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index 73ed73b61ebf9..acff926891bbe 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -30,10 +30,10 @@
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
                                     MultiModalInputsV2, MultiModalKwargs,
                                     NestedTensors, PlaceholderRange)
+from vllm.multimodal.parse import MultiModalDataItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessingMixin,
-                                        PromptReplacement)
-from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs
+                                        BaseProcessingInfo, PromptReplacement)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 from vllm.utils import print_warning_once
 
@@ -49,33 +49,34 @@ class ChameleonImagePixelInputs(TypedDict):
     """Shape: `(batch_size * num_images, num_channels, height, width)`"""
 
 
-class ChameleonProcessingMixin(ProcessingMixin):
+class ChameleonProcessingInfo(BaseProcessingInfo):
 
-    def _get_hf_config(self):
+    def get_hf_config(self):
         return self.ctx.get_hf_config(ChameleonConfig)
 
-    def _get_hf_processor(self):
+    def get_hf_processor(self):
         return self.ctx.get_hf_processor(ChameleonProcessor)
 
-    def _get_num_image_tokens(self) -> int:
-        processor = self._get_hf_processor()
-        return processor.image_seq_length
-
-
-class ChameleonProfilingInfo(ChameleonProcessingMixin, BaseProfilingInfo):
-
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": 1}
 
     def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
-        return {"image": self._get_num_image_tokens()}
+        return {"image": self.get_num_image_tokens()}
+
+    def get_num_image_tokens(self) -> int:
+        processor = self.get_hf_processor()
+        return processor.image_seq_length
+
+
+class ChameleonDummyInputsBuilder(
+        BaseDummyInputsBuilder[ChameleonProcessingInfo]):
 
     def get_dummy_processor_inputs(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
-        config = self._get_hf_config()
+        config = self.info.get_hf_config()
 
         width = height = config.vq_config.resolution
         num_images = mm_counts.get("image", 0)
@@ -93,11 +94,8 @@ def get_dummy_processor_inputs(
         )
 
 
-class ChameleonMultiModalProcessor(ChameleonProcessingMixin,
-                                   BaseMultiModalProcessor):
-
-    def _get_profiling_info(self) -> BaseProfilingInfo:
-        return ChameleonProfilingInfo(self.ctx)
+class ChameleonMultiModalProcessor(
+        BaseMultiModalProcessor[ChameleonProcessingInfo]):
 
     def _get_mm_fields_config(
         self,
@@ -112,7 +110,7 @@ def _get_prompt_replacements(
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
-        processor = self._get_hf_processor(**hf_processor_mm_kwargs)
+        processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
 
         return [
             PromptReplacement(
@@ -120,7 +118,7 @@ def _get_prompt_replacements(
                 target="<image>",
                 replacement="".join([
                     processor.image_start_token,
-                    processor.image_token * self._get_num_image_tokens(),
+                    processor.image_token * self.info.get_num_image_tokens(),
                     processor.image_end_token,
                 ]),
             )
@@ -916,7 +914,10 @@ def forward(
         return hidden_states
 
 
-@MULTIMODAL_REGISTRY.register_processor(ChameleonMultiModalProcessor)
+@MULTIMODAL_REGISTRY.register_processor(
+    ChameleonMultiModalProcessor,
+    info=ChameleonProcessingInfo,
+    dummy_inputs=ChameleonDummyInputsBuilder)
 class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal,
                                         SupportsPP):
 
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index c937fcb0978b9..59af5f0b3ae98 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -33,11 +33,11 @@
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
                                     MultiModalInputsV2, MultiModalKwargs,
                                     NestedTensors, PlaceholderRange)
-from vllm.multimodal.parse import ImageProcessorItems, ImageSize
+from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
+                                   MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessingMixin,
-                                        PromptReplacement)
-from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs
+                                        BaseProcessingInfo, PromptReplacement)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsMultiModal, SupportsPP
@@ -64,24 +64,38 @@ class FuyuImagePatchInputs(TypedDict):
     """
 
 
-class FuyuProcessingMixin(ProcessingMixin):
+class FuyuProcessingInfo(BaseProcessingInfo):
 
-    def _get_hf_config(self):
+    def get_hf_config(self):
         return self.ctx.get_hf_config(FuyuConfig)
 
-    def _get_hf_processor(self):
+    def get_hf_processor(self):
         return self.ctx.get_hf_processor(FuyuProcessor)
 
-    def _get_image_processor(self) -> FuyuImageProcessor:
-        return self._get_hf_processor().image_processor
+    def get_image_processor(self) -> FuyuImageProcessor:
+        return self.get_hf_processor().image_processor
 
-    def _get_image_feature_grid_size(
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": 1}
+
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+        target_width, target_height = self.get_image_size_with_most_features()
+
+        max_ncols, max_nrows = self.get_image_feature_grid_size(
+            image_width=target_width,
+            image_height=target_height,
+        )
+        max_image_tokens = (max_ncols + 1) * max_nrows
+
+        return {"image": max_image_tokens}
+
+    def get_image_feature_grid_size(
         self,
         *,
         image_width: int,
         image_height: int,
     ) -> tuple[int, int]:
-        image_processor = self._get_image_processor()
+        image_processor = self.get_image_processor()
         target_width = image_processor.size["width"]
         target_height = image_processor.size["height"]
 
@@ -97,34 +111,21 @@ def _get_image_feature_grid_size(
         nrows = math.ceil(image_height / 30)
         return ncols, nrows
 
-
-class FuyuProfilingInfo(FuyuProcessingMixin, BaseProfilingInfo):
-
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
-        return {"image": 1}
-
-    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
-        target_width, target_height = self._get_image_size_with_most_features()
-
-        max_ncols, max_nrows = self._get_image_feature_grid_size(
-            image_width=target_width,
-            image_height=target_height,
-        )
-        max_image_tokens = (max_ncols + 1) * max_nrows
-
-        return {"image": max_image_tokens}
-
-    def _get_image_size_with_most_features(self) -> ImageSize:
-        image_processor = self._get_image_processor()
+    def get_image_size_with_most_features(self) -> ImageSize:
+        image_processor = self.get_image_processor()
         return ImageSize(width=image_processor.size["width"],
                          height=image_processor.size["height"])
 
+
+class FuyuDummyInputsBuilder(BaseDummyInputsBuilder[FuyuProcessingInfo]):
+
     def get_dummy_processor_inputs(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
-        target_width, target_height = self._get_image_size_with_most_features()
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
         num_images = mm_counts.get("image", 0)
 
         mm_data = {
@@ -140,10 +141,7 @@ def get_dummy_processor_inputs(
         )
 
 
-class FuyuMultiModalProcessor(FuyuProcessingMixin, BaseMultiModalProcessor):
-
-    def _get_profiling_info(self) -> BaseProfilingInfo:
-        return FuyuProfilingInfo(self.ctx)
+class FuyuMultiModalProcessor(BaseMultiModalProcessor[FuyuProcessingInfo]):
 
     def _call_hf_processor(
         self,
@@ -156,7 +154,7 @@ def _call_hf_processor(
             # Avoid warning from HF logger for text-only input
             # Input_ids format: bos_token_id + prompt_token_ids + boa_token_id
             # Tokenizer won't add boa_token_id by default, we add it manually.
-            tokenizer = self._get_tokenizer()
+            tokenizer = self.info.get_tokenizer()
             boa_token_id: int = tokenizer.vocab["<0x04>"]  # type: ignore
             prompt_ids = tokenizer.encode(prompt) + [boa_token_id]
             return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
@@ -196,10 +194,10 @@ def _get_prompt_replacements(
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
-        hf_config = self._get_hf_config()
+        hf_config = self.info.get_hf_config()
         bos_token_id = hf_config.bos_token_id
 
-        tokenizer = self._get_tokenizer()
+        tokenizer = self.info.get_tokenizer()
         eot_token_id = tokenizer.bos_token_id
         assert isinstance(eot_token_id, int)
 
@@ -207,7 +205,7 @@ def get_replacement_fuyu(item_idx: int):
             images = mm_items.get_items("image", ImageProcessorItems)
             image_size = images.get_image_size(item_idx)
 
-            ncols, nrows = self._get_image_feature_grid_size(
+            ncols, nrows = self.info.get_image_feature_grid_size(
                 image_width=image_size.width,
                 image_height=image_size.height,
             )
@@ -244,7 +242,9 @@ def apply(
         return result
 
 
-@MULTIMODAL_REGISTRY.register_processor(FuyuMultiModalProcessor)
+@MULTIMODAL_REGISTRY.register_processor(FuyuMultiModalProcessor,
+                                        info=FuyuProcessingInfo,
+                                        dummy_inputs=FuyuDummyInputsBuilder)
 class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 305f1364dba23..8d94acf3b21d5 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -1,7 +1,7 @@
-from abc import ABC, abstractmethod
+from abc import abstractmethod
 from functools import cached_property
 from typing import (Final, Iterable, List, Literal, Mapping, Optional,
-                    Protocol, Set, Tuple, TypedDict, Union)
+                    Protocol, Set, Tuple, TypedDict, TypeVar, Union)
 
 import torch
 import torch.nn as nn
@@ -25,11 +25,11 @@
                                     MultiModalInputsV2, MultiModalKwargs,
                                     NestedTensors)
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
-                                   ImageSize)
+                                   ImageSize, MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessingCache,
-                                        ProcessingMixin, PromptReplacement)
-from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs
+                                        BaseProcessingInfo, ProcessingCache,
+                                        PromptReplacement)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
 from .clip import CLIPVisionModel
@@ -105,34 +105,23 @@ class LlavaLikeProcessor(Protocol):
     image_token: Final[str]
 
 
-class BaseLlavaProcessingMixin(ProcessingMixin, ABC):
+class BaseLlavaProcessingInfo(BaseProcessingInfo):
 
-    def _get_hf_config(self) -> LlavaLikeConfig:
+    def get_hf_config(self) -> LlavaLikeConfig:
         return self.ctx.get_hf_config(LlavaConfig)
 
-    def _get_vision_encoder_info(self):
-        return get_vision_encoder_info(self._get_hf_config())
+    def get_vision_encoder_info(self):
+        return get_vision_encoder_info(self.get_hf_config())
 
     @abstractmethod
-    def _get_hf_processor(self) -> LlavaLikeProcessor:
+    def get_hf_processor(self) -> LlavaLikeProcessor:
         raise NotImplementedError
 
-    def _get_num_image_tokens(
-        self,
-        *,
-        image_width: int,
-        image_height: int,
-    ) -> int:
-        hf_config = self._get_hf_config()
-        vision_encoder_info = self._get_vision_encoder_info()
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None}
 
-        return self._apply_feature_select_strategy(
-            hf_config.vision_feature_select_strategy,
-            vision_encoder_info.get_num_image_tokens(
-                image_width=image_width,
-                image_height=image_height,
-            ),
-        )
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+        return {"image": self.get_max_image_tokens()}
 
     def _apply_feature_select_strategy(
         self,
@@ -147,28 +136,42 @@ def _apply_feature_select_strategy(
         msg = f"Unexpected feature select strategy: {strategy!r}"
         raise NotImplementedError(msg)
 
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        hf_config = self.get_hf_config()
+        vision_encoder_info = self.get_vision_encoder_info()
 
-class BaseLlavaProfilingInfo(BaseLlavaProcessingMixin, BaseProfilingInfo):
-
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
-        return {"image": None}
-
-    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
-        return {"image": self._get_max_image_tokens()}
+        return self._apply_feature_select_strategy(
+            hf_config.vision_feature_select_strategy,
+            vision_encoder_info.get_num_image_tokens(
+                image_width=image_width,
+                image_height=image_height,
+            ),
+        )
 
-    def _get_image_size_with_most_features(self) -> ImageSize:
-        vision_encoder_info = self._get_vision_encoder_info()
+    def get_image_size_with_most_features(self) -> ImageSize:
+        vision_encoder_info = self.get_vision_encoder_info()
         width = height = vision_encoder_info.get_image_size()
         return ImageSize(width=width, height=height)
 
-    def _get_max_image_tokens(self) -> int:
-        target_width, target_height = self._get_image_size_with_most_features()
+    def get_max_image_tokens(self) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
 
-        return self._get_num_image_tokens(
+        return self.get_num_image_tokens(
             image_width=target_width,
             image_height=target_height,
         )
 
+
+_I = TypeVar("_I", bound=BaseLlavaProcessingInfo)
+
+
+class LlavaDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
+
     def get_dummy_processor_inputs(
         self,
         seq_len: int,
@@ -176,9 +179,10 @@ def get_dummy_processor_inputs(
     ) -> ProcessorInputs:
         num_images = mm_counts.get("image", 0)
 
-        processor = self._get_hf_processor()
+        processor = self.info.get_hf_processor()
         image_token = processor.image_token
-        target_width, target_height = self._get_image_size_with_most_features()
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
 
         mm_data = {
             "image":
@@ -193,23 +197,13 @@ def get_dummy_processor_inputs(
         )
 
 
-class LlavaProcessingMixin(BaseLlavaProcessingMixin):
+class LlavaProcessingInfo(BaseLlavaProcessingInfo):
 
-    def _get_hf_processor(self):
+    def get_hf_processor(self):
         return self.ctx.get_hf_processor(LlavaProcessor)
 
 
-class LlavaProfilingInfo(LlavaProcessingMixin, BaseLlavaProfilingInfo):
-    pass
-
-
-class BaseLlavaMultiModalProcessor(LlavaProcessingMixin,
-                                   BaseMultiModalProcessor):
-
-    # Copied from BaseMultiModalProcessor
-    @abstractmethod
-    def _get_profiling_info(self) -> BaseProfilingInfo:
-        raise NotImplementedError
+class BaseLlavaMultiModalProcessor(BaseMultiModalProcessor[_I]):
 
     # Copied from BaseMultiModalProcessor
     @abstractmethod
@@ -226,7 +220,7 @@ def _get_prompt_replacements(
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
-        hf_config = self._get_hf_config()
+        hf_config = self.info.get_hf_config()
         image_token_id = hf_config.image_token_index
 
         def get_replacement(item_idx: int):
@@ -237,7 +231,7 @@ def get_replacement(item_idx: int):
                 num_image_tokens = images.get_feature_size(item_idx)
             else:
                 image_size = images.get_image_size(item_idx)
-                num_image_tokens = self._get_num_image_tokens(
+                num_image_tokens = self.info.get_num_image_tokens(
                     image_width=image_size.width,
                     image_height=image_size.height,
                 )
@@ -253,10 +247,8 @@ def get_replacement(item_idx: int):
         ]
 
 
-class LlavaMultiModalProcessor(BaseLlavaMultiModalProcessor):
-
-    def _get_profiling_info(self) -> BaseProfilingInfo:
-        return LlavaProfilingInfo(self.ctx)
+class LlavaMultiModalProcessor(
+        BaseLlavaMultiModalProcessor[LlavaProcessingInfo]):
 
     def _get_mm_fields_config(
         self,
@@ -269,21 +261,14 @@ def _get_mm_fields_config(
         )
 
 
-class PixtralHFProcessingMixin(BaseLlavaProcessingMixin):
+class PixtralHFProcessingInfo(BaseLlavaProcessingInfo):
 
-    def _get_hf_processor(self):
+    def get_hf_processor(self):
         return self.ctx.get_hf_processor(PixtralProcessor)
 
 
-class PixtralHFProfilingInfo(PixtralHFProcessingMixin, BaseLlavaProfilingInfo):
-    pass
-
-
-class PixtralHFMultiModalProcessor(PixtralHFProcessingMixin,
-                                   BaseMultiModalProcessor):
-
-    def _get_profiling_info(self) -> BaseProfilingInfo:
-        return PixtralHFProfilingInfo(self.ctx)
+class PixtralHFMultiModalProcessor(
+        BaseMultiModalProcessor[PixtralHFProcessingInfo]):
 
     def _call_hf_processor(
         self,
@@ -328,10 +313,10 @@ def _get_prompt_replacements(
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
-        hf_config = self._get_hf_config()
+        hf_config = self.info.get_hf_config()
         image_token_id = hf_config.image_token_index
 
-        processor = self._get_hf_processor()
+        processor = self.info.get_hf_processor()
         image_token = processor.image_token
         image_break_token = processor.image_break_token
         image_end_token = processor.image_end_token
@@ -363,26 +348,40 @@ def get_replacement(item_idx: int):
         ]
 
 
+def _build_llava_or_pixtral_hf_info(
+    ctx: InputProcessingContext, ) -> BaseLlavaProcessingInfo:
+    hf_config = ctx.get_hf_config(LlavaConfig)
+
+    if isinstance(hf_config.vision_config, PixtralVisionConfig):
+        return PixtralHFProcessingInfo(ctx)
+
+    return LlavaProcessingInfo(ctx)
+
+
 def _build_llava_or_pixtral_hf_processor(
-    ctx: InputProcessingContext,
+    info: _I,
+    dummy_inputs: BaseDummyInputsBuilder[_I],
     *,
     cache: Optional[ProcessingCache] = None,
     enable_sanity_checks: bool = True,
 ) -> BaseMultiModalProcessor:
-    hf_config = ctx.get_hf_config(LlavaConfig)
-
-    if isinstance(hf_config.vision_config, PixtralVisionConfig):
+    if isinstance(info, PixtralHFProcessingInfo):
         return PixtralHFMultiModalProcessor(
-            ctx,
+            info,
+            dummy_inputs,  # type: ignore
+            cache=cache,
+            enable_sanity_checks=enable_sanity_checks,
+        )
+
+    if isinstance(info, LlavaProcessingInfo):
+        return LlavaMultiModalProcessor(
+            info,
+            dummy_inputs,  # type: ignore
             cache=cache,
             enable_sanity_checks=enable_sanity_checks,
         )
 
-    return LlavaMultiModalProcessor(
-        ctx,
-        cache=cache,
-        enable_sanity_checks=enable_sanity_checks,
-    )
+    raise NotImplementedError(type(info))
 
 
 def _get_num_hidden_layers(hf_config: LlavaLikeConfig) -> int:
@@ -460,7 +459,9 @@ def init_vision_tower_for_llava(
     raise NotImplementedError(msg)
 
 
-@MULTIMODAL_REGISTRY.register_processor(_build_llava_or_pixtral_hf_processor)
+@MULTIMODAL_REGISTRY.register_processor(_build_llava_or_pixtral_hf_processor,
+                                        info=_build_llava_or_pixtral_hf_info,
+                                        dummy_inputs=LlavaDummyInputsBuilder)
 class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
     # BitandBytes specific attributes
     bitsandbytes_stacked_params_mapping = {
@@ -727,11 +728,11 @@ def apply(
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> MultiModalInputsV2:
-        hf_config = self._get_hf_config()
+        hf_config = self.info.get_hf_config()
         image_token_id = hf_config.image_token_index
 
         # Assume that it doesn't depend on the image size
-        num_image_tokens = self._get_num_image_tokens(
+        num_image_tokens = self.info.get_num_image_tokens(
             image_width=-1,
             image_height=-1,
         )
@@ -796,6 +797,8 @@ def get_replacement_mantis(item_idx: int):
 
 # To use this model, please use
 # `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'`
-@MULTIMODAL_REGISTRY.register_processor(MantisMultiModalProcessor)
+@MULTIMODAL_REGISTRY.register_processor(MantisMultiModalProcessor,
+                                        info=LlavaProcessingInfo,
+                                        dummy_inputs=LlavaDummyInputsBuilder)
 class MantisForConditionalGeneration(LlavaForConditionalGeneration):
     pass
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 815456dac2a2f..fda4f22d366b1 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -1,6 +1,7 @@
+from abc import abstractmethod
 from functools import cached_property
 from typing import (Final, Iterable, List, Literal, Mapping, Optional,
-                    Protocol, Set, Tuple, TypedDict, Union)
+                    Protocol, Set, Tuple, TypedDict, TypeVar, Union)
 
 import torch
 import torch.nn as nn
@@ -16,13 +17,12 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import MultiModalFieldConfig, NestedTensors
 from vllm.multimodal.parse import ImageSize
-from vllm.multimodal.profiling import BaseProfilingInfo
 from vllm.sequence import IntermediateTensors
 
 from .clip import CLIPVisionModel
 from .interfaces import SupportsMultiModal, SupportsPP
-from .llava import (BaseLlavaMultiModalProcessor, BaseLlavaProcessingMixin,
-                    BaseLlavaProfilingInfo, LlavaLikeConfig,
+from .llava import (BaseLlavaMultiModalProcessor, BaseLlavaProcessingInfo,
+                    LlavaDummyInputsBuilder, LlavaLikeConfig,
                     LlavaMultiModalProjector, init_vision_tower_for_llava)
 from .siglip import SiglipVisionModel
 from .utils import (AutoWeightsLoader, embed_multimodal, flatten_bn,
@@ -65,23 +65,23 @@ class LlavaNextLikeConfig(LlavaLikeConfig, Protocol):
     image_grid_pinpoints: Final[list[list[int]]]
 
 
-class LlavaNextProcessingMixin(BaseLlavaProcessingMixin):
+class LlavaNextProcessingInfo(BaseLlavaProcessingInfo):
 
-    def _get_hf_config(self) -> LlavaNextLikeConfig:
+    def get_hf_config(self) -> LlavaNextLikeConfig:
         return self.ctx.get_hf_config(LlavaNextConfig)
 
-    def _get_hf_processor(self):
+    def get_hf_processor(self):
         return self.ctx.get_hf_processor(LlavaNextProcessor)
 
     # Based on: https://github.com/huggingface/text-generation-inference/blob/v3.0.1/server/text_generation_server/models/vlm_causal_lm.py#L113
-    def _get_num_image_tokens(
+    def get_num_image_tokens(
         self,
         *,
         image_width: int,
         image_height: int,
     ) -> int:
-        hf_config = self._get_hf_config()
-        vision_encoder_info = self._get_vision_encoder_info()
+        hf_config = self.get_hf_config()
+        vision_encoder_info = self.get_vision_encoder_info()
 
         base_feature_size = self._apply_feature_select_strategy(
             hf_config.vision_feature_select_strategy,
@@ -140,16 +140,13 @@ def _get_num_unpadded_features(
 
         return (unpadded_features, newline_features)
 
-
-class LlavaNextProfilingInfo(LlavaNextProcessingMixin, BaseLlavaProfilingInfo):
-
-    def _get_image_size_with_most_features(self) -> ImageSize:
-        hf_config = self._get_hf_config()
+    def get_image_size_with_most_features(self) -> ImageSize:
+        hf_config = self.get_hf_config()
 
         largest_feature_size, largest_feature_pinpoint = 0, None
         for (height, width) in hf_config.image_grid_pinpoints:
-            feat_size = self._get_num_image_tokens(image_width=width,
-                                                   image_height=height)
+            feat_size = self.get_num_image_tokens(image_width=width,
+                                                  image_height=height)
             if feat_size > largest_feature_size:
                 largest_feature_size = feat_size
                 largest_feature_pinpoint = ImageSize(width=width,
@@ -161,11 +158,23 @@ def _get_image_size_with_most_features(self) -> ImageSize:
         return largest_feature_pinpoint
 
 
-class LlavaNextMultiModalProcessor(LlavaNextProcessingMixin,
-                                   BaseLlavaMultiModalProcessor):
+_I = TypeVar("_I", bound=LlavaNextProcessingInfo)
+
+
+class BaseLlavaNextMultiModalProcessor(BaseLlavaMultiModalProcessor[_I]):
+
+    # Copied from BaseMultiModalProcessor
+    @abstractmethod
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        raise NotImplementedError
+
 
-    def _get_profiling_info(self) -> BaseProfilingInfo:
-        return LlavaNextProfilingInfo(self.ctx)
+class LlavaNextMultiModalProcessor(
+        BaseLlavaNextMultiModalProcessor[LlavaNextProcessingInfo]):
 
     def _get_mm_fields_config(
         self,
@@ -179,7 +188,9 @@ def _get_mm_fields_config(
         )
 
 
-@MULTIMODAL_REGISTRY.register_processor(LlavaNextMultiModalProcessor)
+@MULTIMODAL_REGISTRY.register_processor(LlavaNextMultiModalProcessor,
+                                        info=LlavaNextProcessingInfo,
+                                        dummy_inputs=LlavaDummyInputsBuilder)
 class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
                                         SupportsPP):
 
diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
index 6e82cee1c95a4..5be85d7c0f033 100644
--- a/vllm/model_executor/models/llava_next_video.py
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -17,12 +17,11 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
                                     NestedTensors)
-from vllm.multimodal.parse import (ImageSize, VideoEmbeddingItems,
-                                   VideoProcessorItems)
+from vllm.multimodal.parse import (ImageSize, MultiModalDataItems,
+                                   VideoEmbeddingItems, VideoProcessorItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessingMixin,
-                                        PromptReplacement)
-from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs
+                                        BaseProcessingInfo, PromptReplacement)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 from vllm.utils import is_list_of
 
@@ -47,33 +46,52 @@ class LlavaNextVideoPixelInputs(TypedDict):
     """
 
 
-class LlavaNextVideoProcessingMixin(ProcessingMixin):
+class LlavaNextVideoProcessingInfo(BaseProcessingInfo):
 
-    def _get_hf_config(self):
+    def get_hf_config(self):
         return self.ctx.get_hf_config(LlavaNextVideoConfig)
 
-    def _get_vision_encoder_info(self):
-        return get_vision_encoder_info(self._get_hf_config())
+    def get_vision_encoder_info(self):
+        return get_vision_encoder_info(self.get_hf_config())
 
-    def _get_hf_processor(self):
+    def get_hf_processor(self):
         return self.ctx.get_hf_processor(LlavaNextVideoProcessor)
 
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"video": 1}
+
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+        target_width, target_height = self.get_image_size_with_most_features()
+
+        max_video_tokens = self.get_num_video_tokens(
+            image_width=target_width,
+            image_height=target_height,
+            num_frames=self.get_num_frames_with_most_features(seq_len),
+        )
+
+        return {"video": max_video_tokens}
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        vision_encoder_info = self.get_vision_encoder_info()
+        width = height = vision_encoder_info.get_image_size()
+        return ImageSize(width=width, height=height)
+
     def _get_num_frame_tokens(
         self,
         *,
         image_width: int,
         image_height: int,
     ) -> int:
-        hf_config = self._get_hf_config()
+        hf_config = self.get_hf_config()
         spatial_pool_stride = hf_config.spatial_pool_stride
 
-        vision_encoder_info = self._get_vision_encoder_info()
+        vision_encoder_info = self.get_vision_encoder_info()
         patch_grid_length = vision_encoder_info.get_patch_grid_length()
         pooled_grid_length = math.ceil(patch_grid_length / spatial_pool_stride)
 
         return pooled_grid_length * pooled_grid_length
 
-    def _get_num_video_tokens(
+    def get_num_video_tokens(
         self,
         *,
         image_width: int,
@@ -87,37 +105,14 @@ def _get_num_video_tokens(
 
         return num_frame_tokens * num_frames
 
-
-class LlavaNextVideoProfilingInfo(LlavaNextVideoProcessingMixin,
-                                  BaseProfilingInfo):
-
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
-        return {"video": 1}
-
-    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
-        target_width, target_height = self._get_image_size_with_most_features()
-
-        max_video_tokens = self._get_num_video_tokens(
-            image_width=target_width,
-            image_height=target_height,
-            num_frames=self._get_dummy_num_frames(seq_len),
-        )
-
-        return {"video": max_video_tokens}
-
-    def _get_image_size_with_most_features(self) -> ImageSize:
-        vision_encoder_info = self._get_vision_encoder_info()
-        width = height = vision_encoder_info.get_image_size()
-        return ImageSize(width=width, height=height)
-
     def _get_max_video_frames(self, max_tokens: int) -> int:
-        target_width, target_height = self._get_image_size_with_most_features()
+        target_width, target_height = self.get_image_size_with_most_features()
 
         num_frames = 0
 
         while True:
             next_num_frames = num_frames + 1
-            next_max_tokens = self._get_num_video_tokens(
+            next_max_tokens = self.get_num_video_tokens(
                 image_width=target_width,
                 image_height=target_height,
                 num_frames=next_num_frames,
@@ -130,7 +125,7 @@ def _get_max_video_frames(self, max_tokens: int) -> int:
 
         return num_frames
 
-    def _get_dummy_num_frames(self, seq_len: int) -> int:
+    def get_num_frames_with_most_features(self, seq_len: int) -> int:
         mm_config = self.ctx.get_mm_config()
         max_videos = mm_config.limit_per_prompt.get("video", 1)
 
@@ -138,6 +133,10 @@ def _get_dummy_num_frames(self, seq_len: int) -> int:
 
         return max(max_total_frames // max(max_videos, 1), 1)
 
+
+class LlavaNextVideoDummyInputsBuilder(
+        BaseDummyInputsBuilder[LlavaNextVideoProcessingInfo]):
+
     def get_dummy_processor_inputs(
         self,
         seq_len: int,
@@ -145,16 +144,20 @@ def get_dummy_processor_inputs(
     ) -> ProcessorInputs:
         num_videos = mm_counts.get("video", 0)
 
-        processor = self._get_hf_processor()
+        processor = self.info.get_hf_processor()
         video_token = processor.video_token
-        target_width, target_height = self._get_image_size_with_most_features()
+
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
+        target_num_frames = \
+            self.info.get_num_frames_with_most_features(seq_len)
 
         mm_data = {
             "video":
             self._get_dummy_videos(
                 width=target_width,
                 height=target_height,
-                num_frames=self._get_dummy_num_frames(seq_len),
+                num_frames=target_num_frames,
                 num_videos=num_videos,
             )
         }
@@ -165,11 +168,8 @@ def get_dummy_processor_inputs(
         )
 
 
-class LlavaNextVideoMultiModalProcessor(LlavaNextVideoProcessingMixin,
-                                        BaseMultiModalProcessor):
-
-    def _get_profiling_info(self) -> BaseProfilingInfo:
-        return LlavaNextVideoProfilingInfo(self.ctx)
+class LlavaNextVideoMultiModalProcessor(
+        BaseMultiModalProcessor[LlavaNextVideoProcessingInfo]):
 
     def _get_mm_fields_config(
         self,
@@ -184,7 +184,7 @@ def _get_prompt_replacements(
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
-        hf_config = self._get_hf_config()
+        hf_config = self.info.get_hf_config()
         video_token_id = hf_config.video_token_index
 
         def get_replacement(item_idx: int):
@@ -195,7 +195,7 @@ def get_replacement(item_idx: int):
                 num_video_tokens = videos.get_feature_size(item_idx)
             else:
                 image_size = videos.get_frame_size(item_idx)
-                num_video_tokens = self._get_num_video_tokens(
+                num_video_tokens = self.info.get_num_video_tokens(
                     image_width=image_size.width,
                     image_height=image_size.height,
                     num_frames=videos.get_num_frames(item_idx),
@@ -269,7 +269,11 @@ def forward(self, image_features: torch.Tensor) -> torch.Tensor:
         return hidden_states
 
 
-@MULTIMODAL_REGISTRY.register_processor(LlavaNextVideoMultiModalProcessor)
+@MULTIMODAL_REGISTRY.register_processor(
+    LlavaNextVideoMultiModalProcessor,
+    info=LlavaNextVideoProcessingInfo,
+    dummy_inputs=LlavaNextVideoDummyInputsBuilder,
+)
 class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal,
                                              SupportsPP):
 
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index b5e3edba1f01c..78a47e64d9afc 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -17,19 +17,20 @@
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors
-from vllm.multimodal.parse import (ImageSize, MultiModalDataItems,
-                                   VideoEmbeddingItems, VideoProcessorItems)
-from vllm.multimodal.processing import MultiModalFieldConfig, PromptReplacement
-from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs
+from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
+                                    NestedTensors)
+from vllm.multimodal.parse import (MultiModalDataItems, VideoEmbeddingItems,
+                                   VideoProcessorItems)
+from vllm.multimodal.processing import PromptReplacement
+from vllm.multimodal.profiling import ProcessorInputs
 from vllm.sequence import IntermediateTensors
 from vllm.utils import is_list_of
 
 from .clip import CLIPVisionModel
 from .interfaces import SupportsMultiModal, SupportsPP
-from .llava import BaseLlavaProfilingInfo, init_vision_tower_for_llava
-from .llava_next import (LlavaNextLikeConfig, LlavaNextMultiModalProcessor,
-                         LlavaNextProcessingMixin)
+from .llava import LlavaDummyInputsBuilder, init_vision_tower_for_llava
+from .llava_next import (BaseLlavaNextMultiModalProcessor, LlavaNextLikeConfig,
+                         LlavaNextProcessingInfo)
 from .siglip import SiglipVisionModel
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
@@ -89,14 +90,23 @@ class LlavaOnevisionLikeConfig(LlavaNextLikeConfig, Protocol):
     video_token_index: Final[int]
 
 
-class LlavaOnevisionProcessingMixin(LlavaNextProcessingMixin):
+class LlavaOnevisionProcessingInfo(LlavaNextProcessingInfo):
 
-    def _get_hf_config(self) -> LlavaOnevisionLikeConfig:
+    def get_hf_config(self) -> LlavaOnevisionLikeConfig:
         return self.ctx.get_hf_config(LlavaOnevisionConfig)
 
-    def _get_hf_processor(self):
+    def get_hf_processor(self):
         return self.ctx.get_hf_processor(LlavaOnevisionProcessor)
 
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None, "video": None}
+
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+        return {
+            "image": self.get_max_image_tokens(),
+            "video": self.get_max_video_tokens(seq_len),
+        }
+
     # Based on: https://github.com/huggingface/text-generation-inference/blob/v3.0.1/server/text_generation_server/models/vlm_causal_lm.py#L86
     # with additional logic afterwards taken from LlavaOnevisionProcessor
     def _get_num_unpadded_features(
@@ -141,16 +151,16 @@ def _get_num_frame_tokens(
         image_width: int,
         image_height: int,
     ) -> int:
-        hf_config = self._get_hf_config()
+        hf_config = self.get_hf_config()
         spatial_pool_stride = getattr(hf_config, "spatial_pool_stride", 2)
 
-        vision_encoder_info = self._get_vision_encoder_info()
+        vision_encoder_info = self.get_vision_encoder_info()
         patch_grid_length = vision_encoder_info.get_patch_grid_length()
         pooled_grid_length = math.ceil(patch_grid_length / spatial_pool_stride)
 
         return pooled_grid_length * pooled_grid_length
 
-    def _get_num_video_tokens(
+    def get_num_video_tokens(
         self,
         *,
         image_width: int,
@@ -164,43 +174,14 @@ def _get_num_video_tokens(
 
         return num_frame_tokens * num_frames + 1  # Newline token
 
-
-class LlavaOnevisionProfilingInfo(LlavaOnevisionProcessingMixin,
-                                  BaseLlavaProfilingInfo):
-
-    def _get_image_size_with_most_features(self) -> ImageSize:
-        hf_config = self._get_hf_config()
-        largest_feature_size, largest_feature_pinpoint = 0, None
-        for (height, width) in hf_config.image_grid_pinpoints:
-            feat_size = self._get_num_image_tokens(image_width=width,
-                                                   image_height=height)
-            if feat_size > largest_feature_size:
-                largest_feature_size = feat_size
-                largest_feature_pinpoint = ImageSize(width=width,
-                                                     height=height)
-
-        if largest_feature_size == 0 or largest_feature_pinpoint is None:
-            raise ValueError("Cannot have a largest feature size of 0!")
-
-        return largest_feature_pinpoint
-
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
-        return {"image": None, "video": None}
-
-    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
-        return {
-            "image": self._get_max_image_tokens(),
-            "video": self._get_max_video_tokens(seq_len),
-        }
-
     def _get_max_video_frames(self, max_tokens: int) -> int:
-        target_width, target_height = self._get_image_size_with_most_features()
+        target_width, target_height = self.get_image_size_with_most_features()
 
         num_frames = 0
 
         while True:
             next_num_frames = num_frames + 1
-            next_max_tokens = self._get_num_video_tokens(
+            next_max_tokens = self.get_num_video_tokens(
                 image_width=target_width,
                 image_height=target_height,
                 num_frames=next_num_frames,
@@ -213,12 +194,12 @@ def _get_max_video_frames(self, max_tokens: int) -> int:
 
         return num_frames
 
-    def _get_dummy_num_frames(self, seq_len: int) -> int:
+    def get_num_frames_with_most_features(self, seq_len: int) -> int:
         mm_config = self.ctx.get_mm_config()
         max_images = mm_config.limit_per_prompt.get("image", 1)
         max_videos = mm_config.limit_per_prompt.get("video", 1)
 
-        max_image_tokens = self._get_max_image_tokens() * max_images
+        max_image_tokens = self.get_max_image_tokens() * max_images
         max_total_frames = self._get_max_video_frames(seq_len -
                                                       max_image_tokens)
         max_frames_per_video = min(max_total_frames // max(max_videos, 1),
@@ -226,15 +207,19 @@ def _get_dummy_num_frames(self, seq_len: int) -> int:
 
         return max(max_frames_per_video, 1)
 
-    def _get_max_video_tokens(self, seq_len: int) -> int:
-        target_width, target_height = self._get_image_size_with_most_features()
+    def get_max_video_tokens(self, seq_len: int) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
 
-        return self._get_num_video_tokens(
+        return self.get_num_video_tokens(
             image_width=target_width,
             image_height=target_height,
-            num_frames=self._get_dummy_num_frames(seq_len),
+            num_frames=self.get_num_frames_with_most_features(seq_len),
         )
 
+
+class LlavaOnevisionDummyInputsBuilder(
+        LlavaDummyInputsBuilder[LlavaOnevisionProcessingInfo]):
+
     def get_dummy_processor_inputs(
         self,
         seq_len: int,
@@ -243,10 +228,14 @@ def get_dummy_processor_inputs(
         num_images = mm_counts.get("image", 0)
         num_videos = mm_counts.get("video", 0)
 
-        processor = self._get_hf_processor()
+        processor = self.info.get_hf_processor()
         image_token = processor.image_token
         video_token = processor.video_token
-        target_width, target_height = self._get_image_size_with_most_features()
+
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
+        target_num_frames = \
+            self.info.get_num_frames_with_most_features(seq_len)
 
         mm_data = {
             "image":
@@ -257,7 +246,7 @@ def get_dummy_processor_inputs(
             self._get_dummy_videos(
                 width=target_width,
                 height=target_height,
-                num_frames=self._get_dummy_num_frames(seq_len),
+                num_frames=target_num_frames,
                 num_videos=num_videos,
             )
         }
@@ -268,11 +257,8 @@ def get_dummy_processor_inputs(
         )
 
 
-class LlavaOnevisionMultiModalProcessor(LlavaOnevisionProcessingMixin,
-                                        LlavaNextMultiModalProcessor):
-
-    def _get_profiling_info(self) -> BaseProfilingInfo:
-        return LlavaOnevisionProfilingInfo(self.ctx)
+class LlavaOnevisionMultiModalProcessor(
+        BaseLlavaNextMultiModalProcessor[LlavaOnevisionProcessingInfo]):
 
     def _get_mm_fields_config(
         self,
@@ -303,7 +289,7 @@ def _call_hf_processor(
                 mm_kwargs=mm_kwargs,
             )
 
-        processor = self._get_hf_processor()
+        processor = self.info.get_hf_processor()
         video_token = processor.video_token
 
         # LLaVA-OneVision processor doesn't support multiple videos
@@ -345,7 +331,7 @@ def _get_prompt_replacements(
             out_mm_kwargs=out_mm_kwargs,
         )
 
-        hf_config = self._get_hf_config()
+        hf_config = self.info.get_hf_config()
         video_token_id = hf_config.video_token_index
 
         def get_video_replacement(item_idx: int):
@@ -356,7 +342,7 @@ def get_video_replacement(item_idx: int):
                 num_video_tokens = videos.get_feature_size(item_idx)
             else:
                 image_size = videos.get_frame_size(item_idx)
-                num_video_tokens = self._get_num_video_tokens(
+                num_video_tokens = self.info.get_num_video_tokens(
                     image_width=image_size.width,
                     image_height=image_size.height,
                     num_frames=videos.get_num_frames(item_idx),
@@ -393,7 +379,10 @@ def forward(self, image_features: torch.Tensor) -> torch.Tensor:
         return hidden_states
 
 
-@MULTIMODAL_REGISTRY.register_processor(LlavaOnevisionMultiModalProcessor)
+@MULTIMODAL_REGISTRY.register_processor(
+    LlavaOnevisionMultiModalProcessor,
+    info=LlavaOnevisionProcessingInfo,
+    dummy_inputs=LlavaOnevisionDummyInputsBuilder)
 class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
                                              SupportsPP):
 
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index c8418c14e5fdf..a1b1af35604db 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -34,13 +34,12 @@
                                     MultiModalInputsV2, MultiModalKwargs,
                                     NestedTensors, PlaceholderRange)
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
-                                   ImageSize)
+                                   ImageSize, MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessingMixin,
-                                        PromptReplacement,
-                                        _BoundPromptReplacement,
-                                        _PlaceholderInfo)
-from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs
+                                        BaseProcessingInfo,
+                                        BoundPromptReplacement,
+                                        PlaceholderInfo, PromptReplacement)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 from vllm.utils import is_list_of
 
@@ -302,9 +301,9 @@ def add_image_newline(self, image_features_hd):
         return image_features_hd_newline
 
 
-class Phi3VProcessingMixin(ProcessingMixin):
+class Phi3VProcessingInfo(BaseProcessingInfo):
 
-    def _get_hf_processor(
+    def get_hf_processor(
         self,
         *,
         num_crops: Optional[int] = None,
@@ -314,39 +313,42 @@ def _get_hf_processor(
 
         return self.ctx.get_hf_processor()
 
-    def _get_num_image_tokens(
-        self,
-        *,
-        image_width: int,
-        image_height: int,
-    ) -> int:
-        processor = self._get_hf_processor()
-
-        return processor.calc_num_image_tokens_from_image_size(  # type: ignore
-            width=image_width,
-            height=image_height,
-        )
-
-
-class Phi3VProfilingInfo(Phi3VProcessingMixin, BaseProfilingInfo):
-
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None}
 
     def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
-        target_width, target_height = self._get_image_size_with_most_features()
+        target_width, target_height = self.get_image_size_with_most_features()
 
-        max_image_tokens = self._get_num_image_tokens(
+        max_image_tokens = self.get_num_image_tokens(
             image_width=target_width,
             image_height=target_height,
+            processor=None,
         )
 
         return {"image": max_image_tokens}
 
-    def _get_image_size_with_most_features(self) -> ImageSize:
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        processor: Optional[ProcessorMixin],
+    ) -> int:
+        if processor is None:
+            processor = self.get_hf_processor()
+
+        return processor.calc_num_image_tokens_from_image_size(  # type: ignore
+            width=image_width,
+            height=image_height,
+        )
+
+    def get_image_size_with_most_features(self) -> ImageSize:
         # Result in the max possible feature size (h:w = 16:1)
         return ImageSize(height=8000, width=50)
 
+
+class Phi3VDummyInputsBuilder(BaseDummyInputsBuilder[Phi3VProcessingInfo]):
+
     def get_dummy_processor_inputs(
         self,
         seq_len: int,
@@ -354,7 +356,8 @@ def get_dummy_processor_inputs(
     ) -> ProcessorInputs:
         num_images = mm_counts.get("image", 0)
 
-        target_width, target_height = self._get_image_size_with_most_features()
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
 
         mm_data = {
             "image":
@@ -363,7 +366,7 @@ def get_dummy_processor_inputs(
                                    num_images=num_images)
         }
 
-        hf_processor = self._get_hf_processor()
+        hf_processor = self.info.get_hf_processor()
         image_tokens: list[str] = hf_processor.img_tokens  # type: ignore
 
         return ProcessorInputs(
@@ -372,10 +375,7 @@ def get_dummy_processor_inputs(
         )
 
 
-class Phi3VMultiModalProcessor(Phi3VProcessingMixin, BaseMultiModalProcessor):
-
-    def _get_profiling_info(self) -> BaseProfilingInfo:
-        return Phi3VProfilingInfo(self.ctx)
+class Phi3VMultiModalProcessor(BaseMultiModalProcessor[Phi3VProcessingInfo]):
 
     def _call_hf_processor(
         self,
@@ -416,10 +416,10 @@ def _get_prompt_replacements(
         hf_processor_mm_kwargs: Mapping[str, Any],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
-        hf_processor = self._get_hf_processor(**hf_processor_mm_kwargs)
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         image_tokens: list[str] = hf_processor.img_tokens  # type: ignore
 
-        tokenizer = self._get_tokenizer()
+        tokenizer = self.info.get_tokenizer()
         bos_token_id = tokenizer.bos_token_id
         assert isinstance(bos_token_id, int)
 
@@ -431,9 +431,10 @@ def get_replacement_phi3v(item_idx: int):
                 num_image_tokens = images.get_feature_size(item_idx)
             else:
                 image_size = images.get_image_size(item_idx)
-                num_image_tokens = self._get_num_image_tokens(
+                num_image_tokens = self.info.get_num_image_tokens(
                     image_width=image_size.width,
                     image_height=image_size.height,
+                    processor=hf_processor,
                 )
 
             return [_IMAGE_TOKEN_ID] * num_image_tokens + [bos_token_id]
@@ -451,9 +452,9 @@ def get_replacement_phi3v(item_idx: int):
     def _apply_prompt_replacements(
         self,
         token_ids: list[int],
-        mm_prompt_repls: Mapping[str, Sequence[_BoundPromptReplacement]],
+        mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]],
         mm_item_counts: Mapping[str, int],
-    ) -> tuple[list[int], str, Mapping[str, list[_PlaceholderInfo]]]:
+    ) -> tuple[list[int], str, Mapping[str, list[PlaceholderInfo]]]:
         token_ids, text, placeholders = super()._apply_prompt_replacements(
             token_ids=token_ids,
             mm_prompt_repls=mm_prompt_repls,
@@ -466,7 +467,7 @@ def _apply_prompt_replacements(
             token_ids = [token_ids[0], *token_ids[2:]]
             placeholders = {
                 modality: [
-                    _PlaceholderInfo(
+                    PlaceholderInfo(
                         modality=p.modality,
                         item_idx=p.item_idx,
                         start_idx=p.start_idx - 1,
@@ -499,7 +500,9 @@ def apply(
         return result
 
 
-@MULTIMODAL_REGISTRY.register_processor(Phi3VMultiModalProcessor)
+@MULTIMODAL_REGISTRY.register_processor(Phi3VMultiModalProcessor,
+                                        info=Phi3VProcessingInfo,
+                                        dummy_inputs=Phi3VDummyInputsBuilder)
 class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index 7012ddc66cd9c..0dff9595c6c08 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -38,11 +38,11 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
                                     NestedTensors)
-from vllm.multimodal.parse import AudioProcessorItems, MultiModalDataParser
+from vllm.multimodal.parse import (AudioProcessorItems, MultiModalDataItems,
+                                   MultiModalDataParser)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessingMixin,
-                                        PromptReplacement)
-from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs
+                                        BaseProcessingInfo, PromptReplacement)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsMultiModal, SupportsPP
@@ -80,12 +80,12 @@ def _get_feat_extract_output_lengths(input_lengths: torch.Tensor):
     return feat_lengths, output_lengths
 
 
-class Qwen2AudioProcessingMixin(ProcessingMixin):
+class Qwen2AudioProcessingInfo(BaseProcessingInfo):
 
-    def _get_hf_config(self):
+    def get_hf_config(self):
         return self.ctx.get_hf_config(Qwen2AudioConfig)
 
-    def _get_hf_processor(
+    def get_hf_processor(
         self,
         *,
         # Ignored in initialization
@@ -93,36 +93,37 @@ def _get_hf_processor(
     ) -> Qwen2AudioProcessor:
         return self.ctx.get_hf_processor(Qwen2AudioProcessor)
 
-    def _get_feature_extractor(
+    def get_feature_extractor(
         self,
         *,
         # Ignored in initialization
         sampling_rate: Optional[int] = None,
     ) -> WhisperFeatureExtractor:
-        hf_processor = self._get_hf_processor(sampling_rate=sampling_rate)
+        hf_processor = self.get_hf_processor(sampling_rate=sampling_rate)
         feature_extractor = hf_processor.feature_extractor  # type: ignore
         assert isinstance(feature_extractor, WhisperFeatureExtractor)
         return feature_extractor
 
-
-class Qwen2AudioProfilingInfo(Qwen2AudioProcessingMixin, BaseProfilingInfo):
-
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"audio": None}
 
     def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
-        hf_config = self._get_hf_config()
+        hf_config = self.get_hf_config()
         max_source_positions = hf_config.audio_config.max_source_positions
         max_output_lengths = (max_source_positions - 2) // 2 + 1
 
         return {"audio": max_output_lengths}
 
+
+class Qwen2AudioDummyInputsBuilder(
+        BaseDummyInputsBuilder[Qwen2AudioProcessingInfo]):
+
     def get_dummy_processor_inputs(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
-        feature_extractor = self._get_feature_extractor()
+        feature_extractor = self.info.get_feature_extractor()
 
         sampling_rate = feature_extractor.sampling_rate
         audio_len = feature_extractor.chunk_length * sampling_rate
@@ -139,14 +140,11 @@ def get_dummy_processor_inputs(
         )
 
 
-class Qwen2AudioMultiModalProcessor(Qwen2AudioProcessingMixin,
-                                    BaseMultiModalProcessor):
-
-    def _get_profiling_info(self) -> BaseProfilingInfo:
-        return Qwen2AudioProfilingInfo(self.ctx)
+class Qwen2AudioMultiModalProcessor(
+        BaseMultiModalProcessor[Qwen2AudioProcessingInfo]):
 
     def _get_data_parser(self) -> MultiModalDataParser:
-        feature_extractor = self._get_feature_extractor()
+        feature_extractor = self.info.get_feature_extractor()
         return MultiModalDataParser(target_sr=feature_extractor.sampling_rate)
 
     def _call_hf_processor(
@@ -161,7 +159,7 @@ def _call_hf_processor(
         if audios:
             mm_data["audios"] = audios
 
-            feature_extractor = self._get_feature_extractor(**mm_kwargs)
+            feature_extractor = self.info.get_feature_extractor(**mm_kwargs)
             mm_kwargs = dict(
                 **mm_kwargs,
                 sampling_rate=feature_extractor.sampling_rate,
@@ -194,7 +192,7 @@ def _get_prompt_replacements(
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
-        hf_config = self._get_hf_config()
+        hf_config = self.info.get_hf_config()
         placeholder = hf_config.audio_token_index
 
         feature_attention_mask = out_mm_kwargs.get("feature_attention_mask")
@@ -234,10 +232,13 @@ def _always_apply_prompt_replacements(self) -> bool:
         # has already performed processing for multi-audio input when the input
         # audios are short (the corresponding placeholders may take up fewer
         # tokens than the number of audio items)
-        return not hasattr(self._get_hf_processor(), "audio_token")
+        return not hasattr(self.info.get_hf_processor(), "audio_token")
 
 
-@MULTIMODAL_REGISTRY.register_processor(Qwen2AudioMultiModalProcessor)
+@MULTIMODAL_REGISTRY.register_processor(
+    Qwen2AudioMultiModalProcessor,
+    info=Qwen2AudioProcessingInfo,
+    dummy_inputs=Qwen2AudioDummyInputsBuilder)
 class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal,
                                          SupportsPP):
 
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index a5c2fb9e84df3..8537fec854b6d 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -57,11 +57,10 @@
                                     MultiModalFieldConfig, MultiModalKwargs,
                                     NestedTensors, VideoItem)
 from vllm.multimodal.parse import (ImageSize, ModalityDataItems,
-                                   MultiModalDataParser)
+                                   MultiModalDataItems, MultiModalDataParser)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessingMixin,
-                                        PromptReplacement)
-from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs
+                                        BaseProcessingInfo, PromptReplacement)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.platforms import _Backend
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.config import uses_mrope
@@ -709,12 +708,12 @@ def _parse_video_data(
         return super()._parse_video_data(data)
 
 
-class Qwen2VLProcessingMixin(ProcessingMixin):
+class Qwen2VLProcessingInfo(BaseProcessingInfo):
 
-    def _get_hf_config(self):
+    def get_hf_config(self):
         return self.ctx.get_hf_config(Qwen2VLConfig)
 
-    def _get_hf_processor(
+    def get_hf_processor(
         self,
         *,
         min_pixels: Optional[int] = None,
@@ -736,18 +735,27 @@ def _get_hf_processor(
 
         return hf_processor
 
-    def _get_image_processor(
+    def get_image_processor(
         self,
         *,
         min_pixels: Optional[int] = None,
         max_pixels: Optional[int] = None,
     ):
-        hf_processor = self._get_hf_processor(min_pixels=min_pixels,
-                                              max_pixels=max_pixels)
+        hf_processor = self.get_hf_processor(min_pixels=min_pixels,
+                                             max_pixels=max_pixels)
         image_processor = hf_processor.image_processor  # type: ignore
         assert isinstance(image_processor, Qwen2VLImageProcessor)
         return image_processor
 
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None, "video": None}
+
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+        return {
+            "image": self.get_max_image_tokens(),
+            "video": self.get_max_video_tokens(seq_len),
+        }
+
     def _get_vision_info(
         self,
         *,
@@ -755,15 +763,17 @@ def _get_vision_info(
         image_height: int,
         num_frames: int = 1,
         do_resize: bool = True,
+        image_processor: Optional[Qwen2VLImageProcessor],
     ) -> tuple[ImageSize, int]:
-        hf_config = self._get_hf_config()
+        if image_processor is None:
+            image_processor = self.get_image_processor()
+
+        hf_config = self.get_hf_config()
         vision_config = hf_config.vision_config
         patch_size = vision_config.patch_size
         merge_size = vision_config.spatial_merge_size
         temporal_patch_size = vision_config.temporal_patch_size
 
-        image_processor = self._get_image_processor()
-
         if do_resize:
             resized_height, resized_width = smart_resize(
                 height=image_height,
@@ -787,70 +797,65 @@ def _get_vision_info(
 
         return preprocessed_size, num_vision_tokens
 
-    def _get_num_image_tokens(
+    def get_num_image_tokens(
         self,
         *,
         image_width: int,
         image_height: int,
+        image_processor: Optional[Qwen2VLImageProcessor],
     ) -> int:
         _, num_image_tokens = self._get_vision_info(
             image_width=image_width,
             image_height=image_height,
+            image_processor=image_processor,
         )
         return num_image_tokens
 
-    def _get_num_video_tokens(
+    def get_num_video_tokens(
         self,
         *,
         image_width: int,
         image_height: int,
         num_frames: int,
+        image_processor: Optional[Qwen2VLImageProcessor],
     ) -> int:
         _, num_video_tokens = self._get_vision_info(
             image_width=image_width,
             image_height=image_height,
             num_frames=num_frames,
+            image_processor=image_processor,
         )
         return num_video_tokens
 
-
-class Qwen2VLProfilingInfo(Qwen2VLProcessingMixin, BaseProfilingInfo):
-
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
-        return {"image": None, "video": None}
-
-    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
-        return {
-            "image": self._get_max_image_tokens(),
-            "video": self._get_max_video_tokens(seq_len),
-        }
-
-    def _get_image_size_with_most_features(self) -> ImageSize:
+    def get_image_size_with_most_features(self) -> ImageSize:
         max_image_size, _ = self._get_vision_info(
             image_width=9999999,
             image_height=9999999,
+            image_processor=None,
         )
         return max_image_size
 
-    def _get_max_image_tokens(self) -> int:
-        target_width, target_height = self._get_image_size_with_most_features()
+    def get_max_image_tokens(self) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
 
-        return self._get_num_image_tokens(
+        return self.get_num_image_tokens(
             image_width=target_width,
             image_height=target_height,
+            image_processor=None,
         )
 
     def _get_max_video_frames(self, max_tokens: int) -> int:
-        target_width, target_height = self._get_image_size_with_most_features()
+        target_width, target_height = self.get_image_size_with_most_features()
 
         num_frames = 0
 
         while True:
             next_num_frames = num_frames + 1
-            next_max_tokens = self._get_num_video_tokens(
+            next_max_tokens = self.get_num_video_tokens(
                 image_width=target_width,
                 image_height=target_height,
                 num_frames=next_num_frames,
+                image_processor=None,
             )
 
             if next_max_tokens > max_tokens:
@@ -860,12 +865,12 @@ def _get_max_video_frames(self, max_tokens: int) -> int:
 
         return num_frames
 
-    def _get_dummy_num_frames(self, seq_len: int) -> int:
+    def get_num_frames_with_most_features(self, seq_len: int) -> int:
         mm_config = self.ctx.get_mm_config()
         max_images = mm_config.limit_per_prompt.get("image", 1)
         max_videos = mm_config.limit_per_prompt.get("video", 1)
 
-        max_image_tokens = self._get_max_image_tokens() * max_images
+        max_image_tokens = self.get_max_image_tokens() * max_images
         max_total_frames = self._get_max_video_frames(seq_len -
                                                       max_image_tokens)
 
@@ -877,15 +882,19 @@ def _get_dummy_num_frames(self, seq_len: int) -> int:
 
         return num_frames
 
-    def _get_max_video_tokens(self, seq_len: int) -> int:
-        target_width, target_height = self._get_image_size_with_most_features()
+    def get_max_video_tokens(self, seq_len: int) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
 
-        return self._get_num_video_tokens(
+        return self.get_num_video_tokens(
             image_width=target_width,
             image_height=target_height,
-            num_frames=self._get_dummy_num_frames(seq_len),
+            num_frames=self.get_num_frames_with_most_features(seq_len),
+            image_processor=None,
         )
 
+
+class Qwen2VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen2VLProcessingInfo]):
+
     def get_dummy_processor_inputs(
         self,
         seq_len: int,
@@ -894,10 +903,14 @@ def get_dummy_processor_inputs(
         num_images = mm_counts.get("image", 0)
         num_videos = mm_counts.get("video", 0)
 
-        hf_processor = self._get_hf_processor()
+        hf_processor = self.info.get_hf_processor()
         image_token: str = hf_processor.image_token
         video_token: str = hf_processor.video_token
-        target_width, target_height = self._get_image_size_with_most_features()
+
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
+        target_num_frames = \
+            self.info.get_num_frames_with_most_features(seq_len)
 
         mm_data = {
             "image":
@@ -908,7 +921,7 @@ def get_dummy_processor_inputs(
             self._get_dummy_videos(
                 width=target_width,
                 height=target_height,
-                num_frames=self._get_dummy_num_frames(seq_len),
+                num_frames=target_num_frames,
                 num_videos=num_videos,
             )
         }
@@ -919,11 +932,8 @@ def get_dummy_processor_inputs(
         )
 
 
-class Qwen2VLMultiModalProcessor(Qwen2VLProcessingMixin,
-                                 BaseMultiModalProcessor):
-
-    def _get_profiling_info(self) -> BaseProfilingInfo:
-        return Qwen2VLProfilingInfo(self.ctx)
+class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo]
+                                 ):
 
     def _get_data_parser(self) -> MultiModalDataParser:
         return Qwen2MultiModalDataParser()
@@ -934,8 +944,9 @@ def _get_prompt_replacements(
         hf_processor_mm_kwargs: Mapping[str, Any],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
-        hf_processor = self._get_hf_processor(**hf_processor_mm_kwargs)
-        image_processor = self._get_image_processor(**hf_processor_mm_kwargs)
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+        image_processor = self.info.get_image_processor(
+            **hf_processor_mm_kwargs)
 
         # NOTE: Only Qwen2VLProcessor in transformers 4.47.0 has
         # image_token and video_token registered
@@ -991,7 +1002,9 @@ def _get_mm_fields_config(
         )
 
 
-@MULTIMODAL_REGISTRY.register_processor(Qwen2VLMultiModalProcessor)
+@MULTIMODAL_REGISTRY.register_processor(Qwen2VLMultiModalProcessor,
+                                        info=Qwen2VLProcessingInfo,
+                                        dummy_inputs=Qwen2VLDummyInputsBuilder)
 class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
                                       SupportsLoRA, SupportsPP):
     packed_modules_mapping = {
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index ecafd157b1d61..fada22d685dd6 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -24,11 +24,10 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
                                     NestedTensors)
-from vllm.multimodal.parse import MultiModalDataParser
+from vllm.multimodal.parse import MultiModalDataItems, MultiModalDataParser
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems, ProcessingMixin,
-                                        PromptReplacement)
-from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs
+                                        BaseProcessingInfo, PromptReplacement)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs.ultravox import UltravoxConfig
 
@@ -59,9 +58,9 @@ class UltravoxAudioEmbeddingInputs(TypedDict):
                             UltravoxAudioEmbeddingInputs]
 
 
-class UltravoxProcessingMixin(ProcessingMixin):
+class UltravoxProcessingInfo(BaseProcessingInfo):
 
-    def _get_hf_processor(
+    def get_hf_processor(
         self,
         *,
         # Ignored in initialization
@@ -76,37 +75,38 @@ def _get_hf_processor(
         hf_processor.audio_token_replacement = _AUDIO_PLACEHOLDER_OVERRIDE
         return hf_processor
 
-    def _get_feature_extractor(
+    def get_feature_extractor(
         self,
         *,
         # Ignored in initialization
         sampling_rate: Optional[int] = None,
     ) -> WhisperFeatureExtractor:
-        hf_processor = self._get_hf_processor(sampling_rate=sampling_rate)
+        hf_processor = self.get_hf_processor(sampling_rate=sampling_rate)
         audio_processor = hf_processor.audio_processor  # type: ignore
         feature_extractor = audio_processor.feature_extractor  # type: ignore
         assert isinstance(feature_extractor, WhisperFeatureExtractor)
         return feature_extractor
 
-
-class UltravoxProfilingInfo(UltravoxProcessingMixin, BaseProfilingInfo):
-
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"audio": None}
 
     def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
-        feature_extractor = self._get_feature_extractor()
+        feature_extractor = self.get_feature_extractor()
         max_audio_tokens = math.ceil(feature_extractor.chunk_length *
                                      _AUDIO_TOKENS_PER_SECOND)
 
         return {"audio": max_audio_tokens}
 
+
+class UltravoxDummyInputsBuilder(BaseDummyInputsBuilder[UltravoxProcessingInfo]
+                                 ):
+
     def get_dummy_processor_inputs(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
-        feature_extractor = self._get_feature_extractor()
+        feature_extractor = self.info.get_feature_extractor()
 
         sampling_rate = feature_extractor.sampling_rate
         audio_len = feature_extractor.chunk_length * sampling_rate
@@ -123,14 +123,11 @@ def get_dummy_processor_inputs(
         )
 
 
-class UltravoxMultiModalProcessor(UltravoxProcessingMixin,
-                                  BaseMultiModalProcessor):
-
-    def _get_profiling_info(self) -> BaseProfilingInfo:
-        return UltravoxProfilingInfo(self.ctx)
+class UltravoxMultiModalProcessor(
+        BaseMultiModalProcessor[UltravoxProcessingInfo]):
 
     def _get_data_parser(self) -> MultiModalDataParser:
-        feature_extractor = self._get_feature_extractor()
+        feature_extractor = self.info.get_feature_extractor()
         return MultiModalDataParser(target_sr=feature_extractor.sampling_rate)
 
     def _call_hf_processor(
@@ -141,7 +138,7 @@ def _call_hf_processor(
     ) -> BatchFeature:
         # Text-only input not supported in composite processor
         if not mm_data:
-            tokenizer = self._get_tokenizer()
+            tokenizer = self.info.get_tokenizer()
 
             prompt_ids = tokenizer.encode(
                 prompt,
@@ -160,7 +157,7 @@ def _call_hf_processor(
                 mm_kwargs=mm_kwargs,
             )
 
-        feature_extractor = self._get_feature_extractor()
+        feature_extractor = self.info.get_feature_extractor()
         mm_kwargs = dict(
             **mm_kwargs,
             sampling_rate=feature_extractor.sampling_rate,
@@ -208,7 +205,7 @@ def _get_prompt_replacements(
         hf_processor_mm_kwargs: Mapping[str, Any],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
-        hf_processor = self._get_hf_processor(**hf_processor_mm_kwargs)
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         placeholder = hf_processor.audio_token_replacement  # type: ignore
 
         def get_replacement_ultravox(item_idx: int):
@@ -342,7 +339,10 @@ def forward(
         return hidden_states
 
 
-@MULTIMODAL_REGISTRY.register_processor(UltravoxMultiModalProcessor)
+@MULTIMODAL_REGISTRY.register_processor(UltravoxMultiModalProcessor,
+                                        info=UltravoxProcessingInfo,
+                                        dummy_inputs=UltravoxDummyInputsBuilder
+                                        )
 class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP):
 
     hf_to_vllm_mapper = WeightsMapper(
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 41113cd85bd16..c6a30cacebdd1 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -4,12 +4,13 @@
 from collections.abc import Callable, ItemsView, Iterable, Mapping, Sequence
 from dataclasses import dataclass, field
 from functools import lru_cache
-from typing import Any, NamedTuple, Optional, Protocol, TypeVar, Union
+from typing import (TYPE_CHECKING, Generic, NamedTuple, Optional, Protocol,
+                    TypeVar, Union)
 
 from transformers import BatchFeature, PretrainedConfig, ProcessorMixin
 
-from vllm import envs
-from vllm.inputs import DummyData, InputProcessingContext
+import vllm.envs as envs
+from vllm.inputs import InputProcessingContext
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import (AnyTokenizer, decode_tokens,
                                                encode_tokens)
@@ -20,7 +21,9 @@
                      MultiModalInputsV2, MultiModalKwargs,
                      MultiModalKwargsItem, PlaceholderRange)
 from .parse import MultiModalDataItems, MultiModalDataParser
-from .profiling import BaseProfilingInfo
+
+if TYPE_CHECKING:
+    from .profiling import BaseDummyInputsBuilder
 
 logger = init_logger(__name__)
 
@@ -46,8 +49,8 @@ class PromptReplacement:
     if it does not depend on the input.
     """
 
-    def bind(self, tokenizer: AnyTokenizer) -> "_BoundPromptReplacement":
-        return _BoundPromptReplacement(
+    def bind(self, tokenizer: AnyTokenizer) -> "BoundPromptReplacement":
+        return BoundPromptReplacement(
             tokenizer=tokenizer,
             modality=self.modality,
             _target=self.target,
@@ -128,7 +131,7 @@ def token_ids(self) -> list[int]:
 
 
 @dataclass
-class _BoundPromptReplacement:
+class BoundPromptReplacement:
     tokenizer: AnyTokenizer = field(repr=False)
     modality: str
 
@@ -207,7 +210,7 @@ def iter_token_matches(
 
 @dataclass(repr=False)
 class _PromptReplacementMatch(ABC):
-    prompt_repl: _BoundPromptReplacement
+    prompt_repl: BoundPromptReplacement
 
     @property
     def modality(self) -> str:
@@ -255,7 +258,7 @@ def end_idx(self) -> int:
 
 
 @dataclass
-class _PlaceholderInfo:
+class PlaceholderInfo:
     modality: str
     item_idx: int
     start_idx: int
@@ -274,7 +277,7 @@ def to_range(self) -> PlaceholderRange:
 
 def find_token_matches(
     prompt: list[int],
-    prompt_repls: Sequence[_BoundPromptReplacement],
+    prompt_repls: Sequence[BoundPromptReplacement],
 ) -> list[_PromptReplacementTokenMatch]:
     """Return each target of :code:`prompt_repls` found in :code:`prompt`."""
     return [
@@ -286,7 +289,7 @@ def find_token_matches(
 
 def find_text_matches(
     prompt: str,
-    prompt_repls: Sequence[_BoundPromptReplacement],
+    prompt_repls: Sequence[BoundPromptReplacement],
 ) -> list[_PromptReplacementTextMatch]:
     """Return each target of :code:`prompt_repls` found in :code:`prompt`."""
     return [
@@ -390,9 +393,9 @@ def replace_text_matches(
 def _iter_modality_placeholders(
     prompt: list[int],
     modality: str,
-    modality_repls: Sequence[_BoundPromptReplacement],
+    modality_repls: Sequence[BoundPromptReplacement],
     modal_item_count: int,
-) -> Iterable[_PlaceholderInfo]:
+) -> Iterable[PlaceholderInfo]:
     if modal_item_count == 0:
         return
 
@@ -413,7 +416,7 @@ def _iter_modality_placeholders(
                 continue
 
             if prompt[start_idx:end_idx] == repl_tokens:
-                yield _PlaceholderInfo(
+                yield PlaceholderInfo(
                     modality=modality,
                     item_idx=item_idx,
                     start_idx=start_idx,
@@ -434,10 +437,10 @@ def _iter_modality_placeholders(
 
 
 def _iter_placeholders(
-    mm_prompt_repls: Mapping[str, Sequence[_BoundPromptReplacement]],
+    mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]],
     prompt: list[int],
     mm_item_counts: Mapping[str, int],
-) -> Iterable[_PlaceholderInfo]:
+) -> Iterable[PlaceholderInfo]:
     """
     For each modality, yield each set of placeholder tokens found in
     :code:`prompt`.
@@ -455,10 +458,10 @@ def _iter_placeholders(
 
 
 def find_mm_placeholders(
-    mm_prompt_repls: Mapping[str, Sequence[_BoundPromptReplacement]],
+    mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]],
     prompt: list[int],
     mm_item_counts: Mapping[str, int],
-) -> Mapping[str, list[_PlaceholderInfo]]:
+) -> Mapping[str, list[PlaceholderInfo]]:
     it = _iter_placeholders(mm_prompt_repls, prompt, mm_item_counts)
     return dict(full_groupby_modality(it))
 
@@ -524,29 +527,59 @@ def put(
         self._cache.put(cache_key, output_kwargs)
 
 
-class ProcessingMixin:
-    """
-    Contains helper functions to perform processing.
+class BaseProcessingInfo:
+    """Base class containing information to perform processing."""
 
-    Not to be confused with :class:`transformers.ProcessorMixin`.
-    """
-    ctx: InputProcessingContext
+    def __init__(self, ctx: InputProcessingContext) -> None:
+        super().__init__()
 
-    def _get_tokenizer(self) -> AnyTokenizer:
+        self.ctx = ctx
+
+    @property
+    def model_id(self) -> str:
+        return self.ctx.model_config.model
+
+    def get_tokenizer(self) -> AnyTokenizer:
         return self.ctx.tokenizer
 
-    def _get_hf_config(self) -> PretrainedConfig:
+    def get_hf_config(self) -> PretrainedConfig:
         return self.ctx.get_hf_config()
 
-    def _get_hf_processor(self, **kwargs: object) -> ProcessorMixin:
+    def get_hf_processor(self, **kwargs: object) -> ProcessorMixin:
         """
         Subclasses can override this method to handle
         specific kwargs from model config or user inputs.
         """
         return self.ctx.get_hf_processor(**kwargs)
 
+    @abstractmethod
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        """
+        Return the maximum supported number of items for each modality.
+
+        A value of `None` means unlimited number of items.
+
+        Omitting a modality from the returned dictionary means that
+        it is not supported at all.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+        """
+        Get the maximum possible number of tokens per data item
+        for each modality.
+
+        The dictionary returned by this method should have the same
+        keys as that returned by :meth:`get_supported_mm_limits`.
+        """
+        raise NotImplementedError
+
+
+_I = TypeVar("_I", bound=BaseProcessingInfo)
 
-class BaseMultiModalProcessor(ProcessingMixin, ABC):
+
+class BaseMultiModalProcessor(ABC, Generic[_I]):
     """
     Abstract base class to process multi-modal inputs to be used in vLLM.
 
@@ -554,18 +587,19 @@ class BaseMultiModalProcessor(ProcessingMixin, ABC):
     """
 
     def __init__(self,
-                 ctx: InputProcessingContext,
+                 info: _I,
+                 dummy_inputs: "BaseDummyInputsBuilder[_I]",
                  *,
                  cache: Optional[ProcessingCache] = None,
                  enable_sanity_checks: bool = True) -> None:
         super().__init__()
 
-        self.ctx = ctx
+        self.info = info
+        self.dummy_inputs = dummy_inputs
         self.cache = cache
         self.enable_sanity_checks = enable_sanity_checks
 
         self.data_parser = self._get_data_parser()
-        self.profiling_info = self._get_profiling_info()
 
     def __call__(
         self,
@@ -585,13 +619,6 @@ def _get_data_parser(self) -> MultiModalDataParser:
         """
         return MultiModalDataParser()
 
-    def _get_profiling_info(self) -> BaseProfilingInfo:
-        """
-        Get the profiling information to find the worst-case memory usage of
-        the model.
-        """
-        raise NotImplementedError
-
     def _to_mm_items(
         self,
         mm_data: MultiModalDataDict,
@@ -602,7 +629,7 @@ def _to_mm_items(
         """
         mm_items = self.data_parser.parse_mm_data(mm_data)
 
-        mm_limits = self.ctx.get_mm_config().limit_per_prompt
+        mm_limits = self.info.ctx.get_mm_config().limit_per_prompt
         for modality, items in mm_items.items():
             limit = mm_limits.get(modality, 1)
             if len(items) > limit:
@@ -646,19 +673,19 @@ def _get_prompt_replacements(
 
     def _find_mm_placeholders(
         self,
-        mm_prompt_repls: Mapping[str, Sequence[_BoundPromptReplacement]],
+        mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]],
         new_token_ids: list[int],
         mm_item_counts: Mapping[str, int],
-    ) -> Mapping[str, list[_PlaceholderInfo]]:
+    ) -> Mapping[str, list[PlaceholderInfo]]:
         return find_mm_placeholders(mm_prompt_repls, new_token_ids,
                                     mm_item_counts)
 
     def _get_hf_mm_data(
         self,
         mm_items: MultiModalDataItems,
-    ) -> tuple[dict[str, Any], dict[str, Any]]:
-        processor_data = dict[str, Any]()
-        passthrough_data = dict[str, Any]()
+    ) -> tuple[Mapping[str, object], Mapping[str, object]]:
+        processor_data = dict[str, object]()
+        passthrough_data = dict[str, object]()
 
         for items in mm_items.values():
             processor_data.update(items.get_processor_data())
@@ -678,8 +705,8 @@ def _call_hf_processor(
         Call the HF processor on the prompt text and
         associated multi-modal data.
         """
-        return self.ctx.call_hf_processor(
-            self._get_hf_processor(**mm_kwargs),
+        return self.info.ctx.call_hf_processor(
+            self.info.get_hf_processor(**mm_kwargs),
             dict(text=prompt, **mm_data),
             mm_kwargs,
         )
@@ -738,8 +765,8 @@ def _apply_hf_processor_missing(
 
         # Some HF processors (e.g. Qwen2-VL) expect corresponding
         # multi-modal tokens to be in the prompt text
-        dummy_inputs = self.profiling_info.get_dummy_processor_inputs(
-            self.ctx.model_config.max_model_len,
+        dummy_inputs = self.dummy_inputs.get_dummy_processor_inputs(
+            self.info.ctx.model_config.max_model_len,
             mm_missing_counts,
         )
 
@@ -762,7 +789,7 @@ def _cached_apply_hf_processor(
         caching the results and reusing cached results.
         """
         cache = self.cache
-        model_id = self.ctx.model_config.model
+        model_id = self.info.model_id
 
         _, passthrough_data = self._get_hf_mm_data(mm_data_items)
         if cache is None or passthrough_data:
@@ -838,8 +865,8 @@ def _cached_apply_hf_processor(
     def _bind_and_group_repls(
         self,
         prompt_repls: list[PromptReplacement],
-    ) -> dict[str, list[_BoundPromptReplacement]]:
-        tokenizer = self._get_tokenizer()
+    ) -> dict[str, list[BoundPromptReplacement]]:
+        tokenizer = self.info.get_tokenizer()
 
         it = (prompt_repl.bind(tokenizer) for prompt_repl in prompt_repls)
         return dict(full_groupby_modality(it))
@@ -859,10 +886,10 @@ def _always_apply_prompt_replacements(self) -> bool:
     def _apply_prompt_replacements(
         self,
         token_ids: list[int],
-        mm_prompt_repls: Mapping[str, Sequence[_BoundPromptReplacement]],
+        mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]],
         mm_item_counts: Mapping[str, int],
-    ) -> tuple[list[int], str, Mapping[str, list[_PlaceholderInfo]]]:
-        tokenizer = self._get_tokenizer()
+    ) -> tuple[list[int], str, Mapping[str, list[PlaceholderInfo]]]:
+        tokenizer = self.info.get_tokenizer()
 
         mm_token_matches = {
             modality: find_token_matches(token_ids, prompt_repls)
@@ -950,7 +977,7 @@ def _validate_mm_kwargs(
 
     def _validate_mm_placeholders(
         self,
-        mm_placeholders: Mapping[str, list[_PlaceholderInfo]],
+        mm_placeholders: Mapping[str, list[PlaceholderInfo]],
         mm_item_counts: Mapping[str, int],
         *,
         allow_missing: bool = False,
@@ -1001,7 +1028,7 @@ def apply(
         # instead of rehashing.
 
         if envs.VLLM_USE_V1:
-            model_id = self.ctx.model_config.model
+            model_id = self.info.model_id
             mm_hashes = {
                 modality: [
                     MultiModalHasher.hash_kwargs(model_id=model_id,
@@ -1046,7 +1073,7 @@ def apply(
                 allow_missing=True,
             )
 
-            mm_missing_repls = dict[str, list[_BoundPromptReplacement]]()
+            mm_missing_repls = dict[str, list[BoundPromptReplacement]]()
             for modality, missing_repl_count in mm_missing_repl_counts.items():
                 if missing_repl_count == 0:
                     mm_missing_repls[modality] = []
@@ -1059,7 +1086,7 @@ def apply(
         # If HF processor already inserts placeholder tokens,
         # there is no need for us to insert them
         if all(len(repls) == 0 for repls in mm_missing_repls.items()):
-            tokenizer = self._get_tokenizer()
+            tokenizer = self.info.get_tokenizer()
             prompt_text = decode_tokens(tokenizer, prompt_ids)
             mm_placeholders = hf_mm_placeholders
         else:
@@ -1090,79 +1117,3 @@ def apply(
             mm_hashes=mm_hashes,
             mm_placeholders=mm_placeholder_ranges,
         )
-
-    def _get_dummy_mm_inputs(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> MultiModalInputsV2:
-        profiling = self.profiling_info
-        processor_inputs = profiling.get_dummy_processor_inputs(
-            seq_len, mm_counts)
-
-        return self.apply(
-            prompt_text=processor_inputs.prompt_text,
-            mm_data=processor_inputs.mm_data,
-            hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs,
-        )
-
-    def get_dummy_data(self, seq_len: int) -> DummyData:
-        # Avoid circular import
-        from vllm.sequence import SequenceData
-
-        profiling = self.profiling_info
-        mm_counts = profiling.get_mm_limits()
-        mm_max_tokens_per_item = profiling.get_mm_max_tokens_per_item(seq_len)
-        if mm_counts.keys() != mm_max_tokens_per_item.keys():
-            raise AssertionError(
-                "The keys returned by `get_supported_mm_limits`"
-                f"({set(mm_counts.keys())}) should be the same as those "
-                "returned by `get_mm_max_tokens_per_item` "
-                f"({set(mm_max_tokens_per_item.keys())})")
-
-        mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts)
-        prompt_token_ids = mm_inputs["prompt_token_ids"]
-        placeholders_by_modality = mm_inputs["mm_placeholders"]
-
-        total_placeholders_by_modality = {
-            modality: sum(item["length"] for item in placeholders)
-            for modality, placeholders in placeholders_by_modality.items()
-        }
-        expected_placeholders_by_modality = {
-            modality: mm_max_tokens_per_item[modality] * mm_counts[modality]
-            for modality in placeholders_by_modality
-        }
-        if total_placeholders_by_modality != expected_placeholders_by_modality:
-            raise AssertionError(
-                f"The processed dummy data has a total of "
-                f"{total_placeholders_by_modality} placeholder tokens, which "
-                f"is not the expected {expected_placeholders_by_modality} "
-                "tokens.")
-
-        total_len = len(prompt_token_ids)
-
-        # V0 does not support chunked prefill.
-        if total_len > seq_len and not envs.VLLM_USE_V1:
-            logger.warning(
-                "The context length (%d) of the model is too short "
-                "to hold the multi-modal embeddings in the worst case "
-                "(%d tokens in total, out of which %s are reserved for "
-                "multi-modal embeddings). This may cause certain multi-modal "
-                "inputs to fail during inference, even when the input text is "
-                "short. To avoid this, you should increase `max_model_len`, "
-                "reduce `max_num_seqs`, and/or reduce `mm_counts`.", seq_len,
-                total_len, total_placeholders_by_modality)
-
-            return DummyData(
-                seq_data=SequenceData.from_prompt_token_counts((0, seq_len)),
-                multi_modal_data=None,
-                multi_modal_placeholders=None,
-            )
-
-        prompt_token_ids.extend([0] * (seq_len - len(prompt_token_ids)))
-
-        return DummyData(
-            seq_data=SequenceData.from_seqs(prompt_token_ids),
-            multi_modal_data=mm_inputs["mm_kwargs"],
-            multi_modal_placeholders=placeholders_by_modality,
-        )
diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index 2ecf0db1a485d..2ac3a6bcf3ddd 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -1,16 +1,18 @@
 from abc import ABC, abstractmethod
 from collections.abc import Mapping
 from dataclasses import dataclass, field
-from typing import Optional
+from typing import Generic, TypeVar
 
 import numpy as np
 import numpy.typing as npt
 from PIL import Image
 
-from vllm.inputs import InputProcessingContext
+import vllm.envs as envs
+from vllm.inputs import DummyData
 from vllm.logger import init_logger
 
-from .inputs import MultiModalDataDict
+from .inputs import MultiModalDataDict, MultiModalInputsV2
+from .processing import BaseMultiModalProcessor, BaseProcessingInfo
 
 logger = init_logger(__name__)
 
@@ -23,39 +25,19 @@ class ProcessorInputs:
     hf_processor_mm_kwargs: Mapping[str, object] = field(default_factory=dict)
 
 
-class BaseProfilingInfo(ABC):
+_I = TypeVar("_I", bound=BaseProcessingInfo)
+
+
+class BaseDummyInputsBuilder(ABC, Generic[_I]):
     """
-    Abstract base class that provides the information necessary to profile
+    Abstract base class that constructs the dummy data to profile
     multi-modal models.
     """
 
-    def __init__(self, ctx: InputProcessingContext) -> None:
+    def __init__(self, info: _I) -> None:
         super().__init__()
 
-        self.ctx = ctx
-
-    @abstractmethod
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
-        """
-        Return the maximum supported number of items for each modality.
-
-        A value of `None` means unlimited number of items.
-
-        Omitting a modality from the returned dictionary means that
-        it is not supported at all.
-        """
-        raise NotImplementedError
-
-    @abstractmethod
-    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
-        """
-        Get the maximum possible number of tokens per data item
-        for each modality.
-
-        The dictionary returned by this method should have the same
-        keys as that returned by :meth:`get_supported_mm_limits`.
-        """
-        raise NotImplementedError
+        self.info = info
 
     @abstractmethod
     def get_dummy_processor_inputs(
@@ -64,8 +46,8 @@ def get_dummy_processor_inputs(
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
         """
-        Build the multi-modal portion of the input which, after processing,
-        results in `mm_max_tokens` in :meth:`get_mm_max_tokens_per_item`.
+        Build the input which, after processing, results in
+        `self.info.get_mm_max_tokens_per_item()` placeholder tokens.
         """
         raise NotImplementedError
 
@@ -99,11 +81,33 @@ def _get_dummy_videos(
         video = np.zeros((num_frames, width, height, 3))
         return [video] * num_videos
 
-    def get_mm_limits(self) -> Mapping[str, int]:
-        mm_config = self.ctx.get_mm_config()
+
+class MultiModalProfiler(Generic[_I]):
+    """
+    Contains code for running memory profiling for multi-modal models.
+    """
+
+    def __init__(
+        self,
+        processor: BaseMultiModalProcessor[_I],
+    ) -> None:
+        super().__init__()
+
+        self.processor = processor
+
+    @property
+    def processing_info(self) -> BaseProcessingInfo:
+        return self.processor.info
+
+    @property
+    def dummy_inputs(self) -> BaseDummyInputsBuilder[_I]:
+        return self.processor.dummy_inputs
+
+    def _get_mm_limits(self) -> Mapping[str, int]:
+        mm_config = self.processing_info.ctx.get_mm_config()
         mm_limit_per_prompt = mm_config.limit_per_prompt
 
-        supported_mm_limits = self.get_supported_mm_limits()
+        supported_mm_limits = self.processing_info.get_supported_mm_limits()
 
         mm_limits = {
             modality: mm_limit_per_prompt.get(modality, 1)
@@ -119,3 +123,81 @@ def get_mm_limits(self) -> Mapping[str, int]:
                     f"at most {supported_limit} {modality} items.")
 
         return mm_limits
+
+    def _get_dummy_mm_inputs(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalInputsV2:
+        factory = self.dummy_inputs
+        processor_inputs = factory.get_dummy_processor_inputs(
+            seq_len, mm_counts)
+
+        return self.processor.apply(
+            prompt_text=processor_inputs.prompt_text,
+            mm_data=processor_inputs.mm_data,
+            hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs,
+        )
+
+    def get_dummy_data(self, seq_len: int) -> DummyData:
+        # Avoid circular import
+        from vllm.sequence import SequenceData
+
+        mm_counts = self._get_mm_limits()
+
+        info = self.processing_info
+        mm_max_tokens_per_item = info.get_mm_max_tokens_per_item(seq_len)
+
+        if mm_counts.keys() != mm_max_tokens_per_item.keys():
+            raise AssertionError(
+                "The keys returned by `get_supported_mm_limits`"
+                f"({set(mm_counts.keys())}) should be the same as those "
+                "returned by `get_mm_max_tokens_per_item` "
+                f"({set(mm_max_tokens_per_item.keys())})")
+
+        mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts)
+        prompt_token_ids = mm_inputs["prompt_token_ids"]
+        placeholders_by_modality = mm_inputs["mm_placeholders"]
+
+        total_placeholders_by_modality = {
+            modality: sum(item["length"] for item in placeholders)
+            for modality, placeholders in placeholders_by_modality.items()
+        }
+        expected_placeholders_by_modality = {
+            modality: mm_max_tokens_per_item[modality] * mm_counts[modality]
+            for modality in placeholders_by_modality
+        }
+        if total_placeholders_by_modality != expected_placeholders_by_modality:
+            raise AssertionError(
+                f"The processed dummy data has a total of "
+                f"{total_placeholders_by_modality} placeholder tokens, which "
+                f"is not the expected {expected_placeholders_by_modality} "
+                "tokens.")
+
+        total_len = len(prompt_token_ids)
+
+        # V0 does not support chunked prefill.
+        if total_len > seq_len and not envs.VLLM_USE_V1:
+            logger.warning(
+                "The context length (%d) of the model is too short "
+                "to hold the multi-modal embeddings in the worst case "
+                "(%d tokens in total, out of which %s are reserved for "
+                "multi-modal embeddings). This may cause certain multi-modal "
+                "inputs to fail during inference, even when the input text is "
+                "short. To avoid this, you should increase `max_model_len`, "
+                "reduce `max_num_seqs`, and/or reduce `mm_counts`.", seq_len,
+                total_len, total_placeholders_by_modality)
+
+            return DummyData(
+                seq_data=SequenceData.from_prompt_token_counts((0, seq_len)),
+                multi_modal_data=None,
+                multi_modal_placeholders=None,
+            )
+
+        prompt_token_ids.extend([0] * (seq_len - len(prompt_token_ids)))
+
+        return DummyData(
+            seq_data=SequenceData.from_seqs(prompt_token_ids),
+            multi_modal_data=mm_inputs["mm_kwargs"],
+            multi_modal_placeholders=placeholders_by_modality,
+        )
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index f75a594a4c4e0..5f01eac4edade 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -1,7 +1,8 @@
 import functools
 from collections import UserDict
-from typing import (TYPE_CHECKING, Any, Dict, Mapping, Optional, Protocol,
-                    Sequence, Type, TypeVar)
+from dataclasses import dataclass
+from typing import (TYPE_CHECKING, Any, Dict, Generic, Mapping, Optional,
+                    Protocol, Sequence, Type, TypeVar)
 
 import torch.nn as nn
 
@@ -14,7 +15,9 @@
 from .base import MultiModalInputMapper, MultiModalPlugin, MultiModalTokensCalc
 from .image import ImagePlugin
 from .inputs import MultiModalDataDict, MultiModalKwargs, NestedTensors
-from .processing import BaseMultiModalProcessor, ProcessingCache
+from .processing import (BaseMultiModalProcessor, BaseProcessingInfo,
+                         ProcessingCache)
+from .profiling import BaseDummyInputsBuilder
 from .utils import cached_get_tokenizer
 from .video import VideoPlugin
 
@@ -27,20 +30,59 @@
 MM_CACHE_SIZE = 256
 
 N = TypeVar("N", bound=Type[nn.Module])
+_I = TypeVar("_I", bound=BaseProcessingInfo)
+_I_co = TypeVar("_I_co", bound=BaseProcessingInfo, covariant=True)
 
 
-class MultiModalProcessorFactory(Protocol):
+class ProcessingInfoFactory(Protocol[_I_co]):
     """Constructs a :class:`MultiModalProcessor` instance from the context."""
 
     def __call__(
         self,
         ctx: InputProcessingContext,
+    ) -> _I_co:
+        ...
+
+
+class DummyInputsBuilderFactory(Protocol[_I]):
+    """
+    Constructs a :class:`BaseDummyInputsBuilder` instance from the context.
+    """
+
+    def __call__(self, info: _I) -> BaseDummyInputsBuilder[_I]:
+        ...
+
+
+class MultiModalProcessorFactory(Protocol[_I]):
+    """Constructs a :class:`MultiModalProcessor` instance from the context."""
+
+    def __call__(
+        self,
+        info: _I,
+        dummy_inputs: BaseDummyInputsBuilder[_I],
         *,
         cache: Optional[ProcessingCache] = None,
-    ) -> BaseMultiModalProcessor:
+    ) -> BaseMultiModalProcessor[_I]:
         ...
 
 
+@dataclass(frozen=True)
+class _ProcessorFactories(Generic[_I]):
+    info: ProcessingInfoFactory[_I]
+    processor: MultiModalProcessorFactory[_I]
+    dummy_inputs: DummyInputsBuilderFactory[_I]
+
+    def build_processor(
+        self,
+        ctx: InputProcessingContext,
+        *,
+        cache: Optional[ProcessingCache] = None,
+    ):
+        info = self.info(ctx)
+        dummy_inputs_builder = self.dummy_inputs(info)
+        return self.processor(info, dummy_inputs_builder, cache=cache)
+
+
 class _MultiModalLimits(UserDict["ModelConfig", Dict[str, int]]):
     """
     Wraps `_limits_by_model` for a more informative error message
@@ -71,7 +113,7 @@ def __init__(
         self._plugins = {p.get_data_key(): p for p in plugins}
 
         self._processor_factories = ClassRegistry[nn.Module,
-                                                  MultiModalProcessorFactory]()
+                                                  _ProcessorFactories]()
 
         # This is used for non-multimodal models
         self._disabled_limits_per_plugin = {k: 0 for k in self._plugins}
@@ -224,7 +266,7 @@ def get_max_tokens_per_item_by_modality(
             tokenizer = cached_get_tokenizer(model_config.tokenizer)
             processor = self.create_processor(model_config, tokenizer)
             seq_len = model_config.max_model_len
-            return processor.profiling_info.get_mm_max_tokens_per_item(seq_len)
+            return processor.info.get_mm_max_tokens_per_item(seq_len)
 
         return {
             key: plugin.get_max_multimodal_tokens(model_config)
@@ -315,7 +357,10 @@ def get_mm_limits_per_prompt(
 
     def register_processor(
         self,
-        factory: MultiModalProcessorFactory,
+        processor: MultiModalProcessorFactory[_I],
+        *,
+        info: ProcessingInfoFactory[_I],
+        dummy_inputs: DummyInputsBuilderFactory[_I],
     ):
         """
         Register a multi-modal processor to a model class. The processor
@@ -336,7 +381,11 @@ def wrapper(model_cls: N) -> N:
                     "registered to %s. It is overwritten by the new one.",
                     model_cls, self)
 
-            self._processor_factories[model_cls] = factory
+            self._processor_factories[model_cls] = _ProcessorFactories(
+                info=info,
+                dummy_inputs=dummy_inputs,
+                processor=processor,
+            )
 
             return model_cls
 
@@ -359,15 +408,15 @@ def create_processor(
         self,
         model_config: "ModelConfig",
         tokenizer: AnyTokenizer,
-    ) -> BaseMultiModalProcessor:
+    ) -> BaseMultiModalProcessor[BaseProcessingInfo]:
         """
         Create a multi-modal processor for a specific model and tokenizer.
         """
         model_cls = self._get_model_cls(model_config)
-        processor_factory = self._processor_factories[model_cls]
+        factories = self._processor_factories[model_cls]
 
         ctx = InputProcessingContext(model_config, tokenizer)
         cache = (None if model_config.disable_mm_preprocessor_cache else
                  self._processing_cache)
 
-        return processor_factory(ctx, cache=cache)
+        return factories.build_processor(ctx, cache=cache)

From aba8d6ee006b78149ac4514f460e4038b2d4f607 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 8 Jan 2025 13:09:53 +0000
Subject: [PATCH 131/462] [Doc] Move examples into categories (#11840)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .buildkite/run-cpu-test.sh                    |  2 +-
 .buildkite/run-gh200-test.sh                  |  2 +-
 .buildkite/run-hpu-test.sh                    |  2 +-
 .buildkite/run-neuron-test.sh                 |  2 +-
 .buildkite/run-openvino-test.sh               |  2 +-
 .buildkite/run-tpu-test.sh                    |  2 +-
 .buildkite/run-xpu-test.sh                    |  4 +-
 .buildkite/test-pipeline.yaml                 | 26 +++++------
 .github/workflows/lint-and-deploy.yaml        |  4 +-
 Dockerfile                                    |  2 +-
 .../contributing/profiling/profiling_index.md |  2 +-
 docs/source/deployment/frameworks/skypilot.md |  4 +-
 docs/source/features/disagg_prefill.md        |  2 +-
 docs/source/features/lora.md                  |  2 +-
 docs/source/features/quantization/auto_awq.md |  2 +-
 .../features/quantization/fp8_e4m3_kvcache.md |  2 +-
 docs/source/features/structured_outputs.md    |  4 +-
 docs/source/generate_examples.py              | 45 ++++++++++---------
 .../getting_started/installation/cpu-x86.md   |  4 +-
 .../getting_started/installation/xpu.md       |  2 +-
 docs/source/getting_started/quickstart.md     |  4 +-
 .../source/getting_started/troubleshooting.md |  2 +-
 docs/source/models/extensions/tensorizer.md   |  2 +-
 docs/source/models/generative_models.md       |  4 +-
 docs/source/models/pooling_models.md          |  6 +--
 docs/source/serving/distributed_serving.md    |  2 +-
 docs/source/serving/multimodal_inputs.md      | 16 +++----
 .../serving/openai_compatible_server.md       | 10 ++---
 .../{ => offline_inference}/aqlm_example.py   |  0
 .../{ => offline_inference}/cpu_offload.py    |  0
 .../florence2_inference.py                    |  3 +-
 .../{ => offline_inference}/gguf_inference.py |  0
 .../llm_engine_example.py                     |  0
 .../lora_with_quantization_inference.py       |  0
 .../multilora_inference.py                    |  0
 .../offline_chat_with_tools.py                |  0
 .../offline_inference.py                      |  0
 .../offline_inference_arctic.py               |  0
 .../offline_inference_audio_language.py       |  0
 .../offline_inference_chat.py                 |  0
 .../offline_inference_classification.py       |  0
 .../offline_inference_cli.py                  |  0
 .../offline_inference_distributed.py          |  0
 .../offline_inference_embedding.py            |  0
 .../offline_inference_encoder_decoder.py      |  0
 .../offline_inference_mlpspeculator.py        |  0
 .../offline_inference_neuron.py               |  0
 ...line_inference_neuron_int8_quantization.py |  0
 .../offline_inference_openai.md               | 18 ++++----
 .../openai_example_batch.jsonl                |  0
 .../offline_inference_pixtral.py              |  0
 .../offline_inference_scoring.py              |  0
 .../offline_inference_structured_outputs.py   |  0
 .../offline_inference_tpu.py                  |  0
 .../offline_inference_vision_language.py      |  0
 ...ine_inference_vision_language_embedding.py |  0
 ...e_inference_vision_language_multi_image.py |  0
 .../offline_inference_whisper.py              |  0
 ...nference_with_default_generation_config.py |  0
 .../offline_inference_with_prefix.py          |  0
 .../offline_inference_with_profiler.py        |  0
 .../offline_profile.py                        |  2 +-
 .../save_sharded_state.py                     |  0
 examples/{ => online_serving}/api_client.py   |  0
 .../chart-helm/.helmignore                    |  0
 .../chart-helm/Chart.yaml                     |  0
 examples/online_serving/chart-helm/README.md  | 21 +++++++++
 .../{ => online_serving}/chart-helm/ct.yaml   |  0
 .../chart-helm/lintconf.yaml                  |  0
 .../chart-helm/templates/_helpers.tpl         |  0
 .../chart-helm/templates/configmap.yaml       |  0
 .../chart-helm/templates/custom-objects.yaml  |  0
 .../chart-helm/templates/deployment.yaml      |  0
 .../chart-helm/templates/hpa.yaml             |  0
 .../chart-helm/templates/job.yaml             |  0
 .../templates/poddisruptionbudget.yaml        |  0
 .../chart-helm/templates/pvc.yaml             |  0
 .../chart-helm/templates/secrets.yaml         |  0
 .../chart-helm/templates/service.yaml         |  0
 .../chart-helm/values.schema.json             |  0
 .../chart-helm/values.yaml                    |  0
 .../disaggregated_prefill.sh                  |  0
 .../gradio_openai_chatbot_webserver.py        |  0
 .../{ => online_serving}/gradio_webserver.py  |  0
 .../openai_chat_completion_client.py          |  0
 ...i_chat_completion_client_for_multimodal.py |  0
 ...penai_chat_completion_client_with_tools.py |  0
 ...enai_chat_completion_structured_outputs.py |  0
 ...ai_chat_embedding_client_for_multimodal.py |  0
 .../openai_completion_client.py               |  0
 .../openai_cross_encoder_score.py             |  0
 .../openai_embedding_client.py                |  0
 .../openai_pooling_client.py                  |  0
 .../opentelemetry/Otel.md                     |  0
 .../opentelemetry/dummy_client.py             |  0
 .../prometheus_grafana/README.md              |  0
 .../prometheus_grafana/docker-compose.yaml    |  0
 .../prometheus_grafana/grafana.json           |  0
 .../prometheus_grafana/prometheus.yaml        |  0
 examples/{ => online_serving}/run_cluster.sh  |  0
 .../sagemaker-entrypoint.sh                   |  0
 examples/{ => other}/fp8/README.md            | 10 ++---
 examples/{ => other}/fp8/extract_scales.py    |  0
 examples/{ => other}/fp8/quantizer/README.md  |  0
 .../{ => other}/fp8/quantizer/quantize.py     |  0
 examples/{ => other}/logging_configuration.md |  0
 examples/{ => other}/tensorize_vllm_model.py  | 10 ++---
 pyproject.toml                                |  2 +-
 tests/plugins_tests/test_platform_plugins.py  |  2 +-
 tests/tensorizer_loader/test_tensorizer.py    |  4 +-
 tools/profiler/print_layerwise_table.py       |  2 +-
 tools/profiler/visualize_layerwise_profile.py | 10 ++---
 vllm/distributed/kv_transfer/README.md        |  2 +-
 vllm/model_executor/model_loader/loader.py    | 11 ++---
 .../model_executor/model_loader/tensorizer.py | 14 +++---
 .../model_loader/weight_utils.py              |  3 +-
 116 files changed, 153 insertions(+), 124 deletions(-)
 rename examples/{ => offline_inference}/aqlm_example.py (100%)
 rename examples/{ => offline_inference}/cpu_offload.py (100%)
 rename examples/{ => offline_inference}/florence2_inference.py (92%)
 rename examples/{ => offline_inference}/gguf_inference.py (100%)
 rename examples/{ => offline_inference}/llm_engine_example.py (100%)
 rename examples/{ => offline_inference}/lora_with_quantization_inference.py (100%)
 rename examples/{ => offline_inference}/multilora_inference.py (100%)
 rename examples/{ => offline_inference}/offline_chat_with_tools.py (100%)
 rename examples/{ => offline_inference}/offline_inference.py (100%)
 rename examples/{ => offline_inference}/offline_inference_arctic.py (100%)
 rename examples/{ => offline_inference}/offline_inference_audio_language.py (100%)
 rename examples/{ => offline_inference}/offline_inference_chat.py (100%)
 rename examples/{ => offline_inference}/offline_inference_classification.py (100%)
 rename examples/{ => offline_inference}/offline_inference_cli.py (100%)
 rename examples/{ => offline_inference}/offline_inference_distributed.py (100%)
 rename examples/{ => offline_inference}/offline_inference_embedding.py (100%)
 rename examples/{ => offline_inference}/offline_inference_encoder_decoder.py (100%)
 rename examples/{ => offline_inference}/offline_inference_mlpspeculator.py (100%)
 rename examples/{ => offline_inference}/offline_inference_neuron.py (100%)
 rename examples/{ => offline_inference}/offline_inference_neuron_int8_quantization.py (100%)
 rename examples/{ => offline_inference/offline_inference_openai}/offline_inference_openai.md (90%)
 rename examples/{ => offline_inference/offline_inference_openai}/openai_example_batch.jsonl (100%)
 rename examples/{ => offline_inference}/offline_inference_pixtral.py (100%)
 rename examples/{ => offline_inference}/offline_inference_scoring.py (100%)
 rename examples/{ => offline_inference}/offline_inference_structured_outputs.py (100%)
 rename examples/{ => offline_inference}/offline_inference_tpu.py (100%)
 rename examples/{ => offline_inference}/offline_inference_vision_language.py (100%)
 rename examples/{ => offline_inference}/offline_inference_vision_language_embedding.py (100%)
 rename examples/{ => offline_inference}/offline_inference_vision_language_multi_image.py (100%)
 rename examples/{ => offline_inference}/offline_inference_whisper.py (100%)
 rename examples/{ => offline_inference}/offline_inference_with_default_generation_config.py (100%)
 rename examples/{ => offline_inference}/offline_inference_with_prefix.py (100%)
 rename examples/{ => offline_inference}/offline_inference_with_profiler.py (100%)
 rename examples/{ => offline_inference}/offline_profile.py (99%)
 rename examples/{ => offline_inference}/save_sharded_state.py (100%)
 rename examples/{ => online_serving}/api_client.py (100%)
 rename examples/{ => online_serving}/chart-helm/.helmignore (100%)
 rename examples/{ => online_serving}/chart-helm/Chart.yaml (100%)
 create mode 100644 examples/online_serving/chart-helm/README.md
 rename examples/{ => online_serving}/chart-helm/ct.yaml (100%)
 rename examples/{ => online_serving}/chart-helm/lintconf.yaml (100%)
 rename examples/{ => online_serving}/chart-helm/templates/_helpers.tpl (100%)
 rename examples/{ => online_serving}/chart-helm/templates/configmap.yaml (100%)
 rename examples/{ => online_serving}/chart-helm/templates/custom-objects.yaml (100%)
 rename examples/{ => online_serving}/chart-helm/templates/deployment.yaml (100%)
 rename examples/{ => online_serving}/chart-helm/templates/hpa.yaml (100%)
 rename examples/{ => online_serving}/chart-helm/templates/job.yaml (100%)
 rename examples/{ => online_serving}/chart-helm/templates/poddisruptionbudget.yaml (100%)
 rename examples/{ => online_serving}/chart-helm/templates/pvc.yaml (100%)
 rename examples/{ => online_serving}/chart-helm/templates/secrets.yaml (100%)
 rename examples/{ => online_serving}/chart-helm/templates/service.yaml (100%)
 rename examples/{ => online_serving}/chart-helm/values.schema.json (100%)
 rename examples/{ => online_serving}/chart-helm/values.yaml (100%)
 rename examples/{ => online_serving}/disaggregated_prefill.sh (100%)
 rename examples/{ => online_serving}/gradio_openai_chatbot_webserver.py (100%)
 rename examples/{ => online_serving}/gradio_webserver.py (100%)
 rename examples/{ => online_serving}/openai_chat_completion_client.py (100%)
 rename examples/{ => online_serving}/openai_chat_completion_client_for_multimodal.py (100%)
 rename examples/{ => online_serving}/openai_chat_completion_client_with_tools.py (100%)
 rename examples/{ => online_serving}/openai_chat_completion_structured_outputs.py (100%)
 rename examples/{ => online_serving}/openai_chat_embedding_client_for_multimodal.py (100%)
 rename examples/{ => online_serving}/openai_completion_client.py (100%)
 rename examples/{ => online_serving}/openai_cross_encoder_score.py (100%)
 rename examples/{ => online_serving}/openai_embedding_client.py (100%)
 rename examples/{ => online_serving}/openai_pooling_client.py (100%)
 rename examples/{ => online_serving}/opentelemetry/Otel.md (100%)
 rename examples/{ => online_serving}/opentelemetry/dummy_client.py (100%)
 rename examples/{ => online_serving}/prometheus_grafana/README.md (100%)
 rename examples/{ => online_serving}/prometheus_grafana/docker-compose.yaml (100%)
 rename examples/{ => online_serving}/prometheus_grafana/grafana.json (100%)
 rename examples/{ => online_serving}/prometheus_grafana/prometheus.yaml (100%)
 rename examples/{ => online_serving}/run_cluster.sh (100%)
 rename examples/{ => online_serving}/sagemaker-entrypoint.sh (100%)
 rename examples/{ => other}/fp8/README.md (88%)
 rename examples/{ => other}/fp8/extract_scales.py (100%)
 rename examples/{ => other}/fp8/quantizer/README.md (100%)
 rename examples/{ => other}/fp8/quantizer/quantize.py (100%)
 rename examples/{ => other}/logging_configuration.md (100%)
 rename examples/{ => other}/tensorize_vllm_model.py (96%)

diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index a4eca078568fd..87d08c8c7fdcb 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -30,7 +30,7 @@ function cpu_tests() {
   # offline inference
   docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
     set -e
-    python3 examples/offline_inference.py"
+    python3 examples/offline_inference/offline_inference.py"
 
   # Run basic model test
   docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
diff --git a/.buildkite/run-gh200-test.sh b/.buildkite/run-gh200-test.sh
index 4fc6d089cc666..1e5ff77895a38 100644
--- a/.buildkite/run-gh200-test.sh
+++ b/.buildkite/run-gh200-test.sh
@@ -24,5 +24,5 @@ remove_docker_container
 
 # Run the image and test offline inference
 docker run --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
-    python3 examples/offline_inference.py
+    python3 examples/offline_inference/offline_inference.py
 '
diff --git a/.buildkite/run-hpu-test.sh b/.buildkite/run-hpu-test.sh
index fa4f74fca7a11..a50570ab53438 100644
--- a/.buildkite/run-hpu-test.sh
+++ b/.buildkite/run-hpu-test.sh
@@ -13,4 +13,4 @@ trap remove_docker_container EXIT
 remove_docker_container
 
 # Run the image and launch offline inference
-docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference.py
\ No newline at end of file
+docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/offline_inference.py
\ No newline at end of file
diff --git a/.buildkite/run-neuron-test.sh b/.buildkite/run-neuron-test.sh
index aa29c434e7cfb..52d485939b1d0 100644
--- a/.buildkite/run-neuron-test.sh
+++ b/.buildkite/run-neuron-test.sh
@@ -51,4 +51,4 @@ docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \
        -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
        --name "${container_name}" \
        ${image_name} \
-       /bin/bash -c "python3 /workspace/vllm/examples/offline_inference_neuron.py"
+       /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/offline_inference_neuron.py"
diff --git a/.buildkite/run-openvino-test.sh b/.buildkite/run-openvino-test.sh
index 6b12f424fd828..380f7a44a429a 100755
--- a/.buildkite/run-openvino-test.sh
+++ b/.buildkite/run-openvino-test.sh
@@ -13,4 +13,4 @@ trap remove_docker_container EXIT
 remove_docker_container
 
 # Run the image and launch offline inference
-docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference.py
+docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/offline_inference.py
diff --git a/.buildkite/run-tpu-test.sh b/.buildkite/run-tpu-test.sh
index 770dad6ffa3a1..13605a3e97142 100644
--- a/.buildkite/run-tpu-test.sh
+++ b/.buildkite/run-tpu-test.sh
@@ -14,4 +14,4 @@ remove_docker_container
 # For HF_TOKEN.
 source /etc/environment
 # Run a simple end-to-end example.
-docker run --privileged --net host --shm-size=16G -it -e "HF_TOKEN=$HF_TOKEN" --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && python3 -m pip install lm_eval[api]==0.4.4 && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py"
+docker run --privileged --net host --shm-size=16G -it -e "HF_TOKEN=$HF_TOKEN" --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && python3 -m pip install lm_eval[api]==0.4.4 && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference/offline_inference_tpu.py"
diff --git a/.buildkite/run-xpu-test.sh b/.buildkite/run-xpu-test.sh
index e0a12afbe7320..160e10aa3bb9b 100644
--- a/.buildkite/run-xpu-test.sh
+++ b/.buildkite/run-xpu-test.sh
@@ -14,6 +14,6 @@ remove_docker_container
 
 # Run the image and test offline inference/tensor parallel
 docker run --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test sh -c '
-    python3 examples/offline_inference.py
-    python3 examples/offline_inference_cli.py -tp 2
+    python3 examples/offline_inference/offline_inference.py
+    python3 examples/offline_inference/offline_inference_cli.py -tp 2
 '
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index dcfe228ce8eae..b7178b94f481a 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -187,19 +187,19 @@ steps:
   - examples/
   commands:
     - pip install tensorizer # for tensorizer test
-    - python3 offline_inference.py
-    - python3 cpu_offload.py
-    - python3 offline_inference_chat.py
-    - python3 offline_inference_with_prefix.py
-    - python3 llm_engine_example.py
-    - python3 offline_inference_vision_language.py
-    - python3 offline_inference_vision_language_multi_image.py
-    - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
-    - python3 offline_inference_encoder_decoder.py
-    - python3 offline_inference_classification.py
-    - python3 offline_inference_embedding.py
-    - python3 offline_inference_scoring.py
-    - python3 offline_profile.py --model facebook/opt-125m run_num_steps --num-steps 2
+    - python3 offline_inference/offline_inference.py
+    - python3 offline_inference/cpu_offload.py
+    - python3 offline_inference/offline_inference_chat.py
+    - python3 offline_inference/offline_inference_with_prefix.py
+    - python3 offline_inference/llm_engine_example.py
+    - python3 offline_inference/offline_inference_vision_language.py
+    - python3 offline_inference/offline_inference_vision_language_multi_image.py
+    - python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
+    - python3 offline_inference/offline_inference_encoder_decoder.py
+    - python3 offline_inference/offline_inference_classification.py
+    - python3 offline_inference/offline_inference_embedding.py
+    - python3 offline_inference/offline_inference_scoring.py
+    - python3 offline_inference/offline_profile.py --model facebook/opt-125m run_num_steps --num-steps 2
 
 - label: Prefix Caching Test # 9min
   mirror_hardwares: [amd]
diff --git a/.github/workflows/lint-and-deploy.yaml b/.github/workflows/lint-and-deploy.yaml
index ab6f6e5d2060d..ee768db63c96c 100644
--- a/.github/workflows/lint-and-deploy.yaml
+++ b/.github/workflows/lint-and-deploy.yaml
@@ -27,7 +27,7 @@ jobs:
           version: v3.10.1
 
       - name: Run chart-testing (lint)
-        run: ct lint --target-branch ${{ github.event.repository.default_branch }} --chart-dirs examples/chart-helm --charts examples/chart-helm
+        run: ct lint --target-branch ${{ github.event.repository.default_branch }} --chart-dirs examples/online_serving/chart-helm --charts examples/online_serving/chart-helm
 
       - name: Setup minio
         run: |
@@ -64,7 +64,7 @@ jobs:
         run: |
           export AWS_ACCESS_KEY_ID=minioadmin
           export AWS_SECRET_ACCESS_KEY=minioadmin
-          helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/chart-helm -f examples/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
+          helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
     
       - name: curl test
         run: |
diff --git a/Dockerfile b/Dockerfile
index 088314eb38dbe..808cf675acf4d 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -250,7 +250,7 @@ ENV VLLM_USAGE_SOURCE production-docker-image
 # define sagemaker first, so it is not default from `docker build`
 FROM vllm-openai-base AS vllm-sagemaker
 
-COPY examples/sagemaker-entrypoint.sh .
+COPY examples/online_serving/sagemaker-entrypoint.sh .
 RUN chmod +x sagemaker-entrypoint.sh
 ENTRYPOINT ["./sagemaker-entrypoint.sh"]
 
diff --git a/docs/source/contributing/profiling/profiling_index.md b/docs/source/contributing/profiling/profiling_index.md
index 46210957c19ec..97de40ff469f1 100644
--- a/docs/source/contributing/profiling/profiling_index.md
+++ b/docs/source/contributing/profiling/profiling_index.md
@@ -26,7 +26,7 @@ Set the env variable VLLM_RPC_TIMEOUT to a big number before you start the serve
 
 ### Offline Inference
 
-Refer to <gh-file:examples/offline_inference_with_profiler.py> for an example.
+Refer to <gh-file:examples/offline_inference/offline_inference_with_profiler.py> for an example.
 
 ### OpenAI Server
 
diff --git a/docs/source/deployment/frameworks/skypilot.md b/docs/source/deployment/frameworks/skypilot.md
index f02a943026922..657e7f2bc72cc 100644
--- a/docs/source/deployment/frameworks/skypilot.md
+++ b/docs/source/deployment/frameworks/skypilot.md
@@ -61,7 +61,7 @@ run: |
 
   echo 'Starting gradio server...'
   git clone https://github.com/vllm-project/vllm.git || true
-  python vllm/examples/gradio_openai_chatbot_webserver.py \
+  python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \
     -m $MODEL_NAME \
     --port 8811 \
     --model-url http://localhost:8081/v1 \
@@ -321,7 +321,7 @@ run: |
 
   echo 'Starting gradio server...'
   git clone https://github.com/vllm-project/vllm.git || true
-  python vllm/examples/gradio_openai_chatbot_webserver.py \
+  python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \
     -m $MODEL_NAME \
     --port 8811 \
     --model-url http://$ENDPOINT/v1 \
diff --git a/docs/source/features/disagg_prefill.md b/docs/source/features/disagg_prefill.md
index 645dc60807dd3..efa2efc66192e 100644
--- a/docs/source/features/disagg_prefill.md
+++ b/docs/source/features/disagg_prefill.md
@@ -21,7 +21,7 @@ Disaggregated prefill DOES NOT improve throughput.
 
 ## Usage example
 
-Please refer to `examples/disaggregated_prefill.sh` for the example usage of disaggregated prefilling.
+Please refer to `examples/online_serving/disaggregated_prefill.sh` for the example usage of disaggregated prefilling.
 
 ## Benchmarks
 
diff --git a/docs/source/features/lora.md b/docs/source/features/lora.md
index cf06916d70f44..b00d05147bb32 100644
--- a/docs/source/features/lora.md
+++ b/docs/source/features/lora.md
@@ -47,7 +47,7 @@ outputs = llm.generate(
 )
 ```
 
-Check out <gh-file:examples/multilora_inference.py> for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options.
+Check out <gh-file:examples/offline_inference/multilora_inference.py> for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options.
 
 ## Serving LoRA Adapters
 
diff --git a/docs/source/features/quantization/auto_awq.md b/docs/source/features/quantization/auto_awq.md
index c02fbf0605a8c..3679595e3d4d0 100644
--- a/docs/source/features/quantization/auto_awq.md
+++ b/docs/source/features/quantization/auto_awq.md
@@ -47,7 +47,7 @@ print(f'Model is quantized and saved at "{quant_path}"')
 To run an AWQ model with vLLM, you can use [TheBloke/Llama-2-7b-Chat-AWQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-AWQ) with the following command:
 
 ```console
-$ python examples/llm_engine_example.py --model TheBloke/Llama-2-7b-Chat-AWQ --quantization awq
+$ python examples/offline_inference/llm_engine_example.py --model TheBloke/Llama-2-7b-Chat-AWQ --quantization awq
 ```
 
 AWQ models are also supported directly through the LLM entrypoint:
diff --git a/docs/source/features/quantization/fp8_e4m3_kvcache.md b/docs/source/features/quantization/fp8_e4m3_kvcache.md
index f200c722d1d42..50edaf81fddd3 100644
--- a/docs/source/features/quantization/fp8_e4m3_kvcache.md
+++ b/docs/source/features/quantization/fp8_e4m3_kvcache.md
@@ -28,7 +28,7 @@ Here is an example of how to enable this feature:
 
 ```python
 # two float8_e4m3fn kv cache scaling factor files are provided under tests/fp8_kv, please refer to
-# https://github.com/vllm-project/vllm/blob/main/examples/fp8/README.md to generate kv_cache_scales.json of your own.
+# https://github.com/vllm-project/vllm/blob/main/examples/other/fp8/README.md to generate kv_cache_scales.json of your own.
 
 from vllm import LLM, SamplingParams
 sampling_params = SamplingParams(temperature=1.3, top_p=0.8)
diff --git a/docs/source/features/structured_outputs.md b/docs/source/features/structured_outputs.md
index 26c09bb0d8a0c..ccd9a6a1b1a14 100644
--- a/docs/source/features/structured_outputs.md
+++ b/docs/source/features/structured_outputs.md
@@ -131,7 +131,7 @@ completion = client.chat.completions.create(
 print(completion.choices[0].message.content)
 ```
 
-Full example: <gh-file:examples/openai_chat_completion_structured_outputs.py>
+Full example: <gh-file:examples/online_serving/openai_chat_completion_structured_outputs.py>
 
 ## Experimental Automatic Parsing (OpenAI API)
 
@@ -257,4 +257,4 @@ outputs = llm.generate(
 print(outputs[0].outputs[0].text)
 ```
 
-Full example: <gh-file:examples/offline_inference_structured_outputs.py>
+Full example: <gh-file:examples/offline_inference/offline_inference_structured_outputs.py>
diff --git a/docs/source/generate_examples.py b/docs/source/generate_examples.py
index 32bb86c469c78..aaa13d0fb6d3f 100644
--- a/docs/source/generate_examples.py
+++ b/docs/source/generate_examples.py
@@ -12,6 +12,7 @@
 def fix_case(text: str) -> str:
     subs = {
         "api": "API",
+        "Cli": "CLI",
         "cpu": "CPU",
         "llm": "LLM",
         "tpu": "TPU",
@@ -58,7 +59,7 @@ def generate(self) -> str:
         content = f"# {self.title}\n\n{self.description}\n\n"
         content += "```{toctree}\n"
         content += f":caption: {self.caption}\n:maxdepth: {self.maxdepth}\n"
-        content += "\n".join(sorted(self.documents)) + "\n```\n"
+        content += "\n".join(self.documents) + "\n```\n"
         return content
 
 
@@ -131,11 +132,14 @@ def generate(self) -> str:
             ROOT_DIR)
 
         content = f"Source <gh-file:{self.path.relative_to(ROOT_DIR)}>.\n\n"
-        if self.main_file.suffix == ".py":
-            content += f"# {self.title}\n\n"
         include = "include" if self.main_file.suffix == ".md" else \
             "literalinclude"
-        content += f":::{{{include}}} {make_relative(self.main_file)}\n:::\n\n"
+        if include == "literalinclude":
+            content += f"# {self.title}\n\n"
+        content += f":::{{{include}}} {make_relative(self.main_file)}\n"
+        if include == "literalinclude":
+            content += f":language: {self.main_file.suffix[1:]}\n"
+        content += ":::\n\n"
 
         if not self.other_files:
             return content
@@ -163,14 +167,16 @@ def generate_examples():
         description=
         "A collection of examples demonstrating usage of vLLM.\nAll documented examples are autogenerated using <gh-file:docs/source/generate_examples.py> from examples found in <gh-file:examples>.",  # noqa: E501
         caption="Examples",
-        maxdepth=1)  # TODO change to 2 when examples start being categorised
+        maxdepth=2)
+    # Category indices stored in reverse order because they are inserted into
+    # examples_index.documents at index 0 in order
     category_indices = {
-        "offline_inference":
+        "other":
         Index(
-            path=EXAMPLE_DOC_DIR / "examples_offline_inference_index.md",
-            title="Offline Inference",
+            path=EXAMPLE_DOC_DIR / "examples_other_index.md",
+            title="Other",
             description=
-            "Offline inference examples demonstrate how to use vLLM in an offline setting, where the model is queried for predictions in batches.",  # noqa: E501
+            "Other examples that don't strongly fit into the online or offline serving categories.",  # noqa: E501
             caption="Examples",
         ),
         "online_serving":
@@ -181,31 +187,30 @@ def generate_examples():
             "Online serving examples demonstrate how to use vLLM in an online setting, where the model is queried for predictions in real-time.",  # noqa: E501
             caption="Examples",
         ),
-        "other":
+        "offline_inference":
         Index(
-            path=EXAMPLE_DOC_DIR / "examples_other_index.md",
-            title="Other",
+            path=EXAMPLE_DOC_DIR / "examples_offline_inference_index.md",
+            title="Offline Inference",
             description=
-            "Other examples that don't strongly fit into the online or offline serving categories.",  # noqa: E501
+            "Offline inference examples demonstrate how to use vLLM in an offline setting, where the model is queried for predictions in batches.",  # noqa: E501
             caption="Examples",
         ),
     }
 
     examples = []
+    glob_patterns = ["*.py", "*.md", "*.sh"]
     # Find categorised examples
     for category in category_indices:
         category_dir = EXAMPLE_DIR / category
-        py = category_dir.glob("*.py")
-        md = category_dir.glob("*.md")
-        for path in itertools.chain(py, md):
+        globs = [category_dir.glob(pattern) for pattern in glob_patterns]
+        for path in itertools.chain(*globs):
             examples.append(Example(path, category))
         # Find examples in subdirectories
         for path in category_dir.glob("*/*.md"):
             examples.append(Example(path.parent, category))
     # Find uncategorised examples
-    py = EXAMPLE_DIR.glob("*.py")
-    md = EXAMPLE_DIR.glob("*.md")
-    for path in itertools.chain(py, md):
+    globs = [EXAMPLE_DIR.glob(pattern) for pattern in glob_patterns]
+    for path in itertools.chain(*globs):
         examples.append(Example(path))
     # Find examples in subdirectories
     for path in EXAMPLE_DIR.glob("*/*.md"):
@@ -215,7 +220,7 @@ def generate_examples():
         examples.append(Example(path.parent))
 
     # Generate the example documentation
-    for example in examples:
+    for example in sorted(examples, key=lambda e: e.path.stem):
         doc_path = EXAMPLE_DOC_DIR / f"{example.path.stem}.md"
         with open(doc_path, "w+") as f:
             f.write(example.generate())
diff --git a/docs/source/getting_started/installation/cpu-x86.md b/docs/source/getting_started/installation/cpu-x86.md
index bbb2d1872ef39..bb046dd0fd9dc 100644
--- a/docs/source/getting_started/installation/cpu-x86.md
+++ b/docs/source/getting_started/installation/cpu-x86.md
@@ -95,7 +95,7 @@ $ VLLM_TARGET_DEVICE=cpu python setup.py install
 $ sudo apt-get install libtcmalloc-minimal4 # install TCMalloc library
 $ find / -name *libtcmalloc* # find the dynamic link library path
 $ export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD
-$ python examples/offline_inference.py # run vLLM
+$ python examples/offline_inference/offline_inference.py # run vLLM
 ```
 
 - When using the online serving, it is recommended to reserve 1-2 CPU cores for the serving framework to avoid CPU oversubscription. For example, on a platform with 32 physical CPU cores, reserving CPU 30 and 31 for the framework and using CPU 0-29 for OpenMP:
@@ -132,7 +132,7 @@ CPU NODE SOCKET CORE L1d:L1i:L2:L3 ONLINE    MAXMHZ   MINMHZ      MHZ
 
 # On this platform, it is recommend to only bind openMP threads on logical CPU cores 0-7 or 8-15
 $ export VLLM_CPU_OMP_THREADS_BIND=0-7
-$ python examples/offline_inference.py
+$ python examples/offline_inference/offline_inference.py
 ```
 
 - If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores using `VLLM_CPU_OMP_THREADS_BIND` to avoid cross NUMA node memory access.
diff --git a/docs/source/getting_started/installation/xpu.md b/docs/source/getting_started/installation/xpu.md
index be4e3b9bd1bc5..c1ab5478eb652 100644
--- a/docs/source/getting_started/installation/xpu.md
+++ b/docs/source/getting_started/installation/xpu.md
@@ -71,4 +71,4 @@ $      --pipeline-parallel-size=2 \
 $      -tp=8
 ```
 
-By default, a ray instance will be launched automatically if no existing one is detected in system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the <gh-file:examples/run_cluster.sh> helper script.
+By default, a ray instance will be launched automatically if no existing one is detected in system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the <gh-file:examples/online_serving/run_cluster.sh> helper script.
diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md
index 3f9556165ece4..6b56918ce5638 100644
--- a/docs/source/getting_started/quickstart.md
+++ b/docs/source/getting_started/quickstart.md
@@ -31,7 +31,7 @@ For non-CUDA platforms, please refer [here](#installation-index) for specific in
 
 ## Offline Batched Inference
 
-With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: <gh-file:examples/offline_inference.py>
+With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: <gh-file:examples/offline_inference/offline_inference.py>
 
 The first line of this example imports the classes {class}`~vllm.LLM` and {class}`~vllm.SamplingParams`:
 
@@ -133,7 +133,7 @@ completion = client.completions.create(model="Qwen/Qwen2.5-1.5B-Instruct",
 print("Completion result:", completion)
 ```
 
-A more detailed client example can be found here: <gh-file:examples/openai_completion_client.py>
+A more detailed client example can be found here: <gh-file:examples/online_serving/openai_completion_client.py>
 
 ### OpenAI Chat Completions API with vLLM
 
diff --git a/docs/source/getting_started/troubleshooting.md b/docs/source/getting_started/troubleshooting.md
index 5a0310da0f2cb..f5efe0bef7506 100644
--- a/docs/source/getting_started/troubleshooting.md
+++ b/docs/source/getting_started/troubleshooting.md
@@ -24,7 +24,7 @@ To isolate the model downloading and loading issue, you can use the `--load-form
 
 ## Model is too large
 
-If the model is too large to fit in a single GPU, you might want to [consider tensor parallelism](#distributed-serving) to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using <gh-file:examples/save_sharded_state.py>. The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.
+If the model is too large to fit in a single GPU, you might want to [consider tensor parallelism](#distributed-serving) to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using <gh-file:examples/offline_inference/save_sharded_state.py>. The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.
 
 ## Enable more logging
 
diff --git a/docs/source/models/extensions/tensorizer.md b/docs/source/models/extensions/tensorizer.md
index 42ed5c795dd27..ae17e3437bca6 100644
--- a/docs/source/models/extensions/tensorizer.md
+++ b/docs/source/models/extensions/tensorizer.md
@@ -9,7 +9,7 @@ shorter Pod startup times and CPU memory usage. Tensor encryption is also suppor
 
 For more information on CoreWeave's Tensorizer, please refer to
 [CoreWeave's Tensorizer documentation](https://github.com/coreweave/tensorizer). For more information on serializing a vLLM model, as well a general usage guide to using Tensorizer with vLLM, see
-the [vLLM example script](https://docs.vllm.ai/en/stable/getting_started/examples/tensorize_vllm_model.html).
+the [vLLM example script](https://docs.vllm.ai/en/stable/getting_started/examples/offline_inference/tensorize_vllm_model.html).
 
 ```{note}
 Note that to use this feature you will need to install `tensorizer` by running `pip install vllm[tensorizer]`.
diff --git a/docs/source/models/generative_models.md b/docs/source/models/generative_models.md
index 383299d61b5dd..6228c7c2ac957 100644
--- a/docs/source/models/generative_models.md
+++ b/docs/source/models/generative_models.md
@@ -46,7 +46,7 @@ for output in outputs:
     print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 ```
 
-A code example can be found here: <gh-file:examples/offline_inference.py>
+A code example can be found here: <gh-file:examples/offline_inference/offline_inference.py>
 
 ### `LLM.beam_search`
 
@@ -103,7 +103,7 @@ for output in outputs:
     print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 ```
 
-A code example can be found here: <gh-file:examples/offline_inference_chat.py>
+A code example can be found here: <gh-file:examples/offline_inference/offline_inference_chat.py>
 
 If the model doesn't have a chat template or you want to specify another one,
 you can explicitly pass a chat template:
diff --git a/docs/source/models/pooling_models.md b/docs/source/models/pooling_models.md
index 12ded68eb30b5..3e4407cfdc233 100644
--- a/docs/source/models/pooling_models.md
+++ b/docs/source/models/pooling_models.md
@@ -65,7 +65,7 @@ embeds = output.outputs.embedding
 print(f"Embeddings: {embeds!r} (size={len(embeds)})")
 ```
 
-A code example can be found here: <gh-file:examples/offline_inference_embedding.py>
+A code example can be found here: <gh-file:examples/offline_inference/offline_inference_embedding.py>
 
 ### `LLM.classify`
 
@@ -80,7 +80,7 @@ probs = output.outputs.probs
 print(f"Class Probabilities: {probs!r} (size={len(probs)})")
 ```
 
-A code example can be found here: <gh-file:examples/offline_inference_classification.py>
+A code example can be found here: <gh-file:examples/offline_inference/offline_inference_classification.py>
 
 ### `LLM.score`
 
@@ -102,7 +102,7 @@ score = output.outputs.score
 print(f"Score: {score}")
 ```
 
-A code example can be found here: <gh-file:examples/offline_inference_scoring.py>
+A code example can be found here: <gh-file:examples/offline_inference/offline_inference_scoring.py>
 
 ## Online Inference
 
diff --git a/docs/source/serving/distributed_serving.md b/docs/source/serving/distributed_serving.md
index b1703249d7224..4e0a9ef6ecf7d 100644
--- a/docs/source/serving/distributed_serving.md
+++ b/docs/source/serving/distributed_serving.md
@@ -51,7 +51,7 @@ $     --pipeline-parallel-size 2
 
 If a single node does not have enough GPUs to hold the model, you can run the model using multiple nodes. It is important to make sure the execution environment is the same on all nodes, including the model path, the Python environment. The recommended way is to use docker images to ensure the same environment, and hide the heterogeneity of the host machines via mapping them into the same docker configuration.
 
-The first step, is to start containers and organize them into a cluster. We have provided the helper script <gh-file:examples/run_cluster.sh> to start the cluster. Please note, this script launches docker without administrative privileges that would be required to access GPU performance counters when running profiling and tracing tools. For that purpose, the script can have `CAP_SYS_ADMIN` to the docker container by using the `--cap-add` option in the docker run command.
+The first step, is to start containers and organize them into a cluster. We have provided the helper script <gh-file:examples/online_serving/run_cluster.sh> to start the cluster. Please note, this script launches docker without administrative privileges that would be required to access GPU performance counters when running profiling and tracing tools. For that purpose, the script can have `CAP_SYS_ADMIN` to the docker container by using the `--cap-add` option in the docker run command.
 
 Pick a node as the head node, and run the following command:
 
diff --git a/docs/source/serving/multimodal_inputs.md b/docs/source/serving/multimodal_inputs.md
index 0efa09f2869ca..9f5e1b908d786 100644
--- a/docs/source/serving/multimodal_inputs.md
+++ b/docs/source/serving/multimodal_inputs.md
@@ -60,7 +60,7 @@ for o in outputs:
     print(generated_text)
 ```
 
-Full example: <gh-file:examples/offline_inference_vision_language.py>
+Full example: <gh-file:examples/offline_inference/offline_inference_vision_language.py>
 
 To substitute multiple images inside the same text prompt, you can pass in a list of images instead:
 
@@ -91,7 +91,7 @@ for o in outputs:
     print(generated_text)
 ```
 
-Full example: <gh-file:examples/offline_inference_vision_language_multi_image.py>
+Full example: <gh-file:examples/offline_inference/offline_inference_vision_language_multi_image.py>
 
 Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos:
 
@@ -125,13 +125,13 @@ for o in outputs:
 You can pass a list of NumPy arrays directly to the `'video'` field of the multi-modal dictionary
 instead of using multi-image input.
 
-Full example: <gh-file:examples/offline_inference_vision_language.py>
+Full example: <gh-file:examples/offline_inference/offline_inference_vision_language.py>
 
 ### Audio
 
 You can pass a tuple `(array, sampling_rate)` to the `'audio'` field of the multi-modal dictionary.
 
-Full example: <gh-file:examples/offline_inference_audio_language.py>
+Full example: <gh-file:examples/offline_inference/offline_inference_audio_language.py>
 
 ### Embedding
 
@@ -271,7 +271,7 @@ chat_response = client.chat.completions.create(
 print("Chat completion output:", chat_response.choices[0].message.content)
 ```
 
-Full example: <gh-file:examples/openai_chat_completion_client_for_multimodal.py>
+Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
 
 ```{tip}
 Loading from local file paths is also supported on vLLM: You can specify the allowed local media path via `--allowed-local-media-path` when launching the API server/engine,
@@ -342,7 +342,7 @@ result = chat_completion_from_url.choices[0].message.content
 print("Chat completion output from image url:", result)
 ```
 
-Full example: <gh-file:examples/openai_chat_completion_client_for_multimodal.py>
+Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
 
 ````{note}
 By default, the timeout for fetching videos through HTTP URL is `30` seconds.
@@ -445,7 +445,7 @@ result = chat_completion_from_url.choices[0].message.content
 print("Chat completion output from audio url:", result)
 ```
 
-Full example: <gh-file:examples/openai_chat_completion_client_for_multimodal.py>
+Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
 
 ````{note}
 By default, the timeout for fetching audios through HTTP URL is `10` seconds.
@@ -529,4 +529,4 @@ Also important, `MrLight/dse-qwen2-2b-mrl-v1` requires a placeholder image of th
 example below for details.
 ```
 
-Full example: <gh-file:examples/openai_chat_embedding_client_for_multimodal.py>
+Full example: <gh-file:examples/online_serving/openai_chat_embedding_client_for_multimodal.py>
diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index 1e5ea6357d202..022dd3ae8a237 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -191,7 +191,7 @@ The order of priorities is `command line > config file values > defaults`.
 Our Completions API is compatible with [OpenAI's Completions API](https://platform.openai.com/docs/api-reference/completions);
 you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it.
 
-Code example: <gh-file:examples/openai_completion_client.py>
+Code example: <gh-file:examples/online_serving/openai_completion_client.py>
 
 #### Extra parameters
 
@@ -222,7 +222,7 @@ We support both [Vision](https://platform.openai.com/docs/guides/vision)- and
 see our [Multimodal Inputs](#multimodal-inputs) guide for more information.
 - *Note: `image_url.detail` parameter is not supported.*
 
-Code example: <gh-file:examples/openai_chat_completion_client.py>
+Code example: <gh-file:examples/online_serving/openai_chat_completion_client.py>
 
 #### Extra parameters
 
@@ -255,7 +255,7 @@ which will be treated as a single prompt to the model.
 This enables multi-modal inputs to be passed to embedding models, see [this page](#multimodal-inputs) for details.
 ```
 
-Code example: <gh-file:examples/openai_embedding_client.py>
+Code example: <gh-file:examples/online_serving/openai_embedding_client.py>
 
 #### Extra parameters
 
@@ -299,7 +299,7 @@ Our Pooling API encodes input prompts using a [pooling model](../models/pooling_
 
 The input format is the same as [Embeddings API](#embeddings-api), but the output data can contain an arbitrary nested list, not just a 1-D list of floats.
 
-Code example: <gh-file:examples/openai_pooling_client.py>
+Code example: <gh-file:examples/online_serving/openai_pooling_client.py>
 
 (score-api)=
 ### Score API
@@ -309,7 +309,7 @@ Usually, the score for a sentence pair refers to the similarity between two sent
 
 You can find the documentation for these kind of models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html).
 
-Code example: <gh-file:examples/openai_cross_encoder_score.py>
+Code example: <gh-file:examples/online_serving/openai_cross_encoder_score.py>
 
 #### Single inference
 
diff --git a/examples/aqlm_example.py b/examples/offline_inference/aqlm_example.py
similarity index 100%
rename from examples/aqlm_example.py
rename to examples/offline_inference/aqlm_example.py
diff --git a/examples/cpu_offload.py b/examples/offline_inference/cpu_offload.py
similarity index 100%
rename from examples/cpu_offload.py
rename to examples/offline_inference/cpu_offload.py
diff --git a/examples/florence2_inference.py b/examples/offline_inference/florence2_inference.py
similarity index 92%
rename from examples/florence2_inference.py
rename to examples/offline_inference/florence2_inference.py
index b58ac2e1f7ed4..49dd2c331db5a 100644
--- a/examples/florence2_inference.py
+++ b/examples/offline_inference/florence2_inference.py
@@ -3,7 +3,8 @@
 encoder/decoder models, specifically Florence-2
 '''
 # TODO(Isotr0py):
-# Move to offline_inference_vision_language.py after porting vision backbone
+# Move to offline_inference/offline_inference_vision_language.py
+# after porting vision backbone
 from vllm import LLM, SamplingParams
 
 dtype = "float"
diff --git a/examples/gguf_inference.py b/examples/offline_inference/gguf_inference.py
similarity index 100%
rename from examples/gguf_inference.py
rename to examples/offline_inference/gguf_inference.py
diff --git a/examples/llm_engine_example.py b/examples/offline_inference/llm_engine_example.py
similarity index 100%
rename from examples/llm_engine_example.py
rename to examples/offline_inference/llm_engine_example.py
diff --git a/examples/lora_with_quantization_inference.py b/examples/offline_inference/lora_with_quantization_inference.py
similarity index 100%
rename from examples/lora_with_quantization_inference.py
rename to examples/offline_inference/lora_with_quantization_inference.py
diff --git a/examples/multilora_inference.py b/examples/offline_inference/multilora_inference.py
similarity index 100%
rename from examples/multilora_inference.py
rename to examples/offline_inference/multilora_inference.py
diff --git a/examples/offline_chat_with_tools.py b/examples/offline_inference/offline_chat_with_tools.py
similarity index 100%
rename from examples/offline_chat_with_tools.py
rename to examples/offline_inference/offline_chat_with_tools.py
diff --git a/examples/offline_inference.py b/examples/offline_inference/offline_inference.py
similarity index 100%
rename from examples/offline_inference.py
rename to examples/offline_inference/offline_inference.py
diff --git a/examples/offline_inference_arctic.py b/examples/offline_inference/offline_inference_arctic.py
similarity index 100%
rename from examples/offline_inference_arctic.py
rename to examples/offline_inference/offline_inference_arctic.py
diff --git a/examples/offline_inference_audio_language.py b/examples/offline_inference/offline_inference_audio_language.py
similarity index 100%
rename from examples/offline_inference_audio_language.py
rename to examples/offline_inference/offline_inference_audio_language.py
diff --git a/examples/offline_inference_chat.py b/examples/offline_inference/offline_inference_chat.py
similarity index 100%
rename from examples/offline_inference_chat.py
rename to examples/offline_inference/offline_inference_chat.py
diff --git a/examples/offline_inference_classification.py b/examples/offline_inference/offline_inference_classification.py
similarity index 100%
rename from examples/offline_inference_classification.py
rename to examples/offline_inference/offline_inference_classification.py
diff --git a/examples/offline_inference_cli.py b/examples/offline_inference/offline_inference_cli.py
similarity index 100%
rename from examples/offline_inference_cli.py
rename to examples/offline_inference/offline_inference_cli.py
diff --git a/examples/offline_inference_distributed.py b/examples/offline_inference/offline_inference_distributed.py
similarity index 100%
rename from examples/offline_inference_distributed.py
rename to examples/offline_inference/offline_inference_distributed.py
diff --git a/examples/offline_inference_embedding.py b/examples/offline_inference/offline_inference_embedding.py
similarity index 100%
rename from examples/offline_inference_embedding.py
rename to examples/offline_inference/offline_inference_embedding.py
diff --git a/examples/offline_inference_encoder_decoder.py b/examples/offline_inference/offline_inference_encoder_decoder.py
similarity index 100%
rename from examples/offline_inference_encoder_decoder.py
rename to examples/offline_inference/offline_inference_encoder_decoder.py
diff --git a/examples/offline_inference_mlpspeculator.py b/examples/offline_inference/offline_inference_mlpspeculator.py
similarity index 100%
rename from examples/offline_inference_mlpspeculator.py
rename to examples/offline_inference/offline_inference_mlpspeculator.py
diff --git a/examples/offline_inference_neuron.py b/examples/offline_inference/offline_inference_neuron.py
similarity index 100%
rename from examples/offline_inference_neuron.py
rename to examples/offline_inference/offline_inference_neuron.py
diff --git a/examples/offline_inference_neuron_int8_quantization.py b/examples/offline_inference/offline_inference_neuron_int8_quantization.py
similarity index 100%
rename from examples/offline_inference_neuron_int8_quantization.py
rename to examples/offline_inference/offline_inference_neuron_int8_quantization.py
diff --git a/examples/offline_inference_openai.md b/examples/offline_inference/offline_inference_openai/offline_inference_openai.md
similarity index 90%
rename from examples/offline_inference_openai.md
rename to examples/offline_inference/offline_inference_openai/offline_inference_openai.md
index 2436417cb543a..6278a1943fe4a 100644
--- a/examples/offline_inference_openai.md
+++ b/examples/offline_inference/offline_inference_openai/offline_inference_openai.md
@@ -8,7 +8,7 @@ This is a guide to performing batch inference using the OpenAI batch file format
  
 The OpenAI batch file format consists of a series of json objects on new lines.
  
-[See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/openai_example_batch.jsonl)
+[See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl)
  
 Each line represents a separate request. See the [OpenAI package reference](https://platform.openai.com/docs/api-reference/batch/requestInput) for more details.
  
@@ -31,13 +31,13 @@ We currently only support `/v1/chat/completions` and `/v1/embeddings` endpoints
 To follow along with this example, you can download the example batch, or create your own batch file in your working directory.
 
 ```
-wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl
+wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl
 ```
 
 Once you've created your batch file it should look like this
 
 ```
-$ cat openai_example_batch.jsonl
+$ cat offline_inference/offline_inference_openai/openai_example_batch.jsonl
 {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
 {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
 ```
@@ -49,7 +49,7 @@ The batch running tool is designed to be used from the command line.
 You can run the batch with the following command, which will write its results to a file called `results.jsonl`
 
 ```
-python -m vllm.entrypoints.openai.run_batch -i openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
+python -m vllm.entrypoints.openai.run_batch -i offline_inference/offline_inference_openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
 ```
 
 ### Step 3: Check your results
@@ -66,10 +66,10 @@ $ cat results.jsonl
 
 The batch runner supports remote input and output urls that are accessible via http/https.
 
-For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl`, you can run
+For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl`, you can run
 
 ```
-python -m vllm.entrypoints.openai.run_batch -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
+python -m vllm.entrypoints.openai.run_batch -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
 ```
 
 ## Example 3: Integrating with AWS S3
@@ -90,13 +90,13 @@ To integrate with cloud blob storage, we recommend using presigned urls.
 To follow along with this example, you can download the example batch, or create your own batch file in your working directory.
 
 ```
-wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl
+wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl
 ```
 
 Once you've created your batch file it should look like this
 
 ```
-$ cat openai_example_batch.jsonl
+$ cat offline_inference/offline_inference_openai/openai_example_batch.jsonl
 {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
 {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
 ```
@@ -104,7 +104,7 @@ $ cat openai_example_batch.jsonl
 Now upload your batch file to your S3 bucket.
 
 ```
-aws s3 cp openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl
+aws s3 cp offline_inference/offline_inference_openai/openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl
 ```
 
 ### Step 2: Generate your presigned urls
diff --git a/examples/openai_example_batch.jsonl b/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl
similarity index 100%
rename from examples/openai_example_batch.jsonl
rename to examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl
diff --git a/examples/offline_inference_pixtral.py b/examples/offline_inference/offline_inference_pixtral.py
similarity index 100%
rename from examples/offline_inference_pixtral.py
rename to examples/offline_inference/offline_inference_pixtral.py
diff --git a/examples/offline_inference_scoring.py b/examples/offline_inference/offline_inference_scoring.py
similarity index 100%
rename from examples/offline_inference_scoring.py
rename to examples/offline_inference/offline_inference_scoring.py
diff --git a/examples/offline_inference_structured_outputs.py b/examples/offline_inference/offline_inference_structured_outputs.py
similarity index 100%
rename from examples/offline_inference_structured_outputs.py
rename to examples/offline_inference/offline_inference_structured_outputs.py
diff --git a/examples/offline_inference_tpu.py b/examples/offline_inference/offline_inference_tpu.py
similarity index 100%
rename from examples/offline_inference_tpu.py
rename to examples/offline_inference/offline_inference_tpu.py
diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference/offline_inference_vision_language.py
similarity index 100%
rename from examples/offline_inference_vision_language.py
rename to examples/offline_inference/offline_inference_vision_language.py
diff --git a/examples/offline_inference_vision_language_embedding.py b/examples/offline_inference/offline_inference_vision_language_embedding.py
similarity index 100%
rename from examples/offline_inference_vision_language_embedding.py
rename to examples/offline_inference/offline_inference_vision_language_embedding.py
diff --git a/examples/offline_inference_vision_language_multi_image.py b/examples/offline_inference/offline_inference_vision_language_multi_image.py
similarity index 100%
rename from examples/offline_inference_vision_language_multi_image.py
rename to examples/offline_inference/offline_inference_vision_language_multi_image.py
diff --git a/examples/offline_inference_whisper.py b/examples/offline_inference/offline_inference_whisper.py
similarity index 100%
rename from examples/offline_inference_whisper.py
rename to examples/offline_inference/offline_inference_whisper.py
diff --git a/examples/offline_inference_with_default_generation_config.py b/examples/offline_inference/offline_inference_with_default_generation_config.py
similarity index 100%
rename from examples/offline_inference_with_default_generation_config.py
rename to examples/offline_inference/offline_inference_with_default_generation_config.py
diff --git a/examples/offline_inference_with_prefix.py b/examples/offline_inference/offline_inference_with_prefix.py
similarity index 100%
rename from examples/offline_inference_with_prefix.py
rename to examples/offline_inference/offline_inference_with_prefix.py
diff --git a/examples/offline_inference_with_profiler.py b/examples/offline_inference/offline_inference_with_profiler.py
similarity index 100%
rename from examples/offline_inference_with_profiler.py
rename to examples/offline_inference/offline_inference_with_profiler.py
diff --git a/examples/offline_profile.py b/examples/offline_inference/offline_profile.py
similarity index 99%
rename from examples/offline_profile.py
rename to examples/offline_inference/offline_profile.py
index 46afe8aa2604b..187a05e4d70a2 100644
--- a/examples/offline_profile.py
+++ b/examples/offline_inference/offline_profile.py
@@ -363,7 +363,7 @@ def abort_requests():
 
     example:
     ```
-    python examples/offline_profile.py \\
+    python examples/offline_inference/offline_profile.py \\
         --model neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 --batch-size 4 \\
         --prompt-len 512 --max-num-batched-tokens 8196 --json Llama31-8b-FP8 \\
         --enforce-eager run_num_steps -n 2
diff --git a/examples/save_sharded_state.py b/examples/offline_inference/save_sharded_state.py
similarity index 100%
rename from examples/save_sharded_state.py
rename to examples/offline_inference/save_sharded_state.py
diff --git a/examples/api_client.py b/examples/online_serving/api_client.py
similarity index 100%
rename from examples/api_client.py
rename to examples/online_serving/api_client.py
diff --git a/examples/chart-helm/.helmignore b/examples/online_serving/chart-helm/.helmignore
similarity index 100%
rename from examples/chart-helm/.helmignore
rename to examples/online_serving/chart-helm/.helmignore
diff --git a/examples/chart-helm/Chart.yaml b/examples/online_serving/chart-helm/Chart.yaml
similarity index 100%
rename from examples/chart-helm/Chart.yaml
rename to examples/online_serving/chart-helm/Chart.yaml
diff --git a/examples/online_serving/chart-helm/README.md b/examples/online_serving/chart-helm/README.md
new file mode 100644
index 0000000000000..6aa126d4fd22c
--- /dev/null
+++ b/examples/online_serving/chart-helm/README.md
@@ -0,0 +1,21 @@
+# Helm Charts
+
+This directory contains a Helm chart for deploying the vllm application. The chart includes configurations for deployment, autoscaling, resource management, and more.
+
+## Files
+
+- Chart.yaml: Defines the chart metadata including name, version, and maintainers.
+- ct.yaml: Configuration for chart testing.
+- lintconf.yaml: Linting rules for YAML files.
+- values.schema.json: JSON schema for validating values.yaml.
+- values.yaml: Default values for the Helm chart.
+- templates/_helpers.tpl: Helper templates for defining common configurations.
+- templates/configmap.yaml: Template for creating ConfigMaps.
+- templates/custom-objects.yaml: Template for custom Kubernetes objects.
+- templates/deployment.yaml: Template for creating Deployments.
+- templates/hpa.yaml: Template for Horizontal Pod Autoscaler.
+- templates/job.yaml: Template for Kubernetes Jobs.
+- templates/poddisruptionbudget.yaml: Template for Pod Disruption Budget.
+- templates/pvc.yaml: Template for Persistent Volume Claims.
+- templates/secrets.yaml: Template for Kubernetes Secrets.
+- templates/service.yaml: Template for creating Services.
\ No newline at end of file
diff --git a/examples/chart-helm/ct.yaml b/examples/online_serving/chart-helm/ct.yaml
similarity index 100%
rename from examples/chart-helm/ct.yaml
rename to examples/online_serving/chart-helm/ct.yaml
diff --git a/examples/chart-helm/lintconf.yaml b/examples/online_serving/chart-helm/lintconf.yaml
similarity index 100%
rename from examples/chart-helm/lintconf.yaml
rename to examples/online_serving/chart-helm/lintconf.yaml
diff --git a/examples/chart-helm/templates/_helpers.tpl b/examples/online_serving/chart-helm/templates/_helpers.tpl
similarity index 100%
rename from examples/chart-helm/templates/_helpers.tpl
rename to examples/online_serving/chart-helm/templates/_helpers.tpl
diff --git a/examples/chart-helm/templates/configmap.yaml b/examples/online_serving/chart-helm/templates/configmap.yaml
similarity index 100%
rename from examples/chart-helm/templates/configmap.yaml
rename to examples/online_serving/chart-helm/templates/configmap.yaml
diff --git a/examples/chart-helm/templates/custom-objects.yaml b/examples/online_serving/chart-helm/templates/custom-objects.yaml
similarity index 100%
rename from examples/chart-helm/templates/custom-objects.yaml
rename to examples/online_serving/chart-helm/templates/custom-objects.yaml
diff --git a/examples/chart-helm/templates/deployment.yaml b/examples/online_serving/chart-helm/templates/deployment.yaml
similarity index 100%
rename from examples/chart-helm/templates/deployment.yaml
rename to examples/online_serving/chart-helm/templates/deployment.yaml
diff --git a/examples/chart-helm/templates/hpa.yaml b/examples/online_serving/chart-helm/templates/hpa.yaml
similarity index 100%
rename from examples/chart-helm/templates/hpa.yaml
rename to examples/online_serving/chart-helm/templates/hpa.yaml
diff --git a/examples/chart-helm/templates/job.yaml b/examples/online_serving/chart-helm/templates/job.yaml
similarity index 100%
rename from examples/chart-helm/templates/job.yaml
rename to examples/online_serving/chart-helm/templates/job.yaml
diff --git a/examples/chart-helm/templates/poddisruptionbudget.yaml b/examples/online_serving/chart-helm/templates/poddisruptionbudget.yaml
similarity index 100%
rename from examples/chart-helm/templates/poddisruptionbudget.yaml
rename to examples/online_serving/chart-helm/templates/poddisruptionbudget.yaml
diff --git a/examples/chart-helm/templates/pvc.yaml b/examples/online_serving/chart-helm/templates/pvc.yaml
similarity index 100%
rename from examples/chart-helm/templates/pvc.yaml
rename to examples/online_serving/chart-helm/templates/pvc.yaml
diff --git a/examples/chart-helm/templates/secrets.yaml b/examples/online_serving/chart-helm/templates/secrets.yaml
similarity index 100%
rename from examples/chart-helm/templates/secrets.yaml
rename to examples/online_serving/chart-helm/templates/secrets.yaml
diff --git a/examples/chart-helm/templates/service.yaml b/examples/online_serving/chart-helm/templates/service.yaml
similarity index 100%
rename from examples/chart-helm/templates/service.yaml
rename to examples/online_serving/chart-helm/templates/service.yaml
diff --git a/examples/chart-helm/values.schema.json b/examples/online_serving/chart-helm/values.schema.json
similarity index 100%
rename from examples/chart-helm/values.schema.json
rename to examples/online_serving/chart-helm/values.schema.json
diff --git a/examples/chart-helm/values.yaml b/examples/online_serving/chart-helm/values.yaml
similarity index 100%
rename from examples/chart-helm/values.yaml
rename to examples/online_serving/chart-helm/values.yaml
diff --git a/examples/disaggregated_prefill.sh b/examples/online_serving/disaggregated_prefill.sh
similarity index 100%
rename from examples/disaggregated_prefill.sh
rename to examples/online_serving/disaggregated_prefill.sh
diff --git a/examples/gradio_openai_chatbot_webserver.py b/examples/online_serving/gradio_openai_chatbot_webserver.py
similarity index 100%
rename from examples/gradio_openai_chatbot_webserver.py
rename to examples/online_serving/gradio_openai_chatbot_webserver.py
diff --git a/examples/gradio_webserver.py b/examples/online_serving/gradio_webserver.py
similarity index 100%
rename from examples/gradio_webserver.py
rename to examples/online_serving/gradio_webserver.py
diff --git a/examples/openai_chat_completion_client.py b/examples/online_serving/openai_chat_completion_client.py
similarity index 100%
rename from examples/openai_chat_completion_client.py
rename to examples/online_serving/openai_chat_completion_client.py
diff --git a/examples/openai_chat_completion_client_for_multimodal.py b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
similarity index 100%
rename from examples/openai_chat_completion_client_for_multimodal.py
rename to examples/online_serving/openai_chat_completion_client_for_multimodal.py
diff --git a/examples/openai_chat_completion_client_with_tools.py b/examples/online_serving/openai_chat_completion_client_with_tools.py
similarity index 100%
rename from examples/openai_chat_completion_client_with_tools.py
rename to examples/online_serving/openai_chat_completion_client_with_tools.py
diff --git a/examples/openai_chat_completion_structured_outputs.py b/examples/online_serving/openai_chat_completion_structured_outputs.py
similarity index 100%
rename from examples/openai_chat_completion_structured_outputs.py
rename to examples/online_serving/openai_chat_completion_structured_outputs.py
diff --git a/examples/openai_chat_embedding_client_for_multimodal.py b/examples/online_serving/openai_chat_embedding_client_for_multimodal.py
similarity index 100%
rename from examples/openai_chat_embedding_client_for_multimodal.py
rename to examples/online_serving/openai_chat_embedding_client_for_multimodal.py
diff --git a/examples/openai_completion_client.py b/examples/online_serving/openai_completion_client.py
similarity index 100%
rename from examples/openai_completion_client.py
rename to examples/online_serving/openai_completion_client.py
diff --git a/examples/openai_cross_encoder_score.py b/examples/online_serving/openai_cross_encoder_score.py
similarity index 100%
rename from examples/openai_cross_encoder_score.py
rename to examples/online_serving/openai_cross_encoder_score.py
diff --git a/examples/openai_embedding_client.py b/examples/online_serving/openai_embedding_client.py
similarity index 100%
rename from examples/openai_embedding_client.py
rename to examples/online_serving/openai_embedding_client.py
diff --git a/examples/openai_pooling_client.py b/examples/online_serving/openai_pooling_client.py
similarity index 100%
rename from examples/openai_pooling_client.py
rename to examples/online_serving/openai_pooling_client.py
diff --git a/examples/opentelemetry/Otel.md b/examples/online_serving/opentelemetry/Otel.md
similarity index 100%
rename from examples/opentelemetry/Otel.md
rename to examples/online_serving/opentelemetry/Otel.md
diff --git a/examples/opentelemetry/dummy_client.py b/examples/online_serving/opentelemetry/dummy_client.py
similarity index 100%
rename from examples/opentelemetry/dummy_client.py
rename to examples/online_serving/opentelemetry/dummy_client.py
diff --git a/examples/prometheus_grafana/README.md b/examples/online_serving/prometheus_grafana/README.md
similarity index 100%
rename from examples/prometheus_grafana/README.md
rename to examples/online_serving/prometheus_grafana/README.md
diff --git a/examples/prometheus_grafana/docker-compose.yaml b/examples/online_serving/prometheus_grafana/docker-compose.yaml
similarity index 100%
rename from examples/prometheus_grafana/docker-compose.yaml
rename to examples/online_serving/prometheus_grafana/docker-compose.yaml
diff --git a/examples/prometheus_grafana/grafana.json b/examples/online_serving/prometheus_grafana/grafana.json
similarity index 100%
rename from examples/prometheus_grafana/grafana.json
rename to examples/online_serving/prometheus_grafana/grafana.json
diff --git a/examples/prometheus_grafana/prometheus.yaml b/examples/online_serving/prometheus_grafana/prometheus.yaml
similarity index 100%
rename from examples/prometheus_grafana/prometheus.yaml
rename to examples/online_serving/prometheus_grafana/prometheus.yaml
diff --git a/examples/run_cluster.sh b/examples/online_serving/run_cluster.sh
similarity index 100%
rename from examples/run_cluster.sh
rename to examples/online_serving/run_cluster.sh
diff --git a/examples/sagemaker-entrypoint.sh b/examples/online_serving/sagemaker-entrypoint.sh
similarity index 100%
rename from examples/sagemaker-entrypoint.sh
rename to examples/online_serving/sagemaker-entrypoint.sh
diff --git a/examples/fp8/README.md b/examples/other/fp8/README.md
similarity index 88%
rename from examples/fp8/README.md
rename to examples/other/fp8/README.md
index 5492872cae93a..4e8031d954113 100644
--- a/examples/fp8/README.md
+++ b/examples/other/fp8/README.md
@@ -20,12 +20,12 @@ Before incorporating the FP8 datatype for inference workloads, you must adhere t
 ### 2. Convert HF model into a quantized HF model.
 Note: The following steps are adapted from the [TensorRT-LLM repository](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/quantization/README.md).
 
-`quantize.py` (examples/fp8/quantizer/quantize.py) uses the quantization toolkit  (AMMO) to calibrate the PyTorch models and export TensorRT-LLM checkpoints. Each TensorRT-LLM checkpoint contains a config file (in .json format) and one or several rank weight files (in .safetensors format).
+`quantize.py` (examples/other/fp8/quantizer/quantize.py) uses the quantization toolkit  (AMMO) to calibrate the PyTorch models and export TensorRT-LLM checkpoints. Each TensorRT-LLM checkpoint contains a config file (in .json format) and one or several rank weight files (in .safetensors format).
 
-The detailed quantization toolkit (AMMO) conversion guide for FP8 can be found at `examples/fp8/quantizer/README.md`.
+The detailed quantization toolkit (AMMO) conversion guide for FP8 can be found at `examples/other/fp8/quantizer/README.md`.
 
 ### 3. Extract KV Cache Scaling Factors from quantized HF model.
-`extract_scales.py` (examples/fp8/extract_scales.py) can be utilized to extract the KV cache scaling factors from your quantized HF model, however at the moment, this tool exclusively supports Llama 2 models. It is also important to note the following:
+`extract_scales.py` (examples/other/fp8/extract_scales.py) can be utilized to extract the KV cache scaling factors from your quantized HF model, however at the moment, this tool exclusively supports Llama 2 models. It is also important to note the following:
 1. **File Structure**: The utility operates under the assumption that all parameters, including KV cache scaling factors, corresponding to a particular Tensor Parallelism (TP) rank are stored in a single file. These files must adhere to a specific naming convention where the TP rank is immediately identified after a specific keyword (e.g., "rank") in the filename.
 
 2. **TP Decomposition**: The utility assumes consistency between the TP decomposition employed by the quantizer tool and that used by vLLM.
@@ -35,7 +35,7 @@ The detailed quantization toolkit (AMMO) conversion guide for FP8 can be found a
 ```python
 # prerequisites:
 # - Quantized HF LLaMa 2 model 
-python3 examples/fp8/extract_scales.py --help
+python3 examples/other/fp8/extract_scales.py --help
 Usage: extract_scales.py [-h] --quantized_model QUANTIZED_MODEL [--load_format {auto,safetensors,npz,pt}] [--output_dir OUTPUT_DIR] [--output_name OUTPUT_NAME] [--tp_size TP_SIZE]
 
 KV Scale Extraction Example
@@ -52,7 +52,7 @@ Optional arguments:
 ```
 ```python
 Example:
-python3 examples/fp8/extract_scales.py --quantized_model <QUANTIZED_MODEL_DIR> --tp_size <TENSOR_PARALLEL_SIZE> --output_dir <PATH_TO_OUTPUT_DIR>
+python3 examples/other/fp8/extract_scales.py --quantized_model <QUANTIZED_MODEL_DIR> --tp_size <TENSOR_PARALLEL_SIZE> --output_dir <PATH_TO_OUTPUT_DIR>
 ```
 ### 4. Load KV Cache Scaling Factors into VLLM.
 This script evaluates the inference throughput of language models using various backends such as vLLM. It measures the time taken to process a given number of prompts and generate sequences for each prompt. The recently generated KV cache scaling factors are now integrated into the benchmarking process and allow for KV cache scaling factors to be utilized for FP8.
diff --git a/examples/fp8/extract_scales.py b/examples/other/fp8/extract_scales.py
similarity index 100%
rename from examples/fp8/extract_scales.py
rename to examples/other/fp8/extract_scales.py
diff --git a/examples/fp8/quantizer/README.md b/examples/other/fp8/quantizer/README.md
similarity index 100%
rename from examples/fp8/quantizer/README.md
rename to examples/other/fp8/quantizer/README.md
diff --git a/examples/fp8/quantizer/quantize.py b/examples/other/fp8/quantizer/quantize.py
similarity index 100%
rename from examples/fp8/quantizer/quantize.py
rename to examples/other/fp8/quantizer/quantize.py
diff --git a/examples/logging_configuration.md b/examples/other/logging_configuration.md
similarity index 100%
rename from examples/logging_configuration.md
rename to examples/other/logging_configuration.md
diff --git a/examples/tensorize_vllm_model.py b/examples/other/tensorize_vllm_model.py
similarity index 96%
rename from examples/tensorize_vllm_model.py
rename to examples/other/tensorize_vllm_model.py
index dd77a4ad0c6b7..5fff1fdf502c9 100644
--- a/examples/tensorize_vllm_model.py
+++ b/examples/other/tensorize_vllm_model.py
@@ -25,7 +25,7 @@
 To serialize a model, install vLLM from source, then run something 
 like this from the root level of this repository:
 
-python -m examples.tensorize_vllm_model \
+python -m examples.offline_inference.tensorize_vllm_model \
    --model facebook/opt-125m \
    serialize \
    --serialized-directory s3://my-bucket \
@@ -45,7 +45,7 @@
 To deserialize a model, you can run something like this from the root 
 level of this repository:
 
-python -m examples.tensorize_vllm_model \
+python -m examples.offline_inference.tensorize_vllm_model \
    --model EleutherAI/gpt-j-6B \
    --dtype float16 \
    deserialize \
@@ -63,11 +63,11 @@
 model-rank-%03d.tensors
 
 For more information on the available arguments for serializing, run 
-`python -m examples.tensorize_vllm_model serialize --help`.
+`python -m examples.offline_inference.tensorize_vllm_model serialize --help`.
 
 Or for deserializing:
 
-`python -m examples.tensorize_vllm_model deserialize --help`.
+`python -m examples.offline_inference.tensorize_vllm_model deserialize --help`.
 
 Once a model is serialized, tensorizer can be invoked with the `LLM` class 
 directly to load models:
@@ -88,7 +88,7 @@
 In order to see all of the available arguments usable to configure 
 loading with tensorizer that are given to `TensorizerConfig`, run:
 
-`python -m examples.tensorize_vllm_model deserialize --help`
+`python -m examples.offline_inference.tensorize_vllm_model deserialize --help`
 
 under the `tensorizer options` section. These can also be used for
 deserialization in this example script, although `--tensorizer-uri` and
diff --git a/pyproject.toml b/pyproject.toml
index 45fa4bff4e680..0ac3f39ef7a5f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -20,7 +20,7 @@ build-backend = "setuptools.build_meta"
 line-length = 80
 exclude = [
     # External file, leaving license intact
-    "examples/fp8/quantizer/quantize.py"
+    "examples/other/fp8/quantizer/quantize.py"
 ]
 
 [tool.ruff.lint.per-file-ignores]
diff --git a/tests/plugins_tests/test_platform_plugins.py b/tests/plugins_tests/test_platform_plugins.py
index 0d27cf9f152e0..57518bd3e8299 100644
--- a/tests/plugins_tests/test_platform_plugins.py
+++ b/tests/plugins_tests/test_platform_plugins.py
@@ -5,7 +5,7 @@ def test_platform_plugins():
     import os
     example_file = os.path.join(
         os.path.dirname(os.path.dirname(os.path.dirname(current_file))),
-        "examples", "offline_inference.py")
+        "examples", "offline_inference/offline_inference.py")
     runpy.run_path(example_file)
 
     # check if the plugin is loaded correctly
diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py
index 0b0792b6b845f..bf409d2d97aa1 100644
--- a/tests/tensorizer_loader/test_tensorizer.py
+++ b/tests/tensorizer_loader/test_tensorizer.py
@@ -163,8 +163,8 @@ def test_deserialized_hf_model_has_same_outputs(hf_runner, vllm_runner,
 
 def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
     multilora_inference = import_from_path(
-        "examples.multilora_inference",
-        EXAMPLES_PATH / "multilora_inference.py",
+        "examples.offline_inference.multilora_inference",
+        EXAMPLES_PATH / "offline_inference/multilora_inference.py",
     )
 
     model_ref = "meta-llama/Llama-2-7b-hf"
diff --git a/tools/profiler/print_layerwise_table.py b/tools/profiler/print_layerwise_table.py
index 394ca8663e189..49366abc7fb56 100644
--- a/tools/profiler/print_layerwise_table.py
+++ b/tools/profiler/print_layerwise_table.py
@@ -31,7 +31,7 @@ def get_entries(node, curr_depth=0):
                         type=str,
                         required=True,
                         help="json trace file output by "
-                        "examples/offline_profile.py")
+                        "examples/offline_inference/offline_profile.py")
     parser.add_argument("--phase",
                         type=str,
                         required=True,
diff --git a/tools/profiler/visualize_layerwise_profile.py b/tools/profiler/visualize_layerwise_profile.py
index da7a28da15c19..fa88ed4204d8f 100644
--- a/tools/profiler/visualize_layerwise_profile.py
+++ b/tools/profiler/visualize_layerwise_profile.py
@@ -534,11 +534,11 @@ def make_plot_title_suffix(profile_json: dict) -> str:
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
 
-    parser.add_argument(
-        "--json-trace",
-        type=str,
-        required=True,
-        help="json trace file output by examples/offline_profile.py")
+    parser.add_argument("--json-trace",
+                        type=str,
+                        required=True,
+                        help="json trace file output by \
+                              examples/offline_inference/offline_profile.py")
     parser.add_argument("--output-directory",
                         type=str,
                         required=False,
diff --git a/vllm/distributed/kv_transfer/README.md b/vllm/distributed/kv_transfer/README.md
index dab2d10c4c9d0..e20c992a381a3 100644
--- a/vllm/distributed/kv_transfer/README.md
+++ b/vllm/distributed/kv_transfer/README.md
@@ -22,7 +22,7 @@ NOTE: If you want to not only transfer KV caches, but adjust the model execution
 
 ## Disaggregated prefilling
 
-The example usage is in [this file](../../../examples/disaggregated_prefill.sh).
+The example usage is in [this file](../../../examples/online_serving/disaggregated_prefill.sh).
 
 Here is the diagram of how we run disaggretgated prefilling.
 
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index a9c1fa7221217..0033fbff0e9ac 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -452,9 +452,9 @@ def _load_model_serialized_cpu(
         """Load a serialized model with tensorizer to the CPU.
 
         This is only necessary when the model isn't vLLM-tensorized (see
-        examples/tensorize_vllm_model.py) This should still be faster than
-        default HuggingFace loading, but will be slower than loading a
-        vLLM-tensorized model.
+        examples/other/tensorize_vllm_model.py) This should still
+        be faster than default HuggingFace loading, but will be slower than
+        loading a vLLM-tensorized model.
         """
         device_config = vllm_config.device_config
         model_config = vllm_config.model_config
@@ -472,7 +472,7 @@ def _load_model_serialized(
         """Load a serialized model with tensorizer.
 
         Expects a vLLM-tensorized model. See the
-        examples/tensorize_vllm_model.py example script
+        examples/other/tensorize_vllm_model.py example script
         for serializing vLLM models."""
 
         device_config = vllm_config.device_config
@@ -529,7 +529,8 @@ class ShardedStateLoader(BaseModelLoader):
     Model loader that directly loads each worker's model state dict, which
     enables a fast load path for large tensor-parallel models where each worker
     only needs to read its own shard rather than the entire checkpoint. See
-    `examples/save_sharded_state.py` for creating a sharded checkpoint.
+    `examples/offline_inference/save_sharded_state.py` for creating a sharded
+    checkpoint.
     """
 
     DEFAULT_PATTERN = "model-rank-{rank}-part-{part}.safetensors"
diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py
index 8b929f299c8d8..fbd4937112e11 100644
--- a/vllm/model_executor/model_loader/tensorizer.py
+++ b/vllm/model_executor/model_loader/tensorizer.py
@@ -155,7 +155,7 @@ class TensorizerArgs:
       encryption_keyfile: File path to a binary file containing a  
           binary key to use for decryption. `None` (the default) means 
           no decryption. See the example script in 
-          examples/tensorize_vllm_model.py. 
+          examples/other/tensorize_vllm_model.py. 
       s3_access_key_id: The access key for the S3 bucket. Can also be set via
           the S3_ACCESS_KEY_ID environment variable.
       s3_secret_access_key: The secret access key for the S3 bucket. Can also
@@ -363,12 +363,12 @@ def deserialize(self):
 def tensorizer_weights_iterator(
     tensorizer_args: "TensorizerArgs"
 ) -> Generator[Tuple[str, torch.Tensor], None, None]:
-    logger.warning(
-        "Deserializing HuggingFace models is not optimized for "
-        "loading on vLLM, as tensorizer is forced to load to CPU. "
-        "Consider deserializing a vLLM model instead for faster "
-        "load times. See the examples/tensorize_vllm_model.py example "
-        "script for serializing vLLM models.")
+    logger.warning("Deserializing HuggingFace models is not optimized for "
+                   "loading on vLLM, as tensorizer is forced to load to CPU. "
+                   "Consider deserializing a vLLM model instead for faster "
+                   "load times. See the "
+                   "examples/other/tensorize_vllm_model.py example script "
+                   "for serializing vLLM models.")
 
     deserializer_args = tensorizer_args.deserializer_params
     stream_params = tensorizer_args.stream_params
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 8aa0c98df70d2..a2c991cfdb74e 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -503,7 +503,8 @@ def kv_cache_scales_loader(
     KV cache scaling factors. The serialization should represent a dictionary
     whose keys are the TP ranks and values are another dictionary mapping layers
     to their KV cache scaling factors.
-    Keep this function in sync with the output of examples/fp8/extract_scales.py
+    Keep this function in sync with the output of
+    examples/other/fp8/extract_scales.py
     """
     try:
         with open(filename) as f:

From 6cd40a5bfed24ef0ceca83b0450be6920d8ca6d4 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 8 Jan 2025 21:34:44 +0800
Subject: [PATCH 132/462] [Doc][4/N] Reorganize API Reference (#11843)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .buildkite/test-pipeline.yaml                 |   2 +-
 Dockerfile                                    |   4 ++--
 .../{dev => api}/engine/async_llm_engine.md   |   0
 .../engine_index.md => api/engine/index.md}   |   0
 docs/source/{dev => api}/engine/llm_engine.md |   0
 .../multimodal/index.md}                      |  10 --------
 .../offline_inference/index.md}               |   0
 .../{dev => api}/offline_inference/llm.md     |   0
 .../offline_inference/llm_inputs.md           |   0
 docs/source/api/params.md                     |  22 ++++++++++++++++++
 .../dockerfile-stages-dependency.png          | Bin
 .../contributing/dockerfile/dockerfile.md     |   2 +-
 docs/source/design/arch_overview.md           |   2 +-
 .../multimodal/adding_multimodal_plugin.md    |  16 -------------
 docs/source/dev/pooling_params.md             |   6 -----
 docs/source/dev/sampling_params.md            |   6 -----
 docs/source/getting_started/quickstart.md     |   2 +-
 docs/source/index.md                          |   9 ++++---
 docs/source/serving/offline_inference.md      |   2 +-
 .../serving/openai_compatible_server.md       |   8 +++----
 vllm/multimodal/base.py                       |   3 ---
 vllm/multimodal/inputs.py                     |   6 -----
 vllm/multimodal/registry.py                   |   3 ---
 vllm/pooling_params.py                        |   2 +-
 24 files changed, 38 insertions(+), 67 deletions(-)
 rename docs/source/{dev => api}/engine/async_llm_engine.md (100%)
 rename docs/source/{dev/engine/engine_index.md => api/engine/index.md} (100%)
 rename docs/source/{dev => api}/engine/llm_engine.md (100%)
 rename docs/source/{design/multimodal/multimodal_index.md => api/multimodal/index.md} (84%)
 rename docs/source/{dev/offline_inference/offline_index.md => api/offline_inference/index.md} (100%)
 rename docs/source/{dev => api}/offline_inference/llm.md (100%)
 rename docs/source/{dev => api}/offline_inference/llm_inputs.md (100%)
 create mode 100644 docs/source/api/params.md
 rename docs/source/assets/{dev => contributing}/dockerfile-stages-dependency.png (100%)
 delete mode 100644 docs/source/design/multimodal/adding_multimodal_plugin.md
 delete mode 100644 docs/source/dev/pooling_params.md
 delete mode 100644 docs/source/dev/sampling_params.md

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index b7178b94f481a..f883595f6d9ad 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -38,7 +38,7 @@ steps:
   - pip install -r requirements-docs.txt
   - SPHINXOPTS=\"-W\" make html
   # Check API reference (if it fails, you may have missing mock imports)
-  - grep \"sig sig-object py\" build/html/dev/sampling_params.html
+  - grep \"sig sig-object py\" build/html/api/params.html
 
 - label: Async Engine, Inputs, Utils, Worker Test # 24min
   fast_check: true
diff --git a/Dockerfile b/Dockerfile
index 808cf675acf4d..4542bc9cf0bd2 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -2,8 +2,8 @@
 # to run the OpenAI compatible server.
 
 # Please update any changes made here to
-# docs/source/dev/dockerfile/dockerfile.md and
-# docs/source/assets/dev/dockerfile-stages-dependency.png
+# docs/source/contributing/dockerfile/dockerfile.md and
+# docs/source/assets/contributing/dockerfile-stages-dependency.png
 
 ARG CUDA_VERSION=12.4.1
 #################### BASE BUILD IMAGE ####################
diff --git a/docs/source/dev/engine/async_llm_engine.md b/docs/source/api/engine/async_llm_engine.md
similarity index 100%
rename from docs/source/dev/engine/async_llm_engine.md
rename to docs/source/api/engine/async_llm_engine.md
diff --git a/docs/source/dev/engine/engine_index.md b/docs/source/api/engine/index.md
similarity index 100%
rename from docs/source/dev/engine/engine_index.md
rename to docs/source/api/engine/index.md
diff --git a/docs/source/dev/engine/llm_engine.md b/docs/source/api/engine/llm_engine.md
similarity index 100%
rename from docs/source/dev/engine/llm_engine.md
rename to docs/source/api/engine/llm_engine.md
diff --git a/docs/source/design/multimodal/multimodal_index.md b/docs/source/api/multimodal/index.md
similarity index 84%
rename from docs/source/design/multimodal/multimodal_index.md
rename to docs/source/api/multimodal/index.md
index e4f2171e84ff7..0046b73ea825e 100644
--- a/docs/source/design/multimodal/multimodal_index.md
+++ b/docs/source/api/multimodal/index.md
@@ -11,18 +11,8 @@ vLLM provides experimental support for multi-modal models through the {mod}`vllm
 Multi-modal inputs can be passed alongside text and token prompts to [supported models](#supported-mm-models)
 via the `multi_modal_data` field in {class}`vllm.inputs.PromptType`.
 
-Currently, vLLM only has built-in support for image data. You can extend vLLM to process additional modalities
-by following [this guide](#adding-multimodal-plugin).
-
 Looking to add your own multi-modal model? Please follow the instructions listed [here](#enabling-multimodal-inputs).
 
-## Guides
-
-```{toctree}
-:maxdepth: 1
-
-adding_multimodal_plugin
-```
 
 ## Module Contents
 
diff --git a/docs/source/dev/offline_inference/offline_index.md b/docs/source/api/offline_inference/index.md
similarity index 100%
rename from docs/source/dev/offline_inference/offline_index.md
rename to docs/source/api/offline_inference/index.md
diff --git a/docs/source/dev/offline_inference/llm.md b/docs/source/api/offline_inference/llm.md
similarity index 100%
rename from docs/source/dev/offline_inference/llm.md
rename to docs/source/api/offline_inference/llm.md
diff --git a/docs/source/dev/offline_inference/llm_inputs.md b/docs/source/api/offline_inference/llm_inputs.md
similarity index 100%
rename from docs/source/dev/offline_inference/llm_inputs.md
rename to docs/source/api/offline_inference/llm_inputs.md
diff --git a/docs/source/api/params.md b/docs/source/api/params.md
new file mode 100644
index 0000000000000..a3b4d9cbb44ec
--- /dev/null
+++ b/docs/source/api/params.md
@@ -0,0 +1,22 @@
+# Optional Parameters
+
+Optional parameters for vLLM APIs.
+
+(sampling-params)=
+
+## Sampling Parameters
+
+```{eval-rst}
+.. autoclass:: vllm.SamplingParams
+    :members:
+```
+
+(pooling-params)=
+
+## Pooling Parameters
+
+```{eval-rst}
+.. autoclass:: vllm.PoolingParams
+    :members:
+```
+
diff --git a/docs/source/assets/dev/dockerfile-stages-dependency.png b/docs/source/assets/contributing/dockerfile-stages-dependency.png
similarity index 100%
rename from docs/source/assets/dev/dockerfile-stages-dependency.png
rename to docs/source/assets/contributing/dockerfile-stages-dependency.png
diff --git a/docs/source/contributing/dockerfile/dockerfile.md b/docs/source/contributing/dockerfile/dockerfile.md
index 38ea956ba8dfb..cb142318b8724 100644
--- a/docs/source/contributing/dockerfile/dockerfile.md
+++ b/docs/source/contributing/dockerfile/dockerfile.md
@@ -17,7 +17,7 @@ The edges of the build graph represent:
 
 - `RUN --mount=(.\*)from=...` dependencies (with a dotted line and an empty diamond arrow head)
 
-  > ```{figure} ../../assets/dev/dockerfile-stages-dependency.png
+  > ```{figure} /assets/contributing/dockerfile-stages-dependency.png
   > :align: center
   > :alt: query
   > :width: 100%
diff --git a/docs/source/design/arch_overview.md b/docs/source/design/arch_overview.md
index 5e0dd021ad02e..cec503ef2f77d 100644
--- a/docs/source/design/arch_overview.md
+++ b/docs/source/design/arch_overview.md
@@ -53,7 +53,7 @@ for output in outputs:
 ```
 
 More API details can be found in the {doc}`Offline Inference
-</dev/offline_inference/offline_index>` section of the API docs.
+</api/offline_inference/index>` section of the API docs.
 
 The code for the `LLM` class can be found in <gh-file:vllm/entrypoints/llm.py>.
 
diff --git a/docs/source/design/multimodal/adding_multimodal_plugin.md b/docs/source/design/multimodal/adding_multimodal_plugin.md
deleted file mode 100644
index bcccd284879bb..0000000000000
--- a/docs/source/design/multimodal/adding_multimodal_plugin.md
+++ /dev/null
@@ -1,16 +0,0 @@
-(adding-multimodal-plugin)=
-
-# Adding a Multimodal Plugin
-
-This document teaches you how to add a new modality to vLLM.
-
-Each modality in vLLM is represented by a {class}`~vllm.multimodal.MultiModalPlugin` and registered to {data}`~vllm.multimodal.MULTIMODAL_REGISTRY`.
-For vLLM to recognize a new modality type, you have to create a new plugin and then pass it to {meth}`~vllm.multimodal.MultiModalRegistry.register_plugin`.
-
-The remainder of this document details how to define custom {class}`~vllm.multimodal.MultiModalPlugin` s.
-
-```{note}
-This article is a work in progress.
-```
-
-% TODO: Add more instructions on how to add new plugins once embeddings is in.
diff --git a/docs/source/dev/pooling_params.md b/docs/source/dev/pooling_params.md
deleted file mode 100644
index 74b2c57443e4b..0000000000000
--- a/docs/source/dev/pooling_params.md
+++ /dev/null
@@ -1,6 +0,0 @@
-# Pooling Parameters
-
-```{eval-rst}
-.. autoclass:: vllm.PoolingParams
-    :members:
-```
diff --git a/docs/source/dev/sampling_params.md b/docs/source/dev/sampling_params.md
deleted file mode 100644
index bdc36af5153db..0000000000000
--- a/docs/source/dev/sampling_params.md
+++ /dev/null
@@ -1,6 +0,0 @@
-# Sampling Parameters
-
-```{eval-rst}
-.. autoclass:: vllm.SamplingParams
-    :members:
-```
diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md
index 6b56918ce5638..2808e1b386801 100644
--- a/docs/source/getting_started/quickstart.md
+++ b/docs/source/getting_started/quickstart.md
@@ -42,7 +42,7 @@ The first line of this example imports the classes {class}`~vllm.LLM` and {class
 from vllm import LLM, SamplingParams
 ```
 
-The next section defines a list of input prompts and sampling parameters for text generation. The [sampling temperature](https://arxiv.org/html/2402.05201v1) is set to `0.8` and the [nucleus sampling probability](https://en.wikipedia.org/wiki/Top-p_sampling) is set to `0.95`. You can find more information about the sampling parameters [here](https://docs.vllm.ai/en/stable/dev/sampling_params.html).
+The next section defines a list of input prompts and sampling parameters for text generation. The [sampling temperature](https://arxiv.org/html/2402.05201v1) is set to `0.8` and the [nucleus sampling probability](https://en.wikipedia.org/wiki/Top-p_sampling) is set to `0.95`. You can find more information about the sampling parameters [here](#sampling-params).
 
 ```python
 prompts = [
diff --git a/docs/source/index.md b/docs/source/index.md
index 11d3e24a9b60a..6747a7fcce4fe 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -137,10 +137,10 @@ community/sponsors
 :caption: API Reference
 :maxdepth: 2
 
-dev/sampling_params
-dev/pooling_params
-dev/offline_inference/offline_index
-dev/engine/engine_index
+api/offline_inference/index
+api/engine/index
+api/multimodal/index
+api/params
 ```
 
 % Design Documents: Details about vLLM internals
@@ -154,7 +154,6 @@ design/huggingface_integration
 design/plugin_system
 design/kernel/paged_attention
 design/input_processing/model_inputs_index
-design/multimodal/multimodal_index
 design/automatic_prefix_caching
 design/multiprocessing
 ```
diff --git a/docs/source/serving/offline_inference.md b/docs/source/serving/offline_inference.md
index 83178f7811825..79092ab208784 100644
--- a/docs/source/serving/offline_inference.md
+++ b/docs/source/serving/offline_inference.md
@@ -23,7 +23,7 @@ The available APIs depend on the type of model that is being run:
 Please refer to the above pages for more details about each API.
 
 ```{seealso}
-[API Reference](/dev/offline_inference/offline_index)
+[API Reference](/api/offline_inference/index)
 ```
 
 ## Configuration Options
diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index 022dd3ae8a237..ec5a367594743 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -195,7 +195,7 @@ Code example: <gh-file:examples/online_serving/openai_completion_client.py>
 
 #### Extra parameters
 
-The following [sampling parameters (click through to see documentation)](../dev/sampling_params.md) are supported.
+The following [sampling parameters](#sampling-params) are supported.
 
 ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
@@ -226,7 +226,7 @@ Code example: <gh-file:examples/online_serving/openai_chat_completion_client.py>
 
 #### Extra parameters
 
-The following [sampling parameters (click through to see documentation)](../dev/sampling_params.md) are supported.
+The following [sampling parameters](#sampling-params) are supported.
 
 ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
@@ -259,7 +259,7 @@ Code example: <gh-file:examples/online_serving/openai_embedding_client.py>
 
 #### Extra parameters
 
-The following [pooling parameters (click through to see documentation)](../dev/pooling_params.md) are supported.
+The following [pooling parameters](#pooling-params) are supported.
 
 ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
@@ -447,7 +447,7 @@ Response:
 
 #### Extra parameters
 
-The following [pooling parameters (click through to see documentation)](../dev/pooling_params.md) are supported.
+The following [pooling parameters](#pooling-params) are supported.
 
 ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 7f4029e726332..4941fbac963ca 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -49,9 +49,6 @@ class MultiModalPlugin(ABC):
     process the same data differently). This registry is in turn used by
     :class:`~MultiModalRegistry` which acts at a higher level
     (i.e., the modality of the data).
-
-    See also:
-        :ref:`adding-multimodal-plugin`
     """
 
     def __init__(self) -> None:
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 8fdcc4b524035..d542461874866 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -99,12 +99,6 @@ class MultiModalDataBuiltins(TypedDict, total=False):
 MultiModalDataDict: TypeAlias = Mapping[str, ModalityData[Any]]
 """
 A dictionary containing an entry for each modality type to input.
-
-Note:
-    This dictionary also accepts modality keys defined outside
-    :class:`MultiModalDataBuiltins` as long as a customized plugin
-    is registered through the :class:`~vllm.multimodal.MULTIMODAL_REGISTRY`.
-    Read more on that :ref:`here <adding-multimodal-plugin>`.
 """
 
 
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 5f01eac4edade..9eceefb08c93f 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -125,9 +125,6 @@ def __init__(
     def register_plugin(self, plugin: MultiModalPlugin) -> None:
         """
         Register a multi-modal plugin so it can be recognized by vLLM.
-
-        See also:
-            :ref:`adding-multimodal-plugin`
         """
         data_type_key = plugin.get_data_key()
 
diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py
index 2635c0bccd1c4..b24b7e91a7ae7 100644
--- a/vllm/pooling_params.py
+++ b/vllm/pooling_params.py
@@ -7,7 +7,7 @@ class PoolingParams(
         msgspec.Struct,
         omit_defaults=True,  # type: ignore[call-arg]
         array_like=True):  # type: ignore[call-arg]
-    """Pooling parameters for embeddings API.
+    """API parameters for pooling models. This is currently a placeholder.
 
     Attributes:
         additional_data: Any additional data needed for pooling.

From 2f7024987e582b85b280909b87287668cd97c92f Mon Sep 17 00:00:00 2001
From: "Li, Jiang" <jiang1.li@intel.com>
Date: Wed, 8 Jan 2025 23:18:28 +0800
Subject: [PATCH 133/462] [CI/Build][Bugfix] Fix CPU CI image clean up (#11836)

Signed-off-by: jiang1.li <jiang1.li@intel.com>
---
 .buildkite/run-cpu-test.sh               | 7 ++-----
 vllm/model_executor/layers/activation.py | 2 +-
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index 87d08c8c7fdcb..1a4dae8f65e99 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -13,7 +13,7 @@ numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test-"$BUILDKITE_BU
 numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 -f Dockerfile.cpu .
 
 # Setup cleanup
-remove_docker_container() { docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; }
+remove_docker_container() { set -e; docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; }
 trap remove_docker_container EXIT
 remove_docker_container
 
@@ -35,10 +35,7 @@ function cpu_tests() {
   # Run basic model test
   docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
     set -e
-    pip install pytest pytest-asyncio \
-      decord einops librosa peft Pillow sentence-transformers soundfile \
-      transformers_stream_generator matplotlib datamodel_code_generator
-    pip install torchvision --index-url https://download.pytorch.org/whl/cpu
+    pip install -r vllm/requirements-test.txt
     pytest -v -s tests/models/decoder_only/language -m cpu_model
     pytest -v -s tests/models/embedding/language -m cpu_model
     pytest -v -s tests/models/encoder_decoder/language -m cpu_model
diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index 46d4670bfe4f9..b8a302cf5087f 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -61,7 +61,7 @@ class SiluAndMul(CustomOp):
 
     def __init__(self):
         super().__init__()
-        if current_platform.is_cuda_alike():
+        if current_platform.is_cuda_alike() or current_platform.is_cpu():
             self.op = torch.ops._C.silu_and_mul
         elif current_platform.is_xpu():
             import intel_extension_for_pytorch as ipex

From 78f4590b60161dee1a444870ae682ba45f633502 Mon Sep 17 00:00:00 2001
From: Yan Ma <yan.ma@intel.com>
Date: Thu, 9 Jan 2025 00:11:50 +0800
Subject: [PATCH 134/462] [Bugfix][XPU] fix silu_and_mul (#11823)

Signed-off-by: yan ma <yan.ma@intel.com>
---
 vllm/model_executor/layers/activation.py | 4 ++--
 vllm/plugins/__init__.py                 | 5 ++---
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index b8a302cf5087f..32456fee06a28 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -64,8 +64,8 @@ def __init__(self):
         if current_platform.is_cuda_alike() or current_platform.is_cpu():
             self.op = torch.ops._C.silu_and_mul
         elif current_platform.is_xpu():
-            import intel_extension_for_pytorch as ipex
-            self.op = ipex.llm.functional.silu_and_mul
+            from vllm._ipex_ops import ipex_ops
+            self.op = ipex_ops.silu_and_mul
 
     def forward_native(self, x: torch.Tensor) -> torch.Tensor:
         """PyTorch-native implementation equivalent to forward()."""
diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py
index c50eb2cef4cd5..e5fa4f0e4a2f6 100644
--- a/vllm/plugins/__init__.py
+++ b/vllm/plugins/__init__.py
@@ -63,8 +63,8 @@ def load_general_plugins():
     from vllm.platforms import current_platform
 
     if current_platform.is_xpu():
-        # see https://github.com/pytorch/pytorch/blob/8cada5cbe5450e17c26fb8b358116785324537b2/torch/_dynamo/config.py#L158  # noqa
-        os.environ['TORCH_COMPILE_DISABLE'] = 'True'
+        # see https://github.com/pytorch/pytorch/blob/43c5f59/torch/_dynamo/config.py#L158
+        torch._dynamo.config.disable = True
     if current_platform.is_hpu():
         # NOTE(kzawora): PT HPU lazy backend (PT_HPU_LAZY_MODE = 1)
         # does not support torch.compile
@@ -72,7 +72,6 @@ def load_general_plugins():
         # torch.compile support
         is_lazy = os.environ.get('PT_HPU_LAZY_MODE', '1') == '1'
         if is_lazy:
-            # see https://github.com/pytorch/pytorch/blob/43c5f59/torch/_dynamo/config.py#L158
             torch._dynamo.config.disable = True
             # NOTE(kzawora) multi-HPU inference with HPUGraphs (lazy-only)
             # requires enabling lazy collectives

From ca47e176af9e0a4fa9f02325cdad5f11b40aedab Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 9 Jan 2025 01:04:46 +0800
Subject: [PATCH 135/462] [Misc] Move some model utils into vision file
 (#11848)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/clip.py     |  5 +-
 vllm/model_executor/models/pixtral.py  |  5 +-
 vllm/model_executor/models/qwen2_vl.py |  3 +-
 vllm/model_executor/models/siglip.py   |  5 +-
 vllm/model_executor/models/utils.py    | 37 +-----------
 vllm/model_executor/models/vision.py   | 83 +++++++++++++++++++++++++-
 vllm/multimodal/inputs.py              |  4 +-
 vllm/multimodal/utils.py               | 44 --------------
 8 files changed, 94 insertions(+), 92 deletions(-)

diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index 1bde45cb140cb..dd69f6c9a5aff 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -20,11 +20,10 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.multimodal.utils import (cached_get_tokenizer,
                                    consecutive_placeholder_ranges,
-                                   repeat_and_pad_placeholder_tokens,
-                                   resolve_visual_encoder_outputs)
+                                   repeat_and_pad_placeholder_tokens)
 from vllm.sequence import SequenceData
 
-from .vision import VisionEncoderInfo
+from .vision import VisionEncoderInfo, resolve_visual_encoder_outputs
 
 
 def get_clip_patch_grid_length(*, image_size: int, patch_size: int) -> int:
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index b74bb3c8a3f88..37b9989e489ec 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -31,14 +31,13 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
 from vllm.multimodal.inputs import NestedTensors, PlaceholderRange
 from vllm.multimodal.utils import (cached_get_tokenizer,
-                                   consecutive_placeholder_ranges,
-                                   resolve_visual_encoder_outputs)
+                                   consecutive_placeholder_ranges)
 from vllm.sequence import IntermediateTensors, SequenceData
 
 from .interfaces import SupportsMultiModal, SupportsPP
 from .utils import (init_vllm_registered_model, maybe_prefix,
                     merge_multimodal_embeddings)
-from .vision import VisionEncoderInfo
+from .vision import VisionEncoderInfo, resolve_visual_encoder_outputs
 
 try:
     from xformers import ops as xops
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 8537fec854b6d..76a810e8f0c20 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -66,8 +66,9 @@
 from vllm.transformers_utils.config import uses_mrope
 
 from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP
-from .utils import (AutoWeightsLoader, WeightsMapper, get_vit_attn_backend,
+from .utils import (AutoWeightsLoader, WeightsMapper,
                     init_vllm_registered_model, maybe_prefix)
+from .vision import get_vit_attn_backend
 
 logger = init_logger(__name__)
 
diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py
index 7ea177e94afc0..cca42842bc06e 100644
--- a/vllm/model_executor/models/siglip.py
+++ b/vllm/model_executor/models/siglip.py
@@ -24,11 +24,10 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.multimodal.utils import (cached_get_tokenizer,
                                    consecutive_placeholder_ranges,
-                                   repeat_and_pad_placeholder_tokens,
-                                   resolve_visual_encoder_outputs)
+                                   repeat_and_pad_placeholder_tokens)
 from vllm.sequence import SequenceData
 
-from .vision import VisionEncoderInfo
+from .vision import VisionEncoderInfo, resolve_visual_encoder_outputs
 
 
 def get_siglip_patch_grid_length(*, image_size: int, patch_size: int) -> int:
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index 4ed3b237ae0e2..43b3c973c97b8 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -8,16 +8,12 @@
 from torch.func import functional_call
 from transformers import PretrainedConfig
 
-import vllm.envs as envs
-from vllm.attention.selector import (backend_name_to_enum,
-                                     get_global_forced_attn_backend)
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.multimodal import MultiModalPlaceholderMap, NestedTensors
-from vllm.platforms import _Backend, current_platform
 from vllm.sequence import IntermediateTensors
-from vllm.utils import is_pin_memory_available, print_warning_once
+from vllm.utils import is_pin_memory_available
 
 logger = init_logger(__name__)
 
@@ -612,37 +608,6 @@ def make_empty_intermediate_tensors(
     return make_empty_intermediate_tensors
 
 
-def get_vit_attn_backend(support_fa: bool = False) -> _Backend:
-    """
-    Get the available attention backend for Vision Transformer.
-    """
-    # TODO(Isotr0py): Remove `support_fa` after support FA for all ViTs attn.
-    selected_backend: Optional[_Backend] = get_global_forced_attn_backend()
-    if selected_backend is None:
-        backend_by_env_var: Optional[str] = envs.VLLM_ATTENTION_BACKEND
-        if backend_by_env_var is not None:
-            selected_backend = backend_name_to_enum(backend_by_env_var)
-    if selected_backend is None:
-        # For Volta and Turing GPUs, use xformers instead.
-        device_available = current_platform.has_device_capability(80)
-        if device_available and support_fa:
-            from transformers.utils import is_flash_attn_2_available
-            if is_flash_attn_2_available():
-                selected_backend = _Backend.FLASH_ATTN
-            else:
-                print_warning_once(
-                    "Current `vllm-flash-attn` has a bug inside vision module, "
-                    "so we use xformers backend instead. You can run "
-                    "`pip install flash-attn` to use flash-attention backend.")
-                selected_backend = _Backend.XFORMERS
-        elif current_platform.is_cpu() or current_platform.is_rocm():
-            # ROCM doesn't support xformers
-            selected_backend = _Backend.TORCH_SDPA
-        else:
-            selected_backend = _Backend.XFORMERS
-    return selected_backend
-
-
 def maybe_prefix(prefix: str, name: str) -> str:
     """Add a prefix to a name if the prefix is non-empty.
 
diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py
index 8516c9f7066f7..e6a9e153d9107 100644
--- a/vllm/model_executor/models/vision.py
+++ b/vllm/model_executor/models/vision.py
@@ -1,8 +1,15 @@
 from abc import ABC, abstractmethod
-from typing import Final, Generic, Protocol, TypeVar
+from typing import Final, Generic, Optional, Protocol, TypeVar, Union
 
+import torch
 from transformers import PretrainedConfig
 
+import vllm.envs as envs
+from vllm.attention.selector import (backend_name_to_enum,
+                                     get_global_forced_attn_backend)
+from vllm.platforms import _Backend, current_platform
+from vllm.utils import print_warning_once
+
 _C = TypeVar("_C", bound=PretrainedConfig)
 
 
@@ -60,3 +67,77 @@ def get_vision_encoder_info(
 
     msg = f"Unsupported vision config: {type(vision_config)}"
     raise NotImplementedError(msg)
+
+
+def get_vit_attn_backend(support_fa: bool = False) -> _Backend:
+    """
+    Get the available attention backend for Vision Transformer.
+    """
+    # TODO(Isotr0py): Remove `support_fa` after support FA for all ViTs attn.
+    selected_backend: Optional[_Backend] = get_global_forced_attn_backend()
+    if selected_backend is None:
+        backend_by_env_var: Optional[str] = envs.VLLM_ATTENTION_BACKEND
+        if backend_by_env_var is not None:
+            selected_backend = backend_name_to_enum(backend_by_env_var)
+    if selected_backend is None:
+        # For Volta and Turing GPUs, use xformers instead.
+        device_available = current_platform.has_device_capability(80)
+        if device_available and support_fa:
+            from transformers.utils import is_flash_attn_2_available
+            if is_flash_attn_2_available():
+                selected_backend = _Backend.FLASH_ATTN
+            else:
+                print_warning_once(
+                    "Current `vllm-flash-attn` has a bug inside vision module, "
+                    "so we use xformers backend instead. You can run "
+                    "`pip install flash-attn` to use flash-attention backend.")
+                selected_backend = _Backend.XFORMERS
+        elif current_platform.is_cpu() or current_platform.is_rocm():
+            # ROCM doesn't support xformers
+            selected_backend = _Backend.TORCH_SDPA
+        else:
+            selected_backend = _Backend.XFORMERS
+    return selected_backend
+
+
+def resolve_visual_encoder_outputs(
+    encoder_outputs: Union[torch.Tensor, list[torch.Tensor]],
+    feature_sample_layers: Optional[list[int]],
+    post_layer_norm: Optional[torch.nn.LayerNorm],
+    max_possible_layers: int,
+) -> torch.Tensor:
+    """Given the outputs a visual encoder module that may correspond to the
+    output of the last layer, or a list of hidden states to be stacked,
+    handle post normalization and resolve it into a single output tensor.
+
+    Args:
+        encoder_outputs: Output of encoder's last layer or all hidden states.
+        feature_sample_layers: Optional layer indices to grab from the encoder
+            outputs; if provided, encoder outputs must be a list.
+        post_layer_norm: Post norm to apply to the output of the encoder.
+        max_possible_layers: Total layers in the fully loaded visual encoder.
+
+    """
+    if feature_sample_layers is None:
+        if post_layer_norm is not None:
+            return post_layer_norm(encoder_outputs)
+        return encoder_outputs
+
+    # Get the hidden states corresponding to the layer indices.
+    # Negative values are relative to the full visual encoder,
+    # so offset them depending on how many layers were loaded.
+    # NOTE: this assumes that encoder_outputs contains a list
+    # of hidden states in the same order as the encoder layers
+    # that produced them.
+    offset = max_possible_layers - len(encoder_outputs)
+    hs_pool = [
+        encoder_outputs[layer_idx]
+        if layer_idx >= 0 else encoder_outputs[layer_idx + offset]
+        for layer_idx in feature_sample_layers
+    ]
+
+    # Apply post-norm on the final hidden state if we are using it
+    uses_last_layer = feature_sample_layers[-1] in (len(hs_pool) - 1, -1)
+    if post_layer_norm is not None and uses_last_layer:
+        hs_pool[-1] = post_layer_norm(encoder_outputs)
+    return torch.cat(hs_pool, dim=-1)
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index d542461874866..8680e4175593b 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -99,6 +99,8 @@ class MultiModalDataBuiltins(TypedDict, total=False):
 MultiModalDataDict: TypeAlias = Mapping[str, ModalityData[Any]]
 """
 A dictionary containing an entry for each modality type to input.
+
+The built-in modalities are defined by :class:`MultiModalDataBuiltins`.
 """
 
 
@@ -485,7 +487,7 @@ def get_items(self, modality: str) -> Sequence[MultiModalKwargsItem]:
 
 MultiModalPlaceholderDict = Mapping[str, Sequence[PlaceholderRange]]
 """
-A dictionary containing placeholder ranges.
+A dictionary containing placeholder ranges for each modality.
 """
 
 
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index f4a514ba55d0c..1c6bbf77b926f 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -5,7 +5,6 @@
 
 import numpy as np
 import numpy.typing as npt
-import torch
 from PIL import Image
 
 import vllm.envs as envs
@@ -285,49 +284,6 @@ def encode_video_base64(frames: npt.NDArray) -> str:
     return video_io.encode_base64(frames)
 
 
-def resolve_visual_encoder_outputs(
-    encoder_outputs: Union[torch.Tensor, list[torch.Tensor]],
-    feature_sample_layers: Optional[list[int]],
-    post_layer_norm: Optional[torch.nn.LayerNorm],
-    max_possible_layers: int,
-) -> torch.Tensor:
-    """Given the outputs a visual encoder module that may correspond to the
-    output of the last layer, or a list of hidden states to be stacked,
-    handle post normalization and resolve it into a single output tensor.
-
-    Args:
-        encoder_outputs: Output of encoder's last layer or all hidden states.
-        feature_sample_layers: Optional layer indices to grab from the encoder
-            outputs; if provided, encoder outputs must be a list.
-        post_layer_norm: Post norm to apply to the output of the encoder.
-        max_possible_layers: Total layers in the fully loaded visual encoder.
-
-    """
-    if feature_sample_layers is None:
-        if post_layer_norm is not None:
-            return post_layer_norm(encoder_outputs)
-        return encoder_outputs
-
-    # Get the hidden states corresponding to the layer indices.
-    # Negative values are relative to the full visual encoder,
-    # so offset them depending on how many layers were loaded.
-    # NOTE: this assumes that encoder_outputs contains a list
-    # of hidden states in the same order as the encoder layers
-    # that produced them.
-    offset = max_possible_layers - len(encoder_outputs)
-    hs_pool = [
-        encoder_outputs[layer_idx]
-        if layer_idx >= 0 else encoder_outputs[layer_idx + offset]
-        for layer_idx in feature_sample_layers
-    ]
-
-    # Apply post-norm on the final hidden state if we are using it
-    uses_last_layer = feature_sample_layers[-1] in (len(hs_pool) - 1, -1)
-    if post_layer_norm is not None and uses_last_layer:
-        hs_pool[-1] = post_layer_norm(encoder_outputs)
-    return torch.cat(hs_pool, dim=-1)
-
-
 # Utilities for input processors
 _T = TypeVar("_T", str, int)
 

From 5984499e473c387c17904aa9933b8ed080621ca6 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 9 Jan 2025 01:14:14 +0800
Subject: [PATCH 136/462] [Doc] Expand Multimodal API Reference (#11852)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/source/api/multimodal/index.md      | 61 ++++--------------------
 docs/source/api/multimodal/inputs.md     | 49 +++++++++++++++++++
 docs/source/api/multimodal/parse.md      |  9 ++++
 docs/source/api/multimodal/processing.md |  9 ++++
 docs/source/api/multimodal/profiling.md  |  9 ++++
 docs/source/api/multimodal/registry.md   |  9 ++++
 vllm/multimodal/parse.py                 | 31 ++++++++----
 vllm/multimodal/processing.py            | 26 +++++++---
 vllm/multimodal/profiling.py             |  7 ++-
 9 files changed, 139 insertions(+), 71 deletions(-)
 create mode 100644 docs/source/api/multimodal/inputs.md
 create mode 100644 docs/source/api/multimodal/parse.md
 create mode 100644 docs/source/api/multimodal/processing.md
 create mode 100644 docs/source/api/multimodal/profiling.md
 create mode 100644 docs/source/api/multimodal/registry.md

diff --git a/docs/source/api/multimodal/index.md b/docs/source/api/multimodal/index.md
index 0046b73ea825e..51e24795a34cf 100644
--- a/docs/source/api/multimodal/index.md
+++ b/docs/source/api/multimodal/index.md
@@ -2,10 +2,6 @@
 
 # Multi-Modality
 
-```{eval-rst}
-.. currentmodule:: vllm.multimodal
-```
-
 vLLM provides experimental support for multi-modal models through the {mod}`vllm.multimodal` package.
 
 Multi-modal inputs can be passed alongside text and token prompts to [supported models](#supported-mm-models)
@@ -13,61 +9,20 @@ via the `multi_modal_data` field in {class}`vllm.inputs.PromptType`.
 
 Looking to add your own multi-modal model? Please follow the instructions listed [here](#enabling-multimodal-inputs).
 
-
 ## Module Contents
 
-```{eval-rst}
-.. automodule:: vllm.multimodal
-```
-
-### Registry
-
 ```{eval-rst}
 .. autodata:: vllm.multimodal.MULTIMODAL_REGISTRY
 ```
 
-```{eval-rst}
-.. autoclass:: vllm.multimodal.MultiModalRegistry
-    :members:
-    :show-inheritance:
-```
-
-### Base Classes
-
-```{eval-rst}
-.. automodule:: vllm.multimodal.base
-    :members:
-    :show-inheritance:
-```
+## Submodules
 
-### Input Classes
+```{toctree}
+:maxdepth: 1
 
-```{eval-rst}
-.. automodule:: vllm.multimodal.inputs
-    :members:
-    :show-inheritance:
-```
-
-### Audio Classes
-
-```{eval-rst}
-.. automodule:: vllm.multimodal.audio
-    :members:
-    :show-inheritance:
-```
-
-### Image Classes
-
-```{eval-rst}
-.. automodule:: vllm.multimodal.image
-    :members:
-    :show-inheritance:
-```
-
-### Video Classes
-
-```{eval-rst}
-.. automodule:: vllm.multimodal.video
-    :members:
-    :show-inheritance:
+inputs
+parse
+processing
+profiling
+registry
 ```
diff --git a/docs/source/api/multimodal/inputs.md b/docs/source/api/multimodal/inputs.md
new file mode 100644
index 0000000000000..3d89666113229
--- /dev/null
+++ b/docs/source/api/multimodal/inputs.md
@@ -0,0 +1,49 @@
+# Input Definitions
+
+## User-facing inputs
+
+```{eval-rst}
+.. autodata:: vllm.multimodal.MultiModalDataDict
+```
+
+## Internal data structures
+
+```{eval-rst}
+.. autoclass:: vllm.multimodal.inputs.PlaceholderRange
+    :members:
+    :show-inheritance:
+```
+
+```{eval-rst}
+.. autodata:: vllm.multimodal.inputs.NestedTensors
+```
+
+```{eval-rst}
+.. autoclass:: vllm.multimodal.inputs.MultiModalFieldElem
+    :members:
+    :show-inheritance:
+```
+
+```{eval-rst}
+.. autoclass:: vllm.multimodal.inputs.MultiModalFieldConfig
+    :members:
+    :show-inheritance:
+```
+
+```{eval-rst}
+.. autoclass:: vllm.multimodal.inputs.MultiModalKwargsItem
+    :members:
+    :show-inheritance:
+```
+
+```{eval-rst}
+.. autoclass:: vllm.multimodal.inputs.MultiModalKwargs
+    :members:
+    :show-inheritance:
+```
+
+```{eval-rst}
+.. autoclass:: vllm.multimodal.inputs.MultiModalInputsV2
+    :members:
+    :show-inheritance:
+```
diff --git a/docs/source/api/multimodal/parse.md b/docs/source/api/multimodal/parse.md
new file mode 100644
index 0000000000000..4676139efe626
--- /dev/null
+++ b/docs/source/api/multimodal/parse.md
@@ -0,0 +1,9 @@
+# Data Parsing
+
+## Module Contents
+
+```{eval-rst}
+.. automodule:: vllm.multimodal.parse
+    :members:
+    :member-order: bysource
+```
diff --git a/docs/source/api/multimodal/processing.md b/docs/source/api/multimodal/processing.md
new file mode 100644
index 0000000000000..0d81c8d3966ee
--- /dev/null
+++ b/docs/source/api/multimodal/processing.md
@@ -0,0 +1,9 @@
+# Data Processing
+
+## Module Contents
+
+```{eval-rst}
+.. automodule:: vllm.multimodal.processing
+    :members:
+    :member-order: bysource
+```
diff --git a/docs/source/api/multimodal/profiling.md b/docs/source/api/multimodal/profiling.md
new file mode 100644
index 0000000000000..b455145212202
--- /dev/null
+++ b/docs/source/api/multimodal/profiling.md
@@ -0,0 +1,9 @@
+# Memory Profiling
+
+## Module Contents
+
+```{eval-rst}
+.. automodule:: vllm.multimodal.profiling
+    :members:
+    :member-order: bysource
+```
diff --git a/docs/source/api/multimodal/registry.md b/docs/source/api/multimodal/registry.md
new file mode 100644
index 0000000000000..0737a4385cf32
--- /dev/null
+++ b/docs/source/api/multimodal/registry.md
@@ -0,0 +1,9 @@
+# Registry
+
+## Module Contents
+
+```{eval-rst}
+.. automodule:: vllm.multimodal.registry
+    :members:
+    :member-order: bysource
+```
diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py
index 6be046ba77ca7..ccff0e857eec4 100644
--- a/vllm/multimodal/parse.py
+++ b/vllm/multimodal/parse.py
@@ -13,14 +13,16 @@
 
 from .audio import resample_audio
 from .inputs import (AudioItem, HfAudioItem, HfImageItem, HfVideoItem,
-                     ImageItem, ModalityData, MultiModalDataDict,
-                     NestedTensors, VideoItem)
+                     ImageItem, ModalityData, MultiModalDataDict, VideoItem)
 
 _T = TypeVar("_T")
 _I = TypeVar("_I")
 
 
 class ModalityDataItems(ABC, Generic[_T, _I]):
+    """
+    Represents data items for a modality in :class:`MultiModalDataItems`.
+    """
 
     def __init__(self, data: _T, modality: str) -> None:
         super().__init__()
@@ -69,6 +71,7 @@ def get_passthrough_data(self) -> Mapping[str, object]:
 
 
 class ProcessorBatchItems(ModalityDataItems[Sequence[_T], _T]):
+    """Base class for data items that are arranged in a list."""
 
     def get_count(self) -> int:
         return len(self.data)
@@ -83,7 +86,12 @@ def get_passthrough_data(self) -> Mapping[str, object]:
         return {}
 
 
-class EmbeddingItems(ModalityDataItems[NestedTensors, torch.Tensor]):
+class EmbeddingItems(ModalityDataItems[Union[torch.Tensor, list[torch.Tensor]],
+                                       torch.Tensor]):
+    """
+    Base class for data items that are expressed as a batched embedding tensor,
+    or a list of embedding tensors (one per item).
+    """
 
     def get_count(self) -> int:
         return len(self.data)
@@ -109,7 +117,7 @@ def __init__(self, data: Sequence[HfAudioItem]) -> None:
 
 class AudioEmbeddingItems(EmbeddingItems):
 
-    def __init__(self, data: NestedTensors) -> None:
+    def __init__(self, data: Union[torch.Tensor, list[torch.Tensor]]) -> None:
         super().__init__(data, "audio")
 
 
@@ -137,7 +145,7 @@ def get_image_size(self, item_idx: int) -> ImageSize:
 
 class ImageEmbeddingItems(EmbeddingItems):
 
-    def __init__(self, data: NestedTensors) -> None:
+    def __init__(self, data: Union[torch.Tensor, list[torch.Tensor]]) -> None:
         super().__init__(data, "image")
 
 
@@ -163,7 +171,7 @@ def get_frame_size(self, item_idx: int) -> ImageSize:
 
 class VideoEmbeddingItems(EmbeddingItems):
 
-    def __init__(self, data: NestedTensors) -> None:
+    def __init__(self, data: Union[torch.Tensor, list[torch.Tensor]]) -> None:
         super().__init__(data, "video")
 
 
@@ -172,8 +180,8 @@ def __init__(self, data: NestedTensors) -> None:
 
 class MultiModalDataItems(UserDict[str, ModalityDataItems[Any, Any]]):
     """
-    As :class:`MultiModalDataDict`, but normalized such that each entry
-    corresponds to a list.
+    As :data:`~vllm.multimodal.inputs.MultiModalDataDict`, but normalized
+    such that each entry corresponds to a list.
     """
 
     def get_count(self, modality: str, *, strict: bool = True) -> int:
@@ -226,7 +234,8 @@ def get_items(
 
 class MultiModalDataParser:
     """
-    Parses :class:`MultiModalDataDict` into :class:`MultiModalDataItems`.
+    Parses :data:`~vllm.multimodal.inputs.MultiModalDataDict` into
+    :class:`MultiModalDataItems`.
 
     Args:
         target_sr (float, optional): Enables automatic resampling of audio
@@ -238,7 +247,9 @@ def __init__(self, *, target_sr: Optional[float] = None) -> None:
 
         self.target_sr = target_sr
 
-    def _is_embeddings(self, data: object) -> TypeGuard[NestedTensors]:
+    def _is_embeddings(
+            self, data: object
+    ) -> TypeGuard[Union[torch.Tensor, list[torch.Tensor]]]:
         if isinstance(data, torch.Tensor):
             return data.ndim == 3
         if is_list_of(data, torch.Tensor):
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index c6a30cacebdd1..07d883d5d7295 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -33,20 +33,24 @@
 
 @dataclass
 class PromptReplacement:
+    """
+    Defines how to replace portions of an input prompt with placeholder tokens.
+    """
+
     modality: str
     """The modality for which the replacement is made."""
 
     target: _PromptSeq
-    """The text or token sequence to find and replace."""
+    """The token sequence (or text) to find and replace."""
 
     replacement: Union[Callable[[int], _PromptSeq],
                        _PromptSeq] = field(repr=False)
     """
-    Given the index of the processed item within :attr:`modality`, output the
-    replacement text or token sequence.
+    Given the index of the processed item within :attr:`modality`,
+    output the replacement token sequence (or text).
 
-    For convenience, you can pass in the replacement instead of a function
-    if it does not depend on the input.
+    For convenience, you can directly pass in the replacement token sequence
+    (or text) instead of a function if it does not depend on the input.
     """
 
     def bind(self, tokenizer: AnyTokenizer) -> "BoundPromptReplacement":
@@ -132,6 +136,11 @@ def token_ids(self) -> list[int]:
 
 @dataclass
 class BoundPromptReplacement:
+    """
+    A :class:`PromptReplacement` bound to a tokenizer to automatically
+    convert :attr:`target` and the result of :meth:`get_replacement` between
+    token sequence and text representations.
+    """
     tokenizer: AnyTokenizer = field(repr=False)
     modality: str
 
@@ -144,6 +153,7 @@ def __post_init__(self) -> None:
 
     @property
     def target(self) -> _BoundPromptSequence:
+        """The token sequence (or text) to find and replace."""
         target = self._target
 
         return _BoundPromptSequence(
@@ -153,6 +163,10 @@ def target(self) -> _BoundPromptSequence:
         )
 
     def get_replacement(self, item_idx: int) -> _BoundPromptSequence:
+        """
+        Given the index of the processed item within :attr:`modality`,
+        output the replacement token sequence (or text).
+        """
         replacement = self._replacement
         if callable(replacement):
             cache_key = item_idx
@@ -528,7 +542,7 @@ def put(
 
 
 class BaseProcessingInfo:
-    """Base class containing information to perform processing."""
+    """Base class to provide the information necessary for data processing."""
 
     def __init__(self, ctx: InputProcessingContext) -> None:
         super().__init__()
diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index 2ac3a6bcf3ddd..6f7da1509990f 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -19,7 +19,10 @@
 
 @dataclass
 class ProcessorInputs:
-    """Keyword arguments to :meth:`BaseMultiModalProcessor`."""
+    """
+    Represents the keyword arguments to
+    :meth:`vllm.multimodal.processing.BaseMultiModalProcessor.apply`.
+    """
     prompt_text: str
     mm_data: MultiModalDataDict
     hf_processor_mm_kwargs: Mapping[str, object] = field(default_factory=dict)
@@ -47,7 +50,7 @@ def get_dummy_processor_inputs(
     ) -> ProcessorInputs:
         """
         Build the input which, after processing, results in
-        `self.info.get_mm_max_tokens_per_item()` placeholder tokens.
+        :code:`self.info.get_mm_max_tokens_per_item()` placeholder tokens.
         """
         raise NotImplementedError
 

From 47de8821d3cdd32fce7df6312318223aee591fd2 Mon Sep 17 00:00:00 2001
From: WangErXiao <863579016@qq.com>
Date: Thu, 9 Jan 2025 02:21:30 +0800
Subject: [PATCH 137/462] [Misc]add some explanations for BlockHashType
 (#11847)

---
 vllm/v1/core/kv_cache_utils.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index 84ff48bf428a0..22a5d2fb08a48 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -11,8 +11,10 @@
 
 class BlockHashType(NamedTuple):
     """Hash value of a block (int), the token IDs in the block, and extra keys.
-    The reason we keep a tuple of token IDs and extra keys is to make sure
-    no hash collision happens when the hash value is the same.
+    We keep a tuple of token IDs and extra keys to reduce the likelihood of
+    hash collisions when the hash value is the same. But please note that 
+    hash collisions can still theoretically occur, albeit with an extremely 
+    low probability.
     """
     # Hash value of the block in an integer.
     hash_value: int

From 56fe4c297c7d9d872eccc19e3edbf1d75e1a30e2 Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Wed, 8 Jan 2025 14:33:29 -0500
Subject: [PATCH 138/462] [TPU][Quantization] TPU `W8A8` (#11785)

Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 .buildkite/run-tpu-test.sh                    |  11 +-
 tests/tpu/test_quantization_accuracy.py       |  49 +++++++
 .../schemes/compressed_tensors_w8a8_int8.py   | 105 ++++----------
 .../schemes/compressed_tensors_wNa16.py       |   2 +-
 .../layers/quantization/gptq_marlin.py        |   2 +-
 .../layers/quantization/kernels/__init__.py   |  74 ----------
 .../{ => mixed_precision}/MPLinearKernel.py   |   0
 .../kernels/mixed_precision/__init__.py       |  74 ++++++++++
 .../kernels/{ => mixed_precision}/exllama.py  |   0
 .../kernels/{ => mixed_precision}/machete.py  |   0
 .../kernels/{ => mixed_precision}/marlin.py   |   0
 .../kernels/scaled_mm/ScaledMMLinearKernel.py |  64 +++++++++
 .../kernels/scaled_mm/__init__.py             |  84 +++++++++++
 .../quantization/kernels/scaled_mm/cutlass.py | 134 ++++++++++++++++++
 .../quantization/kernels/scaled_mm/xla.py     | 101 +++++++++++++
 .../layers/quantization/utils/w8a8_utils.py   |  38 -----
 vllm/model_executor/parameter.py              |  13 ++
 vllm/platforms/tpu.py                         |   4 +-
 18 files changed, 565 insertions(+), 190 deletions(-)
 create mode 100644 tests/tpu/test_quantization_accuracy.py
 rename vllm/model_executor/layers/quantization/kernels/{ => mixed_precision}/MPLinearKernel.py (100%)
 create mode 100644 vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py
 rename vllm/model_executor/layers/quantization/kernels/{ => mixed_precision}/exllama.py (100%)
 rename vllm/model_executor/layers/quantization/kernels/{ => mixed_precision}/machete.py (100%)
 rename vllm/model_executor/layers/quantization/kernels/{ => mixed_precision}/marlin.py (100%)
 create mode 100644 vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py
 create mode 100644 vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
 create mode 100644 vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
 create mode 100644 vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py

diff --git a/.buildkite/run-tpu-test.sh b/.buildkite/run-tpu-test.sh
index 13605a3e97142..a8f021890f742 100644
--- a/.buildkite/run-tpu-test.sh
+++ b/.buildkite/run-tpu-test.sh
@@ -14,4 +14,13 @@ remove_docker_container
 # For HF_TOKEN.
 source /etc/environment
 # Run a simple end-to-end example.
-docker run --privileged --net host --shm-size=16G -it -e "HF_TOKEN=$HF_TOKEN" --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && python3 -m pip install lm_eval[api]==0.4.4 && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference/offline_inference_tpu.py"
+docker run --privileged --net host --shm-size=16G -it \
+    -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
+    vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
+    && python3 -m pip install pytest \
+    && python3 -m pip install lm_eval[api]==0.4.4 \
+    && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py \
+    && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
+    && python3 /workspace/vllm/tests/tpu/test_compilation.py \
+    && python3 /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
+    && python3 /workspace/vllm/examples/offline_inference/offline_inference_tpu.py"
diff --git a/tests/tpu/test_quantization_accuracy.py b/tests/tpu/test_quantization_accuracy.py
new file mode 100644
index 0000000000000..6cd5615c44e1e
--- /dev/null
+++ b/tests/tpu/test_quantization_accuracy.py
@@ -0,0 +1,49 @@
+from dataclasses import dataclass
+
+import lm_eval
+import pytest
+
+TASK = "gsm8k"
+FILTER = "exact_match,strict-match"
+RTOL = 0.03
+
+
+@dataclass
+class GSM8KAccuracyTestConfig:
+    model_name: str
+    excepted_value: float
+
+    def get_model_args(self) -> str:
+        return (f"pretrained={self.model_name},"
+                "max_model_len=4096,max_num_seqs=32")
+
+
+# NOTE: Accuracy scores measured on GPUs.
+ACCURACY_CONFIGS = [
+    GSM8KAccuracyTestConfig(
+        model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+        excepted_value=0.76),  # no bias
+    # NOTE(rob): We cannot re-initialize VLLM in the same process for TPU,
+    # so only one of these tests can run in a single call to pytest. As
+    # a follow up, move this into the LM-EVAL section of the CI.
+    # GSM8KAccuracyTestConfig(
+    #     model_name="neuralmagic/Qwen2-7B-Instruct-quantized.w8a8",
+    #     excepted_value=0.66),  # bias in QKV layers
+]
+
+
+@pytest.mark.parametrize("config", ACCURACY_CONFIGS)
+def test_gsm8k_correctness(config: GSM8KAccuracyTestConfig):
+
+    results = lm_eval.simple_evaluate(
+        model="vllm",
+        model_args=config.get_model_args(),
+        tasks="gsm8k",
+        batch_size="auto",
+    )
+
+    EXPECTED_VALUE = config.excepted_value
+    measured_value = results["results"][TASK][FILTER]
+    assert (measured_value - RTOL < EXPECTED_VALUE
+            and measured_value + RTOL > EXPECTED_VALUE
+            ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
index 6cbc58d61e970..0e3f4731775c5 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
@@ -1,14 +1,13 @@
-from typing import Callable, List, Optional
+from typing import Callable, List, Optional, Set
 
 import torch
 from compressed_tensors.quantization import QuantizationStrategy
-from torch.nn import Parameter
 
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
     CompressedTensorsScheme)
-from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    apply_int8_linear, convert_to_channelwise)
+from vllm.model_executor.layers.quantization.kernels.scaled_mm import (
+    ScaledMMLinearLayerConfig, choose_scaled_mm_linear_kernel)
 from vllm.model_executor.parameter import (BasevLLMParameter,
                                            ChannelQuantScaleParameter,
                                            ModelWeightParameter,
@@ -18,6 +17,7 @@
 
 
 class CompressedTensorsW8A8Int8(CompressedTensorsScheme):
+    _kernel_backends_being_used: Set[str] = set()
 
     def __init__(self, strategy: str, is_static_input_scheme: bool,
                  input_symmetric: bool):
@@ -30,74 +30,25 @@ def get_min_capability(cls) -> int:
         # turing and up
         return 75
 
-    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        # WEIGHT
-        # Cutlass kernels need transposed weight.
-        weight = layer.weight
-        layer.weight = Parameter(weight.t(), requires_grad=False)
-
-        # WEIGHT SCALE
-        # Cutlass kernels support only per-tensor and per-channel.
-        # If we have a fused module (QKV, MLP) with per tensor scales (thus N
-        # scales being passed to the kernel), convert to the per-channel case.
-        is_fused_module = len(self.logical_widths) > 1
-        if is_fused_module and self.strategy == QuantizationStrategy.TENSOR:
-            ws_channelwise = convert_to_channelwise(layer.weight_scale,
-                                                    self.logical_widths)
-            layer.weight_scale = Parameter(ws_channelwise, requires_grad=False)
-        else:
-            layer.weight_scale = Parameter(layer.weight_scale.data,
-                                           requires_grad=False)
-        # INPUT SCALE
-        if self.is_static_input_scheme:
-            if self.input_symmetric:
-                layer.input_scale = Parameter(layer.input_scale.max(),
-                                              requires_grad=False)
-                layer.input_zero_point = None
-            else:
-                # reconstruct the ranges
-                int8_traits = torch.iinfo(torch.int8)
-                azps = layer.input_zero_point.to(dtype=torch.int32)
-                range_max = (layer.input_scale *
-                             (int8_traits.max - azps)).max()
-                range_min = (layer.input_scale *
-                             (int8_traits.min - azps)).min()
-
-                scale = (range_max - range_min) / (int8_traits.max -
-                                                   int8_traits.min)
-                layer.input_scale = Parameter(scale, requires_grad=False)
-
-                # AZP loaded as int8 but used as int32
-                azp = (int8_traits.min -
-                       range_min / scale).to(dtype=torch.int32)
-                layer.input_zero_point = Parameter(azp, requires_grad=False)
-
-        else:
-            layer.input_scale = None
-            layer.input_zero_point = None
-
-        # azp_adj is the AZP adjustment term, used to account for weights.
-        # It does not depend on scales or azp, so it is the same for
-        # static and dynamic quantization.
-        # For more details, see csrc/quantization/cutlass_w8a8/Epilogues.md
-        # https://github.com/vllm-project/vllm/blob/8d59dbb00044a588cab96bcdc028006ed922eb06/csrc/quantization/cutlass_w8a8/Epilogues.md
-        if not self.input_symmetric:
-            azp_adj = layer.weight.sum(dim=0, keepdim=True, dtype=torch.int32)
-            if self.is_static_input_scheme:
-                # cutlass_w8a8 requires azp to be folded into azp_adj
-                #  in the per-tensor case
-                azp_adj = layer.input_zero_point * azp_adj
-
-            layer.azp_adj = azp_adj
-        else:
-            layer.azp_adj = None
-
     def create_weights(self, layer: torch.nn.Module,
                        output_partition_sizes: List[int],
                        input_size_per_partition: int,
                        params_dtype: torch.dtype, weight_loader: Callable,
                        **kwargs):
-        self.logical_widths = output_partition_sizes
+        layer.logical_widths = output_partition_sizes
+
+        scaled_mm_linear_kernel_config = ScaledMMLinearLayerConfig(
+            is_channelwise=(self.strategy == QuantizationStrategy.CHANNEL),
+            is_static_input_scheme=self.is_static_input_scheme,
+            input_symmetric=self.input_symmetric)
+
+        kernel_type = choose_scaled_mm_linear_kernel(
+            scaled_mm_linear_kernel_config)
+
+        if kernel_type.__name__ not in self._kernel_backends_being_used:
+            logger.info("Using %s for CompressedTensorsW8A8Int8",
+                        kernel_type.__name__)
+            self._kernel_backends_being_used.add(kernel_type.__name__)
 
         # WEIGHT
         weight = ModelWeightParameter(data=torch.empty(
@@ -140,12 +91,18 @@ def create_weights(self, layer: torch.nn.Module,
                     weight_loader=weight_loader)
                 layer.register_parameter("input_zero_point", input_zero_point)
 
+        self.kernel = kernel_type(c=scaled_mm_linear_kernel_config,
+                                  w_q_param_name="weight",
+                                  w_s_param_name="weight_scale",
+                                  i_s_param_name="input_scale",
+                                  i_zp_param_name="input_zero_point",
+                                  azp_adj_param_name="azp_adj")
+
+    # Checkpoints are serialized in compressed-tensors format, which is
+    # different from the format the kernel may want. Handle repacking here.
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        self.kernel.process_weights_after_loading(layer)
+
     def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
                       bias: Optional[torch.Tensor]) -> torch.Tensor:
-        return apply_int8_linear(input=x,
-                                 weight=layer.weight,
-                                 weight_scale=layer.weight_scale,
-                                 input_scale=layer.input_scale,
-                                 input_zero_point=layer.input_zero_point,
-                                 azp_adj=layer.azp_adj,
-                                 bias=bias)
+        return self.kernel.apply_weights(layer, x, bias)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
index a515738017781..2dd243b9c3109 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
@@ -6,7 +6,7 @@
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
     CompressedTensorsScheme)
-from vllm.model_executor.layers.quantization.kernels import (
+from vllm.model_executor.layers.quantization.kernels.mixed_precision import (
     MPLinearLayerConfig, choose_mp_linear_kernel)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     marlin_repeat_scales_on_all_ranks)
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index a006d729cc627..2dbfca9b07690 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -11,7 +11,7 @@
                                                set_weight_attrs)
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
-from vllm.model_executor.layers.quantization.kernels import (
+from vllm.model_executor.layers.quantization.kernels.mixed_precision import (
     MPLinearLayerConfig, choose_mp_linear_kernel)
 from vllm.model_executor.layers.quantization.utils import replace_parameter
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
diff --git a/vllm/model_executor/layers/quantization/kernels/__init__.py b/vllm/model_executor/layers/quantization/kernels/__init__.py
index 94a3dc2584d6b..e69de29bb2d1d 100644
--- a/vllm/model_executor/layers/quantization/kernels/__init__.py
+++ b/vllm/model_executor/layers/quantization/kernels/__init__.py
@@ -1,74 +0,0 @@
-from typing import List, Optional, Type
-
-import vllm.envs as envs
-from vllm.model_executor.layers.quantization.kernels.exllama import (
-    ExllamaLinearKernel)
-from vllm.model_executor.layers.quantization.kernels.machete import (
-    MacheteLinearKernel)
-from vllm.model_executor.layers.quantization.kernels.marlin import (
-    MarlinLinearKernel)
-from vllm.model_executor.layers.quantization.kernels.MPLinearKernel import (
-    MPLinearKernel, MPLinearLayerConfig)
-from vllm.platforms import current_platform
-
-# in priority/performance order (when available)
-_POSSIBLE_KERNELS: List[Type[MPLinearKernel]] = [
-    MacheteLinearKernel,
-    MarlinLinearKernel,
-    ExllamaLinearKernel,
-]
-
-
-def choose_mp_linear_kernel(
-        config: MPLinearLayerConfig,
-        compute_capability: Optional[int] = None) -> Type[MPLinearKernel]:
-    """
-    Choose an MPLinearKernel that can implement the given config for the given
-     compute capability. Attempts to choose the best kernel in terms of 
-     performance.
-
-    Args:
-        config (MPLinearLayerConfig): Description of the linear layer to be 
-          implemented.
-        compute_capability (Optional[int], optional): The compute capability of
-          the target device, if None uses `current_platform` to get the compute 
-          capability. Defaults to None.
-
-    Raises:
-        ValueError: If no kernel can implement the given config.
-
-    Returns:
-        Type[MPLinearKernel]: Chosen kernel.
-    """
-    if compute_capability is None:
-        if current_platform is None:
-            raise ValueError("Cannot determine compute capability")
-        _cc = current_platform.get_device_capability()
-        compute_capability = _cc[0] * 10 + _cc[1]
-
-    failure_reasons = []
-    for kernel in _POSSIBLE_KERNELS:
-        if kernel.__name__ in envs.VLLM_DISABLED_KERNELS:
-            failure_reasons.append(
-                f' {kernel.__name__} disabled by environment variable')
-            continue
-
-        if kernel.get_min_capability() > compute_capability:
-            failure_reasons.append(
-                f"{kernel.__name__} requires capability "
-                f"{kernel.get_min_capability()}, current compute capability "
-                f"is {compute_capability}")
-            continue
-
-        can_implement, failure_reason = kernel.can_implement(config)
-        if can_implement:
-            return kernel
-        else:
-            failure_reasons.append(
-                f' {kernel.__name__} cannot implement due to: {failure_reason}'
-            )
-
-    raise ValueError(
-        "Failed to find a kernel that can implement the "\
-        "WNA16 linear layer. Reasons: \n"
-        + '\n'.join(failure_reasons))
diff --git a/vllm/model_executor/layers/quantization/kernels/MPLinearKernel.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py
similarity index 100%
rename from vllm/model_executor/layers/quantization/kernels/MPLinearKernel.py
rename to vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py
new file mode 100644
index 0000000000000..83549870e3f0b
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py
@@ -0,0 +1,74 @@
+from typing import List, Optional, Type
+
+import vllm.envs as envs
+from vllm.model_executor.layers.quantization.kernels.mixed_precision.exllama import (  # noqa: E501
+    ExllamaLinearKernel)
+from vllm.model_executor.layers.quantization.kernels.mixed_precision.machete import (  # noqa: E501
+    MacheteLinearKernel)
+from vllm.model_executor.layers.quantization.kernels.mixed_precision.marlin import (  # noqa: E501
+    MarlinLinearKernel)
+from vllm.model_executor.layers.quantization.kernels.mixed_precision.MPLinearKernel import (  # noqa: E501
+    MPLinearKernel, MPLinearLayerConfig)
+from vllm.platforms import current_platform
+
+# in priority/performance order (when available)
+_POSSIBLE_KERNELS: List[Type[MPLinearKernel]] = [
+    MacheteLinearKernel,
+    MarlinLinearKernel,
+    ExllamaLinearKernel,
+]
+
+
+def choose_mp_linear_kernel(
+        config: MPLinearLayerConfig,
+        compute_capability: Optional[int] = None) -> Type[MPLinearKernel]:
+    """
+    Choose an MPLinearKernel that can implement the given config for the given
+     compute capability. Attempts to choose the best kernel in terms of 
+     performance.
+
+    Args:
+        config (MPLinearLayerConfig): Description of the linear layer to be 
+          implemented.
+        compute_capability (Optional[int], optional): The compute capability of
+          the target device, if None uses `current_platform` to get the compute 
+          capability. Defaults to None.
+
+    Raises:
+        ValueError: If no kernel can implement the given config.
+
+    Returns:
+        Type[MPLinearKernel]: Chosen kernel.
+    """
+    if compute_capability is None:
+        if current_platform is None:
+            raise ValueError("Cannot determine compute capability")
+        _cc = current_platform.get_device_capability()
+        compute_capability = _cc[0] * 10 + _cc[1]
+
+    failure_reasons = []
+    for kernel in _POSSIBLE_KERNELS:
+        if kernel.__name__ in envs.VLLM_DISABLED_KERNELS:
+            failure_reasons.append(
+                f' {kernel.__name__} disabled by environment variable')
+            continue
+
+        if kernel.get_min_capability() > compute_capability:
+            failure_reasons.append(
+                f"{kernel.__name__} requires capability "
+                f"{kernel.get_min_capability()}, current compute capability "
+                f"is {compute_capability}")
+            continue
+
+        can_implement, failure_reason = kernel.can_implement(config)
+        if can_implement:
+            return kernel
+        else:
+            failure_reasons.append(
+                f' {kernel.__name__} cannot implement due to: {failure_reason}'
+            )
+
+    raise ValueError(
+        "Failed to find a kernel that can implement the "\
+        "WNA16 linear layer. Reasons: \n"
+        + '\n'.join(failure_reasons))
diff --git a/vllm/model_executor/layers/quantization/kernels/exllama.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py
similarity index 100%
rename from vllm/model_executor/layers/quantization/kernels/exllama.py
rename to vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py
diff --git a/vllm/model_executor/layers/quantization/kernels/machete.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py
similarity index 100%
rename from vllm/model_executor/layers/quantization/kernels/machete.py
rename to vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py
diff --git a/vllm/model_executor/layers/quantization/kernels/marlin.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
similarity index 100%
rename from vllm/model_executor/layers/quantization/kernels/marlin.py
rename to vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py
new file mode 100644
index 0000000000000..75cf91f191136
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py
@@ -0,0 +1,64 @@
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Optional, Tuple
+
+import torch
+
+
+@dataclass
+class ScaledMMLinearLayerConfig:
+    is_channelwise: bool
+    is_static_input_scheme: bool
+    input_symmetric: bool
+
+
+class ScaledMMLinearKernel(ABC):
+
+    @classmethod
+    @abstractmethod
+    def get_min_capability(cls) -> int:
+        raise NotImplementedError
+
+    @classmethod
+    @abstractmethod
+    def can_implement(
+            cls, c: ScaledMMLinearLayerConfig) -> Tuple[bool, Optional[str]]:
+        raise NotImplementedError
+
+    def __init__(self, c: ScaledMMLinearLayerConfig, w_q_param_name: str,
+                 w_s_param_name: str, i_s_param_name: str,
+                 i_zp_param_name: str, azp_adj_param_name: str) -> None:
+        assert self.can_implement(c)
+        self.config = c
+        self.w_q_name = w_q_param_name
+        self.w_s_name = w_s_param_name
+        self.i_s_name = i_s_param_name
+        self.i_zp_name = i_zp_param_name
+        self.azp_adj_name = azp_adj_param_name
+
+    @abstractmethod
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        raise NotImplementedError
+
+    @abstractmethod
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        raise NotImplementedError
+
+    def _get_weight_params(
+            self, layer: torch.nn.Module
+    ) -> Tuple[torch.Tensor,  # weight
+               torch.Tensor,  # weight_scale
+               Optional[torch.Tensor],  # input_scale, 
+               Optional[torch.Tensor],  # input_zp
+               Optional[torch.Tensor],  # azp_adj
+               ]:
+        return (
+            getattr(layer, self.w_q_name),
+            getattr(layer, self.w_s_name),
+            getattr(layer, self.i_s_name),
+            getattr(layer, self.i_zp_name),
+            getattr(layer, self.azp_adj_name),
+        )
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
new file mode 100644
index 0000000000000..586752d3d34e3
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
@@ -0,0 +1,84 @@
+import os
+from typing import Dict, List, Optional, Type
+
+from vllm.model_executor.layers.quantization.kernels.scaled_mm.cutlass import (
+    CutlassScaledMMLinearKernel)
+from vllm.model_executor.layers.quantization.kernels.scaled_mm.ScaledMMLinearKernel import (  # noqa: E501
+    ScaledMMLinearKernel, ScaledMMLinearLayerConfig)
+# from vllm.model_executor.layers.quantization.kernels.scaled_mm.triton import (
+#     TritonScaledMMLinear)
+from vllm.model_executor.layers.quantization.kernels.scaled_mm.xla import (
+    XLAScaledMMLinearKernel)
+from vllm.platforms import PlatformEnum, current_platform
+
+# in priority/performance order (when available)
+_POSSIBLE_KERNELS: Dict[PlatformEnum, List[Type[ScaledMMLinearKernel]]] = {
+    PlatformEnum.CPU: [CutlassScaledMMLinearKernel],
+    PlatformEnum.CUDA: [CutlassScaledMMLinearKernel],
+    # TODO(rob): Create TritonScaledMMLinear kernel. ROCM will
+    # incorrectly attempt to run AZP models if prompted to.
+    PlatformEnum.ROCM: [CutlassScaledMMLinearKernel],
+    PlatformEnum.TPU: [XLAScaledMMLinearKernel],
+}
+
+
+def choose_scaled_mm_linear_kernel(
+        config: ScaledMMLinearLayerConfig,
+        compute_capability: Optional[int] = None
+) -> Type[ScaledMMLinearKernel]:
+    """
+    Choose an ScalledMMLinearKernel that can implement the given config for the 
+    given compute capability. Attempts to choose the best kernel in terms of 
+    performance.
+
+    Args:
+        config (ScaledMMLinearLayerConfig): Description of the linear layer 
+            to be implemented.
+        compute_capability (Optional[int], optional): The compute capability of
+            the target device, if None uses `current_platform` to get the 
+            compute capability. Defaults to None.
+
+    Raises:
+        ValueError: If no kernel can implement the given config.
+
+    Returns:
+        Type[ScaledMMLinearKernel]: Chosen kernel.
+    """
+
+    if compute_capability is None:
+        _cc = current_platform.get_device_capability()
+        if _cc is not None:
+            compute_capability = _cc[0] * 10 + _cc[1]
+
+    failure_reasons = []
+    for kernel in _POSSIBLE_KERNELS[current_platform._enum]:
+        if kernel.__name__ in os.environ.get("VLLM_DISABLED_KERNELS", "")\
+            .split(","):
+            failure_reasons.append(
+                f' {kernel.__name__} disabled by environment variable')
+            continue
+
+        # If the current platform uses compute_capability,
+        # make sure the kernel supports the compute cability.
+        if compute_capability is not None:
+            kernel_min_capability = kernel.get_min_capability()
+            if (kernel_min_capability is not None
+                    and kernel_min_capability > compute_capability):
+                failure_reasons.append(
+                    f"{kernel.__name__} requires capability "
+                    f"{kernel_min_capability}, current compute capability "
+                    f"is {compute_capability}")
+                continue
+
+        can_implement, failure_reason = kernel.can_implement(config)
+        if can_implement:
+            return kernel
+        else:
+            failure_reasons.append(
+                f' {kernel.__name__} cannot implement due to: {failure_reason}'
+            )
+
+    raise ValueError(
+        "Failed to find a kernel that can implement the "\
+        "ScaledMM linear layer. Reasons: \n"
+        + '\n'.join(failure_reasons))
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
new file mode 100644
index 0000000000000..2e83a04286a0d
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
@@ -0,0 +1,134 @@
+from typing import Optional, Tuple
+
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.utils import replace_parameter
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    convert_to_channelwise)
+from vllm.platforms import current_platform
+
+from .ScaledMMLinearKernel import (ScaledMMLinearKernel,
+                                   ScaledMMLinearLayerConfig)
+
+
+class CutlassScaledMMLinearKernel(ScaledMMLinearKernel):
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 75
+
+    @classmethod
+    def can_implement(
+            cls, c: ScaledMMLinearLayerConfig) -> Tuple[bool, Optional[str]]:
+
+        if (not current_platform.is_cuda() and not current_platform.is_cpu()):
+            return False, "CutlassScaledMM requires running on CUDA or CPU."
+
+        return True, None
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        # WEIGHT
+        # Cutlass kernels need transposed weight.
+        weight = getattr(layer, self.w_q_name)
+        replace_parameter(
+            layer, self.w_q_name,
+            torch.nn.Parameter(weight.t().data, requires_grad=False))
+
+        # WEIGHT SCALE
+        # Cutlass kernels support only per-tensor and per-channel.
+        # If we have a fused module (QKV, MLP) with per tensor scales (thus N
+        # scales being passed to the kernel), convert to the per-channel case.
+        is_fused_module = len(layer.logical_widths) > 1
+        weight_scale = getattr(layer, self.w_s_name)
+        if is_fused_module and not self.config.is_channelwise:
+            weight_scale = convert_to_channelwise(weight_scale,
+                                                  layer.logical_widths)
+        replace_parameter(
+            layer, self.w_s_name,
+            torch.nn.Parameter(weight_scale.data, requires_grad=False))
+
+        # INPUT SCALE
+        if self.config.is_static_input_scheme:
+            input_scale = getattr(layer, self.i_s_name)
+
+            if self.config.input_symmetric:
+                replace_parameter(
+                    layer, self.i_s_name,
+                    torch.nn.Parameter(input_scale.max(), requires_grad=False))
+                setattr(layer, self.i_zp_name, None)
+            else:
+                input_zero_point = getattr(layer, self.i_zp_name)
+
+                # reconstruct the ranges
+                int8_traits = torch.iinfo(torch.int8)
+                azps = input_zero_point.to(dtype=torch.int32)
+                range_max = (input_scale * (int8_traits.max - azps)).max()
+                range_min = (input_scale * (int8_traits.min - azps)).min()
+
+                scale = (range_max - range_min) / (int8_traits.max -
+                                                   int8_traits.min)
+                replace_parameter(
+                    layer, self.i_s_name,
+                    torch.nn.Parameter(scale, requires_grad=False))
+
+                # AZP loaded as int8 but used as int32
+                azp = (int8_traits.min -
+                       range_min / scale).to(dtype=torch.int32)
+                replace_parameter(layer, self.i_zp_name,
+                                  torch.nn.Parameter(azp, requires_grad=False))
+
+        else:
+            setattr(layer, self.i_s_name, None)
+            setattr(layer, self.i_zp_name, None)
+
+        # azp_adj is the AZP adjustment term, used to account for weights.
+        # It does not depend on scales or azp, so it is the same for
+        # static and dynamic quantization.
+        # For more details, see csrc/quantization/cutlass_w8a8/Epilogues.md
+        # https://github.com/vllm-project/vllm/blob/8d59dbb00044a588cab96bcdc028006ed922eb06/csrc/quantization/cutlass_w8a8/Epilogues.md
+        if not self.config.input_symmetric:
+            weight = getattr(layer, self.w_q_name)
+            azp_adj = weight.sum(dim=0, keepdim=True, dtype=torch.int32)
+            if self.config.is_static_input_scheme:
+                # cutlass_w8a8 requires azp to be folded into azp_adj
+                # in the per-tensor case
+                azp_adj = getattr(layer, self.i_zp_name) * azp_adj
+            setattr(layer, self.azp_adj_name,
+                    torch.nn.Parameter(azp_adj, requires_grad=False))
+        else:
+            setattr(layer, self.azp_adj_name, None)
+
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        w_q, w_s, i_s, i_zp, azp_adj = self._get_weight_params(layer)
+
+        # ops.scaled_int8_quant supports both dynamic and static quant:
+        # * dynamic, i_s is None and x_s computed from x.
+        # * static, i_s is scalar and x_s is i_s.
+        symmetric = azp_adj is None
+        x_q, x_s, x_zp = ops.scaled_int8_quant(x,
+                                               i_s,
+                                               i_zp,
+                                               symmetric=symmetric)
+
+        if x_zp is not None:
+            # Currently, static is always per-tensor and dynamic is per-token
+            static = i_zp is not None
+            azp = None if static else x_zp
+            return ops.cutlass_scaled_mm_azp(x_q,
+                                             w_q,
+                                             scale_a=x_s,
+                                             scale_b=w_s,
+                                             out_dtype=x.dtype,
+                                             azp_adj=azp_adj,
+                                             azp=azp,
+                                             bias=bias)
+        return ops.cutlass_scaled_mm(x_q,
+                                     w_q,
+                                     scale_a=x_s,
+                                     scale_b=w_s,
+                                     out_dtype=x.dtype,
+                                     bias=bias)
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py
new file mode 100644
index 0000000000000..9de668e658826
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py
@@ -0,0 +1,101 @@
+import warnings
+from typing import Optional, Tuple
+
+import torch
+from functorch.experimental.control_flow import cond  # noqa: F401
+
+from vllm.model_executor.layers.quantization.utils import replace_parameter
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    convert_to_channelwise)
+from vllm.platforms import current_platform
+
+from .ScaledMMLinearKernel import (ScaledMMLinearKernel,
+                                   ScaledMMLinearLayerConfig)
+
+
+class XLAScaledMMLinearKernel(ScaledMMLinearKernel):
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        raise NotImplementedError(
+            "TPU platform does have a concept of compute capability, "
+            "this method should not be called.")
+
+    @classmethod
+    def can_implement(
+            cls, c: ScaledMMLinearLayerConfig) -> Tuple[bool, Optional[str]]:
+
+        if not current_platform.is_tpu():
+            return False, "ScaledMMXLA requires running on TPU."
+
+        if c.is_static_input_scheme:
+            return False, "ScaledMMXLA requires dynamic activation scales."
+
+        if not c.input_symmetric:
+            return False, "ScaledMMXLA requires symmetric activation scales."
+
+        if not c.is_channelwise:
+            return False, "ScaledMMXLA requires channelwise weight scales"
+
+        return True, None
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        # WEIGHT
+        # [out, in] (different than cutlass_scaled_mm)
+        weight = getattr(layer, self.w_q_name)
+        replace_parameter(layer, self.w_q_name,
+                          torch.nn.Parameter(weight.data, requires_grad=False))
+
+        # WEIGHT SCALE
+        # XLA kernels support only per-tensor and per-channel.
+        # If we have a fused module (QKV, MLP) with per tensor scales (thus N
+        # scales being passed to the kernel), convert to the per-channel case.
+        is_fused_module = len(layer.logical_widths) > 1
+        weight_scale = getattr(layer, self.w_s_name)
+        if is_fused_module and not self.config.is_channelwise:
+            weight_scale = convert_to_channelwise(weight_scale,
+                                                  layer.logical_widths)
+
+        # [out_channel,] (different than cutlass_scaled_mm)
+        weight_scale = weight_scale.squeeze(-1)
+        replace_parameter(
+            layer, self.w_s_name,
+            torch.nn.Parameter(weight_scale.data, requires_grad=False))
+
+        # Only support symmetric dynamic activation quantization.
+        setattr(layer, self.i_s_name, None)
+        setattr(layer, self.i_zp_name, None)
+        setattr(layer, self.azp_adj_name, None)
+
+        # Filter warning for cond usage in apply_weights. It is okay
+        # to specialize the graph since bias is not dynamic.
+        warnings.filterwarnings(
+            "ignore",
+            message=
+            "Pred is a Python constant. When used with torch.cond, it specializes on one of the branches."  # noqa: E501
+        )
+
+    def no_add_bias(self, x: torch.Tensor, bias: Optional[torch.Tensor]):
+        return x
+
+    def add_bias(self, x: torch.Tensor, bias: Optional[torch.Tensor]):
+        return x + bias
+
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        w_q, w_s, _, _, _ = self._get_weight_params(layer)
+
+        import torch_xla.experimental.xla_quantized_matmul  # noqa: F401
+        out = torch.ops.xla.quantized_matmul(x,
+                                             w_q,
+                                             w_s,
+                                             zero_point=None,
+                                             block_size=-1,
+                                             int4_weight=False,
+                                             quantize_activation=True)
+
+        # Explicitly capture control flow to make dynamo happy.
+        # https://pytorch.org/docs/main/generated/exportdb/index.html#cond-branch-class-method # noqa: E501
+        return cond(bias is None, self.no_add_bias, self.add_bias, [out, bias])
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index d89071f30a549..7cdce67cf1677 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -201,44 +201,6 @@ def apply_fp8_linear(
             return output.to(dtype=input.dtype).view(*output_shape)
 
 
-def apply_int8_linear(
-    input: torch.Tensor,
-    weight: torch.Tensor,
-    weight_scale: torch.Tensor,
-    input_scale: Optional[torch.Tensor] = None,
-    input_zero_point: Optional[torch.Tensor] = None,
-    azp_adj: Optional[torch.Tensor] = None,
-    bias: Optional[torch.Tensor] = None,
-):
-    # ops.scaled_int8_quant supports both dynamic and static quant.
-    # * dynamic, layer.input_scale is None and x_scale computed from x.
-    # * static, layer.input_scale is scalar and x_scale is input_scale.
-    symmetric = azp_adj is None
-    x_q, x_scale, x_zp = ops.scaled_int8_quant(input,
-                                               input_scale,
-                                               input_zero_point,
-                                               symmetric=symmetric)
-
-    if x_zp is not None:
-        # Currently, static is always per-tensor and dynamic is per-token
-        static = input_zero_point is not None
-        azp = None if static else x_zp
-        return ops.cutlass_scaled_mm_azp(x_q,
-                                         weight,
-                                         scale_a=x_scale,
-                                         scale_b=weight_scale,
-                                         out_dtype=input.dtype,
-                                         azp_adj=azp_adj,
-                                         azp=azp,
-                                         bias=bias)
-    return ops.cutlass_scaled_mm(x_q,
-                                 weight,
-                                 scale_a=x_scale,
-                                 scale_b=weight_scale,
-                                 out_dtype=input.dtype,
-                                 bias=bias)
-
-
 def normalize_e4m3fn_to_e4m3fnuz(
     weight: torch.Tensor,
     weight_scale: torch.Tensor,
diff --git a/vllm/model_executor/parameter.py b/vllm/model_executor/parameter.py
index 02d22a5ca62c0..fc5a3e7fba674 100644
--- a/vllm/model_executor/parameter.py
+++ b/vllm/model_executor/parameter.py
@@ -6,6 +6,7 @@
 
 from vllm.distributed import get_tensor_model_parallel_rank
 from vllm.logger import init_logger
+from vllm.model_executor.utils import _make_synced_weight_loader
 
 __all__ = [
     "BasevLLMParameter", "PackedvLLMParameter", "PerTensorScaleParameter",
@@ -37,6 +38,18 @@ def __init__(self, data: torch.Tensor, weight_loader: Callable):
         :returns: a torch.nn.parameter
         """
 
+        # During weight loading, we often do something like:
+        # narrowed_tensor = param.data.narrow(0, offset, len)
+        # narrowed_tensor.copy_(real_weight)
+        # expecting narrowed_tensor and param.data to share the same storage.
+        # However, on TPUs, narrowed_tensor will lazily propagate to the base
+        # tensor, which is param.data, leading to the redundant memory usage.
+        # This sometimes causes OOM errors during model loading. To avoid this,
+        # we sync the param tensor after its weight loader is called.
+        from vllm.platforms import current_platform
+        if current_platform.is_tpu():
+            weight_loader = _make_synced_weight_loader(weight_loader)
+
         self._weight_loader = weight_loader
 
     @property
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index 77f5c8401424b..d488daf056f1a 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -19,7 +19,9 @@ class TpuPlatform(Platform):
     device_name: str = "tpu"
     device_type: str = "tpu"
     dispatch_key: str = "XLA"
-    supported_quantization: list[str] = ["tpu_int8"]
+    supported_quantization: list[str] = [
+        "tpu_int8", "compressed-tensors", "compressed_tensors"
+    ]
 
     @classmethod
     def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend:

From 526de822d501c792b051c864ba873a836d78d5bf Mon Sep 17 00:00:00 2001
From: rasmith <Randall.Smith@amd.com>
Date: Wed, 8 Jan 2025 14:23:15 -0600
Subject: [PATCH 139/462] [Kernel][Triton][AMD] Use block size heuristic for
 avg 2.8x speedup for int8 models (#11698)

Signed-off-by: Randall Smith <Randall.Smith@amd.com>
---
 .../compressed_tensors/triton_scaled_mm.py      | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py b/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py
index 3ff162170f255..2659afcdc74a9 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py
@@ -128,7 +128,8 @@ def triton_scaled_mm(input: torch.Tensor,
                      bias: Optional[torch.Tensor] = None,
                      block_size_m: int = 32,
                      block_size_n: int = 32,
-                     block_size_k: int = 32) -> torch.Tensor:
+                     block_size_k: int = 32,
+                     use_heuristic=True) -> torch.Tensor:
     M, K = input.shape
     N = weight.shape[1]
 
@@ -152,6 +153,20 @@ def triton_scaled_mm(input: torch.Tensor,
 
     has_scalar = lambda x: x.shape[0] == 1 and x.shape[1] == 1
 
+    if use_heuristic:
+        is_small_N = N < 8192
+        next_power_of_2_M = max(32, triton.next_power_of_2(M))
+        if next_power_of_2_M <= 32:
+            tile_shape = (64, 64, 256) if is_small_N else (64, 128, 256)
+        elif next_power_of_2_M <= 64:
+            tile_shape = (64, 64, 256)
+        elif next_power_of_2_M <= 128:
+            tile_shape = (64, 128, 128)
+        else:
+            tile_shape = (128, 128, 128)
+
+    block_size_m, block_size_n, block_size_k = tile_shape
+
     block_size_sa = 1 if has_scalar(scale_a) else block_size_m
     block_size_sb = 1 if has_scalar(scale_b) else block_size_n
 

From 3db0cafdf1fe7f4cd7e41a145f78e8a568b4d63c Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Wed, 8 Jan 2025 12:38:28 -0800
Subject: [PATCH 140/462] [Docs] Add Google Cloud Meetup (#11864)

---
 README.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/README.md b/README.md
index 1f82229f39537..253a0bb913e37 100644
--- a/README.md
+++ b/README.md
@@ -15,6 +15,10 @@ Easy, fast, and cheap LLM serving for everyone
 
 ---
 
+The first vLLM meetup in 2025 is happening on January 22nd, Wednesday, with Google Cloud in San Francisco! We will talk about vLLM's performant V1 architecture, Q1 roadmap, Google Cloud's innovation around vLLM: networking, Cloud Run, Vertex, and TPU! [Register Now](https://lu.ma/zep56hui)
+
+---
+
 *Latest News* 🔥
 - [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
 - [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing).

From 615e4a54017136649db275b68932af80168781f8 Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Wed, 8 Jan 2025 21:20:44 -0500
Subject: [PATCH 141/462] [CI] Turn on basic correctness tests for V1 (#10864)

---
 tests/basic_correctness/test_basic_correctness.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
index 1c2193bb17a55..31a101e48e026 100644
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -44,7 +44,6 @@ def test_vllm_gc_ed():
     assert weak_llm() is None
 
 
-@pytest.mark.skip_v1
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS", "FLASHINFER"])
 @pytest.mark.parametrize("dtype", ["half"])

From 1fe554bac32419a6d64a5c977849806a1efd9725 Mon Sep 17 00:00:00 2001
From: Maximilien de Bayser <mbayser@br.ibm.com>
Date: Thu, 9 Jan 2025 00:05:43 -0300
Subject: [PATCH 142/462] treat do_lower_case in the same way as the
 sentence-transformers library (#11815)

Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
---
 tests/entrypoints/openai/test_serving_chat.py       | 1 +
 tests/models/embedding/language/test_embedding.py   | 1 +
 vllm/entrypoints/openai/serving_engine.py           | 5 +++++
 vllm/inputs/preprocess.py                           | 6 ++++++
 vllm/transformers_utils/tokenizer_group/__init__.py | 5 -----
 5 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index 97248f1150979..f431d1065e0eb 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -35,6 +35,7 @@ class MockModelConfig:
     logits_processor_pattern = None
     diff_sampling_param: Optional[dict] = None
     allowed_local_media_path: str = ""
+    encoder_config = None
 
     def get_diff_sampling_param(self):
         return self.diff_sampling_param or {}
diff --git a/tests/models/embedding/language/test_embedding.py b/tests/models/embedding/language/test_embedding.py
index f458ef5ef556d..7749806548cd9 100644
--- a/tests/models/embedding/language/test_embedding.py
+++ b/tests/models/embedding/language/test_embedding.py
@@ -15,6 +15,7 @@
         # [Encoder-only]
         pytest.param("BAAI/bge-base-en-v1.5",
                      marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
+        pytest.param("sentence-transformers/all-MiniLM-L12-v2"),
         pytest.param("intfloat/multilingual-e5-large"),
         # [Encoder-decoder]
         pytest.param("intfloat/e5-mistral-7b-instruct",
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 319f869240036..88859255f202a 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -160,6 +160,11 @@ def _normalize_prompt_text_to_input(
         truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]],
         add_special_tokens: bool,
     ) -> TextTokensPrompt:
+        if (self.model_config.encoder_config is not None
+                and self.model_config.encoder_config.get(
+                    "do_lower_case", False)):
+            prompt = prompt.lower()
+
         if truncate_prompt_tokens is None:
             encoded = tokenizer(prompt, add_special_tokens=add_special_tokens)
         else:
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index 6ddc1eb76f10d..3e92d5821e645 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -190,6 +190,12 @@ def _tokenize_prompt(
             # on the task and language of their request. Also needed to avoid
             # appending an EOS token to the prompt which disrupts generation.
             add_special_tokens = False
+
+        if (self.model_config.encoder_config is not None
+                and self.model_config.encoder_config.get(
+                    "do_lower_case", False)):
+            prompt = prompt.lower()
+
         return tokenizer.encode(request_id=request_id,
                                 prompt=prompt,
                                 lora_request=lora_request,
diff --git a/vllm/transformers_utils/tokenizer_group/__init__.py b/vllm/transformers_utils/tokenizer_group/__init__.py
index c0b3d2585a962..d400276796996 100644
--- a/vllm/transformers_utils/tokenizer_group/__init__.py
+++ b/vllm/transformers_utils/tokenizer_group/__init__.py
@@ -26,11 +26,6 @@ def init_tokenizer_from_configs(model_config: ModelConfig,
                        trust_remote_code=model_config.trust_remote_code,
                        revision=model_config.tokenizer_revision)
 
-    if (model_config.encoder_config is not None
-            and "do_lower_case" in model_config.encoder_config):
-        init_kwargs["do_lower_case"] = model_config.encoder_config[
-            "do_lower_case"]
-
     return get_tokenizer_group(parallel_config.tokenizer_pool_config,
                                **init_kwargs)
 

From 730e9592e97c643474aa44e9d3dbe6f55c4b9ad9 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Wed, 8 Jan 2025 22:37:48 -0500
Subject: [PATCH 143/462] [Doc] Recommend uv and python 3.12 for quickstart
 guide (#11849)

Signed-off-by: mgoin <michael@neuralmagic.com>
---
 docs/source/getting_started/quickstart.md | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md
index 2808e1b386801..ea15d9ef065fa 100644
--- a/docs/source/getting_started/quickstart.md
+++ b/docs/source/getting_started/quickstart.md
@@ -15,10 +15,19 @@ This guide will help you quickly get started with vLLM to perform:
 ## Installation
 
 If you are using NVIDIA GPUs, you can install vLLM using [pip](https://pypi.org/project/vllm/) directly.
-It's recommended to use [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html) to create and manage Python environments.
+
+It's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python environment manager, to create and manage Python environments. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment and install vLLM using the following commands:
+
+```console
+$ uv venv myenv --python 3.12 --seed
+$ source myenv/bin/activate
+$ uv pip install vllm
+```
+
+You can also use [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html) to create and manage Python environments.
 
 ```console
-$ conda create -n myenv python=3.10 -y
+$ conda create -n myenv python=3.12 -y
 $ conda activate myenv
 $ pip install vllm
 ```

From d848800e884f581eeed9f154d6c2aeb38eac24de Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 9 Jan 2025 12:48:12 +0800
Subject: [PATCH 144/462] [Misc] Move `print_*_once` from utils to logger
 (#11298)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com>
Co-authored-by: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com>
---
 .github/workflows/lint-and-deploy.yaml        |  1 +
 vllm/attention/backends/torch_sdpa.py         |  9 ++-
 vllm/attention/backends/xformers.py           |  8 ++-
 vllm/config.py                                |  9 ++-
 vllm/entrypoints/chat_utils.py                |  7 +--
 vllm/inputs/preprocess.py                     | 20 ++++---
 vllm/inputs/registry.py                       |  4 +-
 vllm/logger.py                                | 57 +++++++++++++++++--
 vllm/lora/peft_helper.py                      |  6 +-
 vllm/lora/punica_wrapper/punica_selector.py   |  8 ++-
 vllm/model_executor/custom_op.py              |  3 +-
 .../compressed_tensors_moe.py                 |  8 ++-
 .../model_executor/layers/quantization/fp8.py |  5 +-
 .../layers/quantization/kv_cache.py           |  6 +-
 .../quantization/utils/marlin_utils_fp8.py    |  6 +-
 .../model_loader/weight_utils.py              |  8 +--
 vllm/model_executor/models/chameleon.py       |  6 +-
 vllm/model_executor/models/olmoe.py           |  6 +-
 vllm/model_executor/models/qwen2_moe.py       |  6 +-
 vllm/model_executor/models/vision.py          |  6 +-
 vllm/utils.py                                 | 12 ----
 21 files changed, 129 insertions(+), 72 deletions(-)

diff --git a/.github/workflows/lint-and-deploy.yaml b/.github/workflows/lint-and-deploy.yaml
index ee768db63c96c..556b60d2fca12 100644
--- a/.github/workflows/lint-and-deploy.yaml
+++ b/.github/workflows/lint-and-deploy.yaml
@@ -64,6 +64,7 @@ jobs:
         run: |
           export AWS_ACCESS_KEY_ID=minioadmin
           export AWS_SECRET_ACCESS_KEY=minioadmin
+          sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" &
           helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
     
       - name: curl test
diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py
index c14f7754596dd..ca1c4618615de 100644
--- a/vllm/attention/backends/torch_sdpa.py
+++ b/vllm/attention/backends/torch_sdpa.py
@@ -13,9 +13,12 @@
 from vllm.attention.backends.utils import CommonAttentionState
 from vllm.attention.ops.ipex_attn import PagedAttention
 from vllm.attention.ops.paged_attn import PagedAttentionMetadata
-from vllm.utils import make_tensor_with_pad, print_warning_once
+from vllm.logger import init_logger
+from vllm.utils import make_tensor_with_pad
 from vllm.worker.cpu_model_runner import ModelInputForCPUBuilder
 
+logger = init_logger(__name__)
+
 
 class TorchSDPABackend(AttentionBackend):
 
@@ -396,8 +399,8 @@ def __init__(
             raise ValueError(
                 "Torch SPDA does not support block-sparse attention.")
         if logits_soft_cap is not None:
-            print_warning_once("Torch SPDA does not support logits soft cap. "
-                               "Outputs may be slightly off.")
+            logger.warning_once("Torch SPDA does not support logits soft cap. "
+                                "Outputs may be slightly off.")
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
index 694c7cc1bc36a..8c8ca8520a9db 100644
--- a/vllm/attention/backends/xformers.py
+++ b/vllm/attention/backends/xformers.py
@@ -17,7 +17,9 @@
     is_all_cross_attn_metadata_set, is_all_encoder_attn_metadata_set)
 from vllm.attention.ops.paged_attn import (PagedAttention,
                                            PagedAttentionMetadata)
-from vllm.utils import print_warning_once
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
 
 
 class XFormersBackend(AttentionBackend):
@@ -385,8 +387,8 @@ def __init__(
             raise ValueError(
                 "XFormers does not support block-sparse attention.")
         if logits_soft_cap is not None:
-            print_warning_once("XFormers does not support logits soft cap. "
-                               "Outputs may be slightly off.")
+            logger.warning_once("XFormers does not support logits soft cap. "
+                                "Outputs may be slightly off.")
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
diff --git a/vllm/config.py b/vllm/config.py
index 6dabeb3861af2..19609085cc960 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -32,8 +32,7 @@
 from vllm.transformers_utils.s3_utils import S3Model
 from vllm.transformers_utils.utils import is_s3
 from vllm.utils import (GiB_bytes, LayerBlockType, cuda_device_count_stateless,
-                        get_cpu_memory, print_warning_once, random_uuid,
-                        resolve_obj_by_qualname)
+                        get_cpu_memory, random_uuid, resolve_obj_by_qualname)
 
 if TYPE_CHECKING:
     from ray.util.placement_group import PlacementGroup
@@ -314,7 +313,7 @@ def __init__(self,
                 sliding_window_len_min = get_min_sliding_window(
                     self.hf_text_config.sliding_window)
 
-                print_warning_once(
+                logger.warning_once(
                     f"{self.hf_text_config.model_type} has interleaved "
                     "attention, which is currently not supported by the "
                     "XFORMERS backend. Disabling sliding window and capping "
@@ -2758,7 +2757,7 @@ def uuid(self):
 
         def model_post_init(self, __context: Any) -> None:
             if not self.enable_reshape and self.enable_fusion:
-                print_warning_once(
+                logger.warning_once(
                     "Fusion enabled but reshape elimination disabled."
                     "RMSNorm + quant (fp8) fusion might not work")
 
@@ -3151,7 +3150,7 @@ def __post_init__(self):
             self.scheduler_config.chunked_prefill_enabled and \
             self.model_config.dtype == torch.float32 and \
             current_platform.get_device_capability() == (7, 5):
-            print_warning_once(
+            logger.warning_once(
                 "Turing devices tensor cores do not support float32 matmul. "
                 "To workaround this limitation, vLLM will set 'ieee' input "
                 "precision for chunked prefill triton kernels.")
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index a492d5496e025..923c7459f6948 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -35,7 +35,6 @@
 from vllm.multimodal import MultiModalDataDict
 from vllm.multimodal.utils import MediaConnector
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
-from vllm.utils import print_warning_once
 
 logger = init_logger(__name__)
 
@@ -985,14 +984,14 @@ def apply_mistral_chat_template(
     **kwargs: Any,
 ) -> List[int]:
     if chat_template is not None:
-        print_warning_once(
+        logger.warning_once(
             "'chat_template' cannot be overridden for mistral tokenizer.")
     if "add_generation_prompt" in kwargs:
-        print_warning_once(
+        logger.warning_once(
             "'add_generation_prompt' is not supported for mistral tokenizer, "
             "so it will be ignored.")
     if "continue_final_message" in kwargs:
-        print_warning_once(
+        logger.warning_once(
             "'continue_final_message' is not supported for mistral tokenizer, "
             "so it will be ignored.")
 
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index 3e92d5821e645..a738ffe18e3ae 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -10,7 +10,6 @@
 from vllm.multimodal.inputs import MultiModalDataDict, MultiModalInputsV2
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
-from vllm.utils import print_info_once, print_warning_once
 
 from .data import (DecoderOnlyInputs, EncoderDecoderInputs, ProcessorInputs,
                    PromptType, SingletonInputs, SingletonPrompt, token_inputs)
@@ -68,21 +67,24 @@ def get_decoder_start_token_id(self) -> Optional[int]:
         '''
 
         if not self.model_config.is_encoder_decoder:
-            print_warning_once("Using None for decoder start token id because "
-                               "this is not an encoder/decoder model.")
+            logger.warning_once(
+                "Using None for decoder start token id because "
+                "this is not an encoder/decoder model.")
             return None
 
         if (self.model_config is None or self.model_config.hf_config is None):
-            print_warning_once("Using None for decoder start token id because "
-                               "model config is not available.")
+            logger.warning_once(
+                "Using None for decoder start token id because "
+                "model config is not available.")
             return None
 
         dec_start_token_id = getattr(self.model_config.hf_config,
                                      'decoder_start_token_id', None)
         if dec_start_token_id is None:
-            print_warning_once("Falling back on <BOS> for decoder start token "
-                               "id because decoder start token id is not "
-                               "available.")
+            logger.warning_once(
+                "Falling back on <BOS> for decoder start token "
+                "id because decoder start token id is not "
+                "available.")
             dec_start_token_id = self.get_bos_token_id()
 
         return dec_start_token_id
@@ -231,7 +233,7 @@ def _can_process_multimodal(self) -> bool:
         # updated to use the new multi-modal processor
         can_process_multimodal = self.mm_registry.has_processor(model_config)
         if not can_process_multimodal:
-            print_info_once(
+            logger.info_once(
                 "Your model uses the legacy input pipeline instead of the new "
                 "multi-modal processor. Please note that the legacy pipeline "
                 "will be removed in a future release. For more details, see: "
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index b22b3f1594f24..aad0dfab94a01 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -12,7 +12,7 @@
 from vllm.transformers_utils.processor import cached_get_processor
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils import (ClassRegistry, get_allowed_kwarg_only_overrides,
-                        print_warning_once, resolve_mm_processor_kwargs)
+                        resolve_mm_processor_kwargs)
 
 from .data import ProcessorInputs, SingletonInputs
 from .parse import is_encoder_decoder_inputs
@@ -352,7 +352,7 @@ def dummy_data_for_profiling(
         num_tokens = dummy_data.seq_data.prompt_token_ids
         if len(num_tokens) < seq_len:
             if is_encoder_data:
-                print_warning_once(
+                logger.warning_once(
                     f"Expected at least {seq_len} dummy encoder tokens for "
                     f"profiling, but found {len(num_tokens)} tokens instead.")
             else:
diff --git a/vllm/logger.py b/vllm/logger.py
index 538db0dcf19aa..cac174f7ba02a 100644
--- a/vllm/logger.py
+++ b/vllm/logger.py
@@ -4,11 +4,12 @@
 import logging
 import os
 import sys
-from functools import partial
+from functools import lru_cache, partial
 from logging import Logger
 from logging.config import dictConfig
 from os import path
-from typing import Dict, Optional
+from types import MethodType
+from typing import Any, Optional, cast
 
 import vllm.envs as envs
 
@@ -49,8 +50,44 @@
 }
 
 
+@lru_cache
+def _print_info_once(logger: Logger, msg: str) -> None:
+    # Set the stacklevel to 2 to print the original caller's line info
+    logger.info(msg, stacklevel=2)
+
+
+@lru_cache
+def _print_warning_once(logger: Logger, msg: str) -> None:
+    # Set the stacklevel to 2 to print the original caller's line info
+    logger.warning(msg, stacklevel=2)
+
+
+class _VllmLogger(Logger):
+    """
+    Note:
+        This class is just to provide type information.
+        We actually patch the methods directly on the :class:`logging.Logger`
+        instance to avoid conflicting with other libraries such as
+        `intel_extension_for_pytorch.utils._logger`.
+    """
+
+    def info_once(self, msg: str) -> None:
+        """
+        As :meth:`info`, but subsequent calls with the same message
+        are silently dropped.
+        """
+        _print_info_once(self, msg)
+
+    def warning_once(self, msg: str) -> None:
+        """
+        As :meth:`warning`, but subsequent calls with the same message
+        are silently dropped.
+        """
+        _print_warning_once(self, msg)
+
+
 def _configure_vllm_root_logger() -> None:
-    logging_config: Dict = {}
+    logging_config = dict[str, Any]()
 
     if not VLLM_CONFIGURE_LOGGING and VLLM_LOGGING_CONFIG_PATH:
         raise RuntimeError(
@@ -84,12 +121,22 @@ def _configure_vllm_root_logger() -> None:
         dictConfig(logging_config)
 
 
-def init_logger(name: str) -> Logger:
+def init_logger(name: str) -> _VllmLogger:
     """The main purpose of this function is to ensure that loggers are
     retrieved in such a way that we can be sure the root vllm logger has
     already been configured."""
 
-    return logging.getLogger(name)
+    logger = logging.getLogger(name)
+
+    methods_to_patch = {
+        "info_once": _print_info_once,
+        "warning_once": _print_warning_once,
+    }
+
+    for method_name, method in methods_to_patch.items():
+        setattr(logger, method_name, MethodType(method, logger))
+
+    return cast(_VllmLogger, logger)
 
 
 # The root logger is initialized when the module is imported.
diff --git a/vllm/lora/peft_helper.py b/vllm/lora/peft_helper.py
index ddd42ae93d290..dacfb9ebd1480 100644
--- a/vllm/lora/peft_helper.py
+++ b/vllm/lora/peft_helper.py
@@ -4,7 +4,9 @@
 from dataclasses import MISSING, dataclass, field, fields
 from typing import Literal, Optional, Union
 
-from vllm.utils import print_info_once
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
 
 
 @dataclass
@@ -42,7 +44,7 @@ def _validate_features(self):
     def __post_init__(self):
         self._validate_features()
         if self.use_rslora:
-            print_info_once("Loading LoRA weights trained with rsLoRA.")
+            logger.info_once("Loading LoRA weights trained with rsLoRA.")
             self.vllm_lora_scaling_factor = self.lora_alpha / math.sqrt(self.r)
         else:
             self.vllm_lora_scaling_factor = self.lora_alpha / self.r
diff --git a/vllm/lora/punica_wrapper/punica_selector.py b/vllm/lora/punica_wrapper/punica_selector.py
index cd64878d95ae3..9791d492d8e48 100644
--- a/vllm/lora/punica_wrapper/punica_selector.py
+++ b/vllm/lora/punica_wrapper/punica_selector.py
@@ -1,19 +1,21 @@
+from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.utils import print_info_once
 
 from .punica_base import PunicaWrapperBase
 
+logger = init_logger(__name__)
+
 
 def get_punica_wrapper(*args, **kwargs) -> PunicaWrapperBase:
     if current_platform.is_cuda_alike():
         # Lazy import to avoid ImportError
         from vllm.lora.punica_wrapper.punica_gpu import PunicaWrapperGPU
-        print_info_once("Using PunicaWrapperGPU.")
+        logger.info_once("Using PunicaWrapperGPU.")
         return PunicaWrapperGPU(*args, **kwargs)
     elif current_platform.is_hpu():
         # Lazy import to avoid ImportError
         from vllm.lora.punica_wrapper.punica_hpu import PunicaWrapperHPU
-        print_info_once("Using PunicaWrapperHPU.")
+        logger.info_once("Using PunicaWrapperHPU.")
         return PunicaWrapperHPU(*args, **kwargs)
     else:
         raise NotImplementedError
diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py
index fddc8bad09ef5..401606e8c76f0 100644
--- a/vllm/model_executor/custom_op.py
+++ b/vllm/model_executor/custom_op.py
@@ -5,7 +5,6 @@
 from vllm.config import get_current_vllm_config
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.utils import print_warning_once
 
 logger = init_logger(__name__)
 
@@ -91,7 +90,7 @@ def enabled(cls) -> bool:
         compilation_config = get_current_vllm_config().compilation_config
         custom_ops = compilation_config.custom_ops
         if not hasattr(cls, "name"):
-            print_warning_once(
+            logger.warning_once(
                 f"Custom op {cls.__name__} was not registered, "
                 f"which means it won't appear in the op registry. "
                 f"It will be enabled/disabled based on the global settings.")
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 5fd6b017f444b..4fb8fd84e92d4 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -8,6 +8,7 @@
 
 import vllm.model_executor.layers.fused_moe  # noqa
 from vllm import _custom_ops as ops
+from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEMethodBase,
                                                   FusedMoeWeightScaleSupported)
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
@@ -16,7 +17,8 @@
     all_close_1d, normalize_e4m3fn_to_e4m3fnuz, per_tensor_dequantize)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
-from vllm.utils import print_warning_once
+
+logger = init_logger(__name__)
 
 
 class GPTQMarlinState(Enum):
@@ -142,10 +144,10 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
                     "activation scales are None.")
             if (not all_close_1d(layer.w13_input_scale)
                     or not all_close_1d(layer.w2_input_scale)):
-                print_warning_once(
+                logger.warning_once(
                     "Found input_scales that are not equal for "
                     "fp8 MoE layer. Using the maximum across experts "
-                    "for each layer. ")
+                    "for each layer.")
             layer.w13_input_scale = torch.nn.Parameter(
                 layer.w13_input_scale.max(), requires_grad=False)
             layer.w2_input_scale = torch.nn.Parameter(
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 2fe22903a385b..a1be45a49e94a 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -28,7 +28,6 @@
                                            PerTensorScaleParameter)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
-from vllm.utils import print_warning_once
 
 ACTIVATION_SCHEMES = ["static", "dynamic"]
 
@@ -539,10 +538,10 @@ def process_weights_after_loading(self, layer: Module) -> None:
                         "activation scales are None.")
                 if (not all_close_1d(layer.w13_input_scale)
                         or not all_close_1d(layer.w2_input_scale)):
-                    print_warning_once(
+                    logger.warning_once(
                         "Found input_scales that are not equal for "
                         "fp8 MoE layer. Using the maximum across experts "
-                        "for each layer. ")
+                        "for each layer.")
                 layer.w13_input_scale = torch.nn.Parameter(
                     layer.w13_input_scale.max(), requires_grad=False)
                 layer.w2_input_scale = torch.nn.Parameter(
diff --git a/vllm/model_executor/layers/quantization/kv_cache.py b/vllm/model_executor/layers/quantization/kv_cache.py
index d79536d196b92..a74f5415c8a51 100644
--- a/vllm/model_executor/layers/quantization/kv_cache.py
+++ b/vllm/model_executor/layers/quantization/kv_cache.py
@@ -1,8 +1,10 @@
 import torch
 
+from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
-from vllm.utils import print_warning_once
+
+logger = init_logger(__name__)
 
 
 class BaseKVCacheMethod(QuantizeMethodBase):
@@ -67,7 +69,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             layer._v_scale = v_scale
             if (layer._k_scale == 1.0 and layer._v_scale == 1.0
                     and "e5m2" not in layer.kv_cache_dtype):
-                print_warning_once(
+                logger.warning_once(
                     "Using KV cache scaling factor 1.0 for fp8_e4m3. This "
                     "may cause accuracy issues. Please make sure k/v_scale "
                     "scaling factors are available in the fp8 checkpoint.")
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
index 8b3dfaae971c3..245fe9238e421 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
@@ -3,11 +3,13 @@
 import torch
 
 import vllm._custom_ops as ops
+from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.utils import print_warning_once
 
 from .marlin_utils import marlin_make_workspace, marlin_permute_scales
 
+logger = init_logger(__name__)
+
 
 def is_fp8_marlin_supported():
     return current_platform.has_device_capability(80)
@@ -47,7 +49,7 @@ def apply_fp8_marlin_linear(
 
 def prepare_fp8_layer_for_marlin(layer: torch.nn.Module,
                                  strategy: str = "tensor") -> None:
-    print_warning_once(
+    logger.warning_once(
         "Your GPU does not have native support for FP8 computation but "
         "FP8 quantization is being used. Weight-only FP8 compression will "
         "be used leveraging the Marlin kernel. This may degrade "
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index a2c991cfdb74e..11d5fd7135d9e 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -25,7 +25,7 @@
                                                      get_quantization_config)
 from vllm.model_executor.layers.quantization.schema import QuantParamSchema
 from vllm.platforms import current_platform
-from vllm.utils import PlaceholderModule, print_warning_once
+from vllm.utils import PlaceholderModule
 
 try:
     from runai_model_streamer import SafetensorsStreamer
@@ -673,7 +673,7 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> Optional[str]:
         None: If the remapped name is not found in params_dict.
     """
     if name.endswith(".kv_scale"):
-        print_warning_once(
+        logger.warning_once(
             "DEPRECATED. Found kv_scale in the checkpoint. "
             "This format is deprecated in favor of separate k_scale and "
             "v_scale tensors and will be removed in a future release. "
@@ -682,7 +682,7 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> Optional[str]:
         # NOTE: we remap the deprecated kv_scale to k_scale
         remapped_name = name.replace(".kv_scale", ".attn.k_scale")
         if remapped_name not in params_dict:
-            print_warning_once(
+            logger.warning_once(
                 f"Found kv_scale in the checkpoint (e.g. {name}), "
                 "but not found the expected name in the model "
                 f"(e.g. {remapped_name}). kv_scale is "
@@ -695,7 +695,7 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> Optional[str]:
         if name.endswith(scale_name):
             remapped_name = name.replace(scale_name, f".attn{scale_name}")
             if remapped_name not in params_dict:
-                print_warning_once(
+                logger.warning_once(
                     f"Found {scale_name} in the checkpoint (e.g. {name}), "
                     "but not found the expected name in the model "
                     f"(e.g. {remapped_name}). {scale_name} is "
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index acff926891bbe..452fe727875fe 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -11,6 +11,7 @@
 from vllm.attention import Attention, AttentionMetadata
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
@@ -35,13 +36,14 @@
                                         BaseProcessingInfo, PromptReplacement)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
-from vllm.utils import print_warning_once
 
 from .interfaces import SupportsMultiModal, SupportsPP
 from .utils import (is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix, merge_multimodal_embeddings)
 
+logger = init_logger(__name__)
+
 
 class ChameleonImagePixelInputs(TypedDict):
     type: Literal["pixel_values"]
@@ -1111,7 +1113,7 @@ def load_weights(self, weights: Iterable[Tuple[str,
                         remapped_kv_scale_name = name.replace(
                             ".kv_scale", ".attn.kv_scale")
                         if remapped_kv_scale_name not in params_dict:
-                            print_warning_once(
+                            logger.warning_once(
                                 "Found kv scale in the checkpoint (e.g. "
                                 f"{name}), but not found the expected name in "
                                 f"the model (e.g. {remapped_kv_scale_name}). "
diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py
index 5d9091cfb9311..fbe5d1aee04b3 100644
--- a/vllm/model_executor/models/olmoe.py
+++ b/vllm/model_executor/models/olmoe.py
@@ -20,6 +20,7 @@
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (QKVParallelLinear,
@@ -34,13 +35,14 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
-from vllm.utils import print_warning_once
 
 from .interfaces import SupportsPP
 from .utils import (is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
+logger = init_logger(__name__)
+
 
 class OlmoeMoE(nn.Module):
     """A tensor-parallel MoE implementation for Olmoe that shards each expert
@@ -446,7 +448,7 @@ def load_weights(self, weights: Iterable[Tuple[str,
                         remapped_kv_scale_name = name.replace(
                             ".kv_scale", ".attn.kv_scale")
                         if remapped_kv_scale_name not in params_dict:
-                            print_warning_once(
+                            logger.warning_once(
                                 "Found kv scale in the checkpoint "
                                 f"(e.g. {name}), but not found the expected "
                                 f"name in the model "
diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py
index ba70243c6533d..95de6c21871bf 100644
--- a/vllm/model_executor/models/qwen2_moe.py
+++ b/vllm/model_executor/models/qwen2_moe.py
@@ -34,6 +34,7 @@
 from vllm.distributed import (get_pp_group,
                               get_tensor_model_parallel_world_size,
                               tensor_model_parallel_all_reduce)
+from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
@@ -50,13 +51,14 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
-from vllm.utils import print_warning_once
 
 from .interfaces import SupportsPP
 from .utils import (extract_layer_index, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
+logger = init_logger(__name__)
+
 
 class Qwen2MoeMLP(nn.Module):
 
@@ -524,7 +526,7 @@ def load_weights(self, weights: Iterable[Tuple[str,
                         remapped_kv_scale_name = name.replace(
                             ".kv_scale", ".attn.kv_scale")
                         if remapped_kv_scale_name not in params_dict:
-                            print_warning_once(
+                            logger.warning_once(
                                 "Found kv scale in the checkpoint "
                                 f"(e.g. {name}), but not found the expected "
                                 f"name in the model "
diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py
index e6a9e153d9107..a1395982af44c 100644
--- a/vllm/model_executor/models/vision.py
+++ b/vllm/model_executor/models/vision.py
@@ -7,8 +7,10 @@
 import vllm.envs as envs
 from vllm.attention.selector import (backend_name_to_enum,
                                      get_global_forced_attn_backend)
+from vllm.logger import init_logger
 from vllm.platforms import _Backend, current_platform
-from vllm.utils import print_warning_once
+
+logger = init_logger(__name__)
 
 _C = TypeVar("_C", bound=PretrainedConfig)
 
@@ -87,7 +89,7 @@ def get_vit_attn_backend(support_fa: bool = False) -> _Backend:
             if is_flash_attn_2_available():
                 selected_backend = _Backend.FLASH_ATTN
             else:
-                print_warning_once(
+                logger.warning_once(
                     "Current `vllm-flash-attn` has a bug inside vision module, "
                     "so we use xformers backend instead. You can run "
                     "`pip install flash-attn` to use flash-attention backend.")
diff --git a/vllm/utils.py b/vllm/utils.py
index c09cae70e9af8..a92b77efd9fd8 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -696,18 +696,6 @@ def create_kv_caches_with_random(
     return key_caches, value_caches
 
 
-@lru_cache
-def print_info_once(msg: str) -> None:
-    # Set the stacklevel to 2 to print the caller's line info
-    logger.info(msg, stacklevel=2)
-
-
-@lru_cache
-def print_warning_once(msg: str) -> None:
-    # Set the stacklevel to 2 to print the caller's line info
-    logger.warning(msg, stacklevel=2)
-
-
 @lru_cache(maxsize=None)
 def is_pin_memory_available() -> bool:
     from vllm.platforms import current_platform

From a732900efc4eb0d4393e3885d5df8ef3516d4834 Mon Sep 17 00:00:00 2001
From: Guspan Tanadi <36249910+guspan-tanadi@users.noreply.github.com>
Date: Thu, 9 Jan 2025 12:39:39 +0700
Subject: [PATCH 145/462] [Doc] Intended links Python multiprocessing library
 (#11878)

---
 docs/source/design/multiprocessing.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/design/multiprocessing.md b/docs/source/design/multiprocessing.md
index da87638e5b743..c2cdb75ea08a7 100644
--- a/docs/source/design/multiprocessing.md
+++ b/docs/source/design/multiprocessing.md
@@ -21,7 +21,7 @@ This document describes how vLLM deals with these challenges.
 
 ## Multiprocessing Methods
 
-[Python multiprocessing methods](https://docs.python.org/3/library/multiprocessing.html.md#contexts-and-start-methods) include:
+[Python multiprocessing methods](https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods) include:
 
 - `spawn` - spawn a new Python process. This will be the default as of Python
   3.14.

From 310aca88c984983189a57f1b72e3b1dde89fb92f Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 9 Jan 2025 15:18:21 +0800
Subject: [PATCH 146/462] [perf]fix current stream (#11870)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 .../device_communicators/pynccl.py            | 15 +++++----
 vllm/distributed/parallel_state.py            |  5 +--
 vllm/utils.py                                 | 33 +++++++++++++++++++
 vllm/worker/multi_step_model_runner.py        |  8 ++---
 4 files changed, 46 insertions(+), 15 deletions(-)

diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py
index fda4d007ceb5b..efc59987195f5 100644
--- a/vllm/distributed/device_communicators/pynccl.py
+++ b/vllm/distributed/device_communicators/pynccl.py
@@ -10,6 +10,7 @@
     ncclRedOpTypeEnum, ncclUniqueId)
 from vllm.distributed.utils import StatelessProcessGroup
 from vllm.logger import init_logger
+from vllm.utils import current_stream
 
 logger = init_logger(__name__)
 
@@ -96,7 +97,7 @@ def __init__(
             self.comm: ncclComm_t = self.nccl.ncclCommInitRank(
                 self.world_size, self.unique_id, self.rank)
 
-            stream = torch.cuda.current_stream()
+            stream = current_stream()
             # A small all_reduce for warmup.
             data = torch.zeros(1, device=device)
             self.all_reduce(data)
@@ -119,7 +120,7 @@ def all_reduce(self,
         out_tensor = torch.empty_like(in_tensor)
 
         if stream is None:
-            stream = torch.cuda.current_stream()
+            stream = current_stream()
         self.nccl.ncclAllReduce(buffer_type(in_tensor.data_ptr()),
                                 buffer_type(out_tensor.data_ptr()),
                                 in_tensor.numel(),
@@ -141,7 +142,7 @@ def all_gather(self,
             f"this nccl communicator is created to work on {self.device}, "
             f"but the input tensor is on {input_tensor.device}")
         if stream is None:
-            stream = torch.cuda.current_stream()
+            stream = current_stream()
         self.nccl.ncclAllGather(
             buffer_type(input_tensor.data_ptr()),
             buffer_type(output_tensor.data_ptr()), input_tensor.numel(),
@@ -162,7 +163,7 @@ def reduce_scatter(self,
             f"this nccl communicator is created to work on {self.device}, "
             f"but the input tensor is on {input_tensor.device}")
         if stream is None:
-            stream = torch.cuda.current_stream()
+            stream = current_stream()
         self.nccl.ncclReduceScatter(
             buffer_type(input_tensor.data_ptr()),
             buffer_type(output_tensor.data_ptr()), output_tensor.numel(),
@@ -177,7 +178,7 @@ def send(self, tensor: torch.Tensor, dst: int, stream=None):
             f"this nccl communicator is created to work on {self.device}, "
             f"but the input tensor is on {tensor.device}")
         if stream is None:
-            stream = torch.cuda.current_stream()
+            stream = current_stream()
         self.nccl.ncclSend(buffer_type(tensor.data_ptr()), tensor.numel(),
                            ncclDataTypeEnum.from_torch(tensor.dtype), dst,
                            self.comm, cudaStream_t(stream.cuda_stream))
@@ -189,7 +190,7 @@ def recv(self, tensor: torch.Tensor, src: int, stream=None):
             f"this nccl communicator is created to work on {self.device}, "
             f"but the input tensor is on {tensor.device}")
         if stream is None:
-            stream = torch.cuda.current_stream()
+            stream = current_stream()
         self.nccl.ncclRecv(buffer_type(tensor.data_ptr()), tensor.numel(),
                            ncclDataTypeEnum.from_torch(tensor.dtype), src,
                            self.comm, cudaStream_t(stream.cuda_stream))
@@ -201,7 +202,7 @@ def broadcast(self, tensor: torch.Tensor, src: int, stream=None):
             f"this nccl communicator is created to work on {self.device}, "
             f"but the input tensor is on {tensor.device}")
         if stream is None:
-            stream = torch.cuda.current_stream()
+            stream = current_stream()
         if src == self.rank:
             sendbuff = buffer_type(tensor.data_ptr())
             # NCCL requires the sender also to have a receive buffer
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index a837c1dc5953b..be7f16ef52a47 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -357,10 +357,7 @@ def _all_reduce_out_place(self, input_: torch.Tensor) -> torch.Tensor:
             return out
         pynccl_comm = self.pynccl_comm
         assert pynccl_comm is not None
-        # TODO: pynccl should not use `stream=`
-        # it can just always use the current stream.
-        out = pynccl_comm.all_reduce(input_,
-                                     stream=torch.cuda.current_stream())
+        out = pynccl_comm.all_reduce(input_)
         if out is None:
             # fall back to the default all-reduce using PyTorch.
             # this usually happens during testing.
diff --git a/vllm/utils.py b/vllm/utils.py
index a92b77efd9fd8..0b0905e675245 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -944,6 +944,39 @@ def find_nccl_library() -> str:
     return so_file
 
 
+prev_set_stream = torch.cuda.set_stream
+
+_current_stream = None
+
+
+def _patched_set_stream(stream: torch.cuda.Stream) -> None:
+    global _current_stream
+    _current_stream = stream
+    prev_set_stream(stream)
+
+
+torch.cuda.set_stream = _patched_set_stream
+
+
+def current_stream() -> torch.cuda.Stream:
+    """
+    replace `torch.cuda.current_stream()` with `vllm.utils.current_stream()`.
+    it turns out that `torch.cuda.current_stream()` is quite expensive,
+    as it will construct a new stream object at each call.
+    here we patch `torch.cuda.set_stream` to keep track of the current stream
+    directly, so that we can avoid calling `torch.cuda.current_stream()`.
+
+    the underlying hypothesis is that we do not call `torch._C._cuda_setStream`
+    from C/C++ code.
+    """
+    global _current_stream
+    if _current_stream is None:
+        # when this function is called before any stream is set,
+        # we return the default stream.
+        _current_stream = torch.cuda.current_stream()
+    return _current_stream
+
+
 def enable_trace_function_call_for_thread(vllm_config: "VllmConfig") -> None:
     """Set up function tracing for the current thread,
     if enabled via the VLLM_TRACE_FUNCTION environment variable
diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py
index a2c2cebf8d1f6..acce923498d7e 100644
--- a/vllm/worker/multi_step_model_runner.py
+++ b/vllm/worker/multi_step_model_runner.py
@@ -14,7 +14,7 @@
                                                 get_pythonized_sample_results)
 from vllm.sequence import (CompletionSequenceGroupOutput, IntermediateTensors,
                            Logprob, SequenceGroupMetadata, SequenceOutput)
-from vllm.utils import PyObjectCache, async_tensor_h2d
+from vllm.utils import PyObjectCache, async_tensor_h2d, current_stream
 from vllm.worker.model_runner import (GPUModelRunnerBase,
                                       ModelInputForGPUWithSamplingMetadata)
 from vllm.worker.model_runner_base import (
@@ -498,7 +498,7 @@ def execute_model(
         #   appended sampler output from last iteration
         #   - also maybe pythonize if CPU is ahead of GPU
 
-        current_stream = torch.cuda.current_stream()
+        stream = current_stream()
         if not model_input.is_first_multi_step:
             # Explicitly block on the previous step's forward to make sure we
             # don't clobber any GPU tensors still in use.
@@ -541,7 +541,7 @@ def execute_model(
                                                        num_steps=1)
 
         # record the event for the current step so that the next step can sync
-        model_input.record_step_event(current_stream)
+        model_input.record_step_event(stream)
 
         if get_pp_group().is_last_rank and self.is_driver_worker:
             assert isinstance(output, list)
@@ -552,7 +552,7 @@ def execute_model(
             # event for the pythonization so that we only pythonize if the
             # tensors are ready. May be able to be combined with the step event
             output_ready_event = torch.cuda.Event()
-            output_ready_event.record(current_stream)
+            output_ready_event.record(stream)
             if self.parallel_config.pipeline_parallel_size > 1:
                 output[0].sampled_token_ids_cpu = output[
                     0].sampled_token_ids.cpu()

From 0bd1ff43469f867f92786a3596c3e4a64df43400 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 9 Jan 2025 17:02:53 +0800
Subject: [PATCH 147/462] [Bugfix] Override dunder methods of placeholder
 modules (#11882)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/test_utils.py |  47 ++++++++++-
 vllm/utils.py       | 189 +++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 220 insertions(+), 16 deletions(-)

diff --git a/tests/test_utils.py b/tests/test_utils.py
index 0285b00d73be1..14d2fbd63b90d 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -7,9 +7,9 @@
 import torch
 from vllm_test_utils import monitor
 
-from vllm.utils import (FlexibleArgumentParser, StoreBoolean, deprecate_kwargs,
-                        get_open_port, memory_profiling, merge_async_iterators,
-                        supports_kw)
+from vllm.utils import (FlexibleArgumentParser, PlaceholderModule,
+                        StoreBoolean, deprecate_kwargs, get_open_port,
+                        memory_profiling, merge_async_iterators, supports_kw)
 
 from .utils import error_on_warning, fork_new_process_for_each_test
 
@@ -323,3 +323,44 @@ def measure_current_non_torch():
     del weights
     lib.cudaFree(handle1)
     lib.cudaFree(handle2)
+
+
+def test_placeholder_module_error_handling():
+    placeholder = PlaceholderModule("placeholder_1234")
+
+    def build_ctx():
+        return pytest.raises(ModuleNotFoundError,
+                             match="No module named")
+
+    with build_ctx():
+        int(placeholder)
+
+    with build_ctx():
+        placeholder()
+
+    with build_ctx():
+        _ = placeholder.some_attr
+
+    with build_ctx():
+        # Test conflict with internal __name attribute
+        _ = placeholder.name
+
+    # OK to print the placeholder or use it in a f-string
+    _ = repr(placeholder)
+    _ = str(placeholder)
+
+    # No error yet; only error when it is used downstream
+    placeholder_attr = placeholder.placeholder_attr("attr")
+
+    with build_ctx():
+        int(placeholder_attr)
+
+    with build_ctx():
+        placeholder_attr()
+
+    with build_ctx():
+        _ = placeholder_attr.some_attr
+
+    with build_ctx():
+        # Test conflict with internal __module attribute
+        _ = placeholder_attr.module
diff --git a/vllm/utils.py b/vllm/utils.py
index 0b0905e675245..487088591ebc2 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -46,7 +46,7 @@
 import zmq.asyncio
 from packaging.version import Version
 from torch.library import Library
-from typing_extensions import ParamSpec, TypeIs, assert_never
+from typing_extensions import Never, ParamSpec, TypeIs, assert_never
 
 import vllm.envs as envs
 from vllm.logger import enable_trace_function_call, init_logger
@@ -1627,24 +1627,183 @@ def get_vllm_optional_dependencies():
     }
 
 
-@dataclass(frozen=True)
-class PlaceholderModule:
+class _PlaceholderBase:
+    """
+    Disallows downstream usage of placeholder modules.
+
+    We need to explicitly override each dunder method because
+    :meth:`__getattr__` is not called when they are accessed.
+
+    See also:
+        [Special method lookup](https://docs.python.org/3/reference/datamodel.html#special-lookup)
+    """
+
+    def __getattr__(self, key: str) -> Never:
+        """
+        The main class should implement this to throw an error
+        for attribute accesses representing downstream usage.
+        """
+        raise NotImplementedError
+
+    # [Basic customization]
+
+    def __lt__(self, other: object):
+        return self.__getattr__("__lt__")
+
+    def __le__(self, other: object):
+        return self.__getattr__("__le__")
+
+    def __eq__(self, other: object):
+        return self.__getattr__("__eq__")
+
+    def __ne__(self, other: object):
+        return self.__getattr__("__ne__")
+
+    def __gt__(self, other: object):
+        return self.__getattr__("__gt__")
+
+    def __ge__(self, other: object):
+        return self.__getattr__("__ge__")
+
+    def __hash__(self):
+        return self.__getattr__("__hash__")
+
+    def __bool__(self):
+        return self.__getattr__("__bool__")
+
+    # [Callable objects]
+
+    def __call__(self, *args: object, **kwargs: object):
+        return self.__getattr__("__call__")
+
+    # [Container types]
+
+    def __len__(self):
+        return self.__getattr__("__len__")
+
+    def __getitem__(self, key: object):
+        return self.__getattr__("__getitem__")
+
+    def __setitem__(self, key: object, value: object):
+        return self.__getattr__("__setitem__")
+
+    def __delitem__(self, key: object):
+        return self.__getattr__("__delitem__")
+
+    # __missing__ is optional according to __getitem__ specification,
+    # so it is skipped
+
+    # __iter__ and __reversed__ have a default implementation
+    # based on __len__ and __getitem__, so they are skipped.
+
+    # [Numeric Types]
+
+    def __add__(self, other: object):
+        return self.__getattr__("__add__")
+
+    def __sub__(self, other: object):
+        return self.__getattr__("__sub__")
+
+    def __mul__(self, other: object):
+        return self.__getattr__("__mul__")
+
+    def __matmul__(self, other: object):
+        return self.__getattr__("__matmul__")
+
+    def __truediv__(self, other: object):
+        return self.__getattr__("__truediv__")
+
+    def __floordiv__(self, other: object):
+        return self.__getattr__("__floordiv__")
+
+    def __mod__(self, other: object):
+        return self.__getattr__("__mod__")
+
+    def __divmod__(self, other: object):
+        return self.__getattr__("__divmod__")
+
+    def __pow__(self, other: object, modulo: object = ...):
+        return self.__getattr__("__pow__")
+
+    def __lshift__(self, other: object):
+        return self.__getattr__("__lshift__")
+
+    def __rshift__(self, other: object):
+        return self.__getattr__("__rshift__")
+
+    def __and__(self, other: object):
+        return self.__getattr__("__and__")
+
+    def __xor__(self, other: object):
+        return self.__getattr__("__xor__")
+
+    def __or__(self, other: object):
+        return self.__getattr__("__or__")
+
+    # r* and i* methods have lower priority than
+    # the methods for left operand so they are skipped
+
+    def __neg__(self):
+        return self.__getattr__("__neg__")
+
+    def __pos__(self):
+        return self.__getattr__("__pos__")
+
+    def __abs__(self):
+        return self.__getattr__("__abs__")
+
+    def __invert__(self):
+        return self.__getattr__("__invert__")
+
+    # __complex__, __int__ and __float__ have a default implementation
+    # based on __index__, so they are skipped.
+
+    def __index__(self):
+        return self.__getattr__("__index__")
+
+    def __round__(self, ndigits: object = ...):
+        return self.__getattr__("__round__")
+
+    def __trunc__(self):
+        return self.__getattr__("__trunc__")
+
+    def __floor__(self):
+        return self.__getattr__("__floor__")
+
+    def __ceil__(self):
+        return self.__getattr__("__ceil__")
+
+    # [Context managers]
+
+    def __enter__(self):
+        return self.__getattr__("__enter__")
+
+    def __exit__(self, *args: object, **kwargs: object):
+        return self.__getattr__("__exit__")
+
+
+class PlaceholderModule(_PlaceholderBase):
     """
     A placeholder object to use when a module does not exist.
 
     This enables more informative errors when trying to access attributes
     of a module that does not exists.
     """
-    name: str
+
+    def __init__(self, name: str) -> None:
+        super().__init__()
+
+        # Apply name mangling to avoid conflicting with module attributes
+        self.__name = name
 
     def placeholder_attr(self, attr_path: str):
         return _PlaceholderModuleAttr(self, attr_path)
 
     def __getattr__(self, key: str):
-        name = self.name
+        name = self.__name
 
         try:
-            importlib.import_module(self.name)
+            importlib.import_module(name)
         except ImportError as exc:
             for extra, names in get_vllm_optional_dependencies().items():
                 if name in names:
@@ -1657,17 +1816,21 @@ def __getattr__(self, key: str):
                              "when the original module can be imported")
 
 
-@dataclass(frozen=True)
-class _PlaceholderModuleAttr:
-    module: PlaceholderModule
-    attr_path: str
+class _PlaceholderModuleAttr(_PlaceholderBase):
+
+    def __init__(self, module: PlaceholderModule, attr_path: str) -> None:
+        super().__init__()
+
+        # Apply name mangling to avoid conflicting with module attributes
+        self.__module = module
+        self.__attr_path = attr_path
 
     def placeholder_attr(self, attr_path: str):
-        return _PlaceholderModuleAttr(self.module,
-                                      f"{self.attr_path}.{attr_path}")
+        return _PlaceholderModuleAttr(self.__module,
+                                      f"{self.__attr_path}.{attr_path}")
 
     def __getattr__(self, key: str):
-        getattr(self.module, f"{self.attr_path}.{key}")
+        getattr(self.__module, f"{self.__attr_path}.{key}")
 
         raise AssertionError("PlaceholderModule should not be used "
                              "when the original module can be imported")

From 1d967acb45d5d18434409b822f105f087e379eee Mon Sep 17 00:00:00 2001
From: "Ye (Charlotte) Qi" <ye.charlotte.qi@gmail.com>
Date: Thu, 9 Jan 2025 01:36:39 -0800
Subject: [PATCH 148/462] [Bugfix] fix beam search input errors and latency
 benchmark script (#11875)

Signed-off-by: Ye Qi <yeq@meta.com>
Co-authored-by: yeq <yeq@devgpu004.lla3.facebook.com>
---
 benchmarks/benchmark_latency.py | 23 +++++++++++++++++------
 vllm/entrypoints/llm.py         | 10 ++++++----
 2 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index e669ce4db299d..77c4f6aa927e4 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -13,6 +13,7 @@
 from vllm import LLM, SamplingParams
 from vllm.engine.arg_utils import EngineArgs
 from vllm.inputs import PromptType
+from vllm.sampling_params import BeamSearchParams
 from vllm.utils import FlexibleArgumentParser
 
 
@@ -40,6 +41,20 @@ def main(args: argparse.Namespace):
         "prompt_token_ids": batch
     } for batch in dummy_prompt_token_ids.tolist()]
 
+    def llm_generate():
+        if not args.use_beam_search:
+            llm.generate(dummy_prompts,
+                         sampling_params=sampling_params,
+                         use_tqdm=False)
+        else:
+            llm.beam_search(
+                dummy_prompts,
+                BeamSearchParams(
+                    beam_width=args.n,
+                    max_tokens=args.output_len,
+                    ignore_eos=True,
+                ))
+
     def run_to_completion(profile_dir: Optional[str] = None):
         if profile_dir:
             with torch.profiler.profile(
@@ -49,15 +64,11 @@ def run_to_completion(profile_dir: Optional[str] = None):
                     ],
                     on_trace_ready=torch.profiler.tensorboard_trace_handler(
                         str(profile_dir))) as p:
-                llm.generate(dummy_prompts,
-                             sampling_params=sampling_params,
-                             use_tqdm=False)
+                llm_generate()
             print(p.key_averages().table(sort_by="self_cuda_time_total"))
         else:
             start_time = time.perf_counter()
-            llm.generate(dummy_prompts,
-                         sampling_params=sampling_params,
-                         use_tqdm=False)
+            llm_generate()
             end_time = time.perf_counter()
             latency = end_time - start_time
             return latency
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index e48fd1a4fa5e9..acb4db85632a8 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -21,7 +21,7 @@
                                          parse_chat_messages,
                                          resolve_chat_template_content_format)
 from vllm.inputs import PromptType, SingletonPrompt, TextPrompt, TokensPrompt
-from vllm.inputs.parse import parse_and_batch_prompt
+from vllm.inputs.parse import is_token_prompt, parse_and_batch_prompt
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.model_executor.guided_decoding.guided_fields import (
@@ -457,7 +457,7 @@ def generate(
 
     def beam_search(
         self,
-        prompts: List[Union[str, List[int]]],
+        prompts: List[Union[TokensPrompt, TextPrompt]],
         params: BeamSearchParams,
     ) -> List[BeamSearchOutput]:
         """
@@ -493,8 +493,10 @@ def sort_beams_key(x: BeamSearchSequence) -> float:
         instances: List[BeamSearchInstance] = []
 
         for prompt in prompts:
-            prompt_tokens = prompt if isinstance(
-                prompt, list) else tokenizer.encode(prompt)
+            if is_token_prompt(prompt):
+                prompt_tokens = prompt["prompt_token_ids"]
+            else:
+                prompt_tokens = tokenizer.encode(prompt["prompt"])
             instances.append(BeamSearchInstance(prompt_tokens))
 
         for _ in range(max_tokens):

From 65097ca0af5c1d7caa3d9d8224fa8b4790a5f7bc Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 9 Jan 2025 17:43:40 +0800
Subject: [PATCH 149/462] [Doc] Add model development API Reference (#11884)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .buildkite/test-pipeline.yaml                      |  2 +-
 docs/source/api/{params.md => inference_params.md} |  5 ++---
 docs/source/api/model/adapters.md                  |  9 +++++++++
 docs/source/api/model/index.md                     | 12 ++++++++++++
 docs/source/api/model/interfaces.md                |  9 +++++++++
 docs/source/api/model/interfaces_base.md           |  9 +++++++++
 docs/source/index.md                               |  3 ++-
 vllm/model_executor/models/interfaces.py           | 11 +++++++----
 vllm/model_executor/models/interfaces_base.py      |  3 +++
 9 files changed, 54 insertions(+), 9 deletions(-)
 rename docs/source/api/{params.md => inference_params.md} (79%)
 create mode 100644 docs/source/api/model/adapters.md
 create mode 100644 docs/source/api/model/index.md
 create mode 100644 docs/source/api/model/interfaces.md
 create mode 100644 docs/source/api/model/interfaces_base.md

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index f883595f6d9ad..e288f8f30159a 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -38,7 +38,7 @@ steps:
   - pip install -r requirements-docs.txt
   - SPHINXOPTS=\"-W\" make html
   # Check API reference (if it fails, you may have missing mock imports)
-  - grep \"sig sig-object py\" build/html/api/params.html
+  - grep \"sig sig-object py\" build/html/api/inference_params.html
 
 - label: Async Engine, Inputs, Utils, Worker Test # 24min
   fast_check: true
diff --git a/docs/source/api/params.md b/docs/source/api/inference_params.md
similarity index 79%
rename from docs/source/api/params.md
rename to docs/source/api/inference_params.md
index a3b4d9cbb44ec..181c30cab9c4a 100644
--- a/docs/source/api/params.md
+++ b/docs/source/api/inference_params.md
@@ -1,6 +1,6 @@
-# Optional Parameters
+# Inference Parameters
 
-Optional parameters for vLLM APIs.
+Inference parameters for vLLM APIs.
 
 (sampling-params)=
 
@@ -19,4 +19,3 @@ Optional parameters for vLLM APIs.
 .. autoclass:: vllm.PoolingParams
     :members:
 ```
-
diff --git a/docs/source/api/model/adapters.md b/docs/source/api/model/adapters.md
new file mode 100644
index 0000000000000..e103a51d0070d
--- /dev/null
+++ b/docs/source/api/model/adapters.md
@@ -0,0 +1,9 @@
+# Model Adapters
+
+## Module Contents
+
+```{eval-rst}
+.. automodule:: vllm.model_executor.models.adapters
+    :members:
+    :member-order: bysource
+```
diff --git a/docs/source/api/model/index.md b/docs/source/api/model/index.md
new file mode 100644
index 0000000000000..b8437e3c3517a
--- /dev/null
+++ b/docs/source/api/model/index.md
@@ -0,0 +1,12 @@
+# Model Development
+
+## Submodules
+
+```{toctree}
+:maxdepth: 1
+
+interfaces_base
+interfaces
+adapters
+```
+
diff --git a/docs/source/api/model/interfaces.md b/docs/source/api/model/interfaces.md
new file mode 100644
index 0000000000000..55bee57f64faa
--- /dev/null
+++ b/docs/source/api/model/interfaces.md
@@ -0,0 +1,9 @@
+# Optional Interfaces
+
+## Module Contents
+
+```{eval-rst}
+.. automodule:: vllm.model_executor.models.interfaces
+    :members:
+    :member-order: bysource
+```
diff --git a/docs/source/api/model/interfaces_base.md b/docs/source/api/model/interfaces_base.md
new file mode 100644
index 0000000000000..75d58d34228e9
--- /dev/null
+++ b/docs/source/api/model/interfaces_base.md
@@ -0,0 +1,9 @@
+# Base Model Interfaces
+
+## Module Contents
+
+```{eval-rst}
+.. automodule:: vllm.model_executor.models.interfaces_base
+    :members:
+    :member-order: bysource
+```
diff --git a/docs/source/index.md b/docs/source/index.md
index 6747a7fcce4fe..23e4304fe29d9 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -139,8 +139,9 @@ community/sponsors
 
 api/offline_inference/index
 api/engine/index
+api/inference_params
 api/multimodal/index
-api/params
+api/model/index
 ```
 
 % Design Documents: Details about vLLM internals
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index 6f26603046483..b51cba86ec1a4 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -38,13 +38,15 @@ def get_multimodal_embeddings(self, **kwargs) -> Optional[T]:
         to be merged with text embeddings.
 
         The output embeddings must be one of the following formats:
+    
         - A list or tuple of 2D tensors, where each tensor corresponds to 
-          each input multimodal data item (e.g, image).
+            each input multimodal data item (e.g, image).
         - A single 3D tensor, with the batch dimension grouping the 2D tensors.
 
-        NOTE: The returned multimodal embeddings must be in the same order as 
-        the appearances of their corresponding multimodal data item in the 
-        input prompt.
+        Note:
+            The returned multimodal embeddings must be in the same order as 
+            the appearances of their corresponding multimodal data item in the 
+            input prompt.
         """
         ...
 
@@ -59,6 +61,7 @@ def get_input_embeddings(
     ) -> torch.Tensor:
         ...
 
+    @overload
     def get_input_embeddings(
         self,
         input_ids: torch.Tensor,
diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py
index de733b6d49a53..4c353ae6ffc13 100644
--- a/vllm/model_executor/models/interfaces_base.py
+++ b/vllm/model_executor/models/interfaces_base.py
@@ -35,6 +35,7 @@
 
 @runtime_checkable
 class VllmModel(Protocol[C_co, T_co]):
+    """The interface required for all models in vLLM."""
 
     def __init__(
         self,
@@ -97,6 +98,7 @@ def is_vllm_model(
 
 @runtime_checkable
 class VllmModelForTextGeneration(VllmModel[C_co, T], Protocol[C_co, T]):
+    """The interface required for all generative models in vLLM."""
 
     def compute_logits(
         self,
@@ -142,6 +144,7 @@ def is_text_generation_model(
 
 @runtime_checkable
 class VllmModelForPooling(VllmModel[C_co, T], Protocol[C_co, T]):
+    """The interface required for all pooling models in vLLM."""
 
     def pooler(
         self,

From 405eb8e3967eb9bd263b3919796cb3b45a2931d3 Mon Sep 17 00:00:00 2001
From: wangxiyuan <wangxiyuan1007@gmail.com>
Date: Thu, 9 Jan 2025 21:46:50 +0800
Subject: [PATCH 150/462] [platform] Allow platform specify attention backend
 (#11609)

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
Signed-off-by: Mengqing Cao <cmq0113@163.com>
Co-authored-by: Mengqing Cao <cmq0113@163.com>
---
 tests/kernels/test_attention_selector.py |  74 ++++++------
 vllm/attention/selector.py               | 139 ++---------------------
 vllm/platforms/cpu.py                    |   7 +-
 vllm/platforms/cuda.py                   |  77 ++++++++++++-
 vllm/platforms/hpu.py                    |   7 +-
 vllm/platforms/interface.py              |   8 +-
 vllm/platforms/openvino.py               |   7 +-
 vllm/platforms/rocm.py                   |   6 +-
 vllm/platforms/tpu.py                    |   7 +-
 vllm/platforms/xpu.py                    |   7 +-
 10 files changed, 164 insertions(+), 175 deletions(-)

diff --git a/tests/kernels/test_attention_selector.py b/tests/kernels/test_attention_selector.py
index 916cc2efa3895..a08c874407e3f 100644
--- a/tests/kernels/test_attention_selector.py
+++ b/tests/kernels/test_attention_selector.py
@@ -1,10 +1,10 @@
-from unittest.mock import patch
+from unittest.mock import Mock, patch
 
 import pytest
 import torch
 
 from tests.kernels.utils import override_backend_env_variable
-from vllm.attention.selector import which_attn_to_use
+from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
 from vllm.platforms.cpu import CpuPlatform
 from vllm.platforms.cuda import CudaPlatform
 from vllm.platforms.openvino import OpenVinoPlatform
@@ -12,6 +12,13 @@
 from vllm.utils import STR_FLASH_ATTN_VAL, STR_INVALID_VAL
 
 
+@pytest.fixture(autouse=True)
+def clear_cache():
+    """Clear lru cache to ensure each test case runs without caching.
+    """
+    _cached_get_attn_backend.cache_clear()
+
+
 @pytest.mark.parametrize(
     "name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER", "OPENVINO"])
 @pytest.mark.parametrize("device", ["cpu", "openvino", "hip", "cuda"])
@@ -24,67 +31,70 @@ def test_env(name: str, device: str, monkeypatch):
 
     if device == "cpu":
         with patch("vllm.attention.selector.current_platform", CpuPlatform()):
-            backend = which_attn_to_use(16, torch.float16, torch.float16, 16,
-                                        False)
-        assert backend.name == "TORCH_SDPA"
+            backend = get_attn_backend(16, torch.float16, torch.float16, 16,
+                                       False)
+        assert backend.get_name() == "TORCH_SDPA"
     elif device == "hip":
         with patch("vllm.attention.selector.current_platform", RocmPlatform()):
-            backend = which_attn_to_use(16, torch.float16, torch.float16, 16,
-                                        False)
-        assert backend.name == "ROCM_FLASH"
+            backend = get_attn_backend(16, torch.float16, torch.float16, 16,
+                                       False)
+        assert backend.get_name() == "ROCM_FLASH"
     elif device == "openvino":
         with patch("vllm.attention.selector.current_platform",
-                   OpenVinoPlatform()):
-            backend = which_attn_to_use(16, torch.float16, torch.float16, 16,
-                                        False)
-        assert backend.name == "OPENVINO"
+                   OpenVinoPlatform()), patch.dict('sys.modules',
+                                                   {'openvino': Mock()}):
+            backend = get_attn_backend(16, torch.float16, torch.float16, 16,
+                                       False)
+        assert backend.get_name() == "OPENVINO"
     else:
-        with patch("vllm.attention.selector.current_platform", CudaPlatform()):
-            backend = which_attn_to_use(16, torch.float16, torch.float16, 16,
-                                        False)
-        assert backend.name == name
+        if name in ["XFORMERS", "FLASHINFER"]:
+            with patch("vllm.attention.selector.current_platform",
+                       CudaPlatform()):
+                backend = get_attn_backend(16, torch.float16, torch.float16,
+                                           16, False)
+            assert backend.get_name() == name
 
 
 def test_flash_attn(monkeypatch):
     """Test FlashAttn validation."""
     # TODO: When testing for v1, pipe in `use_v1` as an argument to
-    # which_attn_to_use
+    # get_attn_backend
 
     override_backend_env_variable(monkeypatch, STR_FLASH_ATTN_VAL)
 
     # Unsupported CUDA arch
     with patch("torch.cuda.get_device_capability", return_value=(7, 5)):
-        backend = which_attn_to_use(16, torch.float16, None, 16, False)
-        assert backend.name != STR_FLASH_ATTN_VAL
+        backend = get_attn_backend(16, torch.float16, None, 16, False)
+        assert backend.get_name() != STR_FLASH_ATTN_VAL
 
     # Unsupported data type
-    backend = which_attn_to_use(16, torch.float8_e4m3fn, None, 16, False)
-    assert backend.name != STR_FLASH_ATTN_VAL
+    backend = get_attn_backend(16, torch.float8_e4m3fn, None, 16, False)
+    assert backend.get_name() != STR_FLASH_ATTN_VAL
 
     # Unsupported kv cache data type
-    backend = which_attn_to_use(16, torch.float16, "fp8", 16, False)
-    assert backend.name != STR_FLASH_ATTN_VAL
+    backend = get_attn_backend(16, torch.float16, "fp8", 16, False)
+    assert backend.get_name() != STR_FLASH_ATTN_VAL
 
     # Unsupported block size
-    backend = which_attn_to_use(16, torch.float16, None, 8, False)
-    assert backend.name != STR_FLASH_ATTN_VAL
+    backend = get_attn_backend(16, torch.float16, None, 8, False)
+    assert backend.get_name() != STR_FLASH_ATTN_VAL
 
     # flash-attn is not installed
     with patch.dict('sys.modules', {'vllm_flash_attn': None}):
-        backend = which_attn_to_use(16, torch.float16, None, 16, False)
-        assert backend.name != STR_FLASH_ATTN_VAL
+        backend = get_attn_backend(16, torch.float16, None, 16, False)
+        assert backend.get_name() != STR_FLASH_ATTN_VAL
 
     # Unsupported head size
-    backend = which_attn_to_use(17, torch.float16, None, 16, False)
-    assert backend.name != STR_FLASH_ATTN_VAL
+    backend = get_attn_backend(17, torch.float16, None, 16, False)
+    assert backend.get_name() != STR_FLASH_ATTN_VAL
 
     # Attention-free models should bypass env and use PlaceholderAttention
-    backend = which_attn_to_use(16, torch.float16, torch.float16, 16, True)
-    assert backend.name != STR_FLASH_ATTN_VAL
+    backend = get_attn_backend(16, torch.float16, torch.float16, 16, True)
+    assert backend.get_name() != STR_FLASH_ATTN_VAL
 
 
 def test_invalid_env(monkeypatch):
     """Throw an exception if the backend name is invalid."""
     override_backend_env_variable(monkeypatch, STR_INVALID_VAL)
     with pytest.raises(ValueError):
-        which_attn_to_use(16, torch.float16, None, 16, False)
+        get_attn_backend(16, torch.float16, None, 16, False)
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index d263839705690..0ff007c87b1c9 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -9,7 +9,7 @@
 from vllm.attention.backends.abstract import AttentionBackend
 from vllm.logger import init_logger
 from vllm.platforms import _Backend, current_platform
-from vllm.utils import STR_BACKEND_ENV_VAR
+from vllm.utils import STR_BACKEND_ENV_VAR, resolve_obj_by_qualname
 
 logger = init_logger(__name__)
 
@@ -114,83 +114,19 @@ def _cached_get_attn_backend(
             BlocksparseFlashAttentionBackend)
         return BlocksparseFlashAttentionBackend
 
-    backend = which_attn_to_use(head_size, dtype, kv_cache_dtype, block_size,
-                                is_attention_free, use_v1)
-    if backend == _Backend.FLASH_ATTN:
-        logger.info("Using Flash Attention backend.")
-        from vllm.attention.backends.flash_attn import (  # noqa: F401
-            FlashAttentionBackend)
-        return FlashAttentionBackend
-    if backend == _Backend.FLASH_ATTN_VLLM_V1:
-        from vllm.v1.attention.backends.flash_attn import (  # noqa: F401
-            FlashAttentionBackend as FlashAttentionBackendV1)
-        return FlashAttentionBackendV1
-    if backend == _Backend.XFORMERS:
-        logger.info("Using XFormers backend.")
-        from vllm.attention.backends.xformers import (  # noqa: F401
-            XFormersBackend)
-        return XFormersBackend
-    elif backend == _Backend.ROCM_FLASH:
-        logger.info("Using ROCmFlashAttention backend.")
-        from vllm.attention.backends.rocm_flash_attn import (  # noqa: F401
-            ROCmFlashAttentionBackend)
-        return ROCmFlashAttentionBackend
-    elif backend == _Backend.TORCH_SDPA:
-        assert current_platform.is_cpu(), RuntimeError(
-            "Torch SDPA backend is only used for the CPU device.")
-        logger.info("Using Torch SDPA backend.")
-        from vllm.attention.backends.torch_sdpa import TorchSDPABackend
-        return TorchSDPABackend
-    elif backend == _Backend.OPENVINO:
-        logger.info("Using OpenVINO Attention backend.")
-        from vllm.attention.backends.openvino import OpenVINOAttentionBackend
-        return OpenVINOAttentionBackend
-    elif backend == _Backend.IPEX:
-        assert current_platform.is_xpu(), RuntimeError(
-            "IPEX attention backend is only used for the XPU device.")
-        logger.info("Using IPEX attention backend.")
-        from vllm.attention.backends.ipex_attn import IpexAttnBackend
-        return IpexAttnBackend
-    elif backend == _Backend.FLASHINFER:
-        logger.info("Using Flashinfer backend.")
-        from vllm.attention.backends.flashinfer import FlashInferBackend
-        return FlashInferBackend
-    elif backend == _Backend.HPU_ATTN:
-        logger.info("Using HPUAttention backend.")
-        from vllm.attention.backends.hpu_attn import HPUAttentionBackend
-        return HPUAttentionBackend
-    elif backend == _Backend.PALLAS:
-        logger.info("Using Pallas backend.")
-        from vllm.attention.backends.pallas import PallasAttentionBackend
-        return PallasAttentionBackend
-    elif backend == _Backend.NO_ATTENTION:
-        from vllm.attention.backends.placeholder_attn import (
-            PlaceholderAttentionBackend)
-        return PlaceholderAttentionBackend
-    else:
-        raise ValueError("Invalid attention backend.")
-
-
-def which_attn_to_use(head_size: int,
-                      dtype: torch.dtype,
-                      kv_cache_dtype: Optional[str],
-                      block_size: int,
-                      is_attention_free: bool,
-                      use_v1: bool = False) -> _Backend:
-    """Returns which flash attention backend to use."""
-    # Default case.
-    selected_backend = _Backend.FLASH_ATTN
-
     # If there are no attention layers (e.g. we are running Mamba),
     # use the placeholder NO_ATTENTION
     if is_attention_free:
-        return _Backend.NO_ATTENTION
+        from vllm.attention.backends.placeholder_attn import (
+            PlaceholderAttentionBackend)
+        return PlaceholderAttentionBackend
 
     # Check whether a particular choice of backend was
     # previously forced.
     #
     # THIS SELECTION OVERRIDES THE VLLM_ATTENTION_BACKEND
     # ENVIRONMENT VARIABLE.
+    selected_backend = None
     backend_by_global_setting: Optional[_Backend] = (
         get_global_forced_attn_backend())
     if backend_by_global_setting is not None:
@@ -201,64 +137,13 @@ def which_attn_to_use(head_size: int,
         if backend_by_env_var is not None:
             selected_backend = backend_name_to_enum(backend_by_env_var)
 
-    # get device-specific default attn_backend
-    default_backend = current_platform.get_default_attn_backend(
-        selected_backend)
-    if default_backend is not None:
-        return default_backend
-
-    if use_v1:
-        return _Backend.FLASH_ATTN_VLLM_V1
-
-    # FlashAttn in NVIDIA GPUs.
-    if selected_backend == _Backend.FLASH_ATTN:
-        if not current_platform.has_device_capability(80):
-            # Volta and Turing NVIDIA GPUs.
-            logger.info(
-                "Cannot use FlashAttention-2 backend for Volta and Turing "
-                "GPUs.")
-            selected_backend = _Backend.XFORMERS
-        elif dtype not in (torch.float16, torch.bfloat16):
-            logger.info(
-                "Cannot use FlashAttention-2 backend for dtype other than "
-                "torch.float16 or torch.bfloat16.")
-            selected_backend = _Backend.XFORMERS
-        elif kv_cache_dtype is not None and kv_cache_dtype.startswith("fp8"):
-            logger.info(
-                "Cannot use FlashAttention-2 backend for FP8 KV cache.")
-            logger.warning(
-                "Please use FlashInfer backend with FP8 KV Cache for "
-                "better performance by setting environment variable  "
-                "VLLM_ATTENTION_BACKEND=FLASHINFER")
-            selected_backend = _Backend.XFORMERS
-        elif block_size % 16 != 0:
-            logger.info(
-                "Cannot use FlashAttention-2 backend for block size not "
-                "divisible by 16.")
-            selected_backend = _Backend.XFORMERS
-
-    # FlashAttn is valid for the model, checking if the package is installed.
-    if selected_backend == _Backend.FLASH_ATTN:
-        try:
-            import vllm.vllm_flash_attn  # noqa: F401
-            from vllm.attention.backends.flash_attn import (  # noqa: F401
-                FlashAttentionBackend)
-
-            supported_sizes = FlashAttentionBackend.get_supported_head_sizes()
-            if head_size not in supported_sizes:
-                logger.info(
-                    "Cannot use FlashAttention-2 backend for head size %d.",
-                    head_size)
-                selected_backend = _Backend.XFORMERS
-        except ImportError:
-            logger.info(
-                "Cannot use FlashAttention-2 backend because the "
-                "vllm.vllm_flash_attn package is not found. "
-                "Make sure that vllm_flash_attn was built and installed "
-                "(on by default).")
-            selected_backend = _Backend.XFORMERS
-
-    return selected_backend
+    # get device-specific attn_backend
+    attention_cls = current_platform.get_attn_backend_cls(
+        selected_backend, head_size, dtype, kv_cache_dtype, block_size, use_v1)
+    if not attention_cls:
+        raise ValueError(
+            f"Invalid attention backend for {current_platform.device_name}")
+    return resolve_obj_by_qualname(attention_cls)
 
 
 @contextmanager
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index 7ba7f5150150c..eb3e269cac285 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -28,10 +28,13 @@ def get_device_name(cls, device_id: int = 0) -> str:
         return "cpu"
 
     @classmethod
-    def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend:
+    def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
+                             dtype: torch.dtype, kv_cache_dtype: Optional[str],
+                             block_size: int, use_v1: bool) -> str:
         if selected_backend != _Backend.TORCH_SDPA:
             logger.info("Cannot use %s backend on CPU.", selected_backend)
-        return _Backend.TORCH_SDPA
+        logger.info("Using Torch SDPA backend.")
+        return "vllm.attention.backends.torch_sdpa.TorchSDPABackend"
 
     @classmethod
     def get_device_total_memory(cls, device_id: int = 0) -> int:
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 3c5350b778345..23ceac83e49de 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -16,7 +16,7 @@
 import vllm.envs as envs
 from vllm.logger import init_logger
 
-from .interface import DeviceCapability, Platform, PlatformEnum
+from .interface import DeviceCapability, Platform, PlatformEnum, _Backend
 
 if TYPE_CHECKING:
     from vllm.config import VllmConfig
@@ -141,6 +141,81 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         if cache_config and cache_config.block_size is None:
             cache_config.block_size = 16
 
+    @classmethod
+    def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
+                             kv_cache_dtype, block_size, use_v1) -> str:
+        if use_v1:
+            logger.info("Using Flash Attention backend on V1 engine.")
+            return "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend"
+        if selected_backend == _Backend.FLASHINFER:
+            logger.info("Using FlashInfer backend.")
+            return "vllm.attention.backends.flashinfer.FlashInferBackend"
+        elif selected_backend == _Backend.XFORMERS:
+            logger.info("Using XFormers backend.")
+            return "vllm.attention.backends.xformers.XFormersBackend"
+        elif selected_backend == _Backend.FLASH_ATTN:
+            pass
+        elif selected_backend:
+            raise ValueError(
+                f"Invalid attention backend for {cls.device_name}")
+
+        target_backend = _Backend.FLASH_ATTN
+        if not cls.has_device_capability(80):
+            # Volta and Turing NVIDIA GPUs.
+            logger.info(
+                "Cannot use FlashAttention-2 backend for Volta and Turing "
+                "GPUs.")
+            target_backend = _Backend.XFORMERS
+        elif dtype not in (torch.float16, torch.bfloat16):
+            logger.info(
+                "Cannot use FlashAttention-2 backend for dtype other than "
+                "torch.float16 or torch.bfloat16.")
+            target_backend = _Backend.XFORMERS
+        elif kv_cache_dtype is not None and \
+            kv_cache_dtype.startswith("fp8"):
+            logger.info(
+                "Cannot use FlashAttention-2 backend for FP8 KV cache.")
+            logger.warning(
+                "Please use FlashInfer backend with FP8 KV Cache for "
+                "better performance by setting environment variable  "
+                "VLLM_ATTENTION_BACKEND=FLASHINFER")
+            target_backend = _Backend.XFORMERS
+        elif block_size % 16 != 0:
+            logger.info(
+                "Cannot use FlashAttention-2 backend for block size not "
+                "divisible by 16.")
+            target_backend = _Backend.XFORMERS
+
+        # FlashAttn is valid for the model, checking if the package is
+        # installed.
+        if target_backend == _Backend.FLASH_ATTN:
+            try:
+                import vllm.vllm_flash_attn  # noqa: F401
+                from vllm.attention.backends.flash_attn import (  # noqa: F401
+                    FlashAttentionBackend)
+
+                supported_sizes = \
+                    FlashAttentionBackend.get_supported_head_sizes()
+                if head_size not in supported_sizes:
+                    logger.info(
+                        "Cannot use FlashAttention-2 backend for head size %d.",
+                        head_size)
+                    target_backend = _Backend.XFORMERS
+            except ImportError:
+                logger.info(
+                    "Cannot use FlashAttention-2 backend because the "
+                    "vllm.vllm_flash_attn package is not found. "
+                    "Make sure that vllm_flash_attn was built and installed "
+                    "(on by default).")
+                target_backend = _Backend.XFORMERS
+
+        if target_backend == _Backend.XFORMERS:
+            logger.info("Using XFormers backend.")
+            return "vllm.attention.backends.xformers.XFormersBackend"
+
+        logger.info("Using Flash Attention backend.")
+        return "vllm.attention.backends.flash_attn.FlashAttentionBackend"
+
 
 # NVML utils
 # Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`,
diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py
index 0a44f2b74163a..8152d881fa8d9 100644
--- a/vllm/platforms/hpu.py
+++ b/vllm/platforms/hpu.py
@@ -21,8 +21,11 @@ class HpuPlatform(Platform):
     dispatch_key: str = "HPU"
 
     @classmethod
-    def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend:
-        return _Backend.HPU_ATTN
+    def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
+                             dtype: torch.dtype, kv_cache_dtype: Optional[str],
+                             block_size: int, use_v1: bool) -> str:
+        logger.info("Using HPUAttention backend.")
+        return "vllm.attention.backends.hpu_attn.HPUAttentionBackend"
 
     @classmethod
     def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index ddccaa2ce0148..f440358f65fbb 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -112,9 +112,11 @@ def is_cuda_alike(self) -> bool:
         return self._enum in (PlatformEnum.CUDA, PlatformEnum.ROCM)
 
     @classmethod
-    def get_default_attn_backend(cls, selected_backend: _Backend):
-        """Get the default attention backend of a device."""
-        return None
+    def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
+                             dtype: torch.dtype, kv_cache_dtype: Optional[str],
+                             block_size: int, use_v1: bool) -> str:
+        """Get the attention backend class of a device."""
+        return ""
 
     @classmethod
     def get_device_capability(
diff --git a/vllm/platforms/openvino.py b/vllm/platforms/openvino.py
index 16eb8dc81efc2..9390eda535c8f 100644
--- a/vllm/platforms/openvino.py
+++ b/vllm/platforms/openvino.py
@@ -28,10 +28,13 @@ class OpenVinoPlatform(Platform):
     dispatch_key: str = "CPU"
 
     @classmethod
-    def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend:
+    def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
+                             dtype: torch.dtype, kv_cache_dtype: Optional[str],
+                             block_size: int, use_v1: bool) -> str:
         if selected_backend != _Backend.OPENVINO:
             logger.info("Cannot use %s backend on OpenVINO.", selected_backend)
-        return _Backend.OPENVINO
+        logger.info("Using OpenVINO Attention backend.")
+        return "vllm.attention.backends.openvino.OpenVINOAttentionBackend"
 
     @classmethod
     def get_device_name(cls, device_id: int = 0) -> str:
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index aa779f265135f..1c2f602efc856 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -70,7 +70,8 @@ class RocmPlatform(Platform):
     ]
 
     @classmethod
-    def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend:
+    def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
+                             kv_cache_dtype, block_size, use_v1) -> str:
         selected_backend = (_Backend.ROCM_FLASH if selected_backend
                             == _Backend.FLASH_ATTN else selected_backend)
         if selected_backend == _Backend.ROCM_FLASH:
@@ -79,7 +80,8 @@ def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend:
                 logger.info("flash_attn is not supported on NAVI GPUs.")
         else:
             logger.info("%s is not supported in AMD GPUs.", selected_backend)
-        return _Backend.ROCM_FLASH
+        logger.info("Using ROCmFlashAttention backend.")
+        return "vllm.attention.backends.rocm_flash_attn.ROCmFlashAttentionBackend"  # noqa: E501
 
     @classmethod
     @lru_cache(maxsize=8)
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index d488daf056f1a..8a59b53ca4b15 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -24,10 +24,13 @@ class TpuPlatform(Platform):
     ]
 
     @classmethod
-    def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend:
+    def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
+                             dtype: torch.dtype, kv_cache_dtype: Optional[str],
+                             block_size: int, use_v1: bool) -> str:
         if selected_backend != _Backend.PALLAS:
             logger.info("Cannot use %s backend on TPU.", selected_backend)
-        return _Backend.PALLAS
+        logger.info("Using Pallas backend.")
+        return "vllm.attention.backends.pallas.PallasAttentionBackend"
 
     @classmethod
     def get_device_name(cls, device_id: int = 0) -> str:
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index 78e17c2afec65..00692a5d23031 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -21,10 +21,13 @@ class XPUPlatform(Platform):
     dispatch_key: str = "XPU"
 
     @classmethod
-    def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend:
+    def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
+                             dtype: torch.dtype, kv_cache_dtype: Optional[str],
+                             block_size: int, use_v1: bool) -> str:
         if selected_backend != _Backend.IPEX:
             logger.info("Cannot use %s backend on XPU.", selected_backend)
-        return _Backend.IPEX
+        logger.info("Using IPEX attention backend.")
+        return "vllm.attention.backends.ipex_attn.IpexAttnBackend"
 
     @staticmethod
     def get_device_capability(device_id: int = 0) -> DeviceCapability:

From bd8287221187279c668ac10c3edd5242b8d8b429 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 9 Jan 2025 22:47:29 +0800
Subject: [PATCH 151/462] [ci]try to fix flaky multi-step tests (#11894)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 tests/multi_step/test_correctness_async_llm.py | 3 +--
 tests/utils.py                                 | 9 +++++++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/tests/multi_step/test_correctness_async_llm.py b/tests/multi_step/test_correctness_async_llm.py
index 7203d635c2fa8..8456a463adeeb 100644
--- a/tests/multi_step/test_correctness_async_llm.py
+++ b/tests/multi_step/test_correctness_async_llm.py
@@ -16,7 +16,6 @@
 NUM_PROMPTS = [10]
 
 DEFAULT_SERVER_ARGS: List[str] = [
-    "--disable-log-requests",
     "--worker-use-ray",
     "--gpu-memory-utilization",
     "0.85",
@@ -110,7 +109,7 @@ async def test_multi_step(
 
     # Spin up client/server & issue completion API requests.
     # Default `max_wait_seconds` is 240 but was empirically
-    # was raised 3x to 720 *just for this test* due to
+    # was raised 5x to 1200 *just for this test* due to
     # observed timeouts in GHA CI
     ref_completions = await completions_with_server_args(
         prompts,
diff --git a/tests/utils.py b/tests/utils.py
index bf3d88194e4ca..f4eecf19e8c64 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -157,13 +157,19 @@ def url_root(self) -> str:
     def url_for(self, *parts: str) -> str:
         return self.url_root + "/" + "/".join(parts)
 
-    def get_client(self):
+    def get_client(self, **kwargs):
+        if "timeout" not in kwargs:
+            kwargs["timeout"] = 600
         return openai.OpenAI(
             base_url=self.url_for("v1"),
             api_key=self.DUMMY_API_KEY,
+            max_retries=0,
+            **kwargs,
         )
 
     def get_async_client(self, **kwargs):
+        if "timeout" not in kwargs:
+            kwargs["timeout"] = 600
         return openai.AsyncOpenAI(base_url=self.url_for("v1"),
                                   api_key=self.DUMMY_API_KEY,
                                   max_retries=0,
@@ -780,7 +786,6 @@ async def completions_with_server_args(
     assert len(max_tokens) == len(prompts)
 
     outputs = None
-    max_wait_seconds = 240 * 3  # 240 is default
     with RemoteOpenAIServer(model_name,
                             server_cli_args,
                             max_wait_seconds=max_wait_seconds) as server:

From 9a228348d2f9a2c85dfc67d6b9fe883bf10a4680 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 10 Jan 2025 01:19:37 +0800
Subject: [PATCH 152/462] [Misc] Provide correct Pixtral-HF chat template
 (#11891)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/source/models/supported_models.md | 61 ++++++++++++++------------
 examples/template_pixtral_hf.jinja     | 38 ++++++++++++++++
 tests/entrypoints/test_chat_utils.py   |  1 +
 3 files changed, 73 insertions(+), 27 deletions(-)
 create mode 100644 examples/template_pixtral_hf.jinja

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 3ba34c77205e5..acbe27a22a679 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -322,7 +322,7 @@ See [this page](#generative-models) for more information on how to use generativ
   - ✅︎
   - ✅︎
 * - `Qwen2ForCausalLM`
-  - Qwen2
+  - QwQ, Qwen2
   - `Qwen/QwQ-32B-Preview`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-7B`, etc.
   - ✅︎
   - ✅︎
@@ -436,7 +436,7 @@ loaded. See [relevant issue on HF Transformers](https://github.com/huggingface/t
 ```
 
 If your model is not in the above list, we will try to automatically convert the model using
-{func}`vllm.model_executor.models.adapters.as_embedding_model`. By default, the embeddings
+{func}`~vllm.model_executor.models.adapters.as_embedding_model`. By default, the embeddings
 of the whole prompt are extracted from the normalized hidden state corresponding to the last token.
 
 #### Reward Modeling (`--task reward`)
@@ -468,7 +468,7 @@ of the whole prompt are extracted from the normalized hidden state corresponding
 ```
 
 If your model is not in the above list, we will try to automatically convert the model using
-{func}`vllm.model_executor.models.adapters.as_reward_model`. By default, we return the hidden states of each token directly.
+{func}`~vllm.model_executor.models.adapters.as_reward_model`. By default, we return the hidden states of each token directly.
 
 ```{important}
 For process-supervised reward models such as `peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly,
@@ -499,7 +499,7 @@ e.g.: `--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "r
 ```
 
 If your model is not in the above list, we will try to automatically convert the model using
-{func}`vllm.model_executor.models.adapters.as_classification_model`. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token.
+{func}`~vllm.model_executor.models.adapters.as_classification_model`. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token.
 
 #### Sentence Pair Scoring (`--task score`)
 
@@ -550,6 +550,28 @@ On the other hand, modalities separated by `/` are mutually exclusive.
 
 See [this page](#multimodal-inputs) on how to pass multi-modal inputs to the model.
 
+````{important}
+To enable multiple multi-modal items per text prompt, you have to set `limit_mm_per_prompt` (offline inference)
+or `--limit-mm-per-prompt` (online inference). For example, to enable passing up to 4 images per text prompt:
+
+Offline inference:
+```python
+llm = LLM(
+    model="Qwen/Qwen2-VL-7B-Instruct",
+    limit_mm_per_prompt={"image": 4},
+)
+```
+
+Online inference:
+```bash
+vllm serve Qwen/Qwen2-VL-7B-Instruct --limit-mm-per-prompt image=4
+```
+````
+
+```{note}
+vLLM currently only supports adding LoRA to the language backbone of multimodal models.
+```
+
 ### Generative Models
 
 See [this page](#generative-models) for more information on how to use generative models.
@@ -689,14 +711,14 @@ See [this page](#generative-models) for more information on how to use generativ
 * - `Phi3VForCausalLM`
   - Phi-3-Vision, Phi-3.5-Vision
   - T + I<sup>E+</sup>
-  - `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct` etc.
+  - `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc.
   -
   - ✅︎
   - ✅︎
 * - `PixtralForConditionalGeneration`
   - Pixtral
   - T + I<sup>+</sup>
-  - `mistralai/Pixtral-12B-2409`, `mistral-community/pixtral-12b` etc.
+  - `mistralai/Pixtral-12B-2409`, `mistral-community/pixtral-12b` (see note), etc.
   -
   - ✅︎
   - ✅︎
@@ -715,7 +737,7 @@ See [this page](#generative-models) for more information on how to use generativ
   - ✅︎
   - ✅︎
 * - `Qwen2VLForConditionalGeneration`
-  - Qwen2-VL
+  - QVQ, Qwen2-VL
   - T + I<sup>E+</sup> + V<sup>E+</sup>
   - `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc.
   - ✅︎
@@ -733,26 +755,6 @@ See [this page](#generative-models) for more information on how to use generativ
 <sup>E</sup> Pre-computed embeddings can be inputted for this modality.  
 <sup>+</sup> Multiple items can be inputted per text prompt for this modality.
 
-````{important}
-To enable multiple multi-modal items per text prompt, you have to set `limit_mm_per_prompt` (offline inference)
-or `--limit-mm-per-prompt` (online inference). For example, to enable passing up to 4 images per text prompt:
-
-```python
-llm = LLM(
-    model="Qwen/Qwen2-VL-7B-Instruct",
-    limit_mm_per_prompt={"image": 4},
-)
-```
-
-```bash
-vllm serve Qwen/Qwen2-VL-7B-Instruct --limit-mm-per-prompt image=4
-```
-````
-
-```{note}
-vLLM currently only supports adding LoRA to the language backbone of multimodal models.
-```
-
 ```{note}
 To use `TIGER-Lab/Mantis-8B-siglip-llama3`, you have pass `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM.
 ```
@@ -762,6 +764,11 @@ The official `openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (`
 For more details, please see: <gh-pr:4087#issuecomment-2250397630>
 ```
 
+```{note}
+The chat template for Pixtral-HF is incorrect (see [discussion](https://huggingface.co/mistral-community/pixtral-12b/discussions/22)).
+A corrected version is available at <gh-file:examples/template_pixtral_hf.jinja>.
+```
+
 ### Pooling Models
 
 See [this page](pooling-models) for more information on how to use pooling models.
diff --git a/examples/template_pixtral_hf.jinja b/examples/template_pixtral_hf.jinja
new file mode 100644
index 0000000000000..e94661cb39071
--- /dev/null
+++ b/examples/template_pixtral_hf.jinja
@@ -0,0 +1,38 @@
+{%- if messages[0]["role"] == "system" %}
+    {%- set system_message = messages[0]["content"] %}
+    {%- set loop_messages = messages[1:] %}
+{%- else %}
+    {%- set loop_messages = messages %}
+{%- endif %}
+
+{{- bos_token }}
+{%- for message in loop_messages %}
+    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}
+        {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }}
+    {%- endif %}
+    {%- if message["role"] == "user" %}
+        {%- if loop.last and system_message is defined %}
+            {{- "[INST]" + system_message + "\n" }}
+        {%- else %}
+            {{- "[INST]" }}
+        {%- endif %}
+        {%- if message["content"] is not string %}
+            {%- for chunk in message["content"] %}
+                {%- if chunk["type"] == "text" %}
+                    {{- chunk["text"] }}
+                {%- elif chunk["type"] == "image" %}
+                    {{- "[IMG]" }}
+                {%- else %}
+                    {{- raise_exception("Unrecognized content type!") }}
+                {%- endif %}
+            {%- endfor %}
+        {%- else %}
+            {{- message["content"] }}
+        {%- endif %}
+        {{- "[/INST]" }}
+    {%- elif message["role"] == "assistant" %}
+        {{- message["content"] + eos_token}}
+    {%- else %}
+        {{- raise_exception("Only user and assistant roles are supported, with the exception of an initial optional system message!") }}
+    {%- endif %}
+{%- endfor %}
diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index d63b963522e73..8f242df4a60e3 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -758,6 +758,7 @@ def test_resolve_content_format_hf_defined(model, expected_format):
      ("template_falcon.jinja", "string"),
      ("template_inkbot.jinja", "string"),
      ("template_llava.jinja", "string"),
+     ("template_pixtral_hf.jinja", "openai"),
      ("template_vlm2vec.jinja", "openai"),
      ("tool_chat_template_granite_20b_fc.jinja", "string"),
      ("tool_chat_template_hermes.jinja", "string"),

From 36f5303578397d122693a19007be38ba2f02bcbc Mon Sep 17 00:00:00 2001
From: Charles Frye <cfrye59@gmail.com>
Date: Thu, 9 Jan 2025 15:26:37 -0800
Subject: [PATCH 153/462] [Docs] Add Modal to deployment frameworks (#11907)

---
 docs/source/deployment/frameworks/bentoml.md | 2 +-
 docs/source/deployment/frameworks/index.md   | 1 +
 docs/source/deployment/frameworks/modal.md   | 7 +++++++
 3 files changed, 9 insertions(+), 1 deletion(-)
 create mode 100644 docs/source/deployment/frameworks/modal.md

diff --git a/docs/source/deployment/frameworks/bentoml.md b/docs/source/deployment/frameworks/bentoml.md
index ea0b5d1d4c93b..2bf435bda8380 100644
--- a/docs/source/deployment/frameworks/bentoml.md
+++ b/docs/source/deployment/frameworks/bentoml.md
@@ -2,6 +2,6 @@
 
 # BentoML
 
-[BentoML](https://github.com/bentoml/BentoML) allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints. You can serve the model locally or containerize it as an OCI-complicant image and deploy it on Kubernetes.
+[BentoML](https://github.com/bentoml/BentoML) allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints. You can serve the model locally or containerize it as an OCI-compliant image and deploy it on Kubernetes.
 
 For details, see the tutorial [vLLM inference in the BentoML documentation](https://docs.bentoml.com/en/latest/use-cases/large-language-models/vllm.html).
diff --git a/docs/source/deployment/frameworks/index.md b/docs/source/deployment/frameworks/index.md
index 6a59131d36618..964782763f6b3 100644
--- a/docs/source/deployment/frameworks/index.md
+++ b/docs/source/deployment/frameworks/index.md
@@ -8,6 +8,7 @@ cerebrium
 dstack
 helm
 lws
+modal
 skypilot
 triton
 ```
diff --git a/docs/source/deployment/frameworks/modal.md b/docs/source/deployment/frameworks/modal.md
new file mode 100644
index 0000000000000..e7c42088e36a9
--- /dev/null
+++ b/docs/source/deployment/frameworks/modal.md
@@ -0,0 +1,7 @@
+(deployment-modal)=
+
+# Modal
+
+vLLM can be run on cloud GPUs with [Modal](https://modal.com), a serverless computing platform designed for fast auto-scaling.
+
+For details on how to deploy vLLM on Modal, see [this tutorial in the Modal documentation](https://modal.com/docs/examples/vllm_inference).

From c3cf54dda4df200bc8913ed69d210a7108dfa320 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 10 Jan 2025 11:10:12 +0800
Subject: [PATCH 154/462] [Doc][5/N] Move Community and API Reference to the
 bottom (#11896)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Simon Mo <simon.mo@hey.com>
---
 README.md                                     |  2 +-
 .../source/design/automatic_prefix_caching.md |  2 +-
 docs/source/index.md                          | 62 ++++++++++++-------
 3 files changed, 40 insertions(+), 26 deletions(-)

diff --git a/README.md b/README.md
index 253a0bb913e37..67c557bfe13a9 100644
--- a/README.md
+++ b/README.md
@@ -41,7 +41,7 @@ vLLM is a fast and easy-to-use library for LLM inference and serving.
 vLLM is fast with:
 
 - State-of-the-art serving throughput
-- Efficient management of attention key and value memory with **PagedAttention**
+- Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html)
 - Continuous batching of incoming requests
 - Fast model execution with CUDA/HIP graph
 - Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), INT4, INT8, and FP8.
diff --git a/docs/source/design/automatic_prefix_caching.md b/docs/source/design/automatic_prefix_caching.md
index 4398536b2b4ad..6d3dd056e6a60 100644
--- a/docs/source/design/automatic_prefix_caching.md
+++ b/docs/source/design/automatic_prefix_caching.md
@@ -2,7 +2,7 @@
 
 # Automatic Prefix Caching
 
-The core idea of [PagedAttention](#design-paged-attention) is to partition the KV cache of each request into KV Blocks. Each block contains the attention keys and values for a fixed number of tokens. The PagedAttention algorithm allows these blocks to be stored in non-contiguous physical memory so that we can eliminate memory fragmentation by allocating the memory on demand.
+The core idea of [PagedAttention](https://blog.vllm.ai/2023/06/20/vllm.html) is to partition the KV cache of each request into KV Blocks. Each block contains the attention keys and values for a fixed number of tokens. The PagedAttention algorithm allows these blocks to be stored in non-contiguous physical memory so that we can eliminate memory fragmentation by allocating the memory on demand.
 
 To automatically cache the KV cache, we utilize the following key observation: Each KV block can be uniquely identified by the tokens within the block and the tokens in the prefix before the block.
 
diff --git a/docs/source/index.md b/docs/source/index.md
index 23e4304fe29d9..356fa4b7fd573 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -26,7 +26,7 @@ vLLM is a fast and easy-to-use library for LLM inference and serving.
 vLLM is fast with:
 
 - State-of-the-art serving throughput
-- Efficient management of attention key and value memory with **PagedAttention**
+- Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html)
 - Continuous batching of incoming requests
 - Fast model execution with CUDA/HIP graph
 - Quantization: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), INT4, INT8, and FP8
@@ -54,6 +54,8 @@ For more information, check out the following:
 
 ## Documentation
 
+% How to start using vLLM?
+
 ```{toctree}
 :caption: Getting Started
 :maxdepth: 1
@@ -65,6 +67,8 @@ getting_started/troubleshooting
 getting_started/faq
 ```
 
+% What does vLLM support?
+
 ```{toctree}
 :caption: Models
 :maxdepth: 1
@@ -75,6 +79,8 @@ models/supported_models
 models/extensions/index
 ```
 
+% Additional capabilities
+
 ```{toctree}
 :caption: Features
 :maxdepth: 1
@@ -89,6 +95,8 @@ features/spec_decode
 features/compatibility_matrix
 ```
 
+% Details about running vLLM
+
 ```{toctree}
 :caption: Inference and Serving
 :maxdepth: 1
@@ -104,6 +112,8 @@ serving/usage_stats
 serving/integrations/index
 ```
 
+% Scaling up vLLM for production
+
 ```{toctree}
 :caption: Deployment
 :maxdepth: 1
@@ -115,6 +125,8 @@ deployment/frameworks/index
 deployment/integrations/index
 ```
 
+% Making the most out of vLLM
+
 ```{toctree}
 :caption: Performance
 :maxdepth: 1
@@ -123,28 +135,7 @@ performance/optimization
 performance/benchmarks
 ```
 
-% Community: User community resources
-
-```{toctree}
-:caption: Community
-:maxdepth: 1
-
-community/meetups
-community/sponsors
-```
-
-```{toctree}
-:caption: API Reference
-:maxdepth: 2
-
-api/offline_inference/index
-api/engine/index
-api/inference_params
-api/multimodal/index
-api/model/index
-```
-
-% Design Documents: Details about vLLM internals
+% Explanation of vLLM internals
 
 ```{toctree}
 :caption: Design Documents
@@ -159,7 +150,7 @@ design/automatic_prefix_caching
 design/multiprocessing
 ```
 
-% Developer Guide: How to contribute to the vLLM project
+% How to contribute to the vLLM project
 
 ```{toctree}
 :caption: Developer Guide
@@ -172,6 +163,29 @@ contributing/model/index
 contributing/vulnerability_management
 ```
 
+% Technical API specifications
+
+```{toctree}
+:caption: API Reference
+:maxdepth: 2
+
+api/offline_inference/index
+api/engine/index
+api/inference_params
+api/multimodal/index
+api/model/index
+```
+
+% Latest news and acknowledgements
+
+```{toctree}
+:caption: Community
+:maxdepth: 1
+
+community/meetups
+community/sponsors
+```
+
 # Indices and tables
 
 - {ref}`genindex`

From b844b99ad309b05f37b1acb5360c82be7b16281d Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 10 Jan 2025 11:24:00 +0800
Subject: [PATCH 155/462] [VLM] Enable tokenized inputs for merged multi-modal
 processor (#11900)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/multimodal/test_processing.py      |  31 ++++--
 vllm/inputs/data.py                      |   4 +-
 vllm/inputs/preprocess.py                |   4 -
 vllm/model_executor/models/blip2.py      |  22 +++-
 vllm/model_executor/models/chameleon.py  |  32 +++++-
 vllm/model_executor/models/fuyu.py       |  24 +++--
 vllm/model_executor/models/interfaces.py |   8 +-
 vllm/model_executor/models/llava.py      |   8 +-
 vllm/model_executor/models/phi3v.py      |   4 +-
 vllm/model_executor/models/ultravox.py   |  18 ++--
 vllm/multimodal/processing.py            | 127 ++++++++++++++++-------
 vllm/multimodal/profiling.py             |   2 +-
 12 files changed, 207 insertions(+), 77 deletions(-)

diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index d98bd9736b65f..d18909a4197b6 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -649,7 +649,7 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
         )
 
 
-def _test_processing_cache_correctness(
+def _test_processing_correctness(
     model_id: str,
     modalities: dict[str, bool],
     hit_rate: float,
@@ -691,6 +691,7 @@ def _test_processing_cache_correctness(
     baseline_processor = factories.build_processor(ctx, cache=None)
     cached_processor = factories.build_processor(ctx, cache=cache)
     dummy_inputs = baseline_processor.dummy_inputs
+    tokenizer = baseline_processor.info.get_tokenizer()
 
     rng = np.random.RandomState(0)
 
@@ -747,7 +748,25 @@ def _test_processing_cache_correctness(
         )
 
         assert baseline_result == cached_result, (
-            f"Failed ({batch_idx=}, {mm_data=})")
+            f"Failed ({batch_idx=}, {prompt=}, {mm_data=})")
+
+        baseline_tokenized_result = baseline_processor.apply(
+            tokenizer.encode(prompt),
+            mm_data=mm_data,
+            hf_processor_mm_kwargs={},
+        )
+
+        assert baseline_result == baseline_tokenized_result, (
+            f"Failed ({batch_idx=}, {prompt=}, {mm_data=})")
+
+        cached_tokenized_result = cached_processor.apply(
+            tokenizer.encode(prompt),
+            mm_data=mm_data,
+            hf_processor_mm_kwargs={},
+        )
+
+        assert cached_result == cached_tokenized_result, (
+            f"Failed ({batch_idx=}, {prompt=}, {mm_data=})")
 
 
 # yapf: disable
@@ -771,14 +790,14 @@ def _test_processing_cache_correctness(
 @pytest.mark.parametrize("num_batches", [32])
 @pytest.mark.parametrize("simplify_rate", [1.0])
 # yapf: enable
-def test_processing_cache_correctness(
+def test_processing_correctness(
     model_id: str,
     modalities: dict[str, bool],
     hit_rate: float,
     num_batches: int,
     simplify_rate: float,
 ):
-    _test_processing_cache_correctness(
+    _test_processing_correctness(
         model_id,
         modalities,
         hit_rate=hit_rate,
@@ -795,7 +814,7 @@ def test_processing_cache_correctness(
 @pytest.mark.parametrize("num_batches", [32])
 @pytest.mark.parametrize("simplify_rate", [1.0])
 # yapf: enable
-def test_processing_cache_correctness_phi3v(
+def test_processing_correctness_phi3v(
     model_id: str,
     modalities: dict[str, bool],
     hit_rate: float,
@@ -809,7 +828,7 @@ def test_processing_cache_correctness_phi3v(
 
     AutoImageProcessor.from_pretrained(model_id, trust_remote_code=True)
 
-    _test_processing_cache_correctness(
+    _test_processing_correctness(
         model_id,
         modalities,
         hit_rate=hit_rate,
diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py
index cdaf6dd76eaa1..b8163a7acde1d 100644
--- a/vllm/inputs/data.py
+++ b/vllm/inputs/data.py
@@ -44,13 +44,13 @@ class TokensPrompt(TypedDict):
 
     multi_modal_data: NotRequired["MultiModalDataDict"]
     """
-    DEPRECATED: Optional multi-modal data to pass to the model,
+    Optional multi-modal data to pass to the model,
     if the model supports it.
     """
 
     mm_processor_kwargs: NotRequired[Dict[str, Any]]
     """
-    DEPRECATED: Optional multi-modal processor kwargs to be forwarded to the
+    Optional multi-modal processor kwargs to be forwarded to the
     multimodal input mapper & processor. Note that if multiple modalities
     have registered mappers etc for the model being considered, we attempt
     to pass the mm_processor_kwargs to each of them.
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index a738ffe18e3ae..0890883cc984f 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -279,10 +279,6 @@ async def _process_multimodal_async(
 
         mm_processor = self.mm_registry.create_processor(
             self.model_config, tokenizer)
-        if isinstance(prompt, list):
-            logger.warning("Passing `multi_modal_data` in TokensPrompt is"
-                           "deprecated and will be removed in a future update")
-            prompt = tokenizer.decode(prompt)
         if mm_processor_kwargs is None:
             mm_processor_kwargs = {}
 
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index 7dfc0b687c6e3..917b88e802071 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -441,6 +441,24 @@ def get_dummy_processor_inputs(
 
 class Blip2MultiModalProcessor(BaseMultiModalProcessor[Blip2ProcessingInfo]):
 
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        if not mm_data:
+            # HF processor always adds placeholders even when there's no image
+            tokenizer = self.info.get_tokenizer()
+            prompt_ids = tokenizer.encode(prompt)
+            return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
+
+        return super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+        )
+
     def _get_mm_fields_config(
         self,
         hf_inputs: BatchFeature,
@@ -469,11 +487,11 @@ def _get_prompt_replacements(
 
     def apply(
         self,
-        prompt_text: str,
+        prompt: Union[str, list[int]],
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> MultiModalInputsV2:
-        result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs)
+        result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
 
         # Only <image> tokens should be considered as placeholders,
         # so we ignore the trailing bos_token
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index 452fe727875fe..a6634204699c9 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -99,6 +99,34 @@ def get_dummy_processor_inputs(
 class ChameleonMultiModalProcessor(
         BaseMultiModalProcessor[ChameleonProcessingInfo]):
 
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        if not mm_data:
+            prompt_ids = self.info.get_tokenizer().encode(prompt)
+            prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids)
+            return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
+
+        return super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+        )
+
+    def _apply_hf_processor_tokens_only(
+        self,
+        prompt_tokens: list[int],
+    ) -> list[int]:
+        # HF processor adds sep token for chat mode
+        tokenizer = self.info.get_tokenizer()
+        sep_token_id: int = \
+            tokenizer.vocab[tokenizer.sep_token]  # type: ignore
+
+        return prompt_tokens + [sep_token_id]
+
     def _get_mm_fields_config(
         self,
         hf_inputs: BatchFeature,
@@ -128,11 +156,11 @@ def _get_prompt_replacements(
 
     def apply(
         self,
-        prompt_text: str,
+        prompt: Union[str, list[int]],
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> MultiModalInputsV2:
-        result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs)
+        result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
 
         # Only <image> tokens should be considered as placeholders,
         # so we ignore the image_start_token and image_end_token
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index 59af5f0b3ae98..63e7147f84e03 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -16,7 +16,7 @@
 """ PyTorch Fuyu model."""
 import math
 from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
-                    TypedDict)
+                    TypedDict, Union)
 
 import torch
 import torch.nn as nn
@@ -149,14 +149,10 @@ def _call_hf_processor(
         mm_data: Mapping[str, object],
         mm_kwargs: Mapping[str, object],
     ) -> BatchFeature:
-
         if not mm_data:
             # Avoid warning from HF logger for text-only input
-            # Input_ids format: bos_token_id + prompt_token_ids + boa_token_id
-            # Tokenizer won't add boa_token_id by default, we add it manually.
-            tokenizer = self.info.get_tokenizer()
-            boa_token_id: int = tokenizer.vocab["<0x04>"]  # type: ignore
-            prompt_ids = tokenizer.encode(prompt) + [boa_token_id]
+            prompt_ids = self.info.get_tokenizer().encode(prompt)
+            prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids)
             return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
 
         processed_outputs = super()._call_hf_processor(
@@ -181,6 +177,16 @@ def _call_hf_processor(
 
         return processed_outputs
 
+    def _apply_hf_processor_tokens_only(
+        self,
+        prompt_tokens: list[int],
+    ) -> list[int]:
+        # HF processor adds boa_token_id
+        tokenizer = self.info.get_tokenizer()
+        boa_token_id: int = tokenizer.vocab["<0x04>"]  # type: ignore
+
+        return prompt_tokens + [boa_token_id]
+
     def _get_mm_fields_config(
         self,
         hf_inputs: BatchFeature,
@@ -223,11 +229,11 @@ def get_replacement_fuyu(item_idx: int):
 
     def apply(
         self,
-        prompt_text: str,
+        prompt: Union[str, list[int]],
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> MultiModalInputsV2:
-        result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs)
+        result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
 
         # Only |SPEAKER| (image) tokens should be considered as placeholders,
         # so we ignore the trailing bos_token_id
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index b51cba86ec1a4..c5fd0d9332379 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -39,13 +39,13 @@ def get_multimodal_embeddings(self, **kwargs) -> Optional[T]:
 
         The output embeddings must be one of the following formats:
     
-        - A list or tuple of 2D tensors, where each tensor corresponds to 
-            each input multimodal data item (e.g, image).
+        - A list or tuple of 2D tensors, where each tensor corresponds to
+          each input multimodal data item (e.g, image).
         - A single 3D tensor, with the batch dimension grouping the 2D tensors.
 
         Note:
-            The returned multimodal embeddings must be in the same order as 
-            the appearances of their corresponding multimodal data item in the 
+            The returned multimodal embeddings must be in the same order as
+            the appearances of their corresponding multimodal data item in the
             input prompt.
         """
         ...
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 8d94acf3b21d5..bb3db60c7d8ed 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -724,7 +724,7 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor):
 
     def apply(
         self,
-        prompt_text: str,
+        prompt: Union[str, list[int]],
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> MultiModalInputsV2:
@@ -737,7 +737,7 @@ def apply(
             image_height=-1,
         )
 
-        result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs)
+        result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
 
         mm_items = self._to_mm_items(mm_data)
         mm_item_counts = mm_items.get_all_counts()
@@ -760,7 +760,7 @@ def get_replacement_mantis(item_idx: int):
             )
         ])
 
-        prompt_ids, prompt_text, _ = self._apply_prompt_replacements(
+        prompt_ids, prompt, _ = self._apply_prompt_replacements(
             result["prompt_token_ids"],
             mantis_mm_repls,
             mm_item_counts,
@@ -788,7 +788,7 @@ def get_replacement_mantis(item_idx: int):
 
         return MultiModalInputsV2(
             type="multimodal",
-            prompt=prompt_text,
+            prompt=prompt,
             prompt_token_ids=prompt_ids,
             mm_kwargs=mm_kwargs,
             mm_placeholders=mm_placeholder_ranges,
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index a1b1af35604db..7a230e5beb367 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -481,11 +481,11 @@ def _apply_prompt_replacements(
 
     def apply(
         self,
-        prompt_text: str,
+        prompt: Union[str, list[int]],
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> MultiModalInputsV2:
-        result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs)
+        result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
 
         # Only <|image|> tokens should be considered as placeholders,
         # so we ignore the trailing bos_token_id
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index fada22d685dd6..3edfb5107683a 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -138,12 +138,8 @@ def _call_hf_processor(
     ) -> BatchFeature:
         # Text-only input not supported in composite processor
         if not mm_data:
-            tokenizer = self.info.get_tokenizer()
-
-            prompt_ids = tokenizer.encode(
-                prompt,
-                add_special_tokens=False,  # type: ignore
-            )
+            prompt_ids = self.info.get_tokenizer().encode(prompt)
+            prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids)
             return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
 
         mm_data = dict(mm_data)
@@ -188,6 +184,16 @@ def _call_hf_processor(
         )
         return BatchFeature(combined_outputs)
 
+    def _apply_hf_processor_tokens_only(
+        self,
+        prompt_tokens: list[int],
+    ) -> list[int]:
+        # HF processor omits bos_token_id by setting add_special_tokens=False
+        tokenizer = self.info.get_tokenizer()
+        assert prompt_tokens[0] == tokenizer.bos_token_id
+
+        return prompt_tokens[1:]
+
     def _get_mm_fields_config(
         self,
         hf_inputs: BatchFeature,
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 07d883d5d7295..8b47dfb07387f 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -725,15 +725,15 @@ def _call_hf_processor(
             mm_kwargs,
         )
 
-    def _apply_hf_processor(
+    def _apply_hf_processor_text_mm(
         self,
         prompt_text: str,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> tuple[list[int], MultiModalKwargs]:
         """
-        Wrapper of :meth:`_call_hf_processor` that applies
-        additional pre-processing and post-processing.
+        Apply the HF processor on the prompt text and multi-modal data
+        together.
         """
         processor_data, passthrough_data = self._get_hf_mm_data(mm_items)
 
@@ -753,40 +753,93 @@ def _apply_hf_processor(
 
         return prompt_ids, mm_kwargs
 
-    def _apply_hf_processor_missing(
-        self,
-        prompt_text: str,
-        mm_missing_data_items: MultiModalDataItems,
-        hf_processor_mm_kwargs: Mapping[str, object],
-    ):
+    def _apply_hf_processor_text_only(self, prompt_text: str) -> list[int]:
         """
-        Apply the HF processor on the full prompt text, but only on the
-        multi-modal data that are missing from the cache.
+        Apply the HF processor on the prompt text only.
 
-        Note:
-            We pass prompt text and multi-modal data into the HF processor
-            in separate calls to avoid HF prompt replacement being done for
-            cached items; instead, we rely on our own prompt replacement logic
-            (:meth:`_get_prompt_replacements`) for the full text.
+        Since HF processor requires that text and multi-modal items
+        correspond to each other, we create dummy multi-modal items
+        to go along with the text.
         """
-        mm_missing_counts = mm_missing_data_items.get_all_counts()
-
-        prompt_ids, _ = self._apply_hf_processor(
+        prompt_ids, _ = self._apply_hf_processor_text_mm(
             prompt_text=prompt_text,
             mm_items=MultiModalDataItems({}),
             hf_processor_mm_kwargs={},
         )
 
-        # Some HF processors (e.g. Qwen2-VL) expect corresponding
-        # multi-modal tokens to be in the prompt text
+        return prompt_ids
+
+    def _apply_hf_processor_tokens_only(
+        self,
+        prompt_tokens: list[int],
+    ) -> list[int]:
+        """
+        Apply the HF processor on the prompt tokens only.
+
+        Most HF processors accept prompt text but not prompt tokens.
+        If the HF processor adds or removes tokens that are not related to
+        multi-modal data, you should override this method so it is consistent
+        with the output of :meth:`_apply_hf_processor_text_only` on the
+        corresponding text.
+        """
+        return prompt_tokens
+
+    def _apply_hf_processor_mm_only(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> MultiModalKwargs:
+        """
+        Apply the HF processor on the multi-modal data only.
+
+        Since HF processor requires that text and multi-modal items
+        correspond to each other, we generate dummy text using
+        :class:`DummyInputsBuilder` to go along with the multi-modal data.
+        """
+        mm_counts = mm_items.get_all_counts()
+
         dummy_inputs = self.dummy_inputs.get_dummy_processor_inputs(
             self.info.ctx.model_config.max_model_len,
-            mm_missing_counts,
+            mm_counts,
         )
 
-        _, mm_missing_kwargs = self._apply_hf_processor(
+        _, mm_kwargs = self._apply_hf_processor_text_mm(
             prompt_text=dummy_inputs.prompt_text,
-            mm_items=mm_missing_data_items,
+            mm_items=mm_items,
+            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+        )
+
+        return mm_kwargs
+
+    def _apply_hf_processor_main(
+        self,
+        prompt: Union[str, list[int]],
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        *,
+        enable_hf_prompt_replacement: bool,
+    ) -> tuple[list[int], MultiModalKwargs]:
+        """
+        Apply the HF processor on the prompt text and multi-modal data.
+
+        Note:
+            If :code:`enable_hf_prompt_replacement=False`, the prompt should
+            correspond to the multi-modal items.
+        """
+        if isinstance(prompt, str):
+            if enable_hf_prompt_replacement:
+                return self._apply_hf_processor_text_mm(
+                    prompt_text=prompt,
+                    mm_items=mm_items,
+                    hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+                )
+
+            prompt_ids = self._apply_hf_processor_text_only(prompt)
+        else:
+            prompt_ids = self._apply_hf_processor_tokens_only(prompt)
+
+        mm_missing_kwargs = self._apply_hf_processor_mm_only(
+            mm_items=mm_items,
             hf_processor_mm_kwargs=hf_processor_mm_kwargs,
         )
 
@@ -794,7 +847,7 @@ def _apply_hf_processor_missing(
 
     def _cached_apply_hf_processor(
         self,
-        prompt_text: str,
+        prompt: Union[str, list[int]],
         mm_data_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> tuple[list[int], MultiModalKwargs]:
@@ -807,10 +860,11 @@ def _cached_apply_hf_processor(
 
         _, passthrough_data = self._get_hf_mm_data(mm_data_items)
         if cache is None or passthrough_data:
-            return self._apply_hf_processor(
-                prompt_text=prompt_text,
+            return self._apply_hf_processor_main(
+                prompt=prompt,
                 mm_items=mm_data_items,
                 hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+                enable_hf_prompt_replacement=True,
             )
 
         mm_maybe_cached_kw_items = {
@@ -832,10 +886,13 @@ def _cached_apply_hf_processor(
         }
         mm_missing_data_items = self._to_mm_items(mm_missing_data)
 
-        prompt_ids, mm_missing_kwargs = self._apply_hf_processor_missing(
-            prompt_text=prompt_text,
-            mm_missing_data_items=mm_missing_data_items,
+        # NOTE: `prompt` does not correspond to `mm_missing_data_items`,
+        # so we need to pass `enable_hf_prompt_replacement=False`
+        prompt_ids, mm_missing_kwargs = self._apply_hf_processor_main(
+            prompt=prompt,
+            mm_items=mm_missing_data_items,
             hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+            enable_hf_prompt_replacement=False,
         )
 
         mm_missing_next_idx = {
@@ -1018,7 +1075,7 @@ def _validate_mm_placeholders(
 
     def apply(
         self,
-        prompt_text: str,
+        prompt: Union[str, list[int]],
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> MultiModalInputsV2:
@@ -1056,7 +1113,7 @@ def apply(
             mm_hashes = None
 
         prompt_ids, mm_kwargs = self._cached_apply_hf_processor(
-            prompt_text,
+            prompt,
             mm_items,
             hf_processor_mm_kwargs,
         )
@@ -1101,12 +1158,12 @@ def apply(
         # there is no need for us to insert them
         if all(len(repls) == 0 for repls in mm_missing_repls.items()):
             tokenizer = self.info.get_tokenizer()
-            prompt_text = decode_tokens(tokenizer, prompt_ids)
+            prompt = decode_tokens(tokenizer, prompt_ids)
             mm_placeholders = hf_mm_placeholders
         else:
             (
                 prompt_ids,
-                prompt_text,
+                prompt,
                 missing_mm_placeholders,
             ) = self._apply_prompt_replacements(
                 prompt_ids,
@@ -1125,7 +1182,7 @@ def apply(
 
         return MultiModalInputsV2(
             type="multimodal",
-            prompt=prompt_text,
+            prompt=prompt,
             prompt_token_ids=prompt_ids,
             mm_kwargs=mm_kwargs,
             mm_hashes=mm_hashes,
diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index 6f7da1509990f..ec580cd6ecddd 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -137,7 +137,7 @@ def _get_dummy_mm_inputs(
             seq_len, mm_counts)
 
         return self.processor.apply(
-            prompt_text=processor_inputs.prompt_text,
+            prompt=processor_inputs.prompt_text,
             mm_data=processor_inputs.mm_data,
             hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs,
         )

From 3de2b1eafb12e420c563cb7153d4d2f0e8451ca9 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 10 Jan 2025 11:25:20 +0800
Subject: [PATCH 156/462] [Doc] Show default pooling method in a table (#11904)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/source/models/generative_models.md |  8 ++--
 docs/source/models/pooling_models.md    | 59 +++++++++++++++++--------
 2 files changed, 45 insertions(+), 22 deletions(-)

diff --git a/docs/source/models/generative_models.md b/docs/source/models/generative_models.md
index 6228c7c2ac957..a9f74c4d3fbb8 100644
--- a/docs/source/models/generative_models.md
+++ b/docs/source/models/generative_models.md
@@ -8,14 +8,14 @@ In vLLM, generative models implement the {class}`~vllm.model_executor.models.Vll
 Based on the final hidden states of the input, these models output log probabilities of the tokens to generate,
 which are then passed through {class}`~vllm.model_executor.layers.Sampler` to obtain the final text.
 
+For generative models, the only supported `--task` option is `"generate"`.
+Usually, this is automatically inferred so you don't have to specify it.
+
 ## Offline Inference
 
 The {class}`~vllm.LLM` class provides various methods for offline inference.
 See [Engine Arguments](#engine-args) for a list of options when initializing the model.
 
-For generative models, the only supported {code}`task` option is {code}`"generate"`.
-Usually, this is automatically inferred so you don't have to specify it.
-
 ### `LLM.generate`
 
 The {class}`~vllm.LLM.generate` method is available to all generative models in vLLM.
@@ -33,7 +33,7 @@ for output in outputs:
 ```
 
 You can optionally control the language generation by passing {class}`~vllm.SamplingParams`.
-For example, you can use greedy sampling by setting {code}`temperature=0`:
+For example, you can use greedy sampling by setting `temperature=0`:
 
 ```python
 llm = LLM(model="facebook/opt-125m")
diff --git a/docs/source/models/pooling_models.md b/docs/source/models/pooling_models.md
index 3e4407cfdc233..745f3fd81980d 100644
--- a/docs/source/models/pooling_models.md
+++ b/docs/source/models/pooling_models.md
@@ -14,30 +14,53 @@ As shown in the [Compatibility Matrix](#compatibility-matrix), most vLLM feature
 pooling models as they only work on the generation or decode stage, so performance may not improve as much.
 ```
 
-## Offline Inference
-
-The {class}`~vllm.LLM` class provides various methods for offline inference.
-See [Engine Arguments](#engine-args) for a list of options when initializing the model.
-
-For pooling models, we support the following {code}`task` options:
-
-- Embedding ({code}`"embed"` / {code}`"embedding"`)
-- Classification ({code}`"classify"`)
-- Sentence Pair Scoring ({code}`"score"`)
-- Reward Modeling ({code}`"reward"`)
+For pooling models, we support the following `--task` options.
+The selected option sets the default pooler used to extract the final hidden states:
+
+```{list-table}
+:widths: 50 25 25 25
+:header-rows: 1
+
+* - Task
+  - Pooling Type
+  - Normalization
+  - Softmax
+* - Embedding (`embed`)
+  - `LAST`
+  - ✅︎
+  - ✗
+* - Classification (`classify`)
+  - `LAST`
+  - ✗
+  - ✅︎
+* - Sentence Pair Scoring (`score`)
+  - \*
+  - \*
+  - \*
+* - Reward Modeling (`reward`)
+  - `ALL`
+  - ✗
+  - ✗
+```
 
-The selected task determines the default {class}`~vllm.model_executor.layers.Pooler` that is used:
+\*The default pooler is always defined by the model.
 
-- Embedding: Extract only the hidden states corresponding to the last token, and apply normalization.
-- Classification: Extract only the hidden states corresponding to the last token, and apply softmax.
-- Sentence Pair Scoring: Extract only the hidden states corresponding to the last token, and apply softmax.
-- Reward Modeling: Extract all of the hidden states and return them directly.
+```{note}
+If the model's implementation in vLLM defines its own pooler, the default pooler is set to that instead of the one specified in this table.
+```
 
 When loading [Sentence Transformers](https://huggingface.co/sentence-transformers) models,
-we attempt to override the default pooler based on its Sentence Transformers configuration file ({code}`modules.json`).
+we attempt to override the default pooler based on its Sentence Transformers configuration file (`modules.json`).
 
-You can customize the model's pooling method via the {code}`override_pooler_config` option,
+```{tip}
+You can customize the model's pooling method via the `--override-pooler-config` option,
 which takes priority over both the model's and Sentence Transformers's defaults.
+```
+
+## Offline Inference
+
+The {class}`~vllm.LLM` class provides various methods for offline inference.
+See [Engine Arguments](#engine-args) for a list of options when initializing the model.
 
 ### `LLM.encode`
 

From cf5f000d218fbcbc4bf404de8ed9d9607a128c3b Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Fri, 10 Jan 2025 13:14:42 +0800
Subject: [PATCH 157/462] [torch.compile] Hide KV cache behind torch.compile
 boundary (#11677)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
---
 tests/kernels/test_encoder_decoder_attn.py | 18 +++--
 tests/test_utils.py                        | 85 +++++++++++++++++++++-
 tests/v1/engine/test_engine_core.py        |  3 +
 tests/v1/engine/test_engine_core_client.py |  3 +
 vllm/attention/layer.py                    | 29 +++++---
 vllm/config.py                             |  1 -
 vllm/forward_context.py                    | 33 +++++----
 vllm/utils.py                              | 35 +++++++++
 vllm/v1/worker/gpu_model_runner.py         |  6 +-
 vllm/worker/cpu_enc_dec_model_runner.py    |  3 +-
 vllm/worker/cpu_model_runner.py            |  3 +-
 vllm/worker/cpu_pooling_model_runner.py    |  3 +-
 vllm/worker/cpu_worker.py                  |  4 +-
 vllm/worker/enc_dec_model_runner.py        |  3 +-
 vllm/worker/model_runner.py                |  5 +-
 vllm/worker/pooling_model_runner.py        |  3 +-
 vllm/worker/worker.py                      |  4 +-
 vllm/worker/worker_base.py                 |  1 +
 18 files changed, 198 insertions(+), 44 deletions(-)

diff --git a/tests/kernels/test_encoder_decoder_attn.py b/tests/kernels/test_encoder_decoder_attn.py
index 614674375786e..e008a56de6208 100644
--- a/tests/kernels/test_encoder_decoder_attn.py
+++ b/tests/kernels/test_encoder_decoder_attn.py
@@ -142,12 +142,18 @@ class that Attention will automatically select when it is constructed.
             torch.tensor([], dtype=torch.float32, device=CUDA_DEVICE))
 
     # Construct KV cache
-    kv_cache = make_kv_cache(test_pt.num_blocks,
-                             test_pt.num_heads,
-                             test_pt.head_size,
-                             test_pt.block_size,
-                             device=CUDA_DEVICE,
-                             backend=test_pt.backend_name)
+    if test_pt.attn_type in (AttentionType.DECODER,
+                             AttentionType.ENCODER_DECODER):
+        kv_cache = make_kv_cache(test_pt.num_blocks,
+                                 test_pt.num_heads,
+                                 test_pt.head_size,
+                                 test_pt.block_size,
+                                 device=CUDA_DEVICE,
+                                 backend=test_pt.backend_name)
+    else:
+        kv_cache = torch.tensor([])
+
+    attn.kv_cache = [kv_cache]
     return TestResources(scale, attn, kv_cache)
 
 
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 14d2fbd63b90d..6810e0302f897 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -7,9 +7,11 @@
 import torch
 from vllm_test_utils import monitor
 
+from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
 from vllm.utils import (FlexibleArgumentParser, PlaceholderModule,
-                        StoreBoolean, deprecate_kwargs, get_open_port,
-                        memory_profiling, merge_async_iterators, supports_kw)
+                        StoreBoolean, bind_kv_cache, deprecate_kwargs,
+                        get_open_port, memory_profiling, merge_async_iterators,
+                        supports_kw)
 
 from .utils import error_on_warning, fork_new_process_for_each_test
 
@@ -325,6 +327,85 @@ def measure_current_non_torch():
     lib.cudaFree(handle2)
 
 
+def test_bind_kv_cache():
+    from vllm.attention import Attention
+
+    ctx = {
+        'layers.0.self_attn': Attention(32, 128, 0.1),
+        'layers.1.self_attn': Attention(32, 128, 0.1),
+        'layers.2.self_attn': Attention(32, 128, 0.1),
+        'layers.3.self_attn': Attention(32, 128, 0.1),
+    }
+    kv_cache = [
+        torch.zeros((1, )),
+        torch.zeros((1, )),
+        torch.zeros((1, )),
+        torch.zeros((1, )),
+    ]
+    bind_kv_cache(ctx, [kv_cache])
+    assert ctx['layers.0.self_attn'].kv_cache[0] is kv_cache[0]
+    assert ctx['layers.1.self_attn'].kv_cache[0] is kv_cache[1]
+    assert ctx['layers.2.self_attn'].kv_cache[0] is kv_cache[2]
+    assert ctx['layers.3.self_attn'].kv_cache[0] is kv_cache[3]
+
+def test_bind_kv_cache_non_attention():
+    from vllm.attention import Attention
+
+    # example from Jamba PP=2
+    ctx = {
+        'model.layers.20.attn': Attention(32, 128, 0.1),
+        'model.layers.28.attn': Attention(32, 128, 0.1),
+    }
+    kv_cache = [
+        torch.zeros((1, )),
+        torch.zeros((1, )),
+    ]
+    bind_kv_cache(ctx, [kv_cache])
+    assert ctx['model.layers.20.attn'].kv_cache[0] is kv_cache[0]
+    assert ctx['model.layers.28.attn'].kv_cache[0] is kv_cache[1]
+
+
+def test_bind_kv_cache_encoder_decoder():
+    from vllm.attention import Attention, AttentionType
+
+    # example from bart
+    ctx = {
+        'encoder.layers.0.self_attn.attn':
+            Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER),
+        'decoder.layers.0.encoder_attn.attn':
+            Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER_DECODER),
+        'decoder.layers.0.self_attn.attn':
+            Attention(32, 128, 0.1, attn_type=AttentionType.DECODER),
+    }
+
+    kv_cache = [
+        torch.zeros((1, )),
+    ]
+    encoder_kv_cache = ctx['encoder.layers.0.self_attn.attn'].kv_cache
+
+    bind_kv_cache(ctx, [kv_cache])
+    assert ctx['encoder.layers.0.self_attn.attn'].kv_cache is encoder_kv_cache
+    assert ctx['decoder.layers.0.encoder_attn.attn'].kv_cache[0] is kv_cache[0]
+    assert ctx['decoder.layers.0.self_attn.attn'].kv_cache[0] is kv_cache[0]
+
+
+def test_bind_kv_cache_pp():
+    cfg = VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=2))
+    with set_current_vllm_config(cfg):
+        from vllm.attention import Attention
+
+        ctx = {
+            'layers.0.self_attn': Attention(32, 128, 0.1),
+        }
+        kv_cache = [
+            [torch.zeros((1, ))],
+            [torch.zeros((1, ))]
+        ]
+        bind_kv_cache(ctx, kv_cache)
+        assert ctx['layers.0.self_attn'].kv_cache[0] is kv_cache[0][0]
+        assert ctx['layers.0.self_attn'].kv_cache[1] is kv_cache[1][0]
+
+
 def test_placeholder_module_error_handling():
     placeholder = PlaceholderModule("placeholder_1234")
 
diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index 8dd9b23fbdd5f..5b1732036e807 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -4,6 +4,7 @@
 import pytest
 from transformers import AutoTokenizer
 
+from tests.utils import fork_new_process_for_each_test
 from vllm import SamplingParams
 from vllm.engine.arg_utils import EngineArgs
 from vllm.platforms import current_platform
@@ -36,6 +37,7 @@ def make_request() -> EngineCoreRequest:
     )
 
 
+@fork_new_process_for_each_test
 def test_engine_core(monkeypatch):
 
     with monkeypatch.context() as m:
@@ -138,6 +140,7 @@ def test_engine_core(monkeypatch):
         assert len(engine_core.scheduler.running) == 0
 
 
+@fork_new_process_for_each_test
 def test_engine_core_advanced_sampling(monkeypatch):
     """
     A basic end-to-end test to verify that the engine functions correctly 
diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
index 5a21806e57a11..7eac16f2cf542 100644
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -6,6 +6,7 @@
 import pytest
 from transformers import AutoTokenizer
 
+from tests.utils import fork_new_process_for_each_test
 from vllm import SamplingParams
 from vllm.engine.arg_utils import EngineArgs
 from vllm.platforms import current_platform
@@ -75,6 +76,7 @@ async def loop_until_done_async(client: EngineCoreClient, outputs: Dict):
             break
 
 
+@fork_new_process_for_each_test
 @pytest.mark.parametrize("multiprocessing_mode", [True, False])
 def test_engine_core_client(monkeypatch, multiprocessing_mode: bool):
 
@@ -143,6 +145,7 @@ def test_engine_core_client(monkeypatch, multiprocessing_mode: bool):
         client.abort_requests([request.request_id])
 
 
+@fork_new_process_for_each_test
 @pytest.mark.asyncio
 async def test_engine_core_client_asyncio(monkeypatch):
 
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index f1b3598e60b54..55e4e14027f79 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -121,6 +121,13 @@ def __init__(
         compilation_config.static_forward_context[prefix] = self
         self.layer_name = prefix
         self.attn_type = attn_type
+        # use a placeholder kv cache tensor during init, which will be replaced
+        # by bind_kv_cache
+        # this variable will not be accessed if use_direct_call is True
+        self.kv_cache = [
+            torch.tensor([]) for _ in range(get_current_vllm_config(
+            ).parallel_config.pipeline_parallel_size)
+        ]
 
     def forward(
         self,
@@ -148,11 +155,11 @@ def forward(
             if value is not None:
                 value = value.view(-1, self.num_kv_heads, self.head_size)
             torch.ops.vllm.unified_attention_with_output(
-                query, key, value, output, kv_cache, self.layer_name)
+                query, key, value, output, self.layer_name)
             return output.view(-1, hidden_size)
         else:
             return torch.ops.vllm.unified_attention(query, key, value,
-                                                    kv_cache, self.layer_name)
+                                                    self.layer_name)
 
     def extra_repr(self) -> str:
         s = f"head_size={self.impl.head_size}"  # type: ignore
@@ -230,12 +237,12 @@ def unified_attention(
     query: torch.Tensor,
     key: torch.Tensor,
     value: torch.Tensor,
-    kv_cache: torch.Tensor,
     layer_name: str,
 ) -> torch.Tensor:
     forward_context: ForwardContext = get_forward_context()
-    attn_metadata = forward_context.dynamic_forward_context
-    self = forward_context.static_forward_context[layer_name]
+    attn_metadata = forward_context.attn_metadata
+    self = forward_context.attn_layers[layer_name]
+    kv_cache = self.kv_cache[forward_context.virtual_engine]
     return self.impl.forward(query, key, value, kv_cache, attn_metadata,
                              self._k_scale, self._v_scale)
 
@@ -244,7 +251,6 @@ def unified_attention_fake(
     query: torch.Tensor,
     key: torch.Tensor,
     value: torch.Tensor,
-    kv_cache: torch.Tensor,
     layer_name: str,
 ) -> torch.Tensor:
     return torch.empty_like(query).contiguous()
@@ -253,7 +259,7 @@ def unified_attention_fake(
 direct_register_custom_op(
     op_name="unified_attention",
     op_func=unified_attention,
-    mutates_args=["kv_cache"],
+    mutates_args=[],
     fake_impl=unified_attention_fake,
     dispatch_key=current_platform.dispatch_key,
 )
@@ -264,12 +270,12 @@ def unified_attention_with_output(
     key: torch.Tensor,
     value: torch.Tensor,
     output: torch.Tensor,
-    kv_cache: torch.Tensor,
     layer_name: str,
 ) -> None:
     forward_context: ForwardContext = get_forward_context()
-    attn_metadata = forward_context.dynamic_forward_context
-    self = forward_context.static_forward_context[layer_name]
+    attn_metadata = forward_context.attn_metadata
+    self = forward_context.attn_layers[layer_name]
+    kv_cache = self.kv_cache[forward_context.virtual_engine]
     self.impl.forward(query,
                       key,
                       value,
@@ -285,7 +291,6 @@ def unified_attention_with_output_fake(
     key: torch.Tensor,
     value: torch.Tensor,
     output: torch.Tensor,
-    kv_cache: torch.Tensor,
     layer_name: str,
 ) -> None:
     return
@@ -294,7 +299,7 @@ def unified_attention_with_output_fake(
 direct_register_custom_op(
     op_name="unified_attention_with_output",
     op_func=unified_attention_with_output,
-    mutates_args=["kv_cache", "output"],
+    mutates_args=["output"],
     fake_impl=unified_attention_with_output_fake,
     dispatch_key=current_platform.dispatch_key,
 )
diff --git a/vllm/config.py b/vllm/config.py
index 19609085cc960..13b5390008a35 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2780,7 +2780,6 @@ def model_post_init(self, __context: Any) -> None:
     compilation_time: float = PrivateAttr
 
     # Per-model forward context
-    # Mainly used to store attention cls
     # Map from layer name to the attention cls
     static_forward_context: Dict[str, Any] = PrivateAttr
 
diff --git a/vllm/forward_context.py b/vllm/forward_context.py
index 7f56575279e9b..828b394ec5d21 100644
--- a/vllm/forward_context.py
+++ b/vllm/forward_context.py
@@ -2,7 +2,7 @@
 from collections import defaultdict
 from contextlib import contextmanager
 from dataclasses import dataclass
-from typing import Any, Dict, Optional
+from typing import TYPE_CHECKING, Any, Dict, Optional
 
 import torch
 
@@ -10,6 +10,9 @@
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 
+if TYPE_CHECKING:
+    from vllm.attention.backends.abstract import AttentionMetadata
+
 logger = init_logger(__name__)
 
 track_batchsize: bool = envs.VLLM_LOG_BATCHSIZE_INTERVAL >= 0
@@ -21,9 +24,12 @@
 
 @dataclass
 class ForwardContext:
-    static_forward_context: Dict[str, Any]
+    # copy from vllm_config.compilation_config.static_forward_context
+    attn_layers: Dict[str, Any]
     # TODO: extend to support per-layer dynamic forward context
-    dynamic_forward_context: Any
+    attn_metadata: "AttentionMetadata"  # set dynamically for each forward pass
+    # TODO: remove after making all virtual_engines share the same kv cache
+    virtual_engine: int  # set dynamically for each forward pass
 
 
 _forward_context: Optional[ForwardContext] = None
@@ -38,34 +44,35 @@ def get_forward_context() -> ForwardContext:
 
 
 @contextmanager
-def set_forward_context(context: Any, vllm_config: VllmConfig):
+def set_forward_context(attn_metadata: Any,
+                        vllm_config: VllmConfig,
+                        virtual_engine: int = 0):
     """A context manager that stores the current forward context,
     can be attention metadata, etc.
     Here we can inject common logic for every model forward pass.
     """
     global forward_start_time
-    need_to_track_batchsize = track_batchsize and context is not None
+    need_to_track_batchsize = track_batchsize and attn_metadata is not None
     if need_to_track_batchsize:
         forward_start_time = time.perf_counter()
     global _forward_context
     prev_context = _forward_context
     _forward_context = ForwardContext(
-        static_forward_context=vllm_config.compilation_config.
-        static_forward_context,
-        dynamic_forward_context=context)
+        attn_layers=vllm_config.compilation_config.static_forward_context,
+        virtual_engine=virtual_engine,
+        attn_metadata=attn_metadata)
     try:
         yield
     finally:
-        global batchsize_counter
         global last_logging_time, batchsize_logging_interval
         if need_to_track_batchsize:
-            if hasattr(context, "num_prefill_tokens"):
+            if hasattr(attn_metadata, "num_prefill_tokens"):
                 # for v0 attention backends
-                batchsize = context.num_prefill_tokens + \
-                    context.num_decode_tokens
+                batchsize = attn_metadata.num_prefill_tokens + \
+                    attn_metadata.num_decode_tokens
             else:
                 # for v1 attention backends
-                batchsize = context.num_input_tokens
+                batchsize = attn_metadata.num_input_tokens
             # we use synchronous scheduling right now,
             # adding a sync point here should not affect
             # scheduling of the next batch
diff --git a/vllm/utils.py b/vllm/utils.py
index 487088591ebc2..8c3e5200b3d98 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -2138,3 +2138,38 @@ def get_mp_context():
     _check_multiproc_method()
     mp_method = envs.VLLM_WORKER_MULTIPROC_METHOD
     return multiprocessing.get_context(mp_method)
+
+
+def bind_kv_cache(
+        ctx: Dict[str, Any],
+        kv_cache: List[List[torch.Tensor]],  # [virtual_engine][layer_index]
+) -> None:
+    # Bind the kv_cache tensor to Attention modules, similar to
+    # ctx[layer_name].kv_cache[ve]=kv_cache[ve][extract_layer_index(layer_name)]
+    # Special things handled here:
+    # 1. Some models have non-attention layers, e.g., Jamba
+    # 2. Pipeline parallelism, each rank only has a subset of layers
+    # 3. Encoder attention has no kv cache
+    # 4. Encoder-decoder models, encoder-decoder attention and decoder-only
+    #    attention of the same layer (e.g., bart's decoder.layers.1.self_attn
+    #    and decoder.layers.1.encoder_attn) is mapped to the same kv cache
+    #    tensor
+    from vllm.attention import AttentionType
+    from vllm.model_executor.models.utils import extract_layer_index
+    layer_need_kv_cache = [
+        layer_name for layer_name in ctx
+        if ctx[layer_name].attn_type in (AttentionType.DECODER,
+                                         AttentionType.ENCODER_DECODER)
+    ]
+    layer_index_sorted = sorted(
+        set(
+            extract_layer_index(layer_name)
+            for layer_name in layer_need_kv_cache))
+    for layer_name in layer_need_kv_cache:
+        kv_cache_idx = layer_index_sorted.index(
+            extract_layer_index(layer_name))
+        forward_ctx = ctx[layer_name]
+        assert len(forward_ctx.kv_cache) == len(kv_cache)
+        for ve, ve_kv_cache in enumerate(kv_cache):
+            assert forward_ctx.kv_cache[ve].numel() == 0
+            forward_ctx.kv_cache[ve] = ve_kv_cache[kv_cache_idx]
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index a1d4f9b135789..fb87dc5a8222a 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -16,7 +16,8 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
 from vllm.sampling_params import SamplingType
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
-                        LayerBlockType, cdiv, is_pin_memory_available)
+                        LayerBlockType, bind_kv_cache, cdiv,
+                        is_pin_memory_available)
 from vllm.v1.attention.backends.flash_attn import (FlashAttentionBackend,
                                                    FlashAttentionMetadata)
 from vllm.v1.engine.mm_input_mapper import MMInputMapperClient
@@ -860,3 +861,6 @@ def initialize_kv_cache(self, num_blocks: int) -> None:
                 torch.zeros(kv_cache_shape,
                             dtype=self.kv_cache_dtype,
                             device=self.device))
+        bind_kv_cache(
+            self.vllm_config.compilation_config.static_forward_context,
+            [self.kv_caches])
diff --git a/vllm/worker/cpu_enc_dec_model_runner.py b/vllm/worker/cpu_enc_dec_model_runner.py
index cc24cfe04d2ba..fa6775cbd6c66 100644
--- a/vllm/worker/cpu_enc_dec_model_runner.py
+++ b/vllm/worker/cpu_enc_dec_model_runner.py
@@ -305,7 +305,8 @@ def execute_model(
             intermediate_tensors,
         }
 
-        with set_forward_context(model_input.attn_metadata, self.vllm_config):
+        with set_forward_context(model_input.attn_metadata, self.vllm_config,
+                                 model_input.virtual_engine):
             hidden_states = model_executable(**execute_model_kwargs)
 
         # Compute the logits.
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index f1531e0fc0675..d99db4e0c6c40 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -526,7 +526,8 @@ def execute_model(
             execute_model_kwargs.update(
                 {"previous_hidden_states": previous_hidden_states})
 
-        with set_forward_context(model_input.attn_metadata, self.vllm_config):
+        with set_forward_context(model_input.attn_metadata, self.vllm_config,
+                                 model_input.virtual_engine):
             hidden_states = model_executable(
                 input_ids=model_input.input_tokens,
                 positions=model_input.input_positions,
diff --git a/vllm/worker/cpu_pooling_model_runner.py b/vllm/worker/cpu_pooling_model_runner.py
index 17b2fd2564a04..d31ba89e12375 100644
--- a/vllm/worker/cpu_pooling_model_runner.py
+++ b/vllm/worker/cpu_pooling_model_runner.py
@@ -69,7 +69,8 @@ def execute_model(
             intermediate_tensors,
         }
 
-        with set_forward_context(model_input.attn_metadata, self.vllm_config):
+        with set_forward_context(model_input.attn_metadata, self.vllm_config,
+                                 model_input.virtual_engine):
             hidden_states = model_executable(**execute_model_kwargs)
 
         # Only perform pooling in the driver worker.
diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py
index b5dfebfce6f75..494c6506f3c0f 100644
--- a/vllm/worker/cpu_worker.py
+++ b/vllm/worker/cpu_worker.py
@@ -13,7 +13,7 @@
 from vllm.logger import init_logger
 from vllm.model_executor import set_random_seed
 from vllm.sequence import ExecuteModelRequest
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, bind_kv_cache
 from vllm.worker.cpu_enc_dec_model_runner import CPUEncoderDecoderModelRunner
 from vllm.worker.cpu_model_runner import CPUModelRunner, CPUModelRunnerBase
 from vllm.worker.cpu_pooling_model_runner import CPUPoolingModelRunner
@@ -293,6 +293,8 @@ def _init_cache_engine(self) -> None:
             self.cache_engine[ve].cpu_cache
             for ve in range(self.parallel_config.pipeline_parallel_size)
         ]
+        bind_kv_cache(self.compilation_config.static_forward_context,
+                      self.cpu_cache)
         self.model_runner.block_size = self.cache_engine[0].block_size
 
         assert all(
diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py
index 4d5d918087be8..8a161b740042d 100644
--- a/vllm/worker/enc_dec_model_runner.py
+++ b/vllm/worker/enc_dec_model_runner.py
@@ -175,7 +175,8 @@ def execute_model(
         } if self.has_inner_state else {}
 
         multi_modal_kwargs = model_input.multi_modal_kwargs or {}
-        with set_forward_context(model_input.attn_metadata, self.vllm_config):
+        with set_forward_context(model_input.attn_metadata, self.vllm_config,
+                                 model_input.virtual_engine):
             hidden_or_intermediate_states = model_executable(
                 input_ids=model_input.input_tokens,
                 positions=model_input.input_positions,
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 1c6d1bbee78ee..2b918483d3675 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1527,7 +1527,8 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None:
                         self._update_inputs_to_capture_for_enc_dec_model(
                             capture_inputs)
 
-                    with set_forward_context(attn_metadata, self.vllm_config):
+                    with set_forward_context(attn_metadata, self.vllm_config,
+                                             virtual_engine):
                         graph_runner.capture(**capture_inputs)
                     self.graph_memory_pool = graph_runner.graph.pool()
                     self.graph_runners[virtual_engine][batch_size] = (
@@ -1695,7 +1696,7 @@ def execute_model(
 
         if not bypass_model_exec:
             with set_forward_context(model_input.attn_metadata,
-                                     self.vllm_config):
+                                     self.vllm_config, virtual_engine):
                 hidden_or_intermediate_states = model_executable(
                     input_ids=model_input.input_tokens,
                     positions=model_input.input_positions,
diff --git a/vllm/worker/pooling_model_runner.py b/vllm/worker/pooling_model_runner.py
index f79b3773bcbd2..6de227f3cb2b9 100644
--- a/vllm/worker/pooling_model_runner.py
+++ b/vllm/worker/pooling_model_runner.py
@@ -105,7 +105,8 @@ def execute_model(
         if model_input.token_types is not None:
             cross_enc_kwargs["token_type_ids"] = model_input.token_types
 
-        with set_forward_context(model_input.attn_metadata, self.vllm_config):
+        with set_forward_context(model_input.attn_metadata, self.vllm_config,
+                                 virtual_engine):
             hidden_or_intermediate_states = model_executable(
                 input_ids=model_input.input_tokens,
                 positions=model_input.input_positions,
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index f51b51d433d3d..0f12549e3f3fd 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -21,7 +21,7 @@
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sequence import (ExecuteModelRequest, IntermediateTensors,
                            SequenceGroupMetadata, SequenceGroupMetadataDelta)
-from vllm.utils import GiB_bytes, memory_profiling
+from vllm.utils import GiB_bytes, bind_kv_cache, memory_profiling
 from vllm.worker.cache_engine import CacheEngine
 from vllm.worker.enc_dec_model_runner import EncoderDecoderModelRunner
 from vllm.worker.model_runner import GPUModelRunnerBase, ModelRunner
@@ -285,6 +285,8 @@ def _init_cache_engine(self):
             self.cache_engine[ve].gpu_cache
             for ve in range(self.parallel_config.pipeline_parallel_size)
         ]
+        bind_kv_cache(self.compilation_config.static_forward_context,
+                      self.gpu_cache)
 
     def _warm_up_model(self) -> None:
         if not self.model_config.enforce_eager:
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index 249b3ed2dfd37..a835718e1db19 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -43,6 +43,7 @@ def __init__(
         self.prompt_adapter_config = vllm_config.prompt_adapter_config
         self.observability_config = vllm_config.observability_config
         self.kv_transfer_config = vllm_config.kv_transfer_config
+        self.compilation_config = vllm_config.compilation_config
         from vllm.platforms import current_platform
         self.current_platform = current_platform
 

From ac2f3f7fee93cf9cd97c0078e362feab7b6c8299 Mon Sep 17 00:00:00 2001
From: Joe Runde <Joseph.Runde@ibm.com>
Date: Fri, 10 Jan 2025 00:56:36 -0700
Subject: [PATCH 158/462] [Bugfix] Validate lora adapters to avoid crashing
 server (#11727)

Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
---
 .../entrypoints/openai/test_lora_adapters.py  | 269 ++++++++++++++++++
 tests/entrypoints/openai/test_lora_lineage.py | 109 -------
 tests/entrypoints/openai/test_serving_chat.py |   8 +-
 .../entrypoints/openai/test_serving_models.py |  10 +-
 tests/entrypoints/openai/test_shutdown.py     |  27 +-
 vllm/engine/async_llm_engine.py               |   4 +
 vllm/engine/multiprocessing/__init__.py       |  20 +-
 vllm/engine/multiprocessing/client.py         |  42 ++-
 vllm/engine/multiprocessing/engine.py         |  27 +-
 vllm/engine/protocol.py                       |   5 +
 vllm/entrypoints/openai/api_server.py         |   7 +-
 vllm/entrypoints/openai/run_batch.py          |   1 +
 vllm/entrypoints/openai/serving_models.py     |  78 +++--
 vllm/lora/worker_manager.py                   |  19 +-
 vllm/v1/engine/async_llm.py                   |   4 +
 15 files changed, 459 insertions(+), 171 deletions(-)
 create mode 100644 tests/entrypoints/openai/test_lora_adapters.py
 delete mode 100644 tests/entrypoints/openai/test_lora_lineage.py

diff --git a/tests/entrypoints/openai/test_lora_adapters.py b/tests/entrypoints/openai/test_lora_adapters.py
new file mode 100644
index 0000000000000..46a064f6d9e68
--- /dev/null
+++ b/tests/entrypoints/openai/test_lora_adapters.py
@@ -0,0 +1,269 @@
+import asyncio
+import json
+import shutil
+from contextlib import suppress
+
+import openai  # use the official client for correctness check
+import pytest
+import pytest_asyncio
+# downloading lora to test lora requests
+from huggingface_hub import snapshot_download
+
+from ...utils import RemoteOpenAIServer
+
+# any model with a chat template should work here
+MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+# technically this needs Mistral-7B-v0.1 as base, but we're not testing
+# generation quality here
+LORA_NAME = "typeof/zephyr-7b-beta-lora"
+
+
+@pytest.fixture(scope="module")
+def zephyr_lora_files():
+    return snapshot_download(repo_id=LORA_NAME)
+
+
+@pytest.fixture(scope="module")
+def server_with_lora_modules_json(zephyr_lora_files):
+    # Define the json format LoRA module configurations
+    lora_module_1 = {
+        "name": "zephyr-lora",
+        "path": zephyr_lora_files,
+        "base_model_name": MODEL_NAME
+    }
+
+    lora_module_2 = {
+        "name": "zephyr-lora2",
+        "path": zephyr_lora_files,
+        "base_model_name": MODEL_NAME
+    }
+
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+        # lora config below
+        "--enable-lora",
+        "--lora-modules",
+        json.dumps(lora_module_1),
+        json.dumps(lora_module_2),
+        "--max-lora-rank",
+        "64",
+        "--max-cpu-loras",
+        "2",
+        "--max-num-seqs",
+        "64",
+    ]
+
+    # Enable the /v1/load_lora_adapter endpoint
+    envs = {"VLLM_ALLOW_RUNTIME_LORA_UPDATING": "True"}
+
+    with RemoteOpenAIServer(MODEL_NAME, args, env_dict=envs) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server_with_lora_modules_json):
+    async with server_with_lora_modules_json.get_async_client(
+    ) as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+async def test_static_lora_lineage(client: openai.AsyncOpenAI,
+                                   zephyr_lora_files):
+    models = await client.models.list()
+    models = models.data
+    served_model = models[0]
+    lora_models = models[1:]
+    assert served_model.id == MODEL_NAME
+    assert served_model.root == MODEL_NAME
+    assert served_model.parent is None
+    assert all(lora_model.root == zephyr_lora_files
+               for lora_model in lora_models)
+    assert all(lora_model.parent == MODEL_NAME for lora_model in lora_models)
+    assert lora_models[0].id == "zephyr-lora"
+    assert lora_models[1].id == "zephyr-lora2"
+
+
+@pytest.mark.asyncio
+async def test_dynamic_lora_lineage(client: openai.AsyncOpenAI,
+                                    zephyr_lora_files):
+
+    response = await client.post("load_lora_adapter",
+                                 cast_to=str,
+                                 body={
+                                     "lora_name": "zephyr-lora-3",
+                                     "lora_path": zephyr_lora_files
+                                 })
+    # Ensure adapter loads before querying /models
+    assert "success" in response
+
+    models = await client.models.list()
+    models = models.data
+    dynamic_lora_model = models[-1]
+    assert dynamic_lora_model.root == zephyr_lora_files
+    assert dynamic_lora_model.parent == MODEL_NAME
+    assert dynamic_lora_model.id == "zephyr-lora-3"
+
+
+@pytest.mark.asyncio
+async def test_dynamic_lora_not_found(client: openai.AsyncOpenAI):
+    with pytest.raises(openai.NotFoundError):
+        await client.post("load_lora_adapter",
+                          cast_to=str,
+                          body={
+                              "lora_name": "notfound",
+                              "lora_path": "/not/an/adapter"
+                          })
+
+
+@pytest.mark.asyncio
+async def test_dynamic_lora_invalid_files(client: openai.AsyncOpenAI,
+                                          tmp_path):
+    invalid_files = tmp_path / "invalid_files"
+    invalid_files.mkdir()
+    (invalid_files / "adapter_config.json").write_text("this is not json")
+
+    with pytest.raises(openai.BadRequestError):
+        await client.post("load_lora_adapter",
+                          cast_to=str,
+                          body={
+                              "lora_name": "invalid-json",
+                              "lora_path": str(invalid_files)
+                          })
+
+
+@pytest.mark.asyncio
+async def test_dynamic_lora_invalid_lora_rank(client: openai.AsyncOpenAI,
+                                              tmp_path, zephyr_lora_files):
+    invalid_rank = tmp_path / "invalid_rank"
+
+    # Copy adapter from zephyr_lora_files to invalid_rank
+    shutil.copytree(zephyr_lora_files, invalid_rank)
+
+    with open(invalid_rank / "adapter_config.json") as f:
+        adapter_config = json.load(f)
+
+    print(adapter_config)
+
+    # assert False
+
+    # Change rank to invalid value
+    adapter_config["r"] = 1024
+    with open(invalid_rank / "adapter_config.json", "w") as f:
+        json.dump(adapter_config, f)
+
+    with pytest.raises(openai.BadRequestError,
+                       match="is greater than max_lora_rank"):
+        await client.post("load_lora_adapter",
+                          cast_to=str,
+                          body={
+                              "lora_name": "invalid-json",
+                              "lora_path": str(invalid_rank)
+                          })
+
+
+@pytest.mark.asyncio
+async def test_multiple_lora_adapters(client: openai.AsyncOpenAI, tmp_path,
+                                      zephyr_lora_files):
+    """Validate that many loras can be dynamically registered and inferenced 
+    with concurrently"""
+
+    # This test file configures the server with --max-cpu-loras=2 and this test
+    # will concurrently load 10 adapters, so it should flex the LRU cache
+    async def load_and_run_adapter(adapter_name: str):
+        await client.post("load_lora_adapter",
+                          cast_to=str,
+                          body={
+                              "lora_name": adapter_name,
+                              "lora_path": str(zephyr_lora_files)
+                          })
+        for _ in range(3):
+            await client.completions.create(
+                model=adapter_name,
+                prompt=["Hello there", "Foo bar bazz buzz"],
+                max_tokens=5,
+            )
+
+    lora_tasks = []
+    for i in range(10):
+        lora_tasks.append(
+            asyncio.create_task(load_and_run_adapter(f"adapter_{i}")))
+
+    results, _ = await asyncio.wait(lora_tasks)
+
+    for r in results:
+        assert not isinstance(r, Exception), f"Got exception {r}"
+
+
+@pytest.mark.asyncio
+async def test_loading_invalid_adapters_does_not_break_others(
+        client: openai.AsyncOpenAI, tmp_path, zephyr_lora_files):
+
+    invalid_files = tmp_path / "invalid_files"
+    invalid_files.mkdir()
+    (invalid_files / "adapter_config.json").write_text("this is not json")
+
+    stop_good_requests_event = asyncio.Event()
+
+    async def run_good_requests(client):
+        # Run chat completions requests until event set
+
+        results = []
+
+        while not stop_good_requests_event.is_set():
+            try:
+                batch = await client.completions.create(
+                    model="zephyr-lora",
+                    prompt=["Hello there", "Foo bar bazz buzz"],
+                    max_tokens=5,
+                )
+                results.append(batch)
+            except Exception as e:
+                results.append(e)
+
+        return results
+
+    # Create task to run good requests
+    good_task = asyncio.create_task(run_good_requests(client))
+
+    # Run a bunch of bad adapter loads
+    for _ in range(25):
+        with suppress(openai.NotFoundError):
+            await client.post("load_lora_adapter",
+                              cast_to=str,
+                              body={
+                                  "lora_name": "notfound",
+                                  "lora_path": "/not/an/adapter"
+                              })
+    for _ in range(25):
+        with suppress(openai.BadRequestError):
+            await client.post("load_lora_adapter",
+                              cast_to=str,
+                              body={
+                                  "lora_name": "invalid",
+                                  "lora_path": str(invalid_files)
+                              })
+
+    # Ensure all the running requests with lora adapters succeeded
+    stop_good_requests_event.set()
+    results = await good_task
+    for r in results:
+        assert not isinstance(r, Exception), f"Got exception {r}"
+
+    # Ensure we can load another adapter and run it
+    await client.post("load_lora_adapter",
+                      cast_to=str,
+                      body={
+                          "lora_name": "valid",
+                          "lora_path": zephyr_lora_files
+                      })
+    await client.completions.create(
+        model="valid",
+        prompt=["Hello there", "Foo bar bazz buzz"],
+        max_tokens=5,
+    )
diff --git a/tests/entrypoints/openai/test_lora_lineage.py b/tests/entrypoints/openai/test_lora_lineage.py
deleted file mode 100644
index ce4f85c13fff9..0000000000000
--- a/tests/entrypoints/openai/test_lora_lineage.py
+++ /dev/null
@@ -1,109 +0,0 @@
-import json
-
-import openai  # use the official client for correctness check
-import pytest
-import pytest_asyncio
-# downloading lora to test lora requests
-from huggingface_hub import snapshot_download
-
-from ...utils import RemoteOpenAIServer
-
-# any model with a chat template should work here
-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
-# technically this needs Mistral-7B-v0.1 as base, but we're not testing
-# generation quality here
-LORA_NAME = "typeof/zephyr-7b-beta-lora"
-
-
-@pytest.fixture(scope="module")
-def zephyr_lora_files():
-    return snapshot_download(repo_id=LORA_NAME)
-
-
-@pytest.fixture(scope="module")
-def server_with_lora_modules_json(zephyr_lora_files):
-    # Define the json format LoRA module configurations
-    lora_module_1 = {
-        "name": "zephyr-lora",
-        "path": zephyr_lora_files,
-        "base_model_name": MODEL_NAME
-    }
-
-    lora_module_2 = {
-        "name": "zephyr-lora2",
-        "path": zephyr_lora_files,
-        "base_model_name": MODEL_NAME
-    }
-
-    args = [
-        # use half precision for speed and memory savings in CI environment
-        "--dtype",
-        "bfloat16",
-        "--max-model-len",
-        "8192",
-        "--enforce-eager",
-        # lora config below
-        "--enable-lora",
-        "--lora-modules",
-        json.dumps(lora_module_1),
-        json.dumps(lora_module_2),
-        "--max-lora-rank",
-        "64",
-        "--max-cpu-loras",
-        "2",
-        "--max-num-seqs",
-        "64",
-    ]
-
-    # Enable the /v1/load_lora_adapter endpoint
-    envs = {"VLLM_ALLOW_RUNTIME_LORA_UPDATING": "True"}
-
-    with RemoteOpenAIServer(MODEL_NAME, args, env_dict=envs) as remote_server:
-        yield remote_server
-
-
-@pytest_asyncio.fixture
-async def client_for_lora_lineage(server_with_lora_modules_json):
-    async with server_with_lora_modules_json.get_async_client(
-    ) as async_client:
-        yield async_client
-
-
-@pytest.mark.asyncio
-async def test_static_lora_lineage(client_for_lora_lineage: openai.AsyncOpenAI,
-                                   zephyr_lora_files):
-    models = await client_for_lora_lineage.models.list()
-    models = models.data
-    served_model = models[0]
-    lora_models = models[1:]
-    assert served_model.id == MODEL_NAME
-    assert served_model.root == MODEL_NAME
-    assert served_model.parent is None
-    assert all(lora_model.root == zephyr_lora_files
-               for lora_model in lora_models)
-    assert all(lora_model.parent == MODEL_NAME for lora_model in lora_models)
-    assert lora_models[0].id == "zephyr-lora"
-    assert lora_models[1].id == "zephyr-lora2"
-
-
-@pytest.mark.asyncio
-async def test_dynamic_lora_lineage(
-        client_for_lora_lineage: openai.AsyncOpenAI, zephyr_lora_files):
-
-    response = await client_for_lora_lineage.post("load_lora_adapter",
-                                                  cast_to=str,
-                                                  body={
-                                                      "lora_name":
-                                                      "zephyr-lora-3",
-                                                      "lora_path":
-                                                      zephyr_lora_files
-                                                  })
-    # Ensure adapter loads before querying /models
-    assert "success" in response
-
-    models = await client_for_lora_lineage.models.list()
-    models = models.data
-    dynamic_lora_model = models[-1]
-    assert dynamic_lora_model.root == zephyr_lora_files
-    assert dynamic_lora_model.parent == MODEL_NAME
-    assert dynamic_lora_model.id == "zephyr-lora-3"
diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index f431d1065e0eb..85f485364a411 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -52,7 +52,7 @@ async def _async_serving_chat_init():
     engine = MockEngine()
     model_config = await engine.get_model_config()
 
-    models = OpenAIServingModels(model_config, BASE_MODEL_PATHS)
+    models = OpenAIServingModels(engine, model_config, BASE_MODEL_PATHS)
     serving_completion = OpenAIServingChat(engine,
                                            model_config,
                                            models,
@@ -73,7 +73,8 @@ def test_serving_chat_should_set_correct_max_tokens():
     mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
     mock_engine.errored = False
 
-    models = OpenAIServingModels(base_model_paths=BASE_MODEL_PATHS,
+    models = OpenAIServingModels(engine_client=mock_engine,
+                                 base_model_paths=BASE_MODEL_PATHS,
                                  model_config=MockModelConfig())
     serving_chat = OpenAIServingChat(mock_engine,
                                      MockModelConfig(),
@@ -116,7 +117,8 @@ def test_serving_chat_could_load_correct_generation_config():
     mock_engine.errored = False
 
     # Initialize the serving chat
-    models = OpenAIServingModels(base_model_paths=BASE_MODEL_PATHS,
+    models = OpenAIServingModels(engine_client=mock_engine,
+                                 base_model_paths=BASE_MODEL_PATHS,
                                  model_config=mock_model_config)
     serving_chat = OpenAIServingChat(mock_engine,
                                      mock_model_config,
diff --git a/tests/entrypoints/openai/test_serving_models.py b/tests/entrypoints/openai/test_serving_models.py
index 96897dc730da2..657ea20213ec9 100644
--- a/tests/entrypoints/openai/test_serving_models.py
+++ b/tests/entrypoints/openai/test_serving_models.py
@@ -4,6 +4,7 @@
 import pytest
 
 from vllm.config import ModelConfig
+from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.openai.protocol import (ErrorResponse,
                                               LoadLoraAdapterRequest,
                                               UnloadLoraAdapterRequest)
@@ -21,13 +22,16 @@
 
 async def _async_serving_models_init() -> OpenAIServingModels:
     mock_model_config = MagicMock(spec=ModelConfig)
+    mock_engine_client = MagicMock(spec=EngineClient)
     # Set the max_model_len attribute to avoid missing attribute
     mock_model_config.max_model_len = 2048
 
-    serving_models = OpenAIServingModels(base_model_paths=BASE_MODEL_PATHS,
+    serving_models = OpenAIServingModels(engine_client=mock_engine_client,
+                                         base_model_paths=BASE_MODEL_PATHS,
                                          model_config=mock_model_config,
                                          lora_modules=None,
                                          prompt_adapters=None)
+    await serving_models.init_static_loras()
 
     return serving_models
 
@@ -113,5 +117,5 @@ async def test_unload_lora_adapter_not_found():
     request = UnloadLoraAdapterRequest(lora_name="nonexistent_adapter")
     response = await serving_models.unload_lora_adapter(request)
     assert isinstance(response, ErrorResponse)
-    assert response.type == "InvalidUserInput"
-    assert response.code == HTTPStatus.BAD_REQUEST
+    assert response.type == "NotFoundError"
+    assert response.code == HTTPStatus.NOT_FOUND
diff --git a/tests/entrypoints/openai/test_shutdown.py b/tests/entrypoints/openai/test_shutdown.py
index 6fcc92022855b..090523a836e12 100644
--- a/tests/entrypoints/openai/test_shutdown.py
+++ b/tests/entrypoints/openai/test_shutdown.py
@@ -1,6 +1,3 @@
-import json
-import os
-
 import openai
 import pytest
 
@@ -10,16 +7,7 @@
 
 
 @pytest.mark.asyncio
-async def test_shutdown_on_engine_failure(tmp_path):
-    # Use a bad adapter to crash the engine
-    # (This test will fail when that bug is fixed)
-    adapter_path = tmp_path / "bad_adapter"
-    os.mkdir(adapter_path)
-    with open(adapter_path / "adapter_model_config.json", "w") as f:
-        json.dump({"not": "real"}, f)
-    with open(adapter_path / "adapter_model.safetensors", "wb") as f:
-        f.write(b"this is fake")
-
+async def test_shutdown_on_engine_failure():
     # dtype, max-len etc set so that this can run in CI
     args = [
         "--dtype",
@@ -29,9 +17,6 @@ async def test_shutdown_on_engine_failure(tmp_path):
         "--enforce-eager",
         "--max-num-seqs",
         "128",
-        "--enable-lora",
-        "--lora-modules",
-        f"bad-adapter={tmp_path / 'bad_adapter'}",
     ]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
@@ -39,9 +24,13 @@ async def test_shutdown_on_engine_failure(tmp_path):
 
             with pytest.raises(
                 (openai.APIConnectionError, openai.InternalServerError)):
-                # This crashes the engine
-                await client.completions.create(model="bad-adapter",
-                                                prompt="Hello, my name is")
+                # Asking for lots of prompt logprobs will currently crash the
+                # engine. This may change in the future when that bug is fixed
+                prompt = "Hello " * 4000
+                await client.completions.create(
+                    model=MODEL_NAME,
+                    prompt=prompt,
+                    extra_body={"prompt_logprobs": 10})
 
             # Now the server should shut down
             return_code = remote_server.proc.wait(timeout=8)
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 66a5089074ff5..da23ed19ef7be 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -1257,6 +1257,10 @@ async def stop_profile(self) -> None:
         else:
             self.engine.model_executor._run_workers("stop_profile")
 
+    async def add_lora(self, lora_request: LoRARequest) -> None:
+        """Load a new LoRA adapter into the engine for future requests."""
+        self.engine.add_lora(lora_request)
+
 
 # TODO(v1): Remove this class proxy when V1 goes default.
 if envs.VLLM_USE_V1:
diff --git a/vllm/engine/multiprocessing/__init__.py b/vllm/engine/multiprocessing/__init__.py
index 420f540d0b5f4..7132f9840001a 100644
--- a/vllm/engine/multiprocessing/__init__.py
+++ b/vllm/engine/multiprocessing/__init__.py
@@ -1,4 +1,5 @@
-from dataclasses import dataclass
+import uuid
+from dataclasses import dataclass, field
 from enum import Enum
 from typing import List, Mapping, Optional, Union, overload
 
@@ -120,10 +121,23 @@ class RPCUProfileRequest(Enum):
     STOP_PROFILE = 2
 
 
+@dataclass
+class RPCLoadAdapterRequest:
+    lora_request: LoRARequest
+    # Set the default value of request_id to a new UUID
+    request_id: str = field(default_factory=lambda: str(uuid.uuid4()))
+
+
+@dataclass
+class RPCAdapterLoadedResponse:
+    request_id: str
+
+
 RPC_REQUEST_T = Union[RPCProcessRequest, RPCAbortRequest, RPCStartupRequest,
-                      RPCUProfileRequest]
+                      RPCUProfileRequest, RPCLoadAdapterRequest]
 
-REQUEST_OUTPUTS_T = Union[List[RequestOutput], RPCError]
+REQUEST_OUTPUTS_T = Union[List[RequestOutput], RPCAdapterLoadedResponse,
+                          RPCError]
 
 
 def ENGINE_DEAD_ERROR(
diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py
index 0a046c71e86e8..a9ab899535180 100644
--- a/vllm/engine/multiprocessing/client.py
+++ b/vllm/engine/multiprocessing/client.py
@@ -25,8 +25,10 @@
                                          IPC_HEALTH_EXT, IPC_INPUT_EXT,
                                          IPC_OUTPUT_EXT, RPC_REQUEST_T,
                                          VLLM_RPC_SUCCESS_STR, RPCAbortRequest,
-                                         RPCError, RPCProcessRequest,
-                                         RPCStartupRequest, RPCStartupResponse,
+                                         RPCAdapterLoadedResponse, RPCError,
+                                         RPCLoadAdapterRequest,
+                                         RPCProcessRequest, RPCStartupRequest,
+                                         RPCStartupResponse,
                                          RPCUProfileRequest)
 from vllm.engine.protocol import EngineClient
 # yapf: enable
@@ -240,17 +242,22 @@ async def run_output_handler_loop(self):
                         queue = self.output_queues.get(request_id)
                         if queue is not None:
                             queue.put_nowait(exception)
+                # Put each output into the appropriate queue.
+                elif isinstance(request_outputs, RPCAdapterLoadedResponse):
+                    self._add_output(request_outputs)
                 else:
-                    # Put each output into the appropriate steam.
                     for request_output in request_outputs:
-                        queue = self.output_queues.get(
-                            request_output.request_id)
-                        if queue is not None:
-                            queue.put_nowait(request_output)
+                        self._add_output(request_output)
 
         except asyncio.CancelledError:
             logger.debug("Shutting down MQLLMEngineClient output handler.")
 
+    def _add_output(self, request_output: Union[RequestOutput,
+                                                RPCAdapterLoadedResponse]):
+        queue = self.output_queues.get(request_output.request_id)
+        if queue is not None:
+            queue.put_nowait(request_output)
+
     async def setup(self):
         """Setup the client before it starts sending server requests."""
 
@@ -659,3 +666,24 @@ async def stop_profile(self) -> None:
 
         await self._send_one_way_rpc_request(
             request=RPCUProfileRequest.STOP_PROFILE, socket=self.input_socket)
+
+    async def add_lora(self, lora_request: LoRARequest) -> None:
+        """Load a new LoRA adapter into the engine for future requests."""
+        # Uses the same I/O as generate requests
+        request = RPCLoadAdapterRequest(lora_request)
+
+        # Create output queue for this requests.
+        queue: asyncio.Queue[Union[None, BaseException]] = asyncio.Queue()
+        self.output_queues[request.request_id] = queue
+
+        # Send the request
+        request_bytes = pickle.dumps(request)
+        await self.input_socket.send_multipart((request_bytes, ), copy=False)
+
+        # Wait for the response
+        request_output = await queue.get()
+        self.output_queues.pop(request.request_id)
+
+        # Raise on error, otherwise happily return None
+        if isinstance(request_output, BaseException):
+            raise request_output
diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py
index 49a90b321dac4..36f4df4b02731 100644
--- a/vllm/engine/multiprocessing/engine.py
+++ b/vllm/engine/multiprocessing/engine.py
@@ -14,8 +14,10 @@
                                          IPC_HEALTH_EXT, IPC_INPUT_EXT,
                                          IPC_OUTPUT_EXT, REQUEST_OUTPUTS_T,
                                          VLLM_RPC_SUCCESS_STR, RPCAbortRequest,
-                                         RPCError, RPCProcessRequest,
-                                         RPCStartupRequest, RPCStartupResponse,
+                                         RPCAdapterLoadedResponse, RPCError,
+                                         RPCLoadAdapterRequest,
+                                         RPCProcessRequest, RPCStartupRequest,
+                                         RPCStartupResponse,
                                          RPCUProfileRequest)
 # yapf: enable
 from vllm.executor.gpu_executor import GPUExecutor
@@ -234,6 +236,8 @@ def handle_new_input(self):
                         self.start_profile()
                     else:
                         self.stop_profile()
+                elif isinstance(request, RPCLoadAdapterRequest):
+                    self._handle_load_adapter_request(request)
                 else:
                     raise ValueError("Unknown RPCRequest Type: "
                                      f"{type(request)}")
@@ -284,6 +288,19 @@ def _handle_abort_request(self, request: RPCAbortRequest):
         if self.log_requests:
             logger.info("Aborted request %s.", request.request_id)
 
+    def _handle_load_adapter_request(self, request: RPCLoadAdapterRequest):
+        try:
+            self.engine.add_lora(request.lora_request)
+        except BaseException as e:
+            # Send back an error if the adater fails to load
+            rpc_err = RPCError(request_id=request.request_id,
+                               is_engine_errored=False,
+                               exception=e)
+            self._send_outputs(rpc_err)
+        # Otherwise, send back the successful load message
+        self._send_outputs(
+            RPCAdapterLoadedResponse(request_id=request.request_id))
+
     def _health_check(self):
         # Send unhealthy if engine has already errored
         if self._errored_with is not None:
@@ -296,7 +313,11 @@ def _health_check(self):
             self._send_unhealthy(e)
 
     def _send_outputs(self, outputs: REQUEST_OUTPUTS_T):
-        """Send List of RequestOutput to RPCClient."""
+        """Send outputs back to the engine client. These can be:
+        - Exceptions
+        - A list of generation outputs
+        - A response from loading a lora adapter
+        """
         if outputs:
             try:
                 from ray.exceptions import RayTaskError
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index a066836b92708..f05ff62c4766b 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -270,3 +270,8 @@ async def start_profile(self) -> None:
     async def stop_profile(self) -> None:
         """Start profiling the engine"""
         ...
+
+    @abstractmethod
+    async def add_lora(self, lora_request: LoRARequest) -> None:
+        """Load a new LoRA adapter into the engine for future requests."""
+        ...
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index bc1471e1f534d..925d7db43138b 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -662,7 +662,7 @@ async def add_request_id(request: Request, call_next):
     return app
 
 
-def init_app_state(
+async def init_app_state(
     engine_client: EngineClient,
     model_config: ModelConfig,
     state: State,
@@ -690,12 +690,13 @@ def init_app_state(
     logger.info("Using supplied chat template:\n%s", resolved_chat_template)
 
     state.openai_serving_models = OpenAIServingModels(
+        engine_client=engine_client,
         model_config=model_config,
         base_model_paths=base_model_paths,
         lora_modules=args.lora_modules,
         prompt_adapters=args.prompt_adapters,
     )
-    # TODO: The chat template is now broken for lora adapters :(
+    await state.openai_serving_models.init_static_loras()
     state.openai_serving_chat = OpenAIServingChat(
         engine_client,
         model_config,
@@ -794,7 +795,7 @@ def signal_handler(*_) -> None:
         app = build_app(args)
 
         model_config = await engine_client.get_model_config()
-        init_app_state(engine_client, model_config, app.state, args)
+        await init_app_state(engine_client, model_config, app.state, args)
 
         shutdown_task = await serve_http(
             app,
diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
index 822c0f5f7c211..f8f136f9d5024 100644
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -215,6 +215,7 @@ async def main(args):
 
     # Create the openai serving objects.
     openai_serving_models = OpenAIServingModels(
+        engine_client=engine,
         model_config=model_config,
         base_model_paths=base_model_paths,
         lora_modules=None,
diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py
index 26966896bc272..a222eafadcb68 100644
--- a/vllm/entrypoints/openai/serving_models.py
+++ b/vllm/entrypoints/openai/serving_models.py
@@ -5,15 +5,19 @@
 from typing import List, Optional, Union
 
 from vllm.config import ModelConfig
+from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.openai.protocol import (ErrorResponse,
                                               LoadLoraAdapterRequest,
                                               ModelCard, ModelList,
                                               ModelPermission,
                                               UnloadLoraAdapterRequest)
+from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.utils import AtomicCounter
 
+logger = init_logger(__name__)
+
 
 @dataclass
 class BaseModelPath:
@@ -45,6 +49,7 @@ class OpenAIServingModels:
 
     def __init__(
         self,
+        engine_client: EngineClient,
         model_config: ModelConfig,
         base_model_paths: List[BaseModelPath],
         *,
@@ -55,20 +60,11 @@ def __init__(
 
         self.base_model_paths = base_model_paths
         self.max_model_len = model_config.max_model_len
+        self.engine_client = engine_client
 
+        self.static_lora_modules = lora_modules
+        self.lora_requests: List[LoRARequest] = []
         self.lora_id_counter = AtomicCounter(0)
-        self.lora_requests = []
-        if lora_modules is not None:
-            self.lora_requests = [
-                LoRARequest(lora_name=lora.name,
-                            lora_int_id=i,
-                            lora_path=lora.path,
-                            base_model_name=lora.base_model_name
-                            if lora.base_model_name
-                            and self.is_base_model(lora.base_model_name) else
-                            self.base_model_paths[0].name)
-                for i, lora in enumerate(lora_modules, start=1)
-            ]
 
         self.prompt_adapter_requests = []
         if prompt_adapters is not None:
@@ -84,6 +80,19 @@ def __init__(
                         prompt_adapter_local_path=prompt_adapter.local_path,
                         prompt_adapter_num_virtual_tokens=num_virtual_tokens))
 
+    async def init_static_loras(self):
+        """Loads all static LoRA modules.
+        Raises if any fail to load"""
+        if self.static_lora_modules is None:
+            return
+        for lora in self.static_lora_modules:
+            load_request = LoadLoraAdapterRequest(lora_path=lora.path,
+                                                  lora_name=lora.name)
+            load_result = await self.load_lora_adapter(
+                request=load_request, base_model_name=lora.base_model_name)
+            if isinstance(load_result, ErrorResponse):
+                raise ValueError(load_result.message)
+
     def is_base_model(self, model_name):
         return any(model.name == model_name for model in self.base_model_paths)
 
@@ -129,17 +138,47 @@ async def show_available_models(self) -> ModelList:
 
     async def load_lora_adapter(
             self,
-            request: LoadLoraAdapterRequest) -> Union[ErrorResponse, str]:
+            request: LoadLoraAdapterRequest,
+            base_model_name: Optional[str] = None
+    ) -> Union[ErrorResponse, str]:
         error_check_ret = await self._check_load_lora_adapter_request(request)
         if error_check_ret is not None:
             return error_check_ret
 
         lora_name, lora_path = request.lora_name, request.lora_path
         unique_id = self.lora_id_counter.inc(1)
-        self.lora_requests.append(
-            LoRARequest(lora_name=lora_name,
-                        lora_int_id=unique_id,
-                        lora_path=lora_path))
+        lora_request = LoRARequest(lora_name=lora_name,
+                                   lora_int_id=unique_id,
+                                   lora_path=lora_path)
+        if base_model_name is not None and self.is_base_model(base_model_name):
+            lora_request.base_model_name = base_model_name
+
+        # Validate that the adapter can be loaded into the engine
+        # This will also pre-load it for incoming requests
+        try:
+            await self.engine_client.add_lora(lora_request)
+        except ValueError as e:
+            # Adapter not found or lora configuration errors
+            if "No adapter found" in str(e):
+                return create_error_response(message=str(e),
+                                             err_type="NotFoundError",
+                                             status_code=HTTPStatus.NOT_FOUND)
+            else:
+                return create_error_response(
+                    message=str(e),
+                    err_type="BadRequestError",
+                    status_code=HTTPStatus.BAD_REQUEST)
+        except BaseException as e:
+            # Some other unexpected problem loading the adapter, e.g. malformed
+            # input files.
+            # More detailed error messages for the user would be nicer here
+            return create_error_response(message=str(e),
+                                         err_type="BadRequestError",
+                                         status_code=HTTPStatus.BAD_REQUEST)
+
+        self.lora_requests.append(lora_request)
+        logger.info("Loaded new LoRA adapter: name '%s', path '%s'", lora_name,
+                    lora_path)
         return f"Success: LoRA adapter '{lora_name}' added successfully."
 
     async def unload_lora_adapter(
@@ -155,6 +194,7 @@ async def unload_lora_adapter(
             lora_request for lora_request in self.lora_requests
             if lora_request.lora_name != lora_name
         ]
+        logger.info("Removed LoRA adapter: name '%s'", lora_name)
         return f"Success: LoRA adapter '{lora_name}' removed successfully."
 
     async def _check_load_lora_adapter_request(
@@ -195,8 +235,8 @@ async def _check_unload_lora_adapter_request(
             return create_error_response(
                 message=
                 f"The lora adapter '{request.lora_name}' cannot be found.",
-                err_type="InvalidUserInput",
-                status_code=HTTPStatus.BAD_REQUEST)
+                err_type="NotFoundError",
+                status_code=HTTPStatus.NOT_FOUND)
 
         return None
 
diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py
index 10976fac23028..eec462743fe9d 100644
--- a/vllm/lora/worker_manager.py
+++ b/vllm/lora/worker_manager.py
@@ -115,6 +115,14 @@ def _load_adapter(self, lora_request: LoRARequest) -> LoRAModel:
                 embedding_padding_modules=self.embedding_padding_modules,
                 weights_mapper=hf_to_vllm_mapper)
 
+        except FileNotFoundError as e:
+            # FileNotFoundError should be raised if both
+            # - No adapter found to download from huggingface (or in
+            #       offline mode)
+            # - No local adapter files found at `lora_request.lora_path`
+            raise ValueError(
+                f"Loading lora {lora_request.lora_name} failed: No adapter "
+                f"found for {lora_path}") from e
         except Exception as e:
             raise RuntimeError(f"Loading lora {lora_path} failed") from e
         if lora.rank > self.lora_config.max_lora_rank:
@@ -209,12 +217,19 @@ def _apply_adapters(self, lora_requests: Set[LoRARequest]) -> None:
 
     def add_adapter(self, lora_request: LoRARequest) -> bool:
         if lora_request.lora_int_id not in self.list_adapters():
-            # Remove before we load the new lora to save memory
+            # Load the new adapter first to ensure it is actually valid, before
+            # evicting any existing adapters.
+            # This may cause the # of loaded lora adapters to very temporarily
+            # exceed `--max-cpu-loras`.
+            lora = self._load_adapter(lora_request)
+
+            # Loading succeeded, now check if we will exceed cache capacity and
+            # evict if the oldest adapter if so
             if len(self._adapter_manager) + 1 > self._adapter_manager.capacity:
                 assert isinstance(self._adapter_manager,
                                   LRUCacheLoRAModelManager)
                 self._adapter_manager.remove_oldest_adapter()
-            lora = self._load_adapter(lora_request)
+            # Then add the new adapter to the cache
             loaded = self._adapter_manager.add_adapter(lora)
         else:
             # If the lora is already loaded, just touch it to
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index b963ba74f13f0..5daae45dee85c 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -339,3 +339,7 @@ def errored(self) -> bool:
     @property
     def dead_error(self) -> BaseException:
         return Exception()  # TODO: implement
+
+    async def add_lora(self, lora_request: LoRARequest) -> None:
+        """Load a new LoRA adapter into the engine for future requests."""
+        raise NotImplementedError("LoRA not yet supported in V1")

From 61af6332565d0093855fee7266699e548b1c0d1c Mon Sep 17 00:00:00 2001
From: Kunshang Ji <kunshang.ji@intel.com>
Date: Fri, 10 Jan 2025 16:20:46 +0800
Subject: [PATCH 159/462] [BUGFIX] Fix `UnspecifiedPlatform` package name
 (#11916)

Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
---
 vllm/platforms/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py
index f6ac14446c021..6ca95b41dbb07 100644
--- a/vllm/platforms/__init__.py
+++ b/vllm/platforms/__init__.py
@@ -179,7 +179,7 @@ def resolve_current_platform_cls_qualname() -> str:
         logger.info("Automatically detected platform %s.",
                     activated_builtin_plugins[0])
     else:
-        platform_cls_qualname = "vllm.interface.UnspecifiedPlatform"
+        platform_cls_qualname = "vllm.platforms.interface.UnspecifiedPlatform"
         logger.info(
             "No platform detected, vLLM is running on UnspecifiedPlatform")
     return platform_cls_qualname

From d53575a5f0e5c0f9003b4ec6e33c8bf761e93cef Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Fri, 10 Jan 2025 16:25:17 +0800
Subject: [PATCH 160/462] [ci] fix gh200 tests (#11919)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/model_executor/model_loader/weight_utils.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 11d5fd7135d9e..9cfcdbf620d2b 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -29,7 +29,9 @@
 
 try:
     from runai_model_streamer import SafetensorsStreamer
-except ImportError:
+except (ImportError, OSError):
+    # see https://github.com/run-ai/runai-model-streamer/issues/26
+    # OSError will be raised on arm64 platform
     runai_model_streamer = PlaceholderModule(
         "runai_model_streamer")  # type: ignore[assignment]
     SafetensorsStreamer = runai_model_streamer.placeholder_attr(

From d907be7dc7926e64d6240bf4425d7399eaed150e Mon Sep 17 00:00:00 2001
From: cennn <61925104+cennn@users.noreply.github.com>
Date: Fri, 10 Jan 2025 17:18:25 +0800
Subject: [PATCH 161/462] [misc] remove python function call for custom
 activation op (#11885)

Co-authored-by: youkaichao <youkaichao@gmail.com>
---
 vllm/_custom_ops.py                      | 27 --------
 vllm/model_executor/layers/activation.py | 79 ++++++++++++++----------
 2 files changed, 46 insertions(+), 60 deletions(-)

diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index afb350591e562..d04cbbc0a9eed 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -34,33 +34,6 @@ def register_fake(fn):
         from torch.library import impl_abstract as register_fake
 
 
-# activation ops
-def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-    torch.ops._C.gelu_and_mul(out, x)
-
-
-def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-    torch.ops._C.gelu_tanh_and_mul(out, x)
-
-
-def fatrelu_and_mul(out: torch.Tensor,
-                    x: torch.Tensor,
-                    threshold: float = 0.0) -> None:
-    torch.ops._C.fatrelu_and_mul(out, x, threshold)
-
-
-def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None:
-    torch.ops._C.gelu_fast(out, x)
-
-
-def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
-    torch.ops._C.gelu_new(out, x)
-
-
-def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
-    torch.ops._C.gelu_quick(out, x)
-
-
 # page attention ops
 def paged_attention_v1(
     out: torch.Tensor,
diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index 32456fee06a28..2475190d197d3 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -30,6 +30,8 @@ class FatreluAndMul(CustomOp):
     def __init__(self, threshold: float = 0.):
         super().__init__()
         self.threshold = threshold
+        if current_platform.is_cuda_alike() or current_platform.is_cpu():
+            self.op = torch.ops._C.fatrelu_and_mul
 
     def forward_native(self, x: torch.Tensor) -> torch.Tensor:
         d = x.shape[-1] // 2
@@ -39,12 +41,10 @@ def forward_native(self, x: torch.Tensor) -> torch.Tensor:
         return x1 * x2
 
     def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
-        from vllm import _custom_ops as ops
-
         d = x.shape[-1] // 2
         output_shape = (x.shape[:-1] + (d, ))
         out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
-        ops.fatrelu_and_mul(out, x, self.threshold)
+        self.op(out, x, self.threshold)
         return out
 
 
@@ -103,6 +103,17 @@ def __init__(self, approximate: str = "none"):
         self.approximate = approximate
         if approximate not in ("none", "tanh"):
             raise ValueError(f"Unknown approximate mode: {approximate}")
+        if current_platform.is_cuda_alike() or current_platform.is_cpu():
+            if approximate == "none":
+                self.op = torch.ops._C.gelu_and_mul
+            elif approximate == "tanh":
+                self.op = torch.ops._C.gelu_tanh_and_mul
+        elif current_platform.is_xpu():
+            from vllm._ipex_ops import ipex_ops
+            if approximate == "none":
+                self.op = ipex_ops.gelu_and_mul
+            else:
+                self.op = ipex_ops.gelu_tanh_and_mul
 
     def forward_native(self, x: torch.Tensor) -> torch.Tensor:
         """PyTorch-native implementation equivalent to forward()."""
@@ -110,27 +121,17 @@ def forward_native(self, x: torch.Tensor) -> torch.Tensor:
         return F.gelu(x[..., :d], approximate=self.approximate) * x[..., d:]
 
     def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
-        from vllm import _custom_ops as ops
-
         d = x.shape[-1] // 2
         output_shape = (x.shape[:-1] + (d, ))
         out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
-        if self.approximate == "none":
-            ops.gelu_and_mul(out, x)
-        elif self.approximate == "tanh":
-            ops.gelu_tanh_and_mul(out, x)
+        self.op(out, x)
         return out
 
     def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
-        from vllm._ipex_ops import ipex_ops as ops
-
         d = x.shape[-1] // 2
         output_shape = (x.shape[:-1] + (d, ))
         out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
-        if self.approximate == "none":
-            ops.gelu_and_mul(out, x)
-        elif self.approximate == "tanh":
-            ops.gelu_tanh_and_mul(out, x)
+        self.op(out, x)
         return out
 
     def extra_repr(self) -> str:
@@ -140,6 +141,14 @@ def extra_repr(self) -> str:
 @CustomOp.register("gelu_new")
 class NewGELU(CustomOp):
 
+    def __init__(self):
+        super().__init__()
+        if current_platform.is_cuda_alike() or current_platform.is_cpu():
+            self.op = torch.ops._C.gelu_new
+        elif current_platform.is_xpu():
+            from vllm._ipex_ops import ipex_ops
+            self.op = ipex_ops.gelu_new
+
     def forward_native(self, x: torch.Tensor) -> torch.Tensor:
         """PyTorch-native implementation equivalent to forward()."""
         c = math.sqrt(2.0 / math.pi)
@@ -147,58 +156,62 @@ def forward_native(self, x: torch.Tensor) -> torch.Tensor:
                                            (x + 0.044715 * torch.pow(x, 3.0))))
 
     def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
-        from vllm import _custom_ops as ops
-
         out = torch.empty_like(x)
-        ops.gelu_new(out, x)
+        self.op(out, x)
         return out
 
     def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
-        from vllm._ipex_ops import ipex_ops as ops
-
-        return ops.gelu_new(x)
+        return self.op(x)
 
 
 @CustomOp.register("gelu_fast")
 class FastGELU(CustomOp):
 
+    def __init__(self):
+        super().__init__()
+        if current_platform.is_cuda_alike() or current_platform.is_cpu():
+            self.op = torch.ops._C.gelu_fast
+        elif current_platform.is_xpu():
+            from vllm._ipex_ops import ipex_ops
+            self.op = ipex_ops.gelu_fast
+
     def forward_native(self, x: torch.Tensor) -> torch.Tensor:
         """PyTorch-native implementation equivalent to forward()."""
         return 0.5 * x * (1.0 + torch.tanh(x * 0.7978845608 *
                                            (1.0 + 0.044715 * x * x)))
 
     def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
-        from vllm import _custom_ops as ops
-
         out = torch.empty_like(x)
-        ops.gelu_fast(out, x)
+        self.op(out, x)
         return out
 
     def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
-        from vllm._ipex_ops import ipex_ops as ops
-
-        return ops.gelu_fast(x)
+        return self.op(x)
 
 
 @CustomOp.register("quick_gelu")
 class QuickGELU(CustomOp):
     # https://github.com/huggingface/transformers/blob/main/src/transformers/activations.py#L90
+    def __init__(self):
+        super().__init__()
+        if current_platform.is_cuda_alike() or current_platform.is_cpu():
+            self.op = torch.ops._C.gelu_quick
+        elif current_platform.is_xpu():
+            from vllm._ipex_ops import ipex_ops
+            self.op = ipex_ops.gelu_quick
+
     def forward_native(self, x: torch.Tensor) -> torch.Tensor:
         """PyTorch-native implementation equivalent to forward()."""
         return x * torch.sigmoid(1.702 * x)
 
     def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
-        from vllm import _custom_ops as ops
-
         out = torch.empty_like(x)
-        ops.gelu_quick(out, x)
+        self.op(out, x)
         return out
 
     def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
-        from vllm._ipex_ops import ipex_ops as ops
-
         out = torch.empty_like(x)
-        ops.gelu_quick(out, x)
+        self.op(out, x)
         return out
 
     # TODO implement forward_xpu for QuickGELU

From ef725feafcc1f2d6763cc888751fb2b36840587b Mon Sep 17 00:00:00 2001
From: wangxiyuan <wangxiyuan1007@gmail.com>
Date: Fri, 10 Jan 2025 18:02:38 +0800
Subject: [PATCH 162/462] [platform] support pytorch custom op pluggable
 (#11328)

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
---
 vllm/model_executor/custom_op.py | 7 +++++++
 vllm/platforms/interface.py      | 4 ++++
 2 files changed, 11 insertions(+)

diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py
index 401606e8c76f0..96995c56bf504 100644
--- a/vllm/model_executor/custom_op.py
+++ b/vllm/model_executor/custom_op.py
@@ -57,6 +57,11 @@ def forward_hpu(self, *args, **kwargs):
         # PyTorch-native implementation.
         return self.forward_native(*args, **kwargs)
 
+    def forward_oot(self, *args, **kwargs):
+        # By default, we assume that OOT ops are compatible with the
+        # PyTorch-native implementation.
+        return self.forward_native(*args, **kwargs)
+
     def dispatch_forward(self):
         # NOTE(woosuk): Here we assume that vLLM was built for only one
         # specific backend. Currently, we do not support dynamic dispatching.
@@ -81,6 +86,8 @@ def dispatch_forward(self):
             return self.forward_tpu
         elif current_platform.is_xpu():
             return self.forward_xpu
+        elif current_platform.is_out_of_tree():
+            return self.forward_oot
         else:
             return self.forward_cuda
 
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index f440358f65fbb..01d753408e6d0 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -45,6 +45,7 @@ class PlatformEnum(enum.Enum):
     CPU = enum.auto()
     NEURON = enum.auto()
     OPENVINO = enum.auto()
+    OOT = enum.auto()
     UNSPECIFIED = enum.auto()
 
 
@@ -107,6 +108,9 @@ def is_neuron(self) -> bool:
     def is_openvino(self) -> bool:
         return self._enum == PlatformEnum.OPENVINO
 
+    def is_out_of_tree(self) -> bool:
+        return self._enum == PlatformEnum.OOT
+
     def is_cuda_alike(self) -> bool:
         """Stateless version of :func:`torch.cuda.is_available`."""
         return self._enum in (PlatformEnum.CUDA, PlatformEnum.ROCM)

From d85c47d6ad24c286ae55fd9da231808b8ddd7a7f Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Fri, 10 Jan 2025 12:05:56 +0000
Subject: [PATCH 163/462] Replace "online inference" with "online serving"
 (#11923)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .buildkite/run-cpu-test.sh                                    | 2 +-
 docs/source/features/structured_outputs.md                    | 4 ++--
 docs/source/getting_started/installation/hpu-gaudi.md         | 4 ++--
 docs/source/getting_started/quickstart.md                     | 2 +-
 docs/source/models/generative_models.md                       | 2 +-
 docs/source/models/pooling_models.md                          | 2 +-
 docs/source/models/supported_models.md                        | 4 ++--
 docs/source/serving/multimodal_inputs.md                      | 2 +-
 .../openai_chat_completion_client_for_multimodal.py           | 4 ++--
 tests/models/decoder_only/audio_language/test_ultravox.py     | 4 ++--
 vllm/model_executor/models/molmo.py                           | 2 +-
 11 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index 1a4dae8f65e99..5a285be039393 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -61,7 +61,7 @@ function cpu_tests() {
     pytest -s -v -k cpu_model \
     tests/basic_correctness/test_chunked_prefill.py"  
 
-  # online inference
+  # online serving
   docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
     set -e
     export VLLM_CPU_KVCACHE_SPACE=10 
diff --git a/docs/source/features/structured_outputs.md b/docs/source/features/structured_outputs.md
index ccd9a6a1b1a14..a42c3dd64ad10 100644
--- a/docs/source/features/structured_outputs.md
+++ b/docs/source/features/structured_outputs.md
@@ -5,7 +5,7 @@
 vLLM supports the generation of structured outputs using [outlines](https://github.com/dottxt-ai/outlines), [lm-format-enforcer](https://github.com/noamgat/lm-format-enforcer), or [xgrammar](https://github.com/mlc-ai/xgrammar) as backends for the guided decoding.
 This document shows you some examples of the different options that are available to generate structured outputs.
 
-## Online Inference (OpenAI API)
+## Online Serving (OpenAI API)
 
 You can generate structured outputs using the OpenAI's [Completions](https://platform.openai.com/docs/api-reference/completions) and [Chat](https://platform.openai.com/docs/api-reference/chat) API.
 
@@ -239,7 +239,7 @@ The main available options inside `GuidedDecodingParams` are:
 - `backend`
 - `whitespace_pattern`
 
-These parameters can be used in the same way as the parameters from the Online Inference examples above.
+These parameters can be used in the same way as the parameters from the Online Serving examples above.
 One example for the usage of the `choices` parameter is shown below:
 
 ```python
diff --git a/docs/source/getting_started/installation/hpu-gaudi.md b/docs/source/getting_started/installation/hpu-gaudi.md
index 1d50cef3bdc83..21822327c8822 100644
--- a/docs/source/getting_started/installation/hpu-gaudi.md
+++ b/docs/source/getting_started/installation/hpu-gaudi.md
@@ -83,7 +83,7 @@ $ python setup.py develop
 ## Supported Features
 
 - [Offline inference](#offline-inference)
-- Online inference via [OpenAI-Compatible Server](#openai-compatible-server)
+- Online serving via [OpenAI-Compatible Server](#openai-compatible-server)
 - HPU autodetection - no need to manually select device within vLLM
 - Paged KV cache with algorithms enabled for Intel Gaudi accelerators
 - Custom Intel Gaudi implementations of Paged Attention, KV cache ops,
@@ -385,5 +385,5 @@ the below:
   completely. With HPU Graphs disabled, you are trading latency and
   throughput at lower batches for potentially higher throughput on
   higher batches. You can do that by adding `--enforce-eager` flag to
-  server (for online inference), or by passing `enforce_eager=True`
+  server (for online serving), or by passing `enforce_eager=True`
   argument to LLM constructor (for offline inference).
diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md
index ea15d9ef065fa..d7d43785c6c24 100644
--- a/docs/source/getting_started/quickstart.md
+++ b/docs/source/getting_started/quickstart.md
@@ -5,7 +5,7 @@
 This guide will help you quickly get started with vLLM to perform:
 
 - [Offline batched inference](#quickstart-offline)
-- [Online inference using OpenAI-compatible server](#quickstart-online)
+- [Online serving using OpenAI-compatible server](#quickstart-online)
 
 ## Prerequisites
 
diff --git a/docs/source/models/generative_models.md b/docs/source/models/generative_models.md
index a9f74c4d3fbb8..6a5a58ad74ab7 100644
--- a/docs/source/models/generative_models.md
+++ b/docs/source/models/generative_models.md
@@ -118,7 +118,7 @@ print("Loaded chat template:", custom_template)
 outputs = llm.chat(conversation, chat_template=custom_template)
 ```
 
-## Online Inference
+## Online Serving
 
 Our [OpenAI-Compatible Server](#openai-compatible-server) provides endpoints that correspond to the offline APIs:
 
diff --git a/docs/source/models/pooling_models.md b/docs/source/models/pooling_models.md
index 745f3fd81980d..324b1f550e694 100644
--- a/docs/source/models/pooling_models.md
+++ b/docs/source/models/pooling_models.md
@@ -127,7 +127,7 @@ print(f"Score: {score}")
 
 A code example can be found here: <gh-file:examples/offline_inference/offline_inference_scoring.py>
 
-## Online Inference
+## Online Serving
 
 Our [OpenAI-Compatible Server](#openai-compatible-server) provides endpoints that correspond to the offline APIs:
 
diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index acbe27a22a679..72910ea1e2d19 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -552,7 +552,7 @@ See [this page](#multimodal-inputs) on how to pass multi-modal inputs to the mod
 
 ````{important}
 To enable multiple multi-modal items per text prompt, you have to set `limit_mm_per_prompt` (offline inference)
-or `--limit-mm-per-prompt` (online inference). For example, to enable passing up to 4 images per text prompt:
+or `--limit-mm-per-prompt` (online serving). For example, to enable passing up to 4 images per text prompt:
 
 Offline inference:
 ```python
@@ -562,7 +562,7 @@ llm = LLM(
 )
 ```
 
-Online inference:
+Online serving:
 ```bash
 vllm serve Qwen/Qwen2-VL-7B-Instruct --limit-mm-per-prompt image=4
 ```
diff --git a/docs/source/serving/multimodal_inputs.md b/docs/source/serving/multimodal_inputs.md
index 9f5e1b908d786..7e96ed46f2dcc 100644
--- a/docs/source/serving/multimodal_inputs.md
+++ b/docs/source/serving/multimodal_inputs.md
@@ -199,7 +199,7 @@ for o in outputs:
     print(generated_text)
 ```
 
-## Online Inference
+## Online Serving
 
 Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions API](https://platform.openai.com/docs/api-reference/chat).
 
diff --git a/examples/online_serving/openai_chat_completion_client_for_multimodal.py b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
index 213d075542e81..03cc037bb6779 100644
--- a/examples/online_serving/openai_chat_completion_client_for_multimodal.py
+++ b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
@@ -1,5 +1,5 @@
 """An example showing how to use vLLM to serve multimodal models 
-and run online inference with OpenAI client.
+and run online serving with OpenAI client.
 
 Launch the vLLM server with the following command:
 
@@ -309,7 +309,7 @@ def main(args) -> None:
 
 if __name__ == "__main__":
     parser = FlexibleArgumentParser(
-        description='Demo on using OpenAI client for online inference with '
+        description='Demo on using OpenAI client for online serving with '
         'multimodal language models served with vLLM.')
     parser.add_argument('--chat-type',
                         '-c',
diff --git a/tests/models/decoder_only/audio_language/test_ultravox.py b/tests/models/decoder_only/audio_language/test_ultravox.py
index 0bb98df1b58e6..1e329dc4cb22e 100644
--- a/tests/models/decoder_only/audio_language/test_ultravox.py
+++ b/tests/models/decoder_only/audio_language/test_ultravox.py
@@ -237,8 +237,8 @@ def test_models_with_multiple_audios(vllm_runner, audio_assets, dtype: str,
 
 
 @pytest.mark.asyncio
-async def test_online_inference(client, audio_assets):
-    """Exercises online inference with/without chunked prefill enabled."""
+async def test_online_serving(client, audio_assets):
+    """Exercises online serving with/without chunked prefill enabled."""
 
     messages = [{
         "role":
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index 2e60bc719f096..c45ee9b921c9e 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -1068,7 +1068,7 @@ def input_processor_for_molmo(ctx: InputContext, inputs: DecoderOnlyInputs):
         trust_remote_code=model_config.trust_remote_code)
 
     # NOTE: message formatting for raw text prompt is only applied for
-    # offline inference; for online inference, the prompt is always in
+    # offline inference; for online serving, the prompt is always in
     # instruction format and tokenized.
     if prompt is not None and re.match(r"^User:[\s\S]*?(Assistant:)*$",
                                        prompt):

From 241ad7b301facac0728e2b3312d71fe47acc8c9e Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Fri, 10 Jan 2025 20:45:33 +0800
Subject: [PATCH 164/462] [ci] Fix sampler tests (#11922)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 .buildkite/test-pipeline.yaml |  1 +
 tests/conftest.py             | 11 +++++++++--
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index e288f8f30159a..7d13269540864 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -214,6 +214,7 @@ steps:
   - vllm/model_executor/layers
   - vllm/sampling_metadata.py
   - tests/samplers
+  - tests/conftest.py
   commands:
     - pytest -v -s samplers
     - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
diff --git a/tests/conftest.py b/tests/conftest.py
index 917151ddcb8d4..95af4ac1eb17b 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -28,12 +28,13 @@
                               init_distributed_environment,
                               initialize_model_parallel)
 from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
-                         to_enc_dec_tuple_list, zip_enc_dec_prompts)
+                         TokensPrompt, to_enc_dec_tuple_list,
+                         zip_enc_dec_prompts)
 from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import BeamSearchParams
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, cuda_device_count_stateless,
-                        identity)
+                        identity, is_list_of)
 
 logger = init_logger(__name__)
 
@@ -886,6 +887,12 @@ def generate_beam_search(
         beam_width: int,
         max_tokens: int,
     ) -> List[Tuple[List[List[int]], List[str]]]:
+        if is_list_of(prompts, str, check="all"):
+            prompts = [TextPrompt(prompt=prompt) for prompt in prompts]
+        else:
+            prompts = [
+                TokensPrompt(prompt_token_ids=tokens) for tokens in prompts
+            ]
         outputs = self.model.beam_search(
             prompts,
             BeamSearchParams(beam_width=beam_width, max_tokens=max_tokens))

From 12664ddda522b3a22c5b71eca9b2c907e3a687b3 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 10 Jan 2025 22:30:25 +0800
Subject: [PATCH 165/462] [Doc] [1/N] Initial guide for merged multi-modal
 processor (#11925)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/requirements-docs.txt                    |   1 +
 docs/source/api/multimodal/index.md           |   2 +-
 docs/source/api/multimodal/inputs.md          |   2 +-
 docs/source/conf.py                           |   1 +
 docs/source/contributing/model/index.md       |   2 +-
 docs/source/contributing/model/multimodal.md  | 380 +++++++++++++++---
 .../source/contributing/model/registration.md |   2 +-
 .../input_processing_pipeline.md              |  19 -
 .../input_processing/model_inputs_index.md    |  43 --
 docs/source/design/mm_processing.md           |  64 +++
 docs/source/index.md                          |   2 +-
 docs/source/serving/multimodal_inputs.md      |   2 +-
 vllm/config.py                                |   3 +-
 vllm/inputs/__init__.py                       |   3 -
 vllm/inputs/registry.py                       |  12 +-
 vllm/multimodal/__init__.py                   |   4 +-
 vllm/multimodal/base.py                       |  14 -
 vllm/multimodal/inputs.py                     |   3 +-
 vllm/multimodal/registry.py                   |  12 +-
 19 files changed, 403 insertions(+), 168 deletions(-)
 delete mode 100644 docs/source/design/input_processing/input_processing_pipeline.md
 delete mode 100644 docs/source/design/input_processing/model_inputs_index.md
 create mode 100644 docs/source/design/mm_processing.md

diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt
index 64cf6ef8fc19d..8217bc3ba3ded 100644
--- a/docs/requirements-docs.txt
+++ b/docs/requirements-docs.txt
@@ -3,6 +3,7 @@ sphinx-book-theme==1.0.1
 sphinx-copybutton==0.5.2
 myst-parser==3.0.1
 sphinx-argparse==0.4.0
+sphinx-design==0.6.1
 sphinx-togglebutton==0.3.2
 msgspec
 cloudpickle
diff --git a/docs/source/api/multimodal/index.md b/docs/source/api/multimodal/index.md
index 51e24795a34cf..14efdb506d76f 100644
--- a/docs/source/api/multimodal/index.md
+++ b/docs/source/api/multimodal/index.md
@@ -7,7 +7,7 @@ vLLM provides experimental support for multi-modal models through the {mod}`vllm
 Multi-modal inputs can be passed alongside text and token prompts to [supported models](#supported-mm-models)
 via the `multi_modal_data` field in {class}`vllm.inputs.PromptType`.
 
-Looking to add your own multi-modal model? Please follow the instructions listed [here](#enabling-multimodal-inputs).
+Looking to add your own multi-modal model? Please follow the instructions listed [here](#supports-multimodal).
 
 ## Module Contents
 
diff --git a/docs/source/api/multimodal/inputs.md b/docs/source/api/multimodal/inputs.md
index 3d89666113229..76b2fb95a5009 100644
--- a/docs/source/api/multimodal/inputs.md
+++ b/docs/source/api/multimodal/inputs.md
@@ -3,7 +3,7 @@
 ## User-facing inputs
 
 ```{eval-rst}
-.. autodata:: vllm.multimodal.MultiModalDataDict
+.. autodata:: vllm.multimodal.inputs.MultiModalDataDict
 ```
 
 ## Internal data structures
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 1ce11fe057071..bff0141ffbce8 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -43,6 +43,7 @@
     "sphinx.ext.autosummary",
     "myst_parser",
     "sphinxarg.ext",
+    "sphinx_design",
     "sphinx_togglebutton",
 ]
 myst_enable_extensions = [
diff --git a/docs/source/contributing/model/index.md b/docs/source/contributing/model/index.md
index a2d601c83cf47..245e13b795ec4 100644
--- a/docs/source/contributing/model/index.md
+++ b/docs/source/contributing/model/index.md
@@ -2,7 +2,7 @@
 
 # Adding a New Model
 
-This section provides more information on how to integrate a [HuggingFace Transformers](https://github.com/huggingface/transformers) model into vLLM.
+This section provides more information on how to integrate a [PyTorch](https://pytorch.org/) model into vLLM.
 
 ```{toctree}
 :caption: Contents
diff --git a/docs/source/contributing/model/multimodal.md b/docs/source/contributing/model/multimodal.md
index e5dcd1223b361..76ab73e43d24b 100644
--- a/docs/source/contributing/model/multimodal.md
+++ b/docs/source/contributing/model/multimodal.md
@@ -1,6 +1,6 @@
-(enabling-multimodal-inputs)=
+(supports-multimodal)=
 
-# Enabling Multimodal Inputs
+# Multi-Modal Support
 
 This document walks you through the steps to extend a basic model so that it accepts [multi-modal inputs](#multimodal-inputs).
 
@@ -37,103 +37,355 @@ Further update the model as follows:
     ) -> SamplerOutput:
   ```
 
-## 2. Register input mappers
+## 2. Specify processing information
 
-For each modality type that the model accepts as input, decorate the model class with {meth}`MULTIMODAL_REGISTRY.register_input_mapper <vllm.multimodal.MultiModalRegistry.register_input_mapper>`.
-This decorator accepts a function that maps multi-modal inputs to the keyword arguments you have previously defined in {meth}`~torch.nn.Module.forward`.
+Next, create a subclass of {class}`~vllm.multimodal.processing.BaseProcessingInfo`
+to provide basic information related to HF processing.
 
-```diff
-  from vllm.model_executor.models.interfaces import SupportsMultiModal
-+ from vllm.multimodal import MULTIMODAL_REGISTRY
+### Maximum number of input items
 
-+ @MULTIMODAL_REGISTRY.register_image_input_mapper()
-  class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
+You need to override the abstract method {meth}`~vllm.multimodal.processing.BaseProcessingInfo.get_supported_mm_limits`
+to return the maximum number of input items for each modality supported by the model.
+
+For example, if the model supports any number of images but only one video per prompt:
+
+```python
+def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    return {"image": None, "video": 1}
 ```
 
-A default mapper is available for each modality in the core vLLM library. This input mapper will be used if you do not provide your own function.
+### Maximum number of placeholder feature tokens
+
+Also, override the abstract method {meth}`~vllm.multimodal.processing.BaseProcessingInfo.get_mm_max_tokens_per_item`
+to return the maximum number of placeholder feature tokens per input item for each modality.
+
+When calling the model, the output embeddings from the visual encoder are assigned to the input positions
+containing placeholder feature tokens. Therefore, the number of placeholder feature tokens should be equal
+to the size of the output embeddings.
+
+::::{tab-set}
+:::{tab-item} Basic example: LLaVA
+:sync: llava
+
+Looking at the code of HF's `LlavaForConditionalGeneration`:
+
+```python
+# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L530-L544
+n_image_tokens = (input_ids == self.config.image_token_index).sum().item()
+n_image_features = image_features.shape[0] * image_features.shape[1]
+
+if n_image_tokens != n_image_features:
+    raise ValueError(
+        f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
+    )
+special_image_mask = (
+    (input_ids == self.config.image_token_index)
+    .unsqueeze(-1)
+    .expand_as(inputs_embeds)
+    .to(inputs_embeds.device)
+)
+image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+```
 
-```{seealso}
-[Input Processing Pipeline](#input-processing-pipeline)
+The number of placeholder feature tokens per image is `image_features.shape[1]`.
+`image_features` is calculated inside the `get_image_features` method:
+
+```python
+# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L290-L300
+image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
+
+selected_image_feature = image_outputs.hidden_states[vision_feature_layer]
+if vision_feature_select_strategy == "default":
+    selected_image_feature = selected_image_feature[:, 1:]
+elif vision_feature_select_strategy == "full":
+    selected_image_feature = selected_image_feature
+else:
+    raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}")
+image_features = self.multi_modal_projector(selected_image_feature)
+return image_features
 ```
 
-## 3. Register maximum number of multi-modal tokens
+We can infer that `image_features.shape[1]` is based on `image_outputs.hidden_states.shape[1]` from the vision tower
+(`CLIPVisionModel` for the [`llava-hf/llava-1.5-7b-hf`](https://huggingface.co/llava-hf/llava-1.5-7b-hf) model).
+Moreover, we only need the sequence length (the second dimension of the tensor) to get `image_features.shape[1]`.
+The sequence length is determined by the initial hidden states in `CLIPVisionTransformer` since the attention
+mechanism doesn't change the sequence length of the output hidden states.
+
+```python
+# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L1094-L1102
+hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
+hidden_states = self.pre_layrnorm(hidden_states)
+
+encoder_outputs = self.encoder(
+    inputs_embeds=hidden_states,
+    output_attentions=output_attentions,
+    output_hidden_states=output_hidden_states,
+    return_dict=return_dict,
+)
+```
 
-For each modality type that the model accepts as input, calculate the maximum possible number of tokens per data item
-and register it via {meth}`INPUT_REGISTRY.register_dummy_data <vllm.inputs.registry.InputRegistry.register_max_multimodal_tokens>`.
+To find the sequence length, we turn to the code of `CLIPVisionEmbeddings`:
+
+```python
+# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L247-L257
+target_dtype = self.patch_embedding.weight.dtype
+patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
+patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+
+class_embeds = self.class_embedding.expand(batch_size, 1, -1)
+embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+if interpolate_pos_encoding:
+    embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
+else:
+    embeddings = embeddings + self.position_embedding(self.position_ids)
+return embeddings
+```
 
-```diff
-  from vllm.inputs import INPUT_REGISTRY
-  from vllm.model_executor.models.interfaces import SupportsMultiModal
-  from vllm.multimodal import MULTIMODAL_REGISTRY
+We can infer that `embeddings.shape[1] == self.num_positions`, where
 
-  @MULTIMODAL_REGISTRY.register_image_input_mapper()
-+ @MULTIMODAL_REGISTRY.register_max_image_tokens(<your_calculation>)
-  @INPUT_REGISTRY.register_dummy_data(<your_dummy_data_factory>)
-  class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
+```python
+# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L195-L196
+self.num_patches = (self.image_size // self.patch_size) ** 2
+self.num_positions = self.num_patches + 1
 ```
 
-Here are some examples:
+Overall, the number of placeholder feature tokens for an image can be calculated as:
 
-- Image inputs (static feature size): [LLaVA-1.5 Model](gh-file:vllm/model_executor/models/llava.py)
-- Image inputs (dynamic feature size): [LLaVA-NeXT Model](gh-file:vllm/model_executor/models/llava_next.py)
+```python
+def get_num_image_tokens(
+    self,
+    *,
+    image_width: int,
+    image_height: int,
+) -> int:
+    hf_config = self.get_hf_config()
+    hf_processor = self.get_hf_processor()
 
-```{seealso}
-[Input Processing Pipeline](#input-processing-pipeline)
+    image_size = hf_config.vision_config.image_size
+    patch_size = hf_config.vision_config.patch_size
+
+    num_image_tokens = (image_size // patch_size) ** 2 + 1
+    if hf_processor.vision_feature_select_strategy == "default":
+        num_image_tokens -= 1
+
+    return num_image_tokens
 ```
 
-## 4. (Optional) Register dummy data
+Notice that the number of image tokens doesn't depend on the image width and height.
+So, we can calculate the maximum number of image tokens using any image size:
 
-During startup, dummy data is passed to the vLLM model to allocate memory. This only consists of text input by default, which may not be applicable to multi-modal models.
-In such cases, you can define your own dummy data by registering a factory method via {meth}`INPUT_REGISTRY.register_dummy_data <vllm.inputs.registry.InputRegistry.register_dummy_data>`.
+```python
+def get_image_size_with_most_features(self) -> ImageSize:
+    hf_config = self.get_hf_config()
+    width = height = hf_config.image_size
+    return ImageSize(width=width, height=height)
 
-```diff
-  from vllm.inputs import INPUT_REGISTRY
-  from vllm.model_executor.models.interfaces import SupportsMultiModal
-  from vllm.multimodal import MULTIMODAL_REGISTRY
+def get_max_image_tokens(self) -> int:
+    target_width, target_height = self.get_image_size_with_most_features()
 
-  @MULTIMODAL_REGISTRY.register_image_input_mapper()
-  @MULTIMODAL_REGISTRY.register_max_image_tokens(<your_calculation>)
-+ @INPUT_REGISTRY.register_dummy_data(<your_dummy_data_factory>)
-  class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
+    return self.get_num_image_tokens(
+        image_width=target_width,
+        image_height=target_height,
+    )
+```
+
+And thus, we can override the method as:
+
+```python
+def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+    return {"image": self.get_max_image_tokens()}
 ```
 
 ```{note}
-The dummy data should have the maximum possible number of multi-modal tokens, as described in the previous step.
+Our [actual code](gh-file:vllm/model_executor/models/llava.py) is more abstracted to support vision encoders other than CLIP.
 ```
+:::
+::::
+
+## 3. Specify dummy inputs
+
+Then, inherit {class}`~vllm.multimodal.profiling.BaseDummyInputsBuilder` to construct dummy inputs for
+HF processing as well as memory profiling.
+
+### For memory profiling
+
+Override the abstract method {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_processor_inputs`
+to construct dummy inputs for memory profiling. This dummy input should result in the worst-case memory usage of
+the model so that vLLM can reserve the correct amount of memory for it.
+
+Assuming that the memory usage increases with the number of tokens, the dummy input can be constructed based
+on the code for {meth}`~vllm.multimodal.processing.BaseProcessingInfo.get_mm_max_tokens_per_item`.
+
+::::{tab-set}
+:::{tab-item} Basic example: LLaVA
+:sync: llava
+Making use of the `get_image_size_with_most_features` method implemented in the previous section:
+
+```python
+def get_dummy_processor_inputs(
+    self,
+    seq_len: int,
+    mm_counts: Mapping[str, int],
+) -> ProcessorInputs:
+    num_images = mm_counts.get("image", 0)
+
+    processor = self.info.get_hf_processor()
+    image_token = processor.image_token
+  
+    hf_config = self.get_hf_config()
+    target_width, target_height = self.info.get_image_size_with_most_features()
+
+    mm_data = {
+        "image":
+        self._get_dummy_images(width=target_width,
+                               height=target_height,
+                               num_images=num_images)
+    }
+
+    return ProcessorInputs(
+        prompt_text=image_token * num_images,
+        mm_data=mm_data,
+    )
+```
+:::
+::::
 
-Here are some examples:
+## 4. Specify processing details
 
-- Image inputs (static feature size): [LLaVA-1.5 Model](gh-file:vllm/model_executor/models/llava.py)
-- Image inputs (dynamic feature size): [LLaVA-NeXT Model](gh-file:vllm/model_executor/models/llava_next.py)
+Afterwards, create a subclass of {class}`~vllm.multimodal.processing.BaseMultiModalProcessor`
+to fill in the missing details about HF processing.
 
 ```{seealso}
-[Input Processing Pipeline](#input-processing-pipeline)
+[Multi-Modal Data Processing](#mm-processing)
 ```
 
-## 5. (Optional) Register input processor
+### Multi-modal fields
+
+Override {class}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_mm_fields_config` to
+return a schema of the tensors outputted by the HF processor that are related to the input multi-modal items.
+
+::::{tab-set}
+:::{tab-item} Basic example: LLaVA
+:sync: llava
+
+Looking at the model's `forward` method:
+
+```python
+# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L387-L404
+def forward(
+    self,
+    input_ids: torch.LongTensor = None,
+    pixel_values: torch.FloatTensor = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[List[torch.FloatTensor]] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    vision_feature_layer: Optional[int] = None,
+    vision_feature_select_strategy: Optional[str] = None,
+    labels: Optional[torch.LongTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    cache_position: Optional[torch.LongTensor] = None,
+    num_logits_to_keep: int = 0,
+) -> Union[Tuple, LlavaCausalLMOutputWithPast]:
+```
 
-Sometimes, there is a need to process inputs at the {class}`~vllm.LLMEngine` level before they are passed to the model executor.
-This is often due to the fact that unlike implementations in HuggingFace Transformers, the reshaping and/or expansion of multi-modal embeddings needs to take place outside model's {meth}`~torch.nn.Module.forward` call.
-You can register input processors via {meth}`INPUT_REGISTRY.register_input_processor <vllm.inputs.registry.InputRegistry.register_input_processor>`.
+The only related keyword argument is `pixel_values` which directly corresponds to input images.
+The shape of `pixel_values` is `(N, C, H, W)` where `N` is the number of images.
+So, we override the method as follows:
+
+```python
+def _get_mm_fields_config(
+    self,
+    hf_inputs: BatchFeature,
+    hf_processor_mm_kwargs: Mapping[str, object],
+) -> Mapping[str, MultiModalFieldConfig]:
+    return dict(
+        pixel_values=MultiModalFieldConfig.batched("image"),
+    )
+```
 
-```diff
-  from vllm.inputs import INPUT_REGISTRY
-  from vllm.model_executor.models.interfaces import SupportsMultiModal
-  from vllm.multimodal import MULTIMODAL_REGISTRY
+```{note}
+Our [actual code](gh-file:vllm/model_executor/models/llava.py) additionally supports
+pre-computed image embeddings, which can be passed to be model via the `image_embeds` argument.
+```
+:::
+::::
 
-  @MULTIMODAL_REGISTRY.register_image_input_mapper()
-  @MULTIMODAL_REGISTRY.register_max_image_tokens(<your_calculation>)
-  @INPUT_REGISTRY.register_dummy_data(<your_dummy_data_factory>)
-+ @INPUT_REGISTRY.register_input_processor(<your_input_processor>)
-  class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
+### Prompt replacements
+
+Override {class}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_replacements` to
+return a list of {class}`~vllm.multimodal.processing.PromptReplacement` instances.
+
+Each {class}`~vllm.multimodal.processing.PromptReplacement` instance specifies a find-and-replace
+operation performed by the HF processor.
+
+::::{tab-set}
+:::{tab-item} Basic example: LLaVA
+:sync: llava
+
+Looking at HF's `LlavaProcessor`:
+
+```python
+# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/processing_llava.py#L167-L170
+prompt_strings = []
+for sample in text:
+    sample = sample.replace(self.image_token, self.image_token * num_image_tokens)
+    prompt_strings.append(sample)
 ```
 
-A common use case of input processors is inserting placeholder tokens to leverage the vLLM framework for attention mask generation.
-Here are some examples:
+It simply repeats each input `image_token` a number of times equal to the number of placeholder feature tokens (`num_image_tokens`).
+Based on this, we override the method as follows:
+
+```python
+def _get_prompt_replacements(
+    self,
+    mm_items: MultiModalDataItems,
+    hf_processor_mm_kwargs: Mapping[str, object],
+    out_mm_kwargs: MultiModalKwargs,
+) -> list[PromptReplacement]:
+    hf_config = self.info.get_hf_config()
+    image_token_id = hf_config.image_token_index
+
+    def get_replacement(item_idx: int):
+        images = mm_items.get_items("image", ImageProcessorItems)
+
+        image_size = images.get_image_size(item_idx)
+        num_image_tokens = self.info.get_num_image_tokens(
+            image_width=image_size.width,
+            image_height=image_size.height,
+        )
+
+        return [image_token_id] * num_image_tokens
+
+    return [
+        PromptReplacement(
+            modality="image",
+            target=[image_token_id],
+            replacement=get_replacement,
+        ),
+    ]
+```
+:::
+::::
 
-- Insert static number of image tokens: [LLaVA-1.5 Model](gh-file:vllm/model_executor/models/llava.py)
-- Insert dynamic number of image tokens: [LLaVA-NeXT Model](gh-file:vllm/model_executor/models/llava_next.py)
+## 5. Register processor-related classes
 
-```{seealso}
-[Input Processing Pipeline](#input-processing-pipeline)
+After you have defined {class}`~vllm.multimodal.processing.BaseProcessingInfo` (Step 2),
+{class}`~vllm.multimodal.profiling.BaseDummyInputsBuilder` (Step 3),
+and {class}`~vllm.multimodal.processing.BaseMultiModalProcessor` (Step 4),
+decorate the model class with {meth}`MULTIMODAL_REGISTRY.register_processor <vllm.multimodal.registry.MultiModalRegistry.register_processor>`
+to register them to the multi-modal registry:
+
+```diff
+  from vllm.model_executor.models.interfaces import SupportsMultiModal
++ from vllm.multimodal import MULTIMODAL_REGISTRY
+
++ @MULTIMODAL_REGISTRY.register_processor(YourMultiModalProcessor,
++                                         info=YourProcessingInfo,
++                                         dummy_inputs=YourDummyInputsBuilder)
+  class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
 ```
diff --git a/docs/source/contributing/model/registration.md b/docs/source/contributing/model/registration.md
index fe5aa94c52896..6a9262669cd29 100644
--- a/docs/source/contributing/model/registration.md
+++ b/docs/source/contributing/model/registration.md
@@ -48,7 +48,7 @@ ModelRegistry.register_model("YourModelForCausalLM", "your_code:YourModelForCaus
 
 ```{important}
 If your model is a multimodal model, ensure the model class implements the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface.
-Read more about that [here](#enabling-multimodal-inputs).
+Read more about that [here](#supports-multimodal).
 ```
 
 ```{note}
diff --git a/docs/source/design/input_processing/input_processing_pipeline.md b/docs/source/design/input_processing/input_processing_pipeline.md
deleted file mode 100644
index bb16920e3d0c0..0000000000000
--- a/docs/source/design/input_processing/input_processing_pipeline.md
+++ /dev/null
@@ -1,19 +0,0 @@
-(input-processing-pipeline)=
-
-# Input Processing Pipeline
-
-1. Input data is passed to {class}`~vllm.LLMEngine` (or {class}`~vllm.AsyncLLMEngine`).
-
-2. Tokenize the data if necessary.
-
-3. Process the inputs using {meth}`INPUT_REGISTRY.process_input <vllm.inputs.registry.InputRegistry.process_input>`.
-
-   - For example, add placeholder tokens to reserve KV cache for multi-modal embeddings.
-
-4. Send the processed inputs to {class}`~vllm.executor.executor_base.ExecutorBase`.
-
-5. Distribute the inputs via {class}`~vllm.worker.worker_base.WorkerBase` to {class}`~vllm.worker.model_runner_base.ModelRunnerBase`.
-
-6. If the data contains multi-modal data, convert it into keyword arguments using {meth}`MULTIMODAL_REGISTRY.map_input <vllm.multimodal.MultiModalRegistry.map_input>`.
-
-   - For example, convert a {class}`PIL.Image.Image` input to its pixel values for a vision model.
diff --git a/docs/source/design/input_processing/model_inputs_index.md b/docs/source/design/input_processing/model_inputs_index.md
deleted file mode 100644
index cb415366e5a66..0000000000000
--- a/docs/source/design/input_processing/model_inputs_index.md
+++ /dev/null
@@ -1,43 +0,0 @@
-(input-processing)=
-
-# Input Processing
-
-```{eval-rst}
-.. currentmodule:: vllm.inputs
-```
-
-Each model can override parts of vLLM's [input processing pipeline](#input-processing-pipeline) via
-{data}`~vllm.inputs.INPUT_REGISTRY` and {data}`~vllm.multimodal.MULTIMODAL_REGISTRY`.
-
-Currently, this mechanism is only utilized in [multi-modal](#multi-modality) models for preprocessing multi-modal input
-data in addition to input prompt, but it can be extended to text-only language models when needed.
-
-## Guides
-
-```{toctree}
-:maxdepth: 1
-
-input_processing_pipeline
-```
-
-## Module Contents
-
-### LLM Engine Inputs
-
-```{eval-rst}
-.. autoclass:: vllm.inputs.DecoderOnlyInputs
-    :members:
-    :show-inheritance:
-```
-
-### Registry
-
-```{eval-rst}
-.. autodata:: vllm.inputs.INPUT_REGISTRY
-```
-
-```{eval-rst}
-.. automodule:: vllm.inputs.registry
-    :members:
-    :show-inheritance:
-```
diff --git a/docs/source/design/mm_processing.md b/docs/source/design/mm_processing.md
new file mode 100644
index 0000000000000..a0d01205e638c
--- /dev/null
+++ b/docs/source/design/mm_processing.md
@@ -0,0 +1,64 @@
+(mm-processing)=
+
+# Multi-Modal Data Processing
+
+To enable various optimizations in vLLM such as [chunked prefill](#chunked-prefill) and [prefix caching](#automatic-prefix-caching), we use {class}`~vllm.multimodal.processing.BaseMultiModalProcessor` to provide the correspondence between placeholder feature tokens (e.g. `<image>`) and multi-modal inputs (e.g. the raw input image) based on the outputs of HF processor.
+
+Here are the main features of {class}`~vllm.multimodal.processing.BaseMultiModalProcessor`:
+
+## Prompt Replacement Detection
+
+One of the main responsibilies of HF processor is to replace input placeholder tokens (e.g. `<image>` for a single image) with feature placeholder tokens (e.g. `<image><image>...<image>`, the number of which equals to the feature size). The information about which tokens have been replaced is key to finding the correspondence between placeholder feature tokens and multi-modal inputs.
+
+In vLLM, this information is specified using {class}`~vllm.multimodal.processing.PromptReplacement` in {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_replacements`. Given this specification, we can automatically detect whether HF has replaced the input placeholder tokens by checking whether the feature placeholder tokens exist in the prompt.
+
+## Tokenized Prompt Inputs
+
+To enable tokenization in a separate process, we support passing input token IDs alongside multi-modal data.
+
+### The problem
+
+Consider that HF processors follow these main steps:
+
+1. Tokenize the text
+2. Process multi-modal inputs
+3. Perform prompt replacement
+
+And we require that:
+
+- For text + multi-modal inputs, apply all steps 1--3.
+- For tokenized + multi-modal inputs, apply only steps 2--3.
+
+How can we achieve this without rewriting HF processors? We can try to call the HF processor several times on different inputs:
+
+- For text + multi-modal inputs, simply call the HF processor directly.
+- For tokenized + multi-modal inputs, call the processor only on the multi-modal inputs.
+
+While HF processors support text + multi-modal inputs natively, this is not so for tokenized + multi-modal inputs: an error is thrown if the number of input placeholder tokens do not correspond to the number of multi-modal inputs.
+
+Moreover, since the tokenized text has not passed through the HF processor, we have to apply Step 3 by ourselves to keep the output tokens and multi-modal data consistent with each other.
+
+(mm-dummy-text)=
+
+### Dummy text
+
+We work around the first issue by requiring each model to define how to generate dummy text based on the number of multi-modal inputs, via {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_processor_inputs`. This lets us generate dummy text corresponding to the multi-modal inputs and input them together to obtain the processed multi-modal data.
+
+(mm-automatic-prompt-replacement)=
+
+### Automatic prompt replacement
+
+We address the second issue by implementing model-agnostic code in
+{meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._apply_prompt_replacements` to automatically replace input placeholder tokens with feature placeholder tokens based on the specification outputted by {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_replacements`.
+
+### Summary
+
+With the help of dummy text and automatic prompt replacement, our multi-modal processor can finally accept both text and token prompts with multi-modal data. The detailed logic is shown in {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._apply_hf_processor_main`.
+
+## Processor Output Caching
+
+Some HF processors, such as the one for Qwen2-VL, are [very slow](gh-issue:9238). To alleviate this problem, we cache the multi-modal outputs of HF processor to avoid processing the same multi-modal input (e.g. image) again.
+
+When new data is passed in, we first check which items are in the cache, and which ones are missing. The missing items are passed into the HF processor in a single batch and cached, before being merged with the existing items in the cache.
+
+Since we only process the missing multi-modal data items, the number of input placeholder tokens no longer corresponds to the number of the multi-modal inputs, so they can't be passed alongside the text prompt to HF processor. Therefore, we process the text and multi-modal inputs separately, using [dummy text](#mm-dummy-text) to avoid HF errors. Since this skips HF's prompt replacement code, we apply [automatic prompt replacement](#mm-automatic-prompt-replacement) afterwards to keep the output tokens and multi-modal data consistent with each other.
diff --git a/docs/source/index.md b/docs/source/index.md
index 356fa4b7fd573..de74276a50fb6 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -145,7 +145,7 @@ design/arch_overview
 design/huggingface_integration
 design/plugin_system
 design/kernel/paged_attention
-design/input_processing/model_inputs_index
+design/mm_processing
 design/automatic_prefix_caching
 design/multiprocessing
 ```
diff --git a/docs/source/serving/multimodal_inputs.md b/docs/source/serving/multimodal_inputs.md
index 7e96ed46f2dcc..a06f121a6899a 100644
--- a/docs/source/serving/multimodal_inputs.md
+++ b/docs/source/serving/multimodal_inputs.md
@@ -14,7 +14,7 @@ and [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/ch
 To input multi-modal data, follow this schema in {class}`vllm.inputs.PromptType`:
 
 - `prompt`: The prompt should follow the format that is documented on HuggingFace.
-- `multi_modal_data`: This is a dictionary that follows the schema defined in {class}`vllm.multimodal.MultiModalDataDict`.
+- `multi_modal_data`: This is a dictionary that follows the schema defined in {class}`vllm.multimodal.inputs.MultiModalDataDict`.
 
 ### Image
 
diff --git a/vllm/config.py b/vllm/config.py
index 13b5390008a35..59b509d5a961e 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2124,8 +2124,7 @@ class MultiModalConfig:
 
     limit_per_prompt: Mapping[str, int] = field(default_factory=dict)
     """
-    The maximum number of multi-modal input instances allowed per prompt
-    for each :class:`~vllm.multimodal.MultiModalPlugin`.
+    The maximum number of input items allowed per prompt for each modality.
     """
 
     def compute_hash(self) -> str:
diff --git a/vllm/inputs/__init__.py b/vllm/inputs/__init__.py
index aaeecab7ffde1..a0dd89f69bacd 100644
--- a/vllm/inputs/__init__.py
+++ b/vllm/inputs/__init__.py
@@ -11,9 +11,6 @@
 """
 The global :class:`~InputRegistry` which is used by :class:`~vllm.LLMEngine`
 to dispatch data processing according to the target model.
-
-See also:
-    :ref:`input-processing-pipeline`
 """
 
 __all__ = [
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index aad0dfab94a01..4b73ade7af5f0 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -313,9 +313,6 @@ def dummy_data_for_profiling(
 
         The model is identified by ``model_config``.
 
-        See also:
-            :ref:`enabling-multimodal-inputs`
-
         Note:
             This should be called after
             :meth:`~MultiModalRegistry.init_mm_limits_per_prompt`.
@@ -384,10 +381,8 @@ def register_input_processor(self, processor: InputProcessor):
         Register an input processor to a model class.
 
         The provided function is invoked on each input to the model. This
-        happens before :meth:`~vllm.multimodal.MultiModalRegistry.map_input`.
-
-        See also:
-            :ref:`input-processing-pipeline`
+        happens before
+        :meth:`~vllm.multimodal.registry.MultiModalRegistry.map_input`.
         """
 
         def wrapper(model_cls: N) -> N:
@@ -429,9 +424,6 @@ def process_input(self, model_config: "ModelConfig",
         Apply an input processor to an instance of model inputs.
 
         The model is identified by ``model_config``.
-
-        See also:
-            :ref:`input-processing-pipeline`
         """
         # Avoid circular import
         from vllm.model_executor.model_loader import get_model_architecture
diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py
index 343b9322ecc5e..1d7f5d57fa24e 100644
--- a/vllm/multimodal/__init__.py
+++ b/vllm/multimodal/__init__.py
@@ -8,10 +8,10 @@
 MULTIMODAL_REGISTRY = MultiModalRegistry()
 """
 The global :class:`~MultiModalRegistry` is used by model runners to
-dispatch data processing according to its modality and the target model.
+dispatch data processing according to the target model.
 
 See also:
-    :ref:`input-processing-pipeline`
+    :ref:`mm-processing`
 """
 
 __all__ = [
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 4941fbac963ca..fd3ec7e0ec8ce 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -90,10 +90,6 @@ def register_input_mapper(
         invoked to transform the data into a dictionary of model inputs.
 
         If `None` is provided, then the default input mapper is used instead.
-
-        See also:
-            - :ref:`input-processing-pipeline`
-            - :ref:`enabling-multimodal-inputs`
         """
 
         def wrapper(model_cls: N) -> N:
@@ -126,10 +122,6 @@ def map_input(
 
         Raises:
             TypeError: If the data type is not supported.
-
-        See also:
-            - :ref:`input-processing-pipeline`
-            - :ref:`enabling-multimodal-inputs`
         """
 
         # Avoid circular import
@@ -186,9 +178,6 @@ def register_max_multimodal_tokens(
         for a model class.
 
         If `None` is provided, then the default calculation is used instead.
-
-        See also:
-            :ref:`enabling-multimodal-inputs`
         """
 
         def wrapper(model_cls: N) -> N:
@@ -218,9 +207,6 @@ def get_max_multimodal_tokens(self, model_config: "ModelConfig") -> int:
         If this registry is not applicable to the model, `0` is returned.
 
         The model is identified by ``model_config``.
-
-        See also:
-            :ref:`enabling-multimodal-inputs`
         """
         # Avoid circular import
         from vllm.model_executor.model_loader import get_model_architecture
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 8680e4175593b..4b63703585214 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -493,7 +493,8 @@ def get_items(self, modality: str) -> Sequence[MultiModalKwargsItem]:
 
 class MultiModalInputsV2(TypedDict):
     """
-    Represents the outputs of :class:`vllm.multimodal.MultiModalProcessor`,
+    Represents the outputs of
+    :class:`vllm.multimodal.processing.BaseMultiModalProcessor`,
     ready to be passed to vLLM internals.
     """
 
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 9eceefb08c93f..804a91da8c889 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -100,8 +100,7 @@ def __getitem__(self, key: "ModelConfig") -> Dict[str, int]:
 
 class MultiModalRegistry:
     """
-    A registry that dispatches data processing to the
-    :class:`~vllm.multimodal.MultiModalPlugin` for each modality.
+    A registry that dispatches data processing according to the model.
     """
 
     DEFAULT_PLUGINS = (ImagePlugin(), AudioPlugin(), VideoPlugin())
@@ -367,8 +366,7 @@ def register_processor(
         invoked to transform the data into a dictionary of model inputs.
 
         See also:
-            - :ref:`input-processing-pipeline`
-            - :ref:`enabling-multimodal-inputs`
+            :ref:`mm-processing`
         """
 
         def wrapper(model_cls: N) -> N:
@@ -398,6 +396,9 @@ def _get_model_cls(self, model_config: "ModelConfig"):
     def has_processor(self, model_config: "ModelConfig") -> bool:
         """
         Test whether a multi-modal processor is defined for a specific model.
+
+        See also:
+            :ref:`mm-processing`
         """
         return self._get_model_cls(model_config) in self._processor_factories
 
@@ -408,6 +409,9 @@ def create_processor(
     ) -> BaseMultiModalProcessor[BaseProcessingInfo]:
         """
         Create a multi-modal processor for a specific model and tokenizer.
+
+        See also:
+            :ref:`mm-processing`
         """
         model_cls = self._get_model_cls(model_config)
         factories = self._processor_factories[model_cls]

From 20410b2fdac1818ead453018fb07c2ff90ee6770 Mon Sep 17 00:00:00 2001
From: wangxiyuan <wangxiyuan1007@gmail.com>
Date: Fri, 10 Jan 2025 23:46:51 +0800
Subject: [PATCH 166/462] [platform] support custom torch.compile backend key
 (#11318)

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
---
 vllm/model_executor/layers/rejection_sampler.py        | 3 ++-
 vllm/model_executor/layers/vocab_parallel_embedding.py | 2 +-
 vllm/model_executor/models/commandr.py                 | 3 ++-
 vllm/model_executor/models/phi3_small.py               | 5 +++--
 vllm/platforms/interface.py                            | 6 ++++++
 5 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py
index f173cbde03f44..9d6c3797c62fc 100644
--- a/vllm/model_executor/layers/rejection_sampler.py
+++ b/vllm/model_executor/layers/rejection_sampler.py
@@ -9,6 +9,7 @@
 from vllm.logger import init_logger
 from vllm.model_executor.layers.spec_decode_base_sampler import (
     SpecDecodeStochasticBaseSampler)
+from vllm.platforms import current_platform
 
 logger = init_logger(__name__)
 
@@ -368,7 +369,7 @@ def _smallest_positive_value(self) -> float:
 # Note that we always sample with replacement.
 # probs will be modified in place, but this is fine, as we pass
 # in a copy already.
-@torch.compile(dynamic=True)
+@torch.compile(dynamic=True, backend=current_platform.simple_compile_backend)
 def _multinomial(
     probs: torch.Tensor,
     num_samples: int,
diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py
index 30548e656c557..65920aa61ba15 100644
--- a/vllm/model_executor/layers/vocab_parallel_embedding.py
+++ b/vllm/model_executor/layers/vocab_parallel_embedding.py
@@ -133,7 +133,7 @@ def __post_init__(self):
         assert self.num_added_elements <= self.num_added_elements_padded
 
 
-@torch.compile(dynamic=True)
+@torch.compile(dynamic=True, backend=current_platform.simple_compile_backend)
 def get_masked_input_and_mask(
         input_: torch.Tensor, org_vocab_start_index: int,
         org_vocab_end_index: int, num_org_vocab_padding: int,
diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py
index d22d1f3171463..8d61ece289412 100644
--- a/vllm/model_executor/models/commandr.py
+++ b/vllm/model_executor/models/commandr.py
@@ -45,6 +45,7 @@
     row_parallel_weight_loader)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.utils import set_weight_attrs
+from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
@@ -53,7 +54,7 @@
                     maybe_prefix)
 
 
-@torch.compile
+@torch.compile(backend=current_platform.simple_compile_backend)
 def layer_norm_func(hidden_states, weight, variance_epsilon):
     input_dtype = hidden_states.dtype
     hidden_states = hidden_states.to(torch.float32)
diff --git a/vllm/model_executor/models/phi3_small.py b/vllm/model_executor/models/phi3_small.py
index da7e4cdbc6940..f47676b934e4e 100644
--- a/vllm/model_executor/models/phi3_small.py
+++ b/vllm/model_executor/models/phi3_small.py
@@ -20,6 +20,7 @@
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsPP
@@ -54,12 +55,12 @@ def weight_loader(self, param: torch.nn.Parameter,
         return load_column_parallel_weight(param, loaded_weight)
 
 
-@torch.compile(dynamic=True)
+@torch.compile(dynamic=True, backend=current_platform.simple_compile_backend)
 def quick_gelu(x):
     return x * torch.sigmoid(1.702 * x)
 
 
-@torch.compile(dynamic=True)
+@torch.compile(dynamic=True, backend=current_platform.simple_compile_backend)
 def gegelu(input, limit: Optional[float] = None):
     a_gelu, a_linear = input[..., ::2], input[..., 1::2]
     if limit is not None:
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 01d753408e6d0..fe398801c5dd9 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -82,6 +82,12 @@ class Platform:
     # check https://github.com/pytorch/pytorch/blob/313dac6c1ca0fa0cde32477509cce32089f8532a/torchgen/model.py#L134 # noqa
     # use "CPU" as a fallback for platforms not registered in PyTorch
     dispatch_key: str = "CPU"
+    # The torch.compile backend for compiling simple and
+    # standalone functions. The default value is "inductor" to keep
+    # the same behavior as PyTorch.
+    # NOTE: for the forward part of the model, vLLM has another separate
+    # compilation strategy.
+    simple_compile_backend: str = "inductor"
     supported_quantization: list[str] = []
 
     def is_cuda(self) -> bool:

From 482cdc494e608b72303f49b56532f5c50b61cbdb Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Fri, 10 Jan 2025 15:50:29 +0000
Subject: [PATCH 167/462] [Doc] Rename offline inference examples (#11927)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .buildkite/run-cpu-test.sh                    |  2 +-
 .buildkite/run-gh200-test.sh                  |  2 +-
 .buildkite/run-hpu-test.sh                    |  2 +-
 .buildkite/run-neuron-test.sh                 |  2 +-
 .buildkite/run-openvino-test.sh               |  2 +-
 .buildkite/run-tpu-test.sh                    |  2 +-
 .buildkite/run-xpu-test.sh                    |  4 ++--
 .buildkite/test-pipeline.yaml                 | 20 +++++++++----------
 .../contributing/profiling/profiling_index.md |  2 +-
 docs/source/features/structured_outputs.md    |  2 +-
 .../getting_started/installation/cpu-x86.md   |  4 ++--
 docs/source/getting_started/quickstart.md     |  2 +-
 docs/source/models/generative_models.md       |  4 ++--
 docs/source/models/pooling_models.md          |  6 +++---
 docs/source/serving/multimodal_inputs.md      |  8 ++++----
 ...{offline_inference_arctic.py => arctic.py} |  0
 ...ce_audio_language.py => audio_language.py} |  0
 .../{offline_inference.py => basic.py}        |  0
 ...y => basic_with_model_default_sampling.py} |  0
 .../{offline_inference_chat.py => chat.py}    |  0
 ..._chat_with_tools.py => chat_with_tools.py} |  0
 ...ce_classification.py => classification.py} |  0
 .../{offline_inference_cli.py => cli.py}      |  0
 ...nference_distributed.py => distributed.py} |  0
 ...ne_inference_embedding.py => embedding.py} |  0
 ..._encoder_decoder.py => encoder_decoder.py} |  0
 .../offline_inference/florence2_inference.py  |  2 +-
 ...ence_mlpspeculator.py => mlpspeculator.py} |  0
 ...{offline_inference_neuron.py => neuron.py} |  0
 ...ization.py => neuron_int8_quantization.py} |  0
 .../openai_batch.md}                          | 18 ++++++++---------
 .../openai_example_batch.jsonl                |  0
 ...ffline_inference_pixtral.py => pixtral.py} |  0
 ...rence_with_prefix.py => prefix_caching.py} |  0
 .../{offline_profile.py => profiling.py}      |  2 +-
 ...ffline_inference_scoring.py => scoring.py} |  0
 ...e_with_profiler.py => simple_profiling.py} |  0
 ...tured_outputs.py => structured_outputs.py} |  0
 .../{offline_inference_tpu.py => tpu.py}      |  0
 ..._vision_language.py => vision_language.py} |  0
 ...edding.py => vision_language_embedding.py} |  0
 ...mage.py => vision_language_multi_image.py} |  0
 ...ffline_inference_whisper.py => whisper.py} |  0
 tests/plugins_tests/test_platform_plugins.py  |  2 +-
 tools/profiler/print_layerwise_table.py       |  2 +-
 tools/profiler/visualize_layerwise_profile.py |  2 +-
 46 files changed, 46 insertions(+), 46 deletions(-)
 rename examples/offline_inference/{offline_inference_arctic.py => arctic.py} (100%)
 rename examples/offline_inference/{offline_inference_audio_language.py => audio_language.py} (100%)
 rename examples/offline_inference/{offline_inference.py => basic.py} (100%)
 rename examples/offline_inference/{offline_inference_with_default_generation_config.py => basic_with_model_default_sampling.py} (100%)
 rename examples/offline_inference/{offline_inference_chat.py => chat.py} (100%)
 rename examples/offline_inference/{offline_chat_with_tools.py => chat_with_tools.py} (100%)
 rename examples/offline_inference/{offline_inference_classification.py => classification.py} (100%)
 rename examples/offline_inference/{offline_inference_cli.py => cli.py} (100%)
 rename examples/offline_inference/{offline_inference_distributed.py => distributed.py} (100%)
 rename examples/offline_inference/{offline_inference_embedding.py => embedding.py} (100%)
 rename examples/offline_inference/{offline_inference_encoder_decoder.py => encoder_decoder.py} (100%)
 rename examples/offline_inference/{offline_inference_mlpspeculator.py => mlpspeculator.py} (100%)
 rename examples/offline_inference/{offline_inference_neuron.py => neuron.py} (100%)
 rename examples/offline_inference/{offline_inference_neuron_int8_quantization.py => neuron_int8_quantization.py} (100%)
 rename examples/offline_inference/{offline_inference_openai/offline_inference_openai.md => openai/openai_batch.md} (92%)
 rename examples/offline_inference/{offline_inference_openai => openai}/openai_example_batch.jsonl (100%)
 rename examples/offline_inference/{offline_inference_pixtral.py => pixtral.py} (100%)
 rename examples/offline_inference/{offline_inference_with_prefix.py => prefix_caching.py} (100%)
 rename examples/offline_inference/{offline_profile.py => profiling.py} (99%)
 rename examples/offline_inference/{offline_inference_scoring.py => scoring.py} (100%)
 rename examples/offline_inference/{offline_inference_with_profiler.py => simple_profiling.py} (100%)
 rename examples/offline_inference/{offline_inference_structured_outputs.py => structured_outputs.py} (100%)
 rename examples/offline_inference/{offline_inference_tpu.py => tpu.py} (100%)
 rename examples/offline_inference/{offline_inference_vision_language.py => vision_language.py} (100%)
 rename examples/offline_inference/{offline_inference_vision_language_embedding.py => vision_language_embedding.py} (100%)
 rename examples/offline_inference/{offline_inference_vision_language_multi_image.py => vision_language_multi_image.py} (100%)
 rename examples/offline_inference/{offline_inference_whisper.py => whisper.py} (100%)

diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index 5a285be039393..4ae66f6f3215a 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -30,7 +30,7 @@ function cpu_tests() {
   # offline inference
   docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
     set -e
-    python3 examples/offline_inference/offline_inference.py"
+    python3 examples/offline_inference/basic.py"
 
   # Run basic model test
   docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
diff --git a/.buildkite/run-gh200-test.sh b/.buildkite/run-gh200-test.sh
index 1e5ff77895a38..3e4e409466b8a 100644
--- a/.buildkite/run-gh200-test.sh
+++ b/.buildkite/run-gh200-test.sh
@@ -24,5 +24,5 @@ remove_docker_container
 
 # Run the image and test offline inference
 docker run --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
-    python3 examples/offline_inference/offline_inference.py
+    python3 examples/offline_inference/basic.py
 '
diff --git a/.buildkite/run-hpu-test.sh b/.buildkite/run-hpu-test.sh
index a50570ab53438..8f3b08212fd6a 100644
--- a/.buildkite/run-hpu-test.sh
+++ b/.buildkite/run-hpu-test.sh
@@ -13,4 +13,4 @@ trap remove_docker_container EXIT
 remove_docker_container
 
 # Run the image and launch offline inference
-docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/offline_inference.py
\ No newline at end of file
+docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic.py
\ No newline at end of file
diff --git a/.buildkite/run-neuron-test.sh b/.buildkite/run-neuron-test.sh
index 52d485939b1d0..189714ebb6d75 100644
--- a/.buildkite/run-neuron-test.sh
+++ b/.buildkite/run-neuron-test.sh
@@ -51,4 +51,4 @@ docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \
        -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
        --name "${container_name}" \
        ${image_name} \
-       /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/offline_inference_neuron.py"
+       /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py"
diff --git a/.buildkite/run-openvino-test.sh b/.buildkite/run-openvino-test.sh
index 380f7a44a429a..6159b21ff8206 100755
--- a/.buildkite/run-openvino-test.sh
+++ b/.buildkite/run-openvino-test.sh
@@ -13,4 +13,4 @@ trap remove_docker_container EXIT
 remove_docker_container
 
 # Run the image and launch offline inference
-docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/offline_inference.py
+docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/basic.py
diff --git a/.buildkite/run-tpu-test.sh b/.buildkite/run-tpu-test.sh
index a8f021890f742..650af0fac4c61 100644
--- a/.buildkite/run-tpu-test.sh
+++ b/.buildkite/run-tpu-test.sh
@@ -23,4 +23,4 @@ docker run --privileged --net host --shm-size=16G -it \
     && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
     && python3 /workspace/vllm/tests/tpu/test_compilation.py \
     && python3 /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
-    && python3 /workspace/vllm/examples/offline_inference/offline_inference_tpu.py"
+    && python3 /workspace/vllm/examples/offline_inference/tpu.py"
diff --git a/.buildkite/run-xpu-test.sh b/.buildkite/run-xpu-test.sh
index 160e10aa3bb9b..4d344e58db8ac 100644
--- a/.buildkite/run-xpu-test.sh
+++ b/.buildkite/run-xpu-test.sh
@@ -14,6 +14,6 @@ remove_docker_container
 
 # Run the image and test offline inference/tensor parallel
 docker run --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test sh -c '
-    python3 examples/offline_inference/offline_inference.py
-    python3 examples/offline_inference/offline_inference_cli.py -tp 2
+    python3 examples/offline_inference/basic.py
+    python3 examples/offline_inference/cli.py -tp 2
 '
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 7d13269540864..d3bd809cfdf24 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -187,19 +187,19 @@ steps:
   - examples/
   commands:
     - pip install tensorizer # for tensorizer test
-    - python3 offline_inference/offline_inference.py
+    - python3 offline_inference/basic.py
     - python3 offline_inference/cpu_offload.py
-    - python3 offline_inference/offline_inference_chat.py
-    - python3 offline_inference/offline_inference_with_prefix.py
+    - python3 offline_inference/chat.py
+    - python3 offline_inference/prefix_caching.py
     - python3 offline_inference/llm_engine_example.py
-    - python3 offline_inference/offline_inference_vision_language.py
-    - python3 offline_inference/offline_inference_vision_language_multi_image.py
+    - python3 offline_inference/vision_language.py
+    - python3 offline_inference/vision_language_multi_image.py
     - python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
-    - python3 offline_inference/offline_inference_encoder_decoder.py
-    - python3 offline_inference/offline_inference_classification.py
-    - python3 offline_inference/offline_inference_embedding.py
-    - python3 offline_inference/offline_inference_scoring.py
-    - python3 offline_inference/offline_profile.py --model facebook/opt-125m run_num_steps --num-steps 2
+    - python3 offline_inference/encoder_decoder.py
+    - python3 offline_inference/classification.py
+    - python3 offline_inference/embedding.py
+    - python3 offline_inference/scoring.py
+    - python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
 
 - label: Prefix Caching Test # 9min
   mirror_hardwares: [amd]
diff --git a/docs/source/contributing/profiling/profiling_index.md b/docs/source/contributing/profiling/profiling_index.md
index 97de40ff469f1..001db86bdf555 100644
--- a/docs/source/contributing/profiling/profiling_index.md
+++ b/docs/source/contributing/profiling/profiling_index.md
@@ -26,7 +26,7 @@ Set the env variable VLLM_RPC_TIMEOUT to a big number before you start the serve
 
 ### Offline Inference
 
-Refer to <gh-file:examples/offline_inference/offline_inference_with_profiler.py> for an example.
+Refer to <gh-file:examples/offline_inference/simple_profiling.py> for an example.
 
 ### OpenAI Server
 
diff --git a/docs/source/features/structured_outputs.md b/docs/source/features/structured_outputs.md
index a42c3dd64ad10..1d77c7339a33f 100644
--- a/docs/source/features/structured_outputs.md
+++ b/docs/source/features/structured_outputs.md
@@ -257,4 +257,4 @@ outputs = llm.generate(
 print(outputs[0].outputs[0].text)
 ```
 
-Full example: <gh-file:examples/offline_inference/offline_inference_structured_outputs.py>
+Full example: <gh-file:examples/offline_inference/structured_outputs.py>
diff --git a/docs/source/getting_started/installation/cpu-x86.md b/docs/source/getting_started/installation/cpu-x86.md
index bb046dd0fd9dc..f4d3eec0377b1 100644
--- a/docs/source/getting_started/installation/cpu-x86.md
+++ b/docs/source/getting_started/installation/cpu-x86.md
@@ -95,7 +95,7 @@ $ VLLM_TARGET_DEVICE=cpu python setup.py install
 $ sudo apt-get install libtcmalloc-minimal4 # install TCMalloc library
 $ find / -name *libtcmalloc* # find the dynamic link library path
 $ export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD
-$ python examples/offline_inference/offline_inference.py # run vLLM
+$ python examples/offline_inference/basic.py # run vLLM
 ```
 
 - When using the online serving, it is recommended to reserve 1-2 CPU cores for the serving framework to avoid CPU oversubscription. For example, on a platform with 32 physical CPU cores, reserving CPU 30 and 31 for the framework and using CPU 0-29 for OpenMP:
@@ -132,7 +132,7 @@ CPU NODE SOCKET CORE L1d:L1i:L2:L3 ONLINE    MAXMHZ   MINMHZ      MHZ
 
 # On this platform, it is recommend to only bind openMP threads on logical CPU cores 0-7 or 8-15
 $ export VLLM_CPU_OMP_THREADS_BIND=0-7
-$ python examples/offline_inference/offline_inference.py
+$ python examples/offline_inference/basic.py
 ```
 
 - If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores using `VLLM_CPU_OMP_THREADS_BIND` to avoid cross NUMA node memory access.
diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md
index d7d43785c6c24..6fd0083a9bb7b 100644
--- a/docs/source/getting_started/quickstart.md
+++ b/docs/source/getting_started/quickstart.md
@@ -40,7 +40,7 @@ For non-CUDA platforms, please refer [here](#installation-index) for specific in
 
 ## Offline Batched Inference
 
-With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: <gh-file:examples/offline_inference/offline_inference.py>
+With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: <gh-file:examples/offline_inference/basic.py>
 
 The first line of this example imports the classes {class}`~vllm.LLM` and {class}`~vllm.SamplingParams`:
 
diff --git a/docs/source/models/generative_models.md b/docs/source/models/generative_models.md
index 6a5a58ad74ab7..e4b4cd03a90d2 100644
--- a/docs/source/models/generative_models.md
+++ b/docs/source/models/generative_models.md
@@ -46,7 +46,7 @@ for output in outputs:
     print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 ```
 
-A code example can be found here: <gh-file:examples/offline_inference/offline_inference.py>
+A code example can be found here: <gh-file:examples/offline_inference/basic.py>
 
 ### `LLM.beam_search`
 
@@ -103,7 +103,7 @@ for output in outputs:
     print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 ```
 
-A code example can be found here: <gh-file:examples/offline_inference/offline_inference_chat.py>
+A code example can be found here: <gh-file:examples/offline_inference/chat.py>
 
 If the model doesn't have a chat template or you want to specify another one,
 you can explicitly pass a chat template:
diff --git a/docs/source/models/pooling_models.md b/docs/source/models/pooling_models.md
index 324b1f550e694..91db694be29a4 100644
--- a/docs/source/models/pooling_models.md
+++ b/docs/source/models/pooling_models.md
@@ -88,7 +88,7 @@ embeds = output.outputs.embedding
 print(f"Embeddings: {embeds!r} (size={len(embeds)})")
 ```
 
-A code example can be found here: <gh-file:examples/offline_inference/offline_inference_embedding.py>
+A code example can be found here: <gh-file:examples/offline_inference/embedding.py>
 
 ### `LLM.classify`
 
@@ -103,7 +103,7 @@ probs = output.outputs.probs
 print(f"Class Probabilities: {probs!r} (size={len(probs)})")
 ```
 
-A code example can be found here: <gh-file:examples/offline_inference/offline_inference_classification.py>
+A code example can be found here: <gh-file:examples/offline_inference/classification.py>
 
 ### `LLM.score`
 
@@ -125,7 +125,7 @@ score = output.outputs.score
 print(f"Score: {score}")
 ```
 
-A code example can be found here: <gh-file:examples/offline_inference/offline_inference_scoring.py>
+A code example can be found here: <gh-file:examples/offline_inference/scoring.py>
 
 ## Online Serving
 
diff --git a/docs/source/serving/multimodal_inputs.md b/docs/source/serving/multimodal_inputs.md
index a06f121a6899a..53f5a274e39a3 100644
--- a/docs/source/serving/multimodal_inputs.md
+++ b/docs/source/serving/multimodal_inputs.md
@@ -60,7 +60,7 @@ for o in outputs:
     print(generated_text)
 ```
 
-Full example: <gh-file:examples/offline_inference/offline_inference_vision_language.py>
+Full example: <gh-file:examples/offline_inference/vision_language.py>
 
 To substitute multiple images inside the same text prompt, you can pass in a list of images instead:
 
@@ -91,7 +91,7 @@ for o in outputs:
     print(generated_text)
 ```
 
-Full example: <gh-file:examples/offline_inference/offline_inference_vision_language_multi_image.py>
+Full example: <gh-file:examples/offline_inference/vision_language_multi_image.py>
 
 Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos:
 
@@ -125,13 +125,13 @@ for o in outputs:
 You can pass a list of NumPy arrays directly to the `'video'` field of the multi-modal dictionary
 instead of using multi-image input.
 
-Full example: <gh-file:examples/offline_inference/offline_inference_vision_language.py>
+Full example: <gh-file:examples/offline_inference/vision_language.py>
 
 ### Audio
 
 You can pass a tuple `(array, sampling_rate)` to the `'audio'` field of the multi-modal dictionary.
 
-Full example: <gh-file:examples/offline_inference/offline_inference_audio_language.py>
+Full example: <gh-file:examples/offline_inference/audio_language.py>
 
 ### Embedding
 
diff --git a/examples/offline_inference/offline_inference_arctic.py b/examples/offline_inference/arctic.py
similarity index 100%
rename from examples/offline_inference/offline_inference_arctic.py
rename to examples/offline_inference/arctic.py
diff --git a/examples/offline_inference/offline_inference_audio_language.py b/examples/offline_inference/audio_language.py
similarity index 100%
rename from examples/offline_inference/offline_inference_audio_language.py
rename to examples/offline_inference/audio_language.py
diff --git a/examples/offline_inference/offline_inference.py b/examples/offline_inference/basic.py
similarity index 100%
rename from examples/offline_inference/offline_inference.py
rename to examples/offline_inference/basic.py
diff --git a/examples/offline_inference/offline_inference_with_default_generation_config.py b/examples/offline_inference/basic_with_model_default_sampling.py
similarity index 100%
rename from examples/offline_inference/offline_inference_with_default_generation_config.py
rename to examples/offline_inference/basic_with_model_default_sampling.py
diff --git a/examples/offline_inference/offline_inference_chat.py b/examples/offline_inference/chat.py
similarity index 100%
rename from examples/offline_inference/offline_inference_chat.py
rename to examples/offline_inference/chat.py
diff --git a/examples/offline_inference/offline_chat_with_tools.py b/examples/offline_inference/chat_with_tools.py
similarity index 100%
rename from examples/offline_inference/offline_chat_with_tools.py
rename to examples/offline_inference/chat_with_tools.py
diff --git a/examples/offline_inference/offline_inference_classification.py b/examples/offline_inference/classification.py
similarity index 100%
rename from examples/offline_inference/offline_inference_classification.py
rename to examples/offline_inference/classification.py
diff --git a/examples/offline_inference/offline_inference_cli.py b/examples/offline_inference/cli.py
similarity index 100%
rename from examples/offline_inference/offline_inference_cli.py
rename to examples/offline_inference/cli.py
diff --git a/examples/offline_inference/offline_inference_distributed.py b/examples/offline_inference/distributed.py
similarity index 100%
rename from examples/offline_inference/offline_inference_distributed.py
rename to examples/offline_inference/distributed.py
diff --git a/examples/offline_inference/offline_inference_embedding.py b/examples/offline_inference/embedding.py
similarity index 100%
rename from examples/offline_inference/offline_inference_embedding.py
rename to examples/offline_inference/embedding.py
diff --git a/examples/offline_inference/offline_inference_encoder_decoder.py b/examples/offline_inference/encoder_decoder.py
similarity index 100%
rename from examples/offline_inference/offline_inference_encoder_decoder.py
rename to examples/offline_inference/encoder_decoder.py
diff --git a/examples/offline_inference/florence2_inference.py b/examples/offline_inference/florence2_inference.py
index 49dd2c331db5a..c24096e90004b 100644
--- a/examples/offline_inference/florence2_inference.py
+++ b/examples/offline_inference/florence2_inference.py
@@ -3,7 +3,7 @@
 encoder/decoder models, specifically Florence-2
 '''
 # TODO(Isotr0py):
-# Move to offline_inference/offline_inference_vision_language.py
+# Move to offline_inference/vision_language.py
 # after porting vision backbone
 from vllm import LLM, SamplingParams
 
diff --git a/examples/offline_inference/offline_inference_mlpspeculator.py b/examples/offline_inference/mlpspeculator.py
similarity index 100%
rename from examples/offline_inference/offline_inference_mlpspeculator.py
rename to examples/offline_inference/mlpspeculator.py
diff --git a/examples/offline_inference/offline_inference_neuron.py b/examples/offline_inference/neuron.py
similarity index 100%
rename from examples/offline_inference/offline_inference_neuron.py
rename to examples/offline_inference/neuron.py
diff --git a/examples/offline_inference/offline_inference_neuron_int8_quantization.py b/examples/offline_inference/neuron_int8_quantization.py
similarity index 100%
rename from examples/offline_inference/offline_inference_neuron_int8_quantization.py
rename to examples/offline_inference/neuron_int8_quantization.py
diff --git a/examples/offline_inference/offline_inference_openai/offline_inference_openai.md b/examples/offline_inference/openai/openai_batch.md
similarity index 92%
rename from examples/offline_inference/offline_inference_openai/offline_inference_openai.md
rename to examples/offline_inference/openai/openai_batch.md
index 6278a1943fe4a..a4774e57cd9a5 100644
--- a/examples/offline_inference/offline_inference_openai/offline_inference_openai.md
+++ b/examples/offline_inference/openai/openai_batch.md
@@ -8,7 +8,7 @@ This is a guide to performing batch inference using the OpenAI batch file format
  
 The OpenAI batch file format consists of a series of json objects on new lines.
  
-[See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl)
+[See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference/openai/openai_example_batch.jsonl)
  
 Each line represents a separate request. See the [OpenAI package reference](https://platform.openai.com/docs/api-reference/batch/requestInput) for more details.
  
@@ -31,13 +31,13 @@ We currently only support `/v1/chat/completions` and `/v1/embeddings` endpoints
 To follow along with this example, you can download the example batch, or create your own batch file in your working directory.
 
 ```
-wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl
+wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl
 ```
 
 Once you've created your batch file it should look like this
 
 ```
-$ cat offline_inference/offline_inference_openai/openai_example_batch.jsonl
+$ cat offline_inference/openai/openai_example_batch.jsonl
 {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
 {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
 ```
@@ -49,7 +49,7 @@ The batch running tool is designed to be used from the command line.
 You can run the batch with the following command, which will write its results to a file called `results.jsonl`
 
 ```
-python -m vllm.entrypoints.openai.run_batch -i offline_inference/offline_inference_openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
+python -m vllm.entrypoints.openai.run_batch -i offline_inference/openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
 ```
 
 ### Step 3: Check your results
@@ -66,10 +66,10 @@ $ cat results.jsonl
 
 The batch runner supports remote input and output urls that are accessible via http/https.
 
-For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl`, you can run
+For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl`, you can run
 
 ```
-python -m vllm.entrypoints.openai.run_batch -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
+python -m vllm.entrypoints.openai.run_batch -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
 ```
 
 ## Example 3: Integrating with AWS S3
@@ -90,13 +90,13 @@ To integrate with cloud blob storage, we recommend using presigned urls.
 To follow along with this example, you can download the example batch, or create your own batch file in your working directory.
 
 ```
-wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl
+wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl
 ```
 
 Once you've created your batch file it should look like this
 
 ```
-$ cat offline_inference/offline_inference_openai/openai_example_batch.jsonl
+$ cat offline_inference/openai/openai_example_batch.jsonl
 {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
 {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
 ```
@@ -104,7 +104,7 @@ $ cat offline_inference/offline_inference_openai/openai_example_batch.jsonl
 Now upload your batch file to your S3 bucket.
 
 ```
-aws s3 cp offline_inference/offline_inference_openai/openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl
+aws s3 cp offline_inference/openai/openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl
 ```
 
 ### Step 2: Generate your presigned urls
diff --git a/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl b/examples/offline_inference/openai/openai_example_batch.jsonl
similarity index 100%
rename from examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl
rename to examples/offline_inference/openai/openai_example_batch.jsonl
diff --git a/examples/offline_inference/offline_inference_pixtral.py b/examples/offline_inference/pixtral.py
similarity index 100%
rename from examples/offline_inference/offline_inference_pixtral.py
rename to examples/offline_inference/pixtral.py
diff --git a/examples/offline_inference/offline_inference_with_prefix.py b/examples/offline_inference/prefix_caching.py
similarity index 100%
rename from examples/offline_inference/offline_inference_with_prefix.py
rename to examples/offline_inference/prefix_caching.py
diff --git a/examples/offline_inference/offline_profile.py b/examples/offline_inference/profiling.py
similarity index 99%
rename from examples/offline_inference/offline_profile.py
rename to examples/offline_inference/profiling.py
index 187a05e4d70a2..8a94b5c2a8623 100644
--- a/examples/offline_inference/offline_profile.py
+++ b/examples/offline_inference/profiling.py
@@ -363,7 +363,7 @@ def abort_requests():
 
     example:
     ```
-    python examples/offline_inference/offline_profile.py \\
+    python examples/offline_inference/profiling.py \\
         --model neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 --batch-size 4 \\
         --prompt-len 512 --max-num-batched-tokens 8196 --json Llama31-8b-FP8 \\
         --enforce-eager run_num_steps -n 2
diff --git a/examples/offline_inference/offline_inference_scoring.py b/examples/offline_inference/scoring.py
similarity index 100%
rename from examples/offline_inference/offline_inference_scoring.py
rename to examples/offline_inference/scoring.py
diff --git a/examples/offline_inference/offline_inference_with_profiler.py b/examples/offline_inference/simple_profiling.py
similarity index 100%
rename from examples/offline_inference/offline_inference_with_profiler.py
rename to examples/offline_inference/simple_profiling.py
diff --git a/examples/offline_inference/offline_inference_structured_outputs.py b/examples/offline_inference/structured_outputs.py
similarity index 100%
rename from examples/offline_inference/offline_inference_structured_outputs.py
rename to examples/offline_inference/structured_outputs.py
diff --git a/examples/offline_inference/offline_inference_tpu.py b/examples/offline_inference/tpu.py
similarity index 100%
rename from examples/offline_inference/offline_inference_tpu.py
rename to examples/offline_inference/tpu.py
diff --git a/examples/offline_inference/offline_inference_vision_language.py b/examples/offline_inference/vision_language.py
similarity index 100%
rename from examples/offline_inference/offline_inference_vision_language.py
rename to examples/offline_inference/vision_language.py
diff --git a/examples/offline_inference/offline_inference_vision_language_embedding.py b/examples/offline_inference/vision_language_embedding.py
similarity index 100%
rename from examples/offline_inference/offline_inference_vision_language_embedding.py
rename to examples/offline_inference/vision_language_embedding.py
diff --git a/examples/offline_inference/offline_inference_vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
similarity index 100%
rename from examples/offline_inference/offline_inference_vision_language_multi_image.py
rename to examples/offline_inference/vision_language_multi_image.py
diff --git a/examples/offline_inference/offline_inference_whisper.py b/examples/offline_inference/whisper.py
similarity index 100%
rename from examples/offline_inference/offline_inference_whisper.py
rename to examples/offline_inference/whisper.py
diff --git a/tests/plugins_tests/test_platform_plugins.py b/tests/plugins_tests/test_platform_plugins.py
index 57518bd3e8299..69698b34c71a3 100644
--- a/tests/plugins_tests/test_platform_plugins.py
+++ b/tests/plugins_tests/test_platform_plugins.py
@@ -5,7 +5,7 @@ def test_platform_plugins():
     import os
     example_file = os.path.join(
         os.path.dirname(os.path.dirname(os.path.dirname(current_file))),
-        "examples", "offline_inference/offline_inference.py")
+        "examples", "offline_inference/basic.py")
     runpy.run_path(example_file)
 
     # check if the plugin is loaded correctly
diff --git a/tools/profiler/print_layerwise_table.py b/tools/profiler/print_layerwise_table.py
index 49366abc7fb56..54cd60c2bc95b 100644
--- a/tools/profiler/print_layerwise_table.py
+++ b/tools/profiler/print_layerwise_table.py
@@ -31,7 +31,7 @@ def get_entries(node, curr_depth=0):
                         type=str,
                         required=True,
                         help="json trace file output by "
-                        "examples/offline_inference/offline_profile.py")
+                        "examples/offline_inference/profiling.py")
     parser.add_argument("--phase",
                         type=str,
                         required=True,
diff --git a/tools/profiler/visualize_layerwise_profile.py b/tools/profiler/visualize_layerwise_profile.py
index fa88ed4204d8f..cb56ebd69a8c1 100644
--- a/tools/profiler/visualize_layerwise_profile.py
+++ b/tools/profiler/visualize_layerwise_profile.py
@@ -538,7 +538,7 @@ def make_plot_title_suffix(profile_json: dict) -> str:
                         type=str,
                         required=True,
                         help="json trace file output by \
-                              examples/offline_inference/offline_profile.py")
+                              examples/offline_inference/profiling.py")
     parser.add_argument("--output-directory",
                         type=str,
                         required=False,

From f33e033e2782a9258d8ef6a359643944629d4ced Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Fri, 10 Jan 2025 23:51:02 +0800
Subject: [PATCH 168/462] [Docs] Fix docstring in `get_ip` function (#11932)

Signed-off-by: Kuntai Du <kuntai@uchicago.edu>
---
 vllm/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/utils.py b/vllm/utils.py
index 8c3e5200b3d98..217ccb25cef6d 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -446,7 +446,7 @@ def get_ip() -> str:
         logger.warning(
             "The environment variable HOST_IP is deprecated and ignored, as"
             " it is often used by Docker and other software to"
-            "interact with the container's network stack. Please"
+            "interact with the container's network stack. Please "
             "use VLLM_HOST_IP instead to set the IP address for vLLM processes"
             " to communicate with each other.")
     if host_ip:

From 5959564f94180a6a50e0d394e35a035c0c98a7fb Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Fri, 10 Jan 2025 23:51:43 +0800
Subject: [PATCH 169/462] Doc fix in `benchmark_long_document_qa_throughput.py`
 (#11933)

Signed-off-by: Kuntai Du <kuntai@uchicago.edu>
---
 benchmarks/benchmark_long_document_qa_throughput.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/benchmarks/benchmark_long_document_qa_throughput.py b/benchmarks/benchmark_long_document_qa_throughput.py
index 13477ef535e86..0b8fba38156f1 100644
--- a/benchmarks/benchmark_long_document_qa_throughput.py
+++ b/benchmarks/benchmark_long_document_qa_throughput.py
@@ -2,8 +2,7 @@
 Offline benchmark to test the long document QA throughput.
 
 Example usage:
-    # This command run the vllm with 50GB CPU memory for offloading
-    # The workload samples 8 different prompts with a default input
+    # This workload samples 8 different prompts with a default input
     # length of 20000 tokens, then replicates each prompt 2 times 
     # in random order.
     python benchmark_long_document_qa_throughput.py \

From aa1e77a19ce658abcbaa0836f96878a7ae9dea84 Mon Sep 17 00:00:00 2001
From: "Li, Jiang" <jiang1.li@intel.com>
Date: Sat, 11 Jan 2025 00:07:58 +0800
Subject: [PATCH 170/462] [Hardware][CPU] Support MOE models on x86 CPU
 (#11831)

Signed-off-by: jiang1.li <jiang1.li@intel.com>
---
 .../getting_started/installation/cpu-x86.md   |  2 +-
 .../decoder_only/language/test_models.py      |  4 ++
 vllm/model_executor/layers/fused_moe/layer.py | 41 +++++++++++++++++--
 3 files changed, 43 insertions(+), 4 deletions(-)

diff --git a/docs/source/getting_started/installation/cpu-x86.md b/docs/source/getting_started/installation/cpu-x86.md
index f4d3eec0377b1..26bdcd93ad190 100644
--- a/docs/source/getting_started/installation/cpu-x86.md
+++ b/docs/source/getting_started/installation/cpu-x86.md
@@ -5,7 +5,7 @@
 vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32, FP16 and BF16. vLLM CPU backend supports the following vLLM features:
 
 - Tensor Parallel
-- Model Quantization (`INT8 W8A8, AWQ`)
+- Model Quantization (`INT8 W8A8, AWQ, GPTQ`)
 - Chunked-prefill
 - Prefix-caching
 - FP8-E5M2 KV-Caching (TODO)
diff --git a/tests/models/decoder_only/language/test_models.py b/tests/models/decoder_only/language/test_models.py
index 2a7ed8826d2f3..4e110366a09f3 100644
--- a/tests/models/decoder_only/language/test_models.py
+++ b/tests/models/decoder_only/language/test_models.py
@@ -48,6 +48,10 @@
         ),
         pytest.param("stabilityai/stablelm-3b-4e1t"),  # stablelm
         pytest.param("bigcode/starcoder2-3b"),  # starcoder2
+        pytest.param(
+            "ehristoforu/Falcon3-MoE-2x7B-Insruct",  # mixtral
+            marks=[pytest.mark.cpu_model],
+        )
     ])
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [32])
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index b108cbd52c218..cf5db368926b4 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -13,6 +13,7 @@
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
+from vllm.platforms.interface import CpuArchEnum
 
 if current_platform.is_cuda_alike():
     from .fused_moe import fused_experts
@@ -83,6 +84,20 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int,
         layer.register_parameter("w2_weight", w2_weight)
         set_weight_attrs(w2_weight, extra_weight_attrs)
 
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        super().process_weights_after_loading(layer)
+
+        if current_platform.is_cpu():
+            if current_platform.get_cpu_architecture() == CpuArchEnum.X86:
+                import intel_extension_for_pytorch as ipex
+                layer.ipex_fusion = ipex.llm.modules.GatedMLPMOE(
+                    layer.w13_weight,
+                    layer.w2_weight,
+                    use_prepack=True,
+                )
+            else:
+                raise NotImplementedError("CPU MOE only supports x86 arch.")
+
     def apply(
         self,
         layer: torch.nn.Module,
@@ -142,9 +157,29 @@ def forward_cuda(
                              topk_ids=topk_ids,
                              inplace=True)
 
-    def forward_cpu(self, *args, **kwargs):
-        raise NotImplementedError(
-            "The CPU backend currently does not support MoE.")
+    def forward_cpu(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        use_grouped_topk: bool,
+        top_k: int,
+        router_logits: torch.Tensor,
+        renormalize: bool,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        custom_routing_function: Optional[Callable] = None,
+        **kwargs,
+    ):
+        assert custom_routing_function is None
+        return layer.ipex_fusion(
+            x,
+            use_grouped_topk,
+            top_k,
+            router_logits,
+            renormalize,
+            topk_group,
+            num_expert_group,
+        )
 
     def forward_tpu(
         self,

From 46fa98ccad444dbacc7f95995b79f65ddab3ff7c Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sat, 11 Jan 2025 03:19:15 +0800
Subject: [PATCH 171/462] [Misc] Clean up debug code in Deepseek-V3 (#11930)

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 vllm/model_executor/models/deepseek_v3.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/vllm/model_executor/models/deepseek_v3.py b/vllm/model_executor/models/deepseek_v3.py
index 333dc019b4d99..d4710622681b5 100644
--- a/vllm/model_executor/models/deepseek_v3.py
+++ b/vllm/model_executor/models/deepseek_v3.py
@@ -639,9 +639,6 @@ def load_weights(self, weights: Iterable[Tuple[str,
                     if is_pp_missing_parameter(name, self):
                         continue
 
-                    if name not in params_dict:
-                        for key in params_dict:
-                            print(key)
                     param = params_dict[name]
                     weight_loader = getattr(param, "weight_loader",
                                             default_weight_loader)

From 8a579408f33e2f98a89d57418342b53a41622d2f Mon Sep 17 00:00:00 2001
From: minmin <rmm0811@gmail.com>
Date: Sat, 11 Jan 2025 04:39:22 +0800
Subject: [PATCH 172/462] [Misc] Update benchmark_prefix_caching.py fixed
 example usage (#11920)

Signed-off-by: Ren MinMin <renmm6@chinaunicom.cn>
Co-authored-by: Ren MinMin <renmm6@chinaunicom.cn>
---
 benchmarks/benchmark_prefix_caching.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py
index 5e9381f712e10..3ab421a89c935 100644
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@@ -10,7 +10,8 @@
         --model meta-llama/Llama-2-7b-chat-hf \
         --enable-prefix-caching \
         --num-prompts 1 \
-        --repeat-count 100
+        --repeat-count 100 \
+        --input-length-range 128:256
 
 ShareGPT example usage:
     # This command samples 20 prompts with input lengths

From d45cbe70f5bf25bb2f490f4152c256e9acb2a62b Mon Sep 17 00:00:00 2001
From: Travis Johnson <tsjohnso@us.ibm.com>
Date: Fri, 10 Jan 2025 16:26:00 -0700
Subject: [PATCH 173/462] [Bugfix] Check that number of images matches number
 of <|image|> tokens with mllama (#11939)

Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
---
 vllm/model_executor/models/mllama.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index c5046e06edecb..593a4d3fb6940 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -123,6 +123,13 @@ def input_processor_for_mllama(
 
     assert is_list_of(image_data, Image.Image)
 
+    num_image_tokens = dec_inputs['prompt_token_ids'].count(
+        MLLAMA_IMAGE_TOKEN_ID)
+    if num_image_tokens != len(image_data):
+        raise ValueError(
+            f"The number of image tokens ({num_image_tokens}) must be"
+            f" the same as the number of images ({len(image_data)})")
+
     # Since only the last group of consecutive images
     # are attended by the decoded tokens, we only need to
     # get the number of tiles for those images.
@@ -1493,6 +1500,8 @@ def convert_sparse_cross_attention_mask_to_dense(
             dense_mask[seq_start + start:seq_start + end,
                        tile_start:tile_start + tile] = 1
             tile_start += tile
+        assert ts != -1
+        assert td != 0
         tile_range_for_decode.append((ts, ts + td))
         seq_start += length
 

From c9f09a4fe83ef13824ea1663214ac7aad08d2b31 Mon Sep 17 00:00:00 2001
From: Fred Reiss <frreiss@us.ibm.com>
Date: Fri, 10 Jan 2025 17:04:58 -0800
Subject: [PATCH 174/462] [mypy] Fix mypy warnings in api_server.py (#11941)

Signed-off-by: Fred Reiss <frreiss@us.ibm.com>
---
 vllm/entrypoints/openai/api_server.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 925d7db43138b..1aeefe86cd05e 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -14,7 +14,7 @@
 from contextlib import asynccontextmanager
 from functools import partial
 from http import HTTPStatus
-from typing import AsyncIterator, Optional, Set, Tuple
+from typing import AsyncIterator, Dict, Optional, Set, Tuple, Union
 
 import uvloop
 from fastapi import APIRouter, FastAPI, HTTPException, Request
@@ -420,6 +420,8 @@ async def create_embedding(request: EmbeddingRequest, raw_request: Request):
             "use the Pooling API (`/pooling`) instead.")
 
         res = await fallback_handler.create_pooling(request, raw_request)
+
+        generator: Union[ErrorResponse, EmbeddingResponse]
         if isinstance(res, PoolingResponse):
             generator = EmbeddingResponse(
                 id=res.id,
@@ -494,7 +496,7 @@ async def create_score_v1(request: ScoreRequest, raw_request: Request):
     return await create_score(request, raw_request)
 
 
-TASK_HANDLERS = {
+TASK_HANDLERS: Dict[str, Dict[str, tuple]] = {
     "generate": {
         "messages": (ChatCompletionRequest, create_chat_completion),
         "default": (CompletionRequest, create_completion),
@@ -652,7 +654,7 @@ async def add_request_id(request: Request, call_next):
         module_path, object_name = middleware.rsplit(".", 1)
         imported = getattr(importlib.import_module(module_path), object_name)
         if inspect.isclass(imported):
-            app.add_middleware(imported)
+            app.add_middleware(imported)  # type: ignore[arg-type]
         elif inspect.iscoroutinefunction(imported):
             app.middleware("http")(imported)
         else:

From 899136b857d510f0e19b0e21ea32b49f8aa117ed Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sat, 11 Jan 2025 09:07:24 +0800
Subject: [PATCH 175/462] [ci] fix broken distributed-tests-4-gpus (#11937)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 tests/spec_decode/e2e/test_integration_dist_tp4.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/spec_decode/e2e/test_integration_dist_tp4.py b/tests/spec_decode/e2e/test_integration_dist_tp4.py
index 555aef99218c3..2cb10de1c6f55 100644
--- a/tests/spec_decode/e2e/test_integration_dist_tp4.py
+++ b/tests/spec_decode/e2e/test_integration_dist_tp4.py
@@ -108,7 +108,8 @@ def test_skip_speculation(common_llm_kwargs, per_test_common_llm_kwargs,
 
     TODO: fix it to pass without raising Error. (#5814)
     """
-    with pytest.raises(openai.APIConnectionError):
+    with pytest.raises(
+        (openai.APIConnectionError, openai.InternalServerError)):
         run_equality_correctness_test_tp(MAIN_MODEL,
                                          common_llm_kwargs,
                                          per_test_common_llm_kwargs,

From 2118d0565cb52c69e0bbacdcf48af9ecef39e528 Mon Sep 17 00:00:00 2001
From: Sungjae Lee <33976427+llsj14@users.noreply.github.com>
Date: Sat, 11 Jan 2025 13:49:38 +0900
Subject: [PATCH 176/462] [Bugfix][SpecDecode] Adjust Eagle model architecture
 to align with intended design (#11672)

Signed-off-by: Sungjae Lee <33976427+llsj14@users.noreply.github.com>
---
 vllm/model_executor/models/eagle.py | 26 ++++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/eagle.py b/vllm/model_executor/models/eagle.py
index f138d13630263..eb7b5af19ae96 100644
--- a/vllm/model_executor/models/eagle.py
+++ b/vllm/model_executor/models/eagle.py
@@ -17,14 +17,30 @@
 from .utils import maybe_prefix
 
 
+class DummyInputLayerNorm(nn.Module):
+
+    def forward(self, x):
+        return x
+
+
+class DummyOutputNorm(nn.Module):
+
+    def forward(self, x, residual):
+        if residual is None:
+            return x
+        else:
+            return x, residual
+
+
 class EAGLE(nn.Module):
     """This class implements the EAGLE draft model from the paper: https://arxiv.org/pdf/2401.15077
     Reference implementation: https://github.com/SafeAILab/EAGLE
     
     Differences from reference implementation:
     1. In reference, LlamaDecoderLayer implementation doesn't have 
-       input_layernorm for 1st decoder layer (https://github.com/SafeAILab/EAGLE/blob/7d065d084443fbfd386f88839efd7193c12be869/eagle/model/cnets.py#L427) 
-       but we do as HF implementation also does.
+       input_layernorm for 1st decoder layer (https://github.com/SafeAILab/EAGLE/blob/7d065d084443fbfd386f88839efd7193c12be869/eagle/model/cnets.py#L427).
+       Following this approach, our implementation also disables
+       the input_layernorm for the first decoder layer.
     2. We allow any decoder layer to be used in EAGLE whereas in reference 
        decoder layer is fixed to be LlamaDecoderLayer.
     3. We have an optional token_map which reduces draft vocab to most 
@@ -46,10 +62,16 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         self.model = model_cls(vllm_config=vllm_config,
                                prefix=maybe_prefix(prefix, "model"))
+
         self.fc = nn.Linear(config.model.hidden_size * 2,
                             config.model.hidden_size,
                             bias=getattr(self.config, "eagle_fc_bias", False))
 
+        # Modify layer normalization and residual connections as suggested
+        # in the EAGLE framework: https://github.com/SafeAILab/EAGLE
+        self.model.model.layers[0].input_layernorm = DummyInputLayerNorm()
+        self.model.model.norm = DummyOutputNorm()
+
         self.orig_vocab_size = config.vocab_size
         self.truncated_vocab_size = config.truncated_vocab_size
         self.unpadded_vocab_size = self.truncated_vocab_size

From c32a7c7c0c688ed81d2f4ad701a09d0edd095ffe Mon Sep 17 00:00:00 2001
From: shaochangxu <85155497+shaochangxu@users.noreply.github.com>
Date: Sat, 11 Jan 2025 13:49:39 +0800
Subject: [PATCH 177/462] [Bugfix] fused_experts_impl wrong compute type for
 float32 (#11921)

Signed-off-by: shaochangxu.scx <shaochangxu.scx@antgroup.com>
Co-authored-by: shaochangxu.scx <shaochangxu.scx@antgroup.com>
---
 vllm/model_executor/layers/fused_moe/fused_moe.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 1bb6bc753d37c..3ea6217d7c0ef 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -701,8 +701,14 @@ def fused_experts_impl(hidden_states: torch.Tensor,
                                       device=hidden_states.device,
                                       dtype=hidden_states.dtype)
 
-    compute_type = (tl.bfloat16
-                    if hidden_states.dtype == torch.bfloat16 else tl.float16)
+    if hidden_states.dtype == torch.bfloat16:
+        compute_type = tl.bfloat16
+    elif hidden_states.dtype == torch.float16:
+        compute_type = tl.float16
+    elif hidden_states.dtype == torch.float32:
+        compute_type = tl.float32
+    else:
+        raise ValueError(f"Unsupported compute_type: {hidden_states.dtype}")
 
     if inplace:
         out_hidden_states = hidden_states

From 7a3a83e3b87f50fe9c0985a5c5bcc1d4cf2e95cd Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 11 Jan 2025 13:50:05 +0800
Subject: [PATCH 178/462] [CI/Build] Move model-specific multi-modal processing
 tests (#11934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .buildkite/test-pipeline.yaml                 |   1 +
 .../processing => multimodal}/__init__.py     |   0
 .../models/multimodal/processing/__init__.py  |   0
 .../multimodal/processing/test_common.py      | 201 +++++++++++++++
 .../processing/test_idefics3.py               |   4 +-
 .../processing/test_internvl.py               |   4 +-
 .../processing/test_llava_next.py             |   2 +-
 .../processing/test_llava_onevision.py        |   2 +-
 .../processing/test_phi3v.py                  |   4 +-
 .../processing/test_qwen.py                   |   4 +-
 .../processing/test_qwen2_vl.py               |   4 +-
 tests/multimodal/test_processing.py           | 232 +-----------------
 tests/multimodal/utils.py                     |  33 +++
 13 files changed, 251 insertions(+), 240 deletions(-)
 rename tests/models/{decoder_only/vision_language/processing => multimodal}/__init__.py (100%)
 create mode 100644 tests/models/multimodal/processing/__init__.py
 create mode 100644 tests/models/multimodal/processing/test_common.py
 rename tests/models/{decoder_only/vision_language => multimodal}/processing/test_idefics3.py (98%)
 rename tests/models/{decoder_only/vision_language => multimodal}/processing/test_internvl.py (98%)
 rename tests/models/{decoder_only/vision_language => multimodal}/processing/test_llava_next.py (99%)
 rename tests/models/{decoder_only/vision_language => multimodal}/processing/test_llava_onevision.py (99%)
 rename tests/models/{decoder_only/vision_language => multimodal}/processing/test_phi3v.py (95%)
 rename tests/models/{decoder_only/vision_language => multimodal}/processing/test_qwen.py (98%)
 rename tests/models/{decoder_only/vision_language => multimodal}/processing/test_qwen2_vl.py (96%)
 create mode 100644 tests/multimodal/utils.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index d3bd809cfdf24..cf82210f96ee3 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -368,6 +368,7 @@ steps:
   - tests/models/encoder_decoder/vision_language
   commands:
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal
     - pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
     - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model'
     - pytest -v -s models/embedding/vision_language -m core_model
diff --git a/tests/models/decoder_only/vision_language/processing/__init__.py b/tests/models/multimodal/__init__.py
similarity index 100%
rename from tests/models/decoder_only/vision_language/processing/__init__.py
rename to tests/models/multimodal/__init__.py
diff --git a/tests/models/multimodal/processing/__init__.py b/tests/models/multimodal/processing/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
new file mode 100644
index 0000000000000..0a38779e0e4f0
--- /dev/null
+++ b/tests/models/multimodal/processing/test_common.py
@@ -0,0 +1,201 @@
+from functools import partial
+
+import numpy as np
+import pytest
+from PIL import Image
+
+from vllm.config import ModelConfig
+from vllm.inputs import InputProcessingContext
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.processing import ProcessingCache
+from vllm.multimodal.utils import cached_get_tokenizer
+
+from ....multimodal.utils import random_audio, random_image, random_video
+
+
+def _test_processing_correctness(
+    model_id: str,
+    modalities: dict[str, bool],
+    hit_rate: float,
+    num_batches: int,
+    simplify_rate: float,
+):
+    if model_id == "TIGER-Lab/Mantis-8B-siglip-llama3":
+        hf_overrides = {"architectures": ["MantisForConditionalGeneration"]}
+    else:
+        hf_overrides = {}
+
+    limit_mm_per_prompt = {
+        modality: 3 if supports_multi else 1
+        for modality, supports_multi in modalities.items()
+    }
+
+    model_config = ModelConfig(
+        model_id,
+        task="auto",
+        tokenizer=model_id,
+        tokenizer_mode="auto",
+        trust_remote_code=True,
+        seed=0,
+        dtype="float16",
+        revision=None,
+        hf_overrides=hf_overrides,
+        limit_mm_per_prompt=limit_mm_per_prompt,
+    )
+
+    model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
+    factories = MULTIMODAL_REGISTRY._processor_factories[model_cls]
+    ctx = InputProcessingContext(
+        model_config,
+        tokenizer=cached_get_tokenizer(model_config.tokenizer),
+    )
+    # Ensure that it can fit all of the data
+    cache = ProcessingCache(capacity=1 << 30)
+
+    baseline_processor = factories.build_processor(ctx, cache=None)
+    cached_processor = factories.build_processor(ctx, cache=cache)
+    dummy_inputs = baseline_processor.dummy_inputs
+    tokenizer = baseline_processor.info.get_tokenizer()
+
+    rng = np.random.RandomState(0)
+
+    input_to_hit = {
+        "image": Image.new("RGB", size=(128, 128)),
+        "video": np.zeros((4, 128, 128, 3), dtype=np.uint8),
+        "audio": (np.zeros((512, )), 16000),
+    }
+    input_factory = {
+        "image":
+        partial(random_image, rng, min_wh=128, max_wh=256),
+        "video":
+        partial(random_video,
+                rng,
+                min_frames=2,
+                max_frames=8,
+                min_wh=128,
+                max_wh=256),
+        "audio":
+        partial(random_audio, rng, min_len=512, max_len=1024, sr=16000),
+    }
+
+    for batch_idx in range(num_batches):
+        mm_data = {
+            k:
+            [(input_to_hit[k] if rng.rand() < hit_rate else input_factory[k]())
+             for _ in range(rng.randint(limit_mm_per_prompt[k]))]
+            for k in modalities
+        }
+
+        mm_counts = {k: len(vs) for k, vs in mm_data.items()}
+        prompt = dummy_inputs.get_dummy_processor_inputs(
+            model_config.max_model_len,
+            mm_counts,
+        ).prompt_text
+
+        # Drop unnecessary keys and test single -> multi conversion
+        if rng.rand() < simplify_rate:
+            for k in list(mm_data.keys()):
+                if not mm_data[k]:
+                    del mm_data[k]
+                elif len(mm_data[k]) == 1:
+                    mm_data[k] = mm_data[k][0]
+
+        baseline_result = baseline_processor.apply(
+            prompt,
+            mm_data=mm_data,
+            hf_processor_mm_kwargs={},
+        )
+        cached_result = cached_processor.apply(
+            prompt,
+            mm_data=mm_data,
+            hf_processor_mm_kwargs={},
+        )
+
+        assert baseline_result == cached_result, (
+            f"Failed ({batch_idx=}, {prompt=}, {mm_data=})")
+
+        baseline_tokenized_result = baseline_processor.apply(
+            tokenizer.encode(prompt),
+            mm_data=mm_data,
+            hf_processor_mm_kwargs={},
+        )
+
+        assert baseline_result == baseline_tokenized_result, (
+            f"Failed ({batch_idx=}, {prompt=}, {mm_data=})")
+
+        cached_tokenized_result = cached_processor.apply(
+            tokenizer.encode(prompt),
+            mm_data=mm_data,
+            hf_processor_mm_kwargs={},
+        )
+
+        assert cached_result == cached_tokenized_result, (
+            f"Failed ({batch_idx=}, {prompt=}, {mm_data=})")
+
+
+# yapf: disable
+# True if the model supports multiple data items of the modality per request
+@pytest.mark.parametrize(("model_id", "modalities"), [
+    ("rhymes-ai/Aria", {"image": True}),
+    ("Salesforce/blip2-opt-2.7b", {"image": False}),
+    ("facebook/chameleon-7b", {"image": False}),
+    ("adept/fuyu-8b", {"image": False}),
+    ("llava-hf/llava-1.5-7b-hf", {"image": True}),
+    ("llava-hf/llava-v1.6-mistral-7b-hf", {"image": True}),
+    ("llava-hf/LLaVA-NeXT-Video-7B-hf", {"video": False}),
+    ("llava-hf/llava-onevision-qwen2-0.5b-ov-hf", {"image": True, "video": True}),  # noqa: E501
+    ("TIGER-Lab/Mantis-8B-siglip-llama3", {"image": True}),
+    ("mistral-community/pixtral-12b", {"image": True}),
+    ("Qwen/Qwen2-VL-2B-Instruct", {"image": True, "video": True}),
+    ("Qwen/Qwen2-Audio-7B-Instruct", {"audio": True}),
+    ("fixie-ai/ultravox-v0_3", {"audio": True}),
+])
+@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
+@pytest.mark.parametrize("num_batches", [32])
+@pytest.mark.parametrize("simplify_rate", [1.0])
+# yapf: enable
+def test_processing_correctness(
+    model_id: str,
+    modalities: dict[str, bool],
+    hit_rate: float,
+    num_batches: int,
+    simplify_rate: float,
+):
+    _test_processing_correctness(
+        model_id,
+        modalities,
+        hit_rate=hit_rate,
+        num_batches=num_batches,
+        simplify_rate=simplify_rate,
+    )
+
+
+# yapf: disable
+@pytest.mark.parametrize(("model_id", "modalities"), [
+    ("microsoft/Phi-3-vision-128k-instruct", {"image": True}),
+])
+@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
+@pytest.mark.parametrize("num_batches", [32])
+@pytest.mark.parametrize("simplify_rate", [1.0])
+# yapf: enable
+def test_processing_correctness_phi3v(
+    model_id: str,
+    modalities: dict[str, bool],
+    hit_rate: float,
+    num_batches: int,
+    simplify_rate: float,
+):
+    # HACK - this is an attempted workaround for the following bug
+    # https://github.com/huggingface/transformers/issues/34307
+    from transformers import AutoImageProcessor  # noqa: F401
+    from transformers import AutoProcessor  # noqa: F401
+
+    AutoImageProcessor.from_pretrained(model_id, trust_remote_code=True)
+
+    _test_processing_correctness(
+        model_id,
+        modalities,
+        hit_rate=hit_rate,
+        num_batches=num_batches,
+        simplify_rate=simplify_rate,
+    )
diff --git a/tests/models/decoder_only/vision_language/processing/test_idefics3.py b/tests/models/multimodal/processing/test_idefics3.py
similarity index 98%
rename from tests/models/decoder_only/vision_language/processing/test_idefics3.py
rename to tests/models/multimodal/processing/test_idefics3.py
index c71a2d359043d..69b91ad4a5df8 100644
--- a/tests/models/decoder_only/vision_language/processing/test_idefics3.py
+++ b/tests/models/multimodal/processing/test_idefics3.py
@@ -8,8 +8,8 @@
 from vllm.inputs import InputContext, token_inputs
 from vllm.multimodal import MultiModalRegistry
 
-from .....conftest import _ImageAssets
-from ....utils import build_model_context
+from ....conftest import _ImageAssets
+from ...utils import build_model_context
 
 models = ["HuggingFaceM4/Idefics3-8B-Llama3"]
 
diff --git a/tests/models/decoder_only/vision_language/processing/test_internvl.py b/tests/models/multimodal/processing/test_internvl.py
similarity index 98%
rename from tests/models/decoder_only/vision_language/processing/test_internvl.py
rename to tests/models/multimodal/processing/test_internvl.py
index af0c2aa211998..d6c60595ca5ea 100644
--- a/tests/models/decoder_only/vision_language/processing/test_internvl.py
+++ b/tests/models/multimodal/processing/test_internvl.py
@@ -7,8 +7,8 @@
 from vllm.inputs import InputContext, token_inputs
 from vllm.multimodal import MultiModalRegistry
 
-from .....conftest import _ImageAssets
-from ....utils import build_model_context
+from ....conftest import _ImageAssets
+from ...utils import build_model_context
 
 models = ["OpenGVLab/InternVL2-2B"]
 
diff --git a/tests/models/decoder_only/vision_language/processing/test_llava_next.py b/tests/models/multimodal/processing/test_llava_next.py
similarity index 99%
rename from tests/models/decoder_only/vision_language/processing/test_llava_next.py
rename to tests/models/multimodal/processing/test_llava_next.py
index 689d17be81889..1eec35d9c3c72 100644
--- a/tests/models/decoder_only/vision_language/processing/test_llava_next.py
+++ b/tests/models/multimodal/processing/test_llava_next.py
@@ -10,7 +10,7 @@
 from vllm.multimodal.processing import BaseMultiModalProcessor
 from vllm.multimodal.utils import cached_get_tokenizer
 
-from ....utils import build_model_context
+from ...utils import build_model_context
 
 
 def _validate_image_prompt_replacements_one(
diff --git a/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py b/tests/models/multimodal/processing/test_llava_onevision.py
similarity index 99%
rename from tests/models/decoder_only/vision_language/processing/test_llava_onevision.py
rename to tests/models/multimodal/processing/test_llava_onevision.py
index a033354f0e9b8..94ea604c58b43 100644
--- a/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py
+++ b/tests/models/multimodal/processing/test_llava_onevision.py
@@ -10,7 +10,7 @@
 from vllm.multimodal.processing import BaseMultiModalProcessor
 from vllm.multimodal.utils import cached_get_tokenizer
 
-from ....utils import build_model_context
+from ...utils import build_model_context
 
 
 def _validate_image_prompt_replacements_one(
diff --git a/tests/models/decoder_only/vision_language/processing/test_phi3v.py b/tests/models/multimodal/processing/test_phi3v.py
similarity index 95%
rename from tests/models/decoder_only/vision_language/processing/test_phi3v.py
rename to tests/models/multimodal/processing/test_phi3v.py
index c5b77260c6544..7f82a8f18f0ca 100644
--- a/tests/models/decoder_only/vision_language/processing/test_phi3v.py
+++ b/tests/models/multimodal/processing/test_phi3v.py
@@ -4,8 +4,8 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.utils import cached_get_tokenizer
 
-from .....conftest import _ImageAssets
-from ....utils import build_model_context
+from ....conftest import _ImageAssets
+from ...utils import build_model_context
 
 
 @pytest.mark.parametrize("model_id", ["microsoft/Phi-3.5-vision-instruct"])
diff --git a/tests/models/decoder_only/vision_language/processing/test_qwen.py b/tests/models/multimodal/processing/test_qwen.py
similarity index 98%
rename from tests/models/decoder_only/vision_language/processing/test_qwen.py
rename to tests/models/multimodal/processing/test_qwen.py
index 163220c91a27d..af0ace711ba3e 100644
--- a/tests/models/decoder_only/vision_language/processing/test_qwen.py
+++ b/tests/models/multimodal/processing/test_qwen.py
@@ -9,8 +9,8 @@
 from vllm.multimodal import MultiModalKwargs
 from vllm.multimodal.utils import cached_get_tokenizer
 
-from .....conftest import IMAGE_ASSETS
-from ....utils import build_model_context
+from ....conftest import IMAGE_ASSETS
+from ...utils import build_model_context
 
 ### Multimodal preprocessing tests
 SAMPLE_IMAGE = IMAGE_ASSETS[0].pil_image
diff --git a/tests/models/decoder_only/vision_language/processing/test_qwen2_vl.py b/tests/models/multimodal/processing/test_qwen2_vl.py
similarity index 96%
rename from tests/models/decoder_only/vision_language/processing/test_qwen2_vl.py
rename to tests/models/multimodal/processing/test_qwen2_vl.py
index 0d54802f2b733..de14fbbffe5b7 100644
--- a/tests/models/decoder_only/vision_language/processing/test_qwen2_vl.py
+++ b/tests/models/multimodal/processing/test_qwen2_vl.py
@@ -3,8 +3,8 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.utils import cached_get_tokenizer
 
-from .....conftest import _ImageAssets
-from ....utils import build_model_context
+from ....conftest import _ImageAssets
+from ...utils import build_model_context
 
 
 @pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"])
diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index d18909a4197b6..54269c3ef7ce0 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -1,30 +1,25 @@
 from contextlib import nullcontext
-from functools import partial
 from typing import cast
 from unittest.mock import MagicMock
 
 import numpy as np
 import pytest
-from PIL import Image
 
 from vllm.config import ModelConfig
-from vllm.inputs import InputProcessingContext
 from vllm.multimodal import MULTIMODAL_REGISTRY
-# yapf conflicts with isort for this block
-# yapf: disable
-from vllm.multimodal.processing import (PlaceholderInfo, ProcessingCache,
-                                        PromptReplacement,
+from vllm.multimodal.processing import (PlaceholderInfo, PromptReplacement,
                                         find_mm_placeholders,
                                         find_text_matches, find_token_matches,
                                         iter_token_matches,
                                         replace_text_matches,
                                         replace_token_matches)
-# yapf: enable
 from vllm.multimodal.profiling import MultiModalProfiler
 from vllm.multimodal.utils import cached_get_tokenizer
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils import full_groupby
 
+from .utils import random_image
+
 
 # yapf: disable
 @pytest.mark.parametrize(
@@ -531,37 +526,6 @@ def test_find_mm_placeholders(
     assert result == expected
 
 
-def _rand_img(rng: np.random.RandomState, min_wh: int, max_wh: int):
-    w, h = rng.randint(min_wh, max_wh, size=(2, ))
-    arr = rng.randint(0, 255, size=(w, h, 3), dtype=np.uint8)
-    return Image.fromarray(arr)
-
-
-def _rand_video(
-    rng: np.random.RandomState,
-    min_frames: int,
-    max_frames: int,
-    min_wh: int,
-    max_wh: int,
-):
-    # Temporary workaround for https://github.com/huggingface/transformers/issues/35412
-    num_frames = rng.randint(min_frames, max_frames)
-    num_frames = (num_frames // 2) * 2
-
-    w, h = rng.randint(min_wh, max_wh, size=(2, ))
-    return rng.randint(0, 255, size=(num_frames, w, h, 3), dtype=np.uint8)
-
-
-def _rand_audio(
-    rng: np.random.RandomState,
-    min_len: int,
-    max_len: int,
-    sr: int,
-):
-    audio_len = rng.randint(min_len, max_len)
-    return rng.rand(audio_len), sr
-
-
 @pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
 @pytest.mark.parametrize(
     ("limit", "num_supported", "is_valid"),
@@ -628,7 +592,7 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
     )
 
     rng = np.random.RandomState(0)
-    image = _rand_img(rng, min_wh=128, max_wh=256)
+    image = random_image(rng, min_wh=128, max_wh=256)
     if num_images == 0:
         mm_data = {}
     elif num_images == 1:
@@ -647,191 +611,3 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
             mm_data=mm_data,
             hf_processor_mm_kwargs={},
         )
-
-
-def _test_processing_correctness(
-    model_id: str,
-    modalities: dict[str, bool],
-    hit_rate: float,
-    num_batches: int,
-    simplify_rate: float,
-):
-    if model_id == "TIGER-Lab/Mantis-8B-siglip-llama3":
-        hf_overrides = {"architectures": ["MantisForConditionalGeneration"]}
-    else:
-        hf_overrides = {}
-
-    limit_mm_per_prompt = {
-        modality: 3 if supports_multi else 1
-        for modality, supports_multi in modalities.items()
-    }
-
-    model_config = ModelConfig(
-        model_id,
-        task="auto",
-        tokenizer=model_id,
-        tokenizer_mode="auto",
-        trust_remote_code=True,
-        seed=0,
-        dtype="float16",
-        revision=None,
-        hf_overrides=hf_overrides,
-        limit_mm_per_prompt=limit_mm_per_prompt,
-    )
-
-    model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
-    factories = MULTIMODAL_REGISTRY._processor_factories[model_cls]
-    ctx = InputProcessingContext(
-        model_config,
-        tokenizer=cached_get_tokenizer(model_config.tokenizer),
-    )
-    # Ensure that it can fit all of the data
-    cache = ProcessingCache(capacity=1 << 30)
-
-    baseline_processor = factories.build_processor(ctx, cache=None)
-    cached_processor = factories.build_processor(ctx, cache=cache)
-    dummy_inputs = baseline_processor.dummy_inputs
-    tokenizer = baseline_processor.info.get_tokenizer()
-
-    rng = np.random.RandomState(0)
-
-    input_to_hit = {
-        "image": Image.new("RGB", size=(128, 128)),
-        "video": np.zeros((4, 128, 128, 3), dtype=np.uint8),
-        "audio": (np.zeros((512, )), 16000),
-    }
-    input_factory = {
-        "image":
-        partial(_rand_img, rng, min_wh=128, max_wh=256),
-        "video":
-        partial(_rand_video,
-                rng,
-                min_frames=2,
-                max_frames=8,
-                min_wh=128,
-                max_wh=256),
-        "audio":
-        partial(_rand_audio, rng, min_len=512, max_len=1024, sr=16000),
-    }
-
-    for batch_idx in range(num_batches):
-        mm_data = {
-            k:
-            [(input_to_hit[k] if rng.rand() < hit_rate else input_factory[k]())
-             for _ in range(rng.randint(limit_mm_per_prompt[k]))]
-            for k in modalities
-        }
-
-        mm_counts = {k: len(vs) for k, vs in mm_data.items()}
-        prompt = dummy_inputs.get_dummy_processor_inputs(
-            model_config.max_model_len,
-            mm_counts,
-        ).prompt_text
-
-        # Drop unnecessary keys and test single -> multi conversion
-        if rng.rand() < simplify_rate:
-            for k in list(mm_data.keys()):
-                if not mm_data[k]:
-                    del mm_data[k]
-                elif len(mm_data[k]) == 1:
-                    mm_data[k] = mm_data[k][0]
-
-        baseline_result = baseline_processor.apply(
-            prompt,
-            mm_data=mm_data,
-            hf_processor_mm_kwargs={},
-        )
-        cached_result = cached_processor.apply(
-            prompt,
-            mm_data=mm_data,
-            hf_processor_mm_kwargs={},
-        )
-
-        assert baseline_result == cached_result, (
-            f"Failed ({batch_idx=}, {prompt=}, {mm_data=})")
-
-        baseline_tokenized_result = baseline_processor.apply(
-            tokenizer.encode(prompt),
-            mm_data=mm_data,
-            hf_processor_mm_kwargs={},
-        )
-
-        assert baseline_result == baseline_tokenized_result, (
-            f"Failed ({batch_idx=}, {prompt=}, {mm_data=})")
-
-        cached_tokenized_result = cached_processor.apply(
-            tokenizer.encode(prompt),
-            mm_data=mm_data,
-            hf_processor_mm_kwargs={},
-        )
-
-        assert cached_result == cached_tokenized_result, (
-            f"Failed ({batch_idx=}, {prompt=}, {mm_data=})")
-
-
-# yapf: disable
-# True if the model supports multiple data items of the modality per request
-@pytest.mark.parametrize(("model_id", "modalities"), [
-    ("rhymes-ai/Aria", {"image": True}),
-    ("Salesforce/blip2-opt-2.7b", {"image": False}),
-    ("facebook/chameleon-7b", {"image": False}),
-    ("adept/fuyu-8b", {"image": False}),
-    ("llava-hf/llava-1.5-7b-hf", {"image": True}),
-    ("llava-hf/llava-v1.6-mistral-7b-hf", {"image": True}),
-    ("llava-hf/LLaVA-NeXT-Video-7B-hf", {"video": False}),
-    ("llava-hf/llava-onevision-qwen2-0.5b-ov-hf", {"image": True, "video": True}),  # noqa: E501
-    ("TIGER-Lab/Mantis-8B-siglip-llama3", {"image": True}),
-    ("mistral-community/pixtral-12b", {"image": True}),
-    ("Qwen/Qwen2-VL-2B-Instruct", {"image": True, "video": True}),
-    ("Qwen/Qwen2-Audio-7B-Instruct", {"audio": True}),
-    ("fixie-ai/ultravox-v0_3", {"audio": True}),
-])
-@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
-@pytest.mark.parametrize("num_batches", [32])
-@pytest.mark.parametrize("simplify_rate", [1.0])
-# yapf: enable
-def test_processing_correctness(
-    model_id: str,
-    modalities: dict[str, bool],
-    hit_rate: float,
-    num_batches: int,
-    simplify_rate: float,
-):
-    _test_processing_correctness(
-        model_id,
-        modalities,
-        hit_rate=hit_rate,
-        num_batches=num_batches,
-        simplify_rate=simplify_rate,
-    )
-
-
-# yapf: disable
-@pytest.mark.parametrize(("model_id", "modalities"), [
-    ("microsoft/Phi-3-vision-128k-instruct", {"image": True}),
-])
-@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
-@pytest.mark.parametrize("num_batches", [32])
-@pytest.mark.parametrize("simplify_rate", [1.0])
-# yapf: enable
-def test_processing_correctness_phi3v(
-    model_id: str,
-    modalities: dict[str, bool],
-    hit_rate: float,
-    num_batches: int,
-    simplify_rate: float,
-):
-    # HACK - this is an attempted workaround for the following bug
-    # https://github.com/huggingface/transformers/issues/34307
-    from transformers import AutoImageProcessor  # noqa: F401
-    from transformers import AutoProcessor  # noqa: F401
-
-    AutoImageProcessor.from_pretrained(model_id, trust_remote_code=True)
-
-    _test_processing_correctness(
-        model_id,
-        modalities,
-        hit_rate=hit_rate,
-        num_batches=num_batches,
-        simplify_rate=simplify_rate,
-    )
diff --git a/tests/multimodal/utils.py b/tests/multimodal/utils.py
new file mode 100644
index 0000000000000..29aeca605109b
--- /dev/null
+++ b/tests/multimodal/utils.py
@@ -0,0 +1,33 @@
+import numpy as np
+from PIL import Image
+
+
+def random_image(rng: np.random.RandomState, min_wh: int, max_wh: int):
+    w, h = rng.randint(min_wh, max_wh, size=(2, ))
+    arr = rng.randint(0, 255, size=(w, h, 3), dtype=np.uint8)
+    return Image.fromarray(arr)
+
+
+def random_video(
+    rng: np.random.RandomState,
+    min_frames: int,
+    max_frames: int,
+    min_wh: int,
+    max_wh: int,
+):
+    # Temporary workaround for https://github.com/huggingface/transformers/issues/35412
+    num_frames = rng.randint(min_frames, max_frames)
+    num_frames = (num_frames // 2) * 2
+
+    w, h = rng.randint(min_wh, max_wh, size=(2, ))
+    return rng.randint(0, 255, size=(num_frames, w, h, 3), dtype=np.uint8)
+
+
+def random_audio(
+    rng: np.random.RandomState,
+    min_len: int,
+    max_len: int,
+    sr: int,
+):
+    audio_len = rng.randint(min_len, max_len)
+    return rng.rand(audio_len), sr

From a991f7d5085e3e9474531f78639766eac3af607c Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 11 Jan 2025 21:27:24 +0800
Subject: [PATCH 179/462] [Doc] Basic guide for writing unit tests for new
 models (#11951)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/source/contributing/model/basic.md       |  2 +-
 docs/source/contributing/model/index.md       |  1 +
 .../source/contributing/model/registration.md |  3 +-
 docs/source/contributing/model/tests.md       | 63 +++++++++++++++++++
 tests/models/registry.py                      |  5 ++
 tests/models/test_initialization.py           | 10 +++
 6 files changed, 81 insertions(+), 3 deletions(-)
 create mode 100644 docs/source/contributing/model/tests.md

diff --git a/docs/source/contributing/model/basic.md b/docs/source/contributing/model/basic.md
index 002808ac5fbbd..5c2dc486c8bea 100644
--- a/docs/source/contributing/model/basic.md
+++ b/docs/source/contributing/model/basic.md
@@ -1,6 +1,6 @@
 (new-model-basic)=
 
-# Basic Implementation
+# Implementing a Basic Model
 
 This guide walks you through the steps to implement a basic vLLM model.
 
diff --git a/docs/source/contributing/model/index.md b/docs/source/contributing/model/index.md
index 245e13b795ec4..fe018b61b08cf 100644
--- a/docs/source/contributing/model/index.md
+++ b/docs/source/contributing/model/index.md
@@ -10,6 +10,7 @@ This section provides more information on how to integrate a [PyTorch](https://p
 
 basic
 registration
+tests
 multimodal
 ```
 
diff --git a/docs/source/contributing/model/registration.md b/docs/source/contributing/model/registration.md
index 6a9262669cd29..d6c9e4181dfee 100644
--- a/docs/source/contributing/model/registration.md
+++ b/docs/source/contributing/model/registration.md
@@ -1,6 +1,6 @@
 (new-model-registration)=
 
-# Model Registration
+# Registering a Model to vLLM
 
 vLLM relies on a model registry to determine how to run each model.
 A list of pre-registered architectures can be found [here](#supported-models).
@@ -15,7 +15,6 @@ This gives you the ability to modify the codebase and test your model.
 
 After you have implemented your model (see [tutorial](#new-model-basic)), put it into the <gh-dir:vllm/model_executor/models> directory.
 Then, add your model class to `_VLLM_MODELS` in <gh-file:vllm/model_executor/models/registry.py> so that it is automatically registered upon importing vLLM.
-You should also include an example HuggingFace repository for this model in <gh-file:tests/models/registry.py> to run the unit tests.
 Finally, update our [list of supported models](#supported-models) to promote your model!
 
 ```{important}
diff --git a/docs/source/contributing/model/tests.md b/docs/source/contributing/model/tests.md
new file mode 100644
index 0000000000000..74c933b2f45da
--- /dev/null
+++ b/docs/source/contributing/model/tests.md
@@ -0,0 +1,63 @@
+(new-model-tests)=
+
+# Writing Unit Tests
+
+This page explains how to write unit tests to verify the implementation of your model.
+
+## Required Tests
+
+These tests are necessary to get your PR merged into vLLM library.
+Without them, the CI for your PR will fail.
+
+### Model loading
+
+Include an example HuggingFace repository for your model in <gh-file:tests/models/registry.py>.
+This enables a unit test that loads dummy weights to ensure that the model can be initialized in vLLM.
+
+```{important}
+The list of models in each section should be maintained in alphabetical order.
+```
+
+```{tip}
+If your model requires a development version of HF Transformers, you can set
+`min_transformers_version` to skip the test in CI until the model is released.
+```
+
+## Optional Tests
+
+These tests are optional to get your PR merged into vLLM library.
+Passing these tests provides more confidence that your implementation is correct, and helps avoid future regressions.
+
+### Model correctness
+
+These tests compare the model outputs of vLLM against [HF Transformers](https://github.com/huggingface/transformers). You can add new tests under the subdirectories of <gh-dir:tests/models>.
+
+#### Generative models
+
+For [generative models](#generative-models), there are two levels of correctness tests, as defined in <gh-file:tests/models/utils.py>:
+
+- Exact correctness (`check_outputs_equal`): The text outputted by vLLM should exactly match the text outputted by HF.
+- Logprobs similarity (`check_logprobs_close`): The logprobs outputted by vLLM should be in the top-k logprobs outputted by HF, and vice versa.
+
+#### Pooling models
+
+For [pooling models](#pooling-models), we simply check the cosine similarity, as defined in <gh-file:tests/models/embedding/utils.py>.
+
+(mm-processing-tests)=
+
+### Multi-modal processing
+
+#### Common tests
+
+Adding your model to <gh-file:tests/models/multimodal/processing/test_common.py> verifies that the following input combinations result in the same outputs:
+
+- Text + multi-modal data
+- Tokens + multi-modal data
+- Text + cached multi-modal data
+- Tokens + cached multi-modal data
+
+#### Model-specific tests
+
+You can add a new file under <gh-dir:tests/models/multimodal/processing> to run tests that only apply to your model.
+
+For example, if the HF processor for your model accepts user-specified keyword arguments, you can verify that the keyword arguments are being applied correctly, such as in <gh-file:tests/models/multimodal/processing/test_phi3v.py>.
diff --git a/tests/models/registry.py b/tests/models/registry.py
index dcb8bfa0f9510..f5aaa8eb071f9 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -22,6 +22,11 @@ class _HfExamplesInfo:
     for speculative decoding.
     """
 
+    min_transformers_version: Optional[str] = None
+    """
+    The minimum version of HF Transformers that is required to run this model.
+    """
+
     is_available_online: bool = True
     """
     Set this to ``False`` if the name of this architecture no longer exists on
diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py
index 3b728f2744fca..7a564c1f4a1d0 100644
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -1,7 +1,9 @@
 from unittest.mock import patch
 
 import pytest
+from packaging.version import Version
 from transformers import PretrainedConfig
+from transformers import __version__ as TRANSFORMERS_VERSION
 
 from vllm import LLM
 
@@ -13,6 +15,14 @@ def test_can_initialize(model_arch):
     model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
     if not model_info.is_available_online:
         pytest.skip("Model is not available online")
+    if model_info.min_transformers_version is not None:
+        current_version = TRANSFORMERS_VERSION
+        required_version = model_info.min_transformers_version
+        if Version(current_version) < Version(required_version):
+            pytest.skip(
+                f"You have `transformers=={current_version}` installed, but "
+                f"`transformers>={required_version}` is required to run this "
+                "model")
 
     # Avoid OOM
     def hf_overrides(hf_config: PretrainedConfig) -> PretrainedConfig:

From d697dc01b4a25b96c3a1e88d72a058f17a717fd5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Sat, 11 Jan 2025 15:05:09 +0100
Subject: [PATCH 180/462] [Bugfix] Fix RobertaModel loading (#11940)

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 .../test_model_load_with_params.py            | 27 +++++++++-
 .../embedding/language/test_embedding.py      |  1 +
 vllm/model_executor/models/roberta.py         | 51 +++++++++++++++----
 3 files changed, 67 insertions(+), 12 deletions(-)

diff --git a/tests/model_executor/test_model_load_with_params.py b/tests/model_executor/test_model_load_with_params.py
index ed321ba9f00c1..0609fd96825e3 100644
--- a/tests/model_executor/test_model_load_with_params.py
+++ b/tests/model_executor/test_model_load_with_params.py
@@ -2,7 +2,7 @@
 
 import pytest
 
-from vllm.model_executor.layers.pooler import PoolingType
+from vllm.model_executor.layers.pooler import CLSPool, PoolingType
 from vllm.model_executor.models.bert import BertEmbeddingModel
 from vllm.model_executor.models.roberta import RobertaEmbeddingModel
 from vllm.platforms import current_platform
@@ -92,3 +92,28 @@ def test_roberta_model_loading_with_params(vllm_runner):
 
         # assert output
         assert output
+
+
+@pytest.mark.skipif(current_platform.is_rocm(),
+                    reason="Xformers backend is not supported on ROCm.")
+def test_facebook_roberta_model_loading_with_params(vllm_runner):
+    """
+    Test loading roberta-base model with no lm_head.
+    """
+    model_name = "FacebookAI/roberta-base"
+    with vllm_runner(model_name=model_name,
+                     dtype="float16",
+                     max_model_len=MAX_MODEL_LEN) as model:
+        output = model.encode("Write a short story about a robot that"
+                              " dreams for the first time.\n")
+
+        model_tokenizer = model.model.llm_engine.tokenizer
+        assert model_tokenizer.tokenizer_id == model_name
+
+        model = model.model.llm_engine.model_executor\
+                     .driver_worker.model_runner.model
+        assert not hasattr(model, "lm_head")
+        assert isinstance(model, RobertaEmbeddingModel)
+        assert isinstance(model._pooler, CLSPool)
+
+        assert output
diff --git a/tests/models/embedding/language/test_embedding.py b/tests/models/embedding/language/test_embedding.py
index 7749806548cd9..04ab4dd7371a3 100644
--- a/tests/models/embedding/language/test_embedding.py
+++ b/tests/models/embedding/language/test_embedding.py
@@ -25,6 +25,7 @@
         pytest.param("ssmits/Qwen2-7B-Instruct-embed-base"),
         pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct"),
         pytest.param("Alibaba-NLP/gte-Qwen2-7B-instruct"),
+        pytest.param("sentence-transformers/stsb-roberta-base-v2"),
     ],
 )
 @pytest.mark.parametrize("dtype", ["half"])
diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py
index ba1a78ac640fd..5997a76890c9d 100644
--- a/vllm/model_executor/models/roberta.py
+++ b/vllm/model_executor/models/roberta.py
@@ -1,3 +1,4 @@
+import itertools
 from typing import Iterable, List, Optional, Tuple
 
 import torch
@@ -20,6 +21,30 @@
 from .interfaces import SupportsCrossEncoding
 
 
+def roberta_task_weights_filter(
+    all_weights: Iterable[Tuple[str, torch.Tensor]]
+) -> Tuple[Iterable[Tuple[str, torch.Tensor]], Iterable[Tuple[str,
+                                                              torch.Tensor]]]:
+    """
+    Separate task-specific weights that are applied on top
+    of the encoder-decoder bert base.
+    To do so, return two generators over the original iterator.
+    Also, remove the "roberta." prefix to make it loadable
+    from vanilla BertModel.
+    """
+    # Copy of a lazy iterator without in-memory overhead so both
+    # iterators can be iterated upon independently.
+    all_weights1, all_weights2 = itertools.tee(all_weights)
+
+    def encoder_decoder_weights():
+        for name, weight in all_weights1:
+            if name.startswith("roberta."):
+                yield (name[len("roberta."):], weight)
+
+    return encoder_decoder_weights(), ((n, w) for n, w in all_weights2
+                                       if not n.startswith("roberta."))
+
+
 class RobertaEmbedding(nn.Module):
 
     def __init__(self, config: RobertaConfig):
@@ -152,6 +177,18 @@ def _build_model(self,
                          prefix=prefix,
                          embedding_class=RobertaEmbedding)
 
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        weights = self.hf_to_vllm_mapper.apply(weights)
+        # Separate weights in "roberta"-prefixed and all else (not in memory).
+        # For use with models like FacebookAI/roberta-base.
+        bert_weights, task_weights = roberta_task_weights_filter(weights)
+        loaded = self.model.load_weights(bert_weights)
+        if not len(loaded):
+            # Fix for models like `sentence-transformers/stsb-roberta-base-v2`
+            # which use the same architecture, but have no "roberta" prefix.
+            loaded = self.model.load_weights(task_weights)
+        assert len(loaded), "Unable to load RobertaEmbeddingModel"
+
 
 class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding):
     """A model that uses Roberta to provide embedding functionalities.
@@ -181,20 +218,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
 
-        self_weights = []
-
-        def weight_filter():
-            for name, weight in weights:
-                if name.startswith("roberta."):
-                    yield (name[len("roberta."):], weight)
-                else:
-                    self_weights.append((name, weight))
-
-        self.roberta.load_weights(weight_filter())
+        bert_weights, task_weights = roberta_task_weights_filter(weights)
+        self.roberta.load_weights(bert_weights)
 
         params_dict = dict(self.named_parameters())
 
-        for name, loaded_weight in self_weights:
+        for name, loaded_weight in task_weights:
             if name.startswith("classifier"):
                 param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader",

From 4b657d32922cb6d3179f901e312715279ad9c728 Mon Sep 17 00:00:00 2001
From: sixgod <evethwillbeok@outlook.com>
Date: Sun, 12 Jan 2025 03:05:56 +0800
Subject: [PATCH 181/462] [Model] Add cogagent model support vLLM (#11742)

Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
---
 vllm/model_executor/models/chatglm.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py
index ffd6891b25965..7e37ce3086e6b 100644
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -1,6 +1,6 @@
 # Adapted from
-# https://github.com/THUDM/GLM-4
-"""Inference-only ChatGLM model compatible with THUDM weights."""
+# https://github.com/THUDM/CogAgent
+"""Inference-only CogAgent model compatible with THUDM weights."""
 from argparse import Namespace
 from array import array
 from typing import (Dict, Iterable, List, Mapping, Optional, Set, Tuple,
@@ -201,7 +201,6 @@ def input_processor_for_glmv(ctx: InputContext, inputs: DecoderOnlyInputs):
 
     new_input_ids = []
     final_processed_position = 0
-    final_processed_position = 0
 
     for boi_position, eoi_position in zip(boi_positions, eoi_positions):
         assert boi_position < eoi_position
@@ -275,12 +274,15 @@ def __init__(
         # https://huggingface.co/THUDM/chatglm3-6b-32k/blob/e210410255278dd9d74463cf396ba559c0ef801c/modeling_chatglm.py#L141
         rope_ratio = getattr(config, "rope_ratio", 1.0)
         max_positions = getattr(config, "seq_length", 8192)
+        # NOTE: THUDM/cogagent-9b-20241220 uses original_rope=False,
+        # which is equivalent to is_neox_style=True
+        is_neox_style = not config.original_rope
         self.rotary_emb = get_rope(
             self.head_dim,
             rotary_dim=self.head_dim // 2,
             max_position=max_positions,
             base=10000 * rope_ratio,
-            is_neox_style=False,
+            is_neox_style=is_neox_style,
         )
         self.attn = Attention(self.num_heads,
                               self.head_dim,
@@ -779,4 +781,4 @@ def __new__(
             return ChatGLMV(vllm_config=vllm_config, prefix=prefix)
         # Initialize LLM
         else:
-            return ChatGLM(vllm_config=vllm_config, prefix=prefix)
+            return ChatGLM(vllm_config=vllm_config, prefix=prefix)
\ No newline at end of file

From b25cfab9a03b5c460fb92340b310d2a5c2dbc5da Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Sat, 11 Jan 2025 22:36:38 -0800
Subject: [PATCH 182/462] [V1] Avoid sending text prompt to core engine
 (#11963)

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/v1/engine/__init__.py    | 4 ++--
 vllm/v1/engine/core_client.py | 6 ++++++
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 5e3c5e327ef63..3ce9db0e47eed 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -19,8 +19,8 @@ class EngineCoreRequest:
     # due to circular imports and typing we have in data.py
 
     request_id: str
-    #NOTE(Nick): I don't think we need to pass prompt here since it should
-    # always be tokenized?
+    # NOTE(ywang96): original text prompt is needed when a request is added to
+    # Detokenizer, but set to None when it is added to EngineCoreClient.
     prompt: Optional[str]
     prompt_token_ids: List[int]
     mm_inputs: Optional[List[Optional["MultiModalKwargs"]]]
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index a4a45ae05ff9e..4ed7f944b058f 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -219,6 +219,9 @@ def _send_input(self, request_type: EngineCoreRequestType,
         self.input_socket.send_multipart(msg, copy=False)
 
     def add_request(self, request: EngineCoreRequest) -> None:
+        # NOTE: text prompt is not needed in the core engine as it has been
+        # tokenized.
+        request.prompt = None
         self._send_input(EngineCoreRequestType.ADD, request)
 
     def abort_requests(self, request_ids: List[str]) -> None:
@@ -257,6 +260,9 @@ async def _send_input(self, request_type: EngineCoreRequestType,
         await self.input_socket.send_multipart(msg, copy=False)
 
     async def add_request_async(self, request: EngineCoreRequest) -> None:
+        # NOTE: text prompt is not needed in the core engine as it has been
+        # tokenized.
+        request.prompt = None
         await self._send_input(EngineCoreRequestType.ADD, request)
 
     async def abort_requests_async(self, request_ids: List[str]) -> None:

From 43f3d9e6990811461ecb42bc50a17aad944d30f9 Mon Sep 17 00:00:00 2001
From: Rafael Vasquez <rafvasq21@gmail.com>
Date: Sun, 12 Jan 2025 03:17:13 -0500
Subject: [PATCH 183/462] [CI/Build] Add markdown linter (#11857)

Signed-off-by: Rafael Vasquez <rafvasq21@gmail.com>
---
 .../{sphinx-lint.yml => doc-lint.yml}         |   4 +-
 docs/README.md                                |   1 +
 docs/source/api/model/index.md                |   1 -
 docs/source/community/sponsors.md             |   2 +
 docs/source/contributing/model/multimodal.md  |   4 +
 docs/source/contributing/overview.md          |   2 -
 docs/source/deployment/docker.md              |   4 +-
 .../source/deployment/frameworks/cerebrium.md |  10 +-
 docs/source/deployment/frameworks/dstack.md   |  10 +-
 docs/source/deployment/frameworks/skypilot.md |  12 +-
 .../deployment/integrations/llamastack.md     |   2 +-
 docs/source/deployment/k8s.md                 | 451 +++++++++---------
 .../source/design/automatic_prefix_caching.md |  11 +-
 docs/source/features/quantization/auto_awq.md |   4 +-
 docs/source/features/quantization/bnb.md      |   7 +-
 docs/source/features/quantization/fp8.md      |   4 +-
 .../features/quantization/fp8_e4m3_kvcache.md |   2 +-
 docs/source/features/quantization/gguf.md     |  10 +-
 docs/source/features/quantization/int8.md     |   2 +-
 docs/source/features/spec_decode.md           |  10 +-
 docs/source/features/tool_calling.md          |  44 +-
 docs/source/getting_started/faq.md            |   2 +-
 .../getting_started/installation/cpu-apple.md |  17 +-
 .../getting_started/installation/cpu-x86.md   |  44 +-
 .../getting_started/installation/gpu-cuda.md  |  96 ++--
 .../getting_started/installation/gpu-rocm.md  | 116 ++---
 .../getting_started/installation/hpu-gaudi.md |  64 +--
 .../getting_started/installation/neuron.md    |   8 +-
 .../getting_started/installation/openvino.md  |  14 +-
 .../getting_started/installation/tpu.md       |   6 +-
 .../getting_started/installation/xpu.md       |  24 +-
 docs/source/getting_started/quickstart.md     |  50 +-
 .../source/getting_started/troubleshooting.md |   6 +-
 docs/source/index.md                          |   4 +-
 .../models/extensions/runai_model_streamer.md |  12 +-
 docs/source/models/supported_models.md        |  13 +-
 docs/source/performance/optimization.md       |   2 +-
 docs/source/serving/distributed_serving.md    |  40 +-
 docs/source/serving/integrations/langchain.md |   2 +-
 .../source/serving/integrations/llamaindex.md |   2 +-
 docs/source/serving/metrics.md                |   2 +-
 docs/source/serving/multimodal_inputs.md      |   1 +
 docs/source/serving/offline_inference.md      |   2 +-
 .../serving/openai_compatible_server.md       |  19 +-
 format.sh                                     |  10 +-
 pyproject.toml                                |   6 +
 requirements-lint.txt                         |   2 +-
 tools/doc-lint.sh                             |   3 +
 tools/sphinx-lint.sh                          |   3 -
 49 files changed, 596 insertions(+), 571 deletions(-)
 rename .github/workflows/{sphinx-lint.yml => doc-lint.yml} (93%)
 create mode 100755 tools/doc-lint.sh
 delete mode 100755 tools/sphinx-lint.sh

diff --git a/.github/workflows/sphinx-lint.yml b/.github/workflows/doc-lint.yml
similarity index 93%
rename from .github/workflows/sphinx-lint.yml
rename to .github/workflows/doc-lint.yml
index e0bb24276a653..2f5ee8bbfd8c5 100644
--- a/.github/workflows/sphinx-lint.yml
+++ b/.github/workflows/doc-lint.yml
@@ -13,7 +13,7 @@ on:
       - "docs/**"
 
 jobs:
-  sphinx-lint:
+  doc-lint:
     runs-on: ubuntu-latest
     strategy:
       matrix:
@@ -29,4 +29,4 @@ jobs:
           python -m pip install --upgrade pip
           pip install -r requirements-lint.txt
       - name: Linting docs
-        run: tools/sphinx-lint.sh
+        run: tools/doc-lint.sh
diff --git a/docs/README.md b/docs/README.md
index 46488c9bb0b92..1a44c1341f4fb 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -16,4 +16,5 @@ make html
 ```bash
 python -m http.server -d build/html/
 ```
+
 Launch your browser and open localhost:8000.
diff --git a/docs/source/api/model/index.md b/docs/source/api/model/index.md
index b8437e3c3517a..113792147be7c 100644
--- a/docs/source/api/model/index.md
+++ b/docs/source/api/model/index.md
@@ -9,4 +9,3 @@ interfaces_base
 interfaces
 adapters
 ```
-
diff --git a/docs/source/community/sponsors.md b/docs/source/community/sponsors.md
index 9d2af4c13b088..fb93e65673dff 100644
--- a/docs/source/community/sponsors.md
+++ b/docs/source/community/sponsors.md
@@ -6,6 +6,7 @@ vLLM is a community project. Our compute resources for development and testing a
 <!-- Note: Please keep these consistent with README.md. -->
 
 Cash Donations:
+
 - a16z
 - Dropbox
 - Sequoia Capital
@@ -13,6 +14,7 @@ Cash Donations:
 - ZhenFund
 
 Compute Resources:
+
 - AMD
 - Anyscale
 - AWS
diff --git a/docs/source/contributing/model/multimodal.md b/docs/source/contributing/model/multimodal.md
index 76ab73e43d24b..99f6a1d5462c4 100644
--- a/docs/source/contributing/model/multimodal.md
+++ b/docs/source/contributing/model/multimodal.md
@@ -200,6 +200,7 @@ def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
 ```{note}
 Our [actual code](gh-file:vllm/model_executor/models/llava.py) is more abstracted to support vision encoders other than CLIP.
 ```
+
 :::
 ::::
 
@@ -248,6 +249,7 @@ def get_dummy_processor_inputs(
         mm_data=mm_data,
     )
 ```
+
 :::
 ::::
 
@@ -312,6 +314,7 @@ def _get_mm_fields_config(
 Our [actual code](gh-file:vllm/model_executor/models/llava.py) additionally supports
 pre-computed image embeddings, which can be passed to be model via the `image_embeds` argument.
 ```
+
 :::
 ::::
 
@@ -369,6 +372,7 @@ def _get_prompt_replacements(
         ),
     ]
 ```
+
 :::
 ::::
 
diff --git a/docs/source/contributing/overview.md b/docs/source/contributing/overview.md
index c960790f47a13..e92104399342d 100644
--- a/docs/source/contributing/overview.md
+++ b/docs/source/contributing/overview.md
@@ -37,8 +37,6 @@ pytest tests/
 Currently, the repository is not fully checked by `mypy`.
 ```
 
-# Contribution Guidelines
-
 ## Issues
 
 If you encounter a bug or have a feature request, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible.
diff --git a/docs/source/deployment/docker.md b/docs/source/deployment/docker.md
index 2df1aca27f1e6..c735bfd0e87a7 100644
--- a/docs/source/deployment/docker.md
+++ b/docs/source/deployment/docker.md
@@ -28,8 +28,8 @@ memory to share data between processes under the hood, particularly for tensor p
 You can build and run vLLM from source via the provided <gh-file:Dockerfile>. To build vLLM:
 
 ```console
-$ # optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2
-$ DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai
+# optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2
+DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai
 ```
 
 ```{note}
diff --git a/docs/source/deployment/frameworks/cerebrium.md b/docs/source/deployment/frameworks/cerebrium.md
index be018dfb75d7a..5787c4a407bfb 100644
--- a/docs/source/deployment/frameworks/cerebrium.md
+++ b/docs/source/deployment/frameworks/cerebrium.md
@@ -13,14 +13,14 @@ vLLM can be run on a cloud based GPU machine with [Cerebrium](https://www.cerebr
 To install the Cerebrium client, run:
 
 ```console
-$ pip install cerebrium
-$ cerebrium login
+pip install cerebrium
+cerebrium login
 ```
 
 Next, create your Cerebrium project, run:
 
 ```console
-$ cerebrium init vllm-project
+cerebrium init vllm-project
 ```
 
 Next, to install the required packages, add the following to your cerebrium.toml:
@@ -58,10 +58,10 @@ def run(prompts: list[str], temperature: float = 0.8, top_p: float = 0.95):
 Then, run the following code to deploy it to the cloud:
 
 ```console
-$ cerebrium deploy
+cerebrium deploy
 ```
 
-If successful, you should be returned a CURL command that you can call inference against. Just remember to end the url with the function name you are calling (in our case` /run`)
+If successful, you should be returned a CURL command that you can call inference against. Just remember to end the url with the function name you are calling (in our case`/run`)
 
 ```python
 curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \
diff --git a/docs/source/deployment/frameworks/dstack.md b/docs/source/deployment/frameworks/dstack.md
index 4142c1d9f1f60..b42a34125c6d7 100644
--- a/docs/source/deployment/frameworks/dstack.md
+++ b/docs/source/deployment/frameworks/dstack.md
@@ -13,16 +13,16 @@ vLLM can be run on a cloud based GPU machine with [dstack](https://dstack.ai/),
 To install dstack client, run:
 
 ```console
-$ pip install "dstack[all]
-$ dstack server
+pip install "dstack[all]
+dstack server
 ```
 
 Next, to configure your dstack project, run:
 
 ```console
-$ mkdir -p vllm-dstack
-$ cd vllm-dstack
-$ dstack init
+mkdir -p vllm-dstack
+cd vllm-dstack
+dstack init
 ```
 
 Next, to provision a VM instance with LLM of your choice (`NousResearch/Llama-2-7b-chat-hf` for this example), create the following `serve.dstack.yml` file for the dstack `Service`:
diff --git a/docs/source/deployment/frameworks/skypilot.md b/docs/source/deployment/frameworks/skypilot.md
index 657e7f2bc72cc..051fc2f2a8d4e 100644
--- a/docs/source/deployment/frameworks/skypilot.md
+++ b/docs/source/deployment/frameworks/skypilot.md
@@ -334,12 +334,12 @@ run: |
 
 1. Start the chat web UI:
 
-```console
-sky launch -c gui ./gui.yaml --env ENDPOINT=$(sky serve status --endpoint vllm)
-```
+    ```console
+    sky launch -c gui ./gui.yaml --env ENDPOINT=$(sky serve status --endpoint vllm)
+    ```
 
 2. Then, we can access the GUI at the returned gradio link:
 
-```console
-| INFO | stdout | Running on public URL: https://6141e84201ce0bb4ed.gradio.live
-```
+    ```console
+    | INFO | stdout | Running on public URL: https://6141e84201ce0bb4ed.gradio.live
+    ```
diff --git a/docs/source/deployment/integrations/llamastack.md b/docs/source/deployment/integrations/llamastack.md
index 474d2bdfa9580..a6c3569637abf 100644
--- a/docs/source/deployment/integrations/llamastack.md
+++ b/docs/source/deployment/integrations/llamastack.md
@@ -7,7 +7,7 @@ vLLM is also available via [Llama Stack](https://github.com/meta-llama/llama-sta
 To install Llama Stack, run
 
 ```console
-$ pip install llama-stack -q
+pip install llama-stack -q
 ```
 
 ## Inference using OpenAI Compatible API
diff --git a/docs/source/deployment/k8s.md b/docs/source/deployment/k8s.md
index 760214e112fba..cbc95c20ff4b3 100644
--- a/docs/source/deployment/k8s.md
+++ b/docs/source/deployment/k8s.md
@@ -14,234 +14,235 @@ Before you begin, ensure that you have the following:
 
 ## Deployment Steps
 
-1. **Create a PVC , Secret and Deployment for vLLM**
-
-PVC is used to store the model cache and it is optional, you can use hostPath or other storage options
-
-```yaml
-apiVersion: v1
-kind: PersistentVolumeClaim
-metadata:
-  name: mistral-7b
-  namespace: default
-spec:
-  accessModes:
-  - ReadWriteOnce
-  resources:
-    requests:
-      storage: 50Gi
-  storageClassName: default
-  volumeMode: Filesystem
-```
-
-Secret is optional and only required for accessing gated models, you can skip this step if you are not using gated models
-
-```yaml
-apiVersion: v1
-kind: Secret
-metadata:
-  name: hf-token-secret
-  namespace: default
-type: Opaque
-stringData:
-  token: "REPLACE_WITH_TOKEN"
-```
-
-Next to create the deployment file for vLLM to run the model server. The following example deploys the `Mistral-7B-Instruct-v0.3` model.
-
-Here are two examples for using NVIDIA GPU and AMD GPU. 
-
-- NVIDIA GPU
-
-```yaml
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: mistral-7b
-  namespace: default
-  labels:
-    app: mistral-7b
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: mistral-7b
-  template:
-    metadata:
-      labels:
-        app: mistral-7b
-    spec:
-      volumes:
-      - name: cache-volume
-        persistentVolumeClaim:
-          claimName: mistral-7b
-      # vLLM needs to access the host's shared memory for tensor parallel inference.
-      - name: shm
-        emptyDir:
-          medium: Memory
-          sizeLimit: "2Gi"
-      containers:
-      - name: mistral-7b
-        image: vllm/vllm-openai:latest
-        command: ["/bin/sh", "-c"]
-        args: [
-          "vllm serve mistralai/Mistral-7B-Instruct-v0.3 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024"
-        ]
-        env:
-        - name: HUGGING_FACE_HUB_TOKEN
-          valueFrom:
-            secretKeyRef:
-              name: hf-token-secret
-              key: token
-        ports:
-        - containerPort: 8000
+1. Create a PVC, Secret and Deployment for vLLM
+
+      PVC is used to store the model cache and it is optional, you can use hostPath or other storage options
+
+      ```yaml
+      apiVersion: v1
+      kind: PersistentVolumeClaim
+      metadata:
+        name: mistral-7b
+        namespace: default
+      spec:
+        accessModes:
+        - ReadWriteOnce
         resources:
-          limits:
-            cpu: "10"
-            memory: 20G
-            nvidia.com/gpu: "1"
           requests:
-            cpu: "2"
-            memory: 6G
-            nvidia.com/gpu: "1"
-        volumeMounts:
-        - mountPath: /root/.cache/huggingface
-          name: cache-volume
-        - name: shm
-          mountPath: /dev/shm
-        livenessProbe:
-          httpGet:
-            path: /health
-            port: 8000
-          initialDelaySeconds: 60
-          periodSeconds: 10
-        readinessProbe:
-          httpGet:
-            path: /health
-            port: 8000
-          initialDelaySeconds: 60
-          periodSeconds: 5
-```
-
-- AMD GPU
-
-You can refer to the `deployment.yaml` below if using AMD ROCm GPU like MI300X.
-
-```yaml
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: mistral-7b
-  namespace: default
-  labels:
-    app: mistral-7b
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: mistral-7b
-  template:
-    metadata:
-      labels:
-        app: mistral-7b
-    spec:
-      volumes:
-      # PVC
-      - name: cache-volume
-        persistentVolumeClaim:
-          claimName: mistral-7b
-      # vLLM needs to access the host's shared memory for tensor parallel inference.
-      - name: shm
-        emptyDir:
-          medium: Memory
-          sizeLimit: "8Gi"
-      hostNetwork: true
-      hostIPC: true
-      containers:
-      - name: mistral-7b
-        image: rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4
-        securityContext:
-          seccompProfile:
-            type: Unconfined
-          runAsGroup: 44
-          capabilities:
-            add:
-            - SYS_PTRACE
-        command: ["/bin/sh", "-c"]
-        args: [
-          "vllm serve mistralai/Mistral-7B-v0.3 --port 8000 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024"
-        ]
-        env:
-        - name: HUGGING_FACE_HUB_TOKEN
-          valueFrom:
-            secretKeyRef:
-              name: hf-token-secret
-              key: token
+            storage: 50Gi
+        storageClassName: default
+        volumeMode: Filesystem
+      ```
+
+      Secret is optional and only required for accessing gated models, you can skip this step if you are not using gated models
+
+      ```yaml
+      apiVersion: v1
+      kind: Secret
+      metadata:
+        name: hf-token-secret
+        namespace: default
+      type: Opaque
+      stringData:
+        token: "REPLACE_WITH_TOKEN"
+      ```
+
+      Next to create the deployment file for vLLM to run the model server. The following example deploys the `Mistral-7B-Instruct-v0.3` model.
+
+      Here are two examples for using NVIDIA GPU and AMD GPU.
+
+      NVIDIA GPU:
+
+      ```yaml
+      apiVersion: apps/v1
+      kind: Deployment
+      metadata:
+        name: mistral-7b
+        namespace: default
+        labels:
+          app: mistral-7b
+      spec:
+        replicas: 1
+        selector:
+          matchLabels:
+            app: mistral-7b
+        template:
+          metadata:
+            labels:
+              app: mistral-7b
+          spec:
+            volumes:
+            - name: cache-volume
+              persistentVolumeClaim:
+                claimName: mistral-7b
+            # vLLM needs to access the host's shared memory for tensor parallel inference.
+            - name: shm
+              emptyDir:
+                medium: Memory
+                sizeLimit: "2Gi"
+            containers:
+            - name: mistral-7b
+              image: vllm/vllm-openai:latest
+              command: ["/bin/sh", "-c"]
+              args: [
+                "vllm serve mistralai/Mistral-7B-Instruct-v0.3 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024"
+              ]
+              env:
+              - name: HUGGING_FACE_HUB_TOKEN
+                valueFrom:
+                  secretKeyRef:
+                    name: hf-token-secret
+                    key: token
+              ports:
+              - containerPort: 8000
+              resources:
+                limits:
+                  cpu: "10"
+                  memory: 20G
+                  nvidia.com/gpu: "1"
+                requests:
+                  cpu: "2"
+                  memory: 6G
+                  nvidia.com/gpu: "1"
+              volumeMounts:
+              - mountPath: /root/.cache/huggingface
+                name: cache-volume
+              - name: shm
+                mountPath: /dev/shm
+              livenessProbe:
+                httpGet:
+                  path: /health
+                  port: 8000
+                initialDelaySeconds: 60
+                periodSeconds: 10
+              readinessProbe:
+                httpGet:
+                  path: /health
+                  port: 8000
+                initialDelaySeconds: 60
+                periodSeconds: 5
+      ```
+
+      AMD GPU:
+
+      You can refer to the `deployment.yaml` below if using AMD ROCm GPU like MI300X.
+
+      ```yaml
+      apiVersion: apps/v1
+      kind: Deployment
+      metadata:
+        name: mistral-7b
+        namespace: default
+        labels:
+          app: mistral-7b
+      spec:
+        replicas: 1
+        selector:
+          matchLabels:
+            app: mistral-7b
+        template:
+          metadata:
+            labels:
+              app: mistral-7b
+          spec:
+            volumes:
+            # PVC
+            - name: cache-volume
+              persistentVolumeClaim:
+                claimName: mistral-7b
+            # vLLM needs to access the host's shared memory for tensor parallel inference.
+            - name: shm
+              emptyDir:
+                medium: Memory
+                sizeLimit: "8Gi"
+            hostNetwork: true
+            hostIPC: true
+            containers:
+            - name: mistral-7b
+              image: rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4
+              securityContext:
+                seccompProfile:
+                  type: Unconfined
+                runAsGroup: 44
+                capabilities:
+                  add:
+                  - SYS_PTRACE
+              command: ["/bin/sh", "-c"]
+              args: [
+                "vllm serve mistralai/Mistral-7B-v0.3 --port 8000 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024"
+              ]
+              env:
+              - name: HUGGING_FACE_HUB_TOKEN
+                valueFrom:
+                  secretKeyRef:
+                    name: hf-token-secret
+                    key: token
+              ports:
+              - containerPort: 8000
+              resources:
+                limits:
+                  cpu: "10"
+                  memory: 20G
+                  amd.com/gpu: "1"
+                requests:
+                  cpu: "6"
+                  memory: 6G
+                  amd.com/gpu: "1"
+              volumeMounts:
+              - name: cache-volume
+                mountPath: /root/.cache/huggingface
+              - name: shm
+                mountPath: /dev/shm
+      ```
+
+      You can get the full example with steps and sample yaml files from <https://github.com/ROCm/k8s-device-plugin/tree/master/example/vllm-serve>.
+
+2. Create a Kubernetes Service for vLLM
+
+      Next, create a Kubernetes Service file to expose the `mistral-7b` deployment:
+
+      ```yaml
+      apiVersion: v1
+      kind: Service
+      metadata:
+        name: mistral-7b
+        namespace: default
+      spec:
         ports:
-        - containerPort: 8000
-        resources:
-          limits:
-            cpu: "10"
-            memory: 20G
-            amd.com/gpu: "1"
-          requests:
-            cpu: "6"
-            memory: 6G
-            amd.com/gpu: "1"
-        volumeMounts:
-        - name: cache-volume
-          mountPath: /root/.cache/huggingface
-        - name: shm
-          mountPath: /dev/shm
-```
-You can get the full example with steps and sample yaml files from <https://github.com/ROCm/k8s-device-plugin/tree/master/example/vllm-serve>.
-
-2. **Create a Kubernetes Service for vLLM**
-
-Next, create a Kubernetes Service file to expose the `mistral-7b` deployment:
-
-```yaml
-apiVersion: v1
-kind: Service
-metadata:
-  name: mistral-7b
-  namespace: default
-spec:
-  ports:
-  - name: http-mistral-7b
-    port: 80
-    protocol: TCP
-    targetPort: 8000
-  # The label selector should match the deployment labels & it is useful for prefix caching feature
-  selector:
-    app: mistral-7b
-  sessionAffinity: None
-  type: ClusterIP
-```
-
-3. **Deploy and Test**
-
-Apply the deployment and service configurations using `kubectl apply -f <filename>`:
-
-```console
-kubectl apply -f deployment.yaml
-kubectl apply -f service.yaml
-```
-
-To test the deployment, run the following `curl` command:
-
-```console
-curl http://mistral-7b.default.svc.cluster.local/v1/completions \
-  -H "Content-Type: application/json" \
-  -d '{
-        "model": "mistralai/Mistral-7B-Instruct-v0.3",
-        "prompt": "San Francisco is a",
-        "max_tokens": 7,
-        "temperature": 0
-      }'
-```
-
-If the service is correctly deployed, you should receive a response from the vLLM model.
+        - name: http-mistral-7b
+          port: 80
+          protocol: TCP
+          targetPort: 8000
+        # The label selector should match the deployment labels & it is useful for prefix caching feature
+        selector:
+          app: mistral-7b
+        sessionAffinity: None
+        type: ClusterIP
+      ```
+
+3. Deploy and Test
+
+      Apply the deployment and service configurations using `kubectl apply -f <filename>`:
+
+      ```console
+      kubectl apply -f deployment.yaml
+      kubectl apply -f service.yaml
+      ```
+
+      To test the deployment, run the following `curl` command:
+
+      ```console
+      curl http://mistral-7b.default.svc.cluster.local/v1/completions \
+        -H "Content-Type: application/json" \
+        -d '{
+              "model": "mistralai/Mistral-7B-Instruct-v0.3",
+              "prompt": "San Francisco is a",
+              "max_tokens": 7,
+              "temperature": 0
+            }'
+      ```
+
+      If the service is correctly deployed, you should receive a response from the vLLM model.
 
 ## Conclusion
 
diff --git a/docs/source/design/automatic_prefix_caching.md b/docs/source/design/automatic_prefix_caching.md
index 6d3dd056e6a60..3928e0c16568b 100644
--- a/docs/source/design/automatic_prefix_caching.md
+++ b/docs/source/design/automatic_prefix_caching.md
@@ -6,7 +6,7 @@ The core idea of [PagedAttention](https://blog.vllm.ai/2023/06/20/vllm.html) is
 
 To automatically cache the KV cache, we utilize the following key observation: Each KV block can be uniquely identified by the tokens within the block and the tokens in the prefix before the block.
 
-```
+```text
                     Block 1                  Block 2                  Block 3
          [A gentle breeze stirred] [the leaves as children] [laughed in the distance]
 Block 1: |<--- block tokens ---->|
@@ -14,19 +14,16 @@ Block 2: |<------- prefix ------>| |<--- block tokens --->|
 Block 3: |<------------------ prefix -------------------->| |<--- block tokens ---->|
 ```
 
-
 In the example above, the KV cache in the first block can be uniquely identified with the tokens “A gentle breeze stirred”. The third block can be uniquely identified with the tokens in the block “laughed in the distance”, along with the prefix tokens “A gentle breeze stirred the leaves as children”. Therefore, we can build the following one-to-one mapping:
 
-```
+```text
 hash(prefix tokens + block tokens) <--> KV Block
 ```
 
 With this mapping, we can add another indirection in vLLM’s KV cache management. Previously, each sequence in vLLM maintained a mapping from their logical KV blocks to physical blocks. To achieve automatic caching of KV blocks, we map the logical KV blocks to their hash value and maintain a global hash table of all the physical blocks. In this way, all the KV blocks sharing the same hash value (e.g., shared prefix blocks across two requests) can be mapped to the same physical block and share the memory space.
 
-
 This design achieves automatic prefix caching without the need of maintaining a tree structure among the KV blocks. More specifically, all of the blocks are independent of each other and can be allocated and freed by itself, which enables us to manages the KV cache as ordinary caches in operating system.
 
-
 ## Generalized Caching Policy
 
 Keeping all the KV blocks in a hash table enables vLLM to cache KV blocks from earlier requests to save memory and accelerate the computation of future requests. For example, if a new request shares the system prompt with the previous request, the KV cache of the shared prompt can directly be used for the new request without recomputation. However, the total KV cache space is limited and we have to decide which KV blocks to keep or evict when the cache is full.
@@ -41,5 +38,5 @@ Note that this eviction policy effectively implements the exact policy as in [Ra
 
 However, the hash-based KV cache management gives us the flexibility to handle more complicated serving scenarios and implement more complicated eviction policies beyond the policy above:
 
-- Multi-LoRA serving. When serving requests for multiple LoRA adapters, we can simply let the hash of each KV block to also include the LoRA ID the request is querying for to enable caching for all adapters. In this way, we can jointly manage the KV blocks for different adapters, which simplifies the system implementation and improves the global cache hit rate and efficiency.
-- Multi-modal models. When the user input includes more than just discrete tokens, we can use different hashing methods to handle the caching of inputs of different modalities. For example, perceptual hashing for images to cache similar input images.
+* Multi-LoRA serving. When serving requests for multiple LoRA adapters, we can simply let the hash of each KV block to also include the LoRA ID the request is querying for to enable caching for all adapters. In this way, we can jointly manage the KV blocks for different adapters, which simplifies the system implementation and improves the global cache hit rate and efficiency.
+* Multi-modal models. When the user input includes more than just discrete tokens, we can use different hashing methods to handle the caching of inputs of different modalities. For example, perceptual hashing for images to cache similar input images.
diff --git a/docs/source/features/quantization/auto_awq.md b/docs/source/features/quantization/auto_awq.md
index 3679595e3d4d0..404505eb3890e 100644
--- a/docs/source/features/quantization/auto_awq.md
+++ b/docs/source/features/quantization/auto_awq.md
@@ -15,7 +15,7 @@ The main benefits are lower latency and memory usage.
 You can quantize your own models by installing AutoAWQ or picking one of the [400+ models on Huggingface](https://huggingface.co/models?sort=trending&search=awq).
 
 ```console
-$ pip install autoawq
+pip install autoawq
 ```
 
 After installing AutoAWQ, you are ready to quantize a model. Here is an example of how to quantize `mistralai/Mistral-7B-Instruct-v0.2`:
@@ -47,7 +47,7 @@ print(f'Model is quantized and saved at "{quant_path}"')
 To run an AWQ model with vLLM, you can use [TheBloke/Llama-2-7b-Chat-AWQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-AWQ) with the following command:
 
 ```console
-$ python examples/offline_inference/llm_engine_example.py --model TheBloke/Llama-2-7b-Chat-AWQ --quantization awq
+python examples/offline_inference/llm_engine_example.py --model TheBloke/Llama-2-7b-Chat-AWQ --quantization awq
 ```
 
 AWQ models are also supported directly through the LLM entrypoint:
diff --git a/docs/source/features/quantization/bnb.md b/docs/source/features/quantization/bnb.md
index f7f41726f3725..7525e8e7866c3 100644
--- a/docs/source/features/quantization/bnb.md
+++ b/docs/source/features/quantization/bnb.md
@@ -9,7 +9,7 @@ Compared to other quantization methods, BitsAndBytes eliminates the need for cal
 Below are the steps to utilize BitsAndBytes with vLLM.
 
 ```console
-$ pip install bitsandbytes>=0.45.0
+pip install bitsandbytes>=0.45.0
 ```
 
 vLLM reads the model's config file and supports both in-flight quantization and pre-quantized checkpoint.
@@ -17,7 +17,7 @@ vLLM reads the model's config file and supports both in-flight quantization and
 You can find bitsandbytes quantized models on <https://huggingface.co/models?other=bitsandbytes>.
 And usually, these repositories have a config.json file that includes a quantization_config section.
 
-## Read quantized checkpoint.
+## Read quantized checkpoint
 
 ```python
 from vllm import LLM
@@ -37,10 +37,11 @@ model_id = "huggyllama/llama-7b"
 llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \
 quantization="bitsandbytes", load_format="bitsandbytes")
 ```
+
 ## OpenAI Compatible Server
 
 Append the following to your 4bit model arguments:
 
-```
+```console
 --quantization bitsandbytes --load-format bitsandbytes
 ```
diff --git a/docs/source/features/quantization/fp8.md b/docs/source/features/quantization/fp8.md
index b2eda74fd1e3b..da49cd2747228 100644
--- a/docs/source/features/quantization/fp8.md
+++ b/docs/source/features/quantization/fp8.md
@@ -41,7 +41,7 @@ Currently, we load the model at original precision before quantizing down to 8-b
 To produce performant FP8 quantized models with vLLM, you'll need to install the [llm-compressor](https://github.com/vllm-project/llm-compressor/) library:
 
 ```console
-$ pip install llmcompressor
+pip install llmcompressor
 ```
 
 ## Quantization Process
@@ -98,7 +98,7 @@ tokenizer.save_pretrained(SAVE_DIR)
 Install `vllm` and `lm-evaluation-harness`:
 
 ```console
-$ pip install vllm lm-eval==0.4.4
+pip install vllm lm-eval==0.4.4
 ```
 
 Load and run the model in `vllm`:
diff --git a/docs/source/features/quantization/fp8_e4m3_kvcache.md b/docs/source/features/quantization/fp8_e4m3_kvcache.md
index 50edaf81fddd3..1cd67cb8fd336 100644
--- a/docs/source/features/quantization/fp8_e4m3_kvcache.md
+++ b/docs/source/features/quantization/fp8_e4m3_kvcache.md
@@ -17,7 +17,7 @@ unquantized model through a quantizer tool (e.g. AMD quantizer or NVIDIA AMMO).
 To install AMMO (AlgorithMic Model Optimization):
 
 ```console
-$ pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com nvidia-ammo
+pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com nvidia-ammo
 ```
 
 Studies have shown that FP8 E4M3 quantization typically only minimally degrades inference accuracy. The most recent silicon
diff --git a/docs/source/features/quantization/gguf.md b/docs/source/features/quantization/gguf.md
index eebf11dfc1b2b..640997cf4bc39 100644
--- a/docs/source/features/quantization/gguf.md
+++ b/docs/source/features/quantization/gguf.md
@@ -13,16 +13,16 @@ Currently, vllm only supports loading single-file GGUF models. If you have a mul
 To run a GGUF model with vLLM, you can download and use the local GGUF model from [TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF](https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF) with the following command:
 
 ```console
-$ wget https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf
-$ # We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion.
-$ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0
+wget https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf
+# We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion.
+vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0
 ```
 
 You can also add `--tensor-parallel-size 2` to enable tensor parallelism inference with 2 GPUs:
 
 ```console
-$ # We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion.
-$ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tensor-parallel-size 2
+# We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion.
+vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tensor-parallel-size 2
 ```
 
 ```{warning}
diff --git a/docs/source/features/quantization/int8.md b/docs/source/features/quantization/int8.md
index 1ac50ba987dda..82a15d76d352f 100644
--- a/docs/source/features/quantization/int8.md
+++ b/docs/source/features/quantization/int8.md
@@ -16,7 +16,7 @@ INT8 computation is supported on NVIDIA GPUs with compute capability > 7.5 (Turi
 To use INT8 quantization with vLLM, you'll need to install the [llm-compressor](https://github.com/vllm-project/llm-compressor/) library:
 
 ```console
-$ pip install llmcompressor
+pip install llmcompressor
 ```
 
 ## Quantization Process
diff --git a/docs/source/features/spec_decode.md b/docs/source/features/spec_decode.md
index 903acadb71426..ab7b2f302bd13 100644
--- a/docs/source/features/spec_decode.md
+++ b/docs/source/features/spec_decode.md
@@ -192,11 +192,11 @@ A few important things to consider when using the EAGLE based draft models:
 
 1. The EAGLE draft models available in the [HF repository for EAGLE models](https://huggingface.co/yuhuili) cannot be
    used directly with vLLM due to differences in the expected layer names and model definition.
-   To use these models with vLLM, use the [following script](https://gist.github.com/abhigoyal1997/1e7a4109ccb7704fbc67f625e86b2d6d) 
+   To use these models with vLLM, use the [following script](https://gist.github.com/abhigoyal1997/1e7a4109ccb7704fbc67f625e86b2d6d)
    to convert them. Note that this script does not modify the model's weights.
 
    In the above example, use the script to first convert
-   the [yuhuili/EAGLE-LLaMA3-Instruct-8B](https://huggingface.co/yuhuili/EAGLE-LLaMA3-Instruct-8B) model 
+   the [yuhuili/EAGLE-LLaMA3-Instruct-8B](https://huggingface.co/yuhuili/EAGLE-LLaMA3-Instruct-8B) model
    and then use the converted checkpoint as the draft model in vLLM.
 
 2. The EAGLE based draft models need to be run without tensor parallelism
@@ -207,7 +207,6 @@ A few important things to consider when using the EAGLE based draft models:
    reported in the reference implementation [here](https://github.com/SafeAILab/EAGLE). This issue is under
    investigation and tracked here: [https://github.com/vllm-project/vllm/issues/9565](https://github.com/vllm-project/vllm/issues/9565).
 
-
 A variety of EAGLE draft models are available on the Hugging Face hub:
 
 | Base Model                                                           | EAGLE on Hugging Face                     | # EAGLE Parameters |
@@ -224,7 +223,6 @@ A variety of EAGLE draft models are available on the Hugging Face hub:
 | Qwen2-7B-Instruct                                                    | yuhuili/EAGLE-Qwen2-7B-Instruct          | 0.26B              |
 | Qwen2-72B-Instruct                                                   | yuhuili/EAGLE-Qwen2-72B-Instruct         | 1.05B              |
 
-
 ## Lossless guarantees of Speculative Decoding
 
 In vLLM, speculative decoding aims to enhance inference efficiency while maintaining accuracy. This section addresses the lossless guarantees of
@@ -250,8 +248,6 @@ speculative decoding, breaking down the guarantees into three key areas:
    same request across runs. For more details, see the FAQ section
    titled *Can the output of a prompt vary across runs in vLLM?* in the [FAQs](#faq).
 
-**Conclusion**
-
 While vLLM strives to ensure losslessness in speculative decoding, variations in generated outputs with and without speculative decoding
 can occur due to following factors:
 
@@ -259,8 +255,6 @@ can occur due to following factors:
 - **Batch Size and Numerical Stability**: Changes in batch size may cause variations in logprobs and output probabilities, potentially
   due to non-deterministic behavior in batched operations or numerical instability.
 
-**Mitigation Strategies**
-
 For mitigation strategies, please refer to the FAQ entry *Can the output of a prompt vary across runs in vLLM?* in the [FAQs](#faq).
 
 ## Resources for vLLM contributors
diff --git a/docs/source/features/tool_calling.md b/docs/source/features/tool_calling.md
index 062f2021eb62a..027ddb6d5eda3 100644
--- a/docs/source/features/tool_calling.md
+++ b/docs/source/features/tool_calling.md
@@ -55,21 +55,24 @@ print(f"Result: {get_weather(**json.loads(tool_call.arguments))}")
 ```
 
 Example output:
-```
+
+```text
 Function called: get_weather
 Arguments: {"location": "San Francisco, CA", "unit": "fahrenheit"}
 Result: Getting the weather for San Francisco, CA in fahrenheit...
 ```
 
 This example demonstrates:
-- Setting up the server with tool calling enabled
-- Defining an actual function to handle tool calls
-- Making a request with `tool_choice="auto"`
-- Handling the structured response and executing the corresponding function
+
+* Setting up the server with tool calling enabled
+* Defining an actual function to handle tool calls
+* Making a request with `tool_choice="auto"`
+* Handling the structured response and executing the corresponding function
 
 You can also specify a particular function using named function calling by setting `tool_choice={"type": "function", "function": {"name": "get_weather"}}`. Note that this will use the guided decoding backend - so the first time this is used, there will be several seconds of latency (or more) as the FSM is compiled for the first time before it is cached for subsequent requests.
 
 Remember that it's the callers responsibility to:
+
 1. Define appropriate tools in the request
 2. Include relevant context in the chat messages
 3. Handle the tool calls in your application logic
@@ -77,20 +80,21 @@ Remember that it's the callers responsibility to:
 For more advanced usage, including parallel tool calls and different model-specific parsers, see the sections below.
 
 ## Named Function Calling
+
 vLLM supports named function calling in the chat completion API by default. It does so using Outlines through guided decoding, so this is
 enabled by default, and will work with any supported model. You are guaranteed a validly-parsable function call - not a
 high-quality one.
 
-vLLM will use guided decoding to ensure the response matches the tool parameter object defined by the JSON schema in the `tools` parameter. 
+vLLM will use guided decoding to ensure the response matches the tool parameter object defined by the JSON schema in the `tools` parameter.
 For best results, we recommend ensuring that the expected output format / schema is specified in the prompt to ensure that the model's intended generation is aligned with the schema that it's being forced to generate by the guided decoding backend.
 
 To use a named function, you need to define the functions in the `tools` parameter of the chat completion request, and
 specify the `name` of one of the tools in the `tool_choice` parameter of the chat completion request.
 
-
 ## Automatic Function Calling
 
 To enable this feature, you should set the following flags:
+
 * `--enable-auto-tool-choice` -- **mandatory** Auto tool choice. tells vLLM that you want to enable the model to generate its own tool calls when it
 deems appropriate.
 * `--tool-call-parser` -- select the tool parser to use (listed below). Additional tool parsers
@@ -104,28 +108,28 @@ from HuggingFace; and you can find an example of this in a `tokenizer_config.jso
 
 If your favorite tool-calling model is not supported, please feel free to contribute a parser & tool use chat template!
 
-
 ### Hermes Models (`hermes`)
 
 All Nous Research Hermes-series models newer than Hermes 2 Pro should be supported.
+
 * `NousResearch/Hermes-2-Pro-*`
 * `NousResearch/Hermes-2-Theta-*`
 * `NousResearch/Hermes-3-*`
 
-
 _Note that the Hermes 2 **Theta** models are known to have degraded tool call quality & capabilities due to the merge
 step in their creation_.
 
 Flags: `--tool-call-parser hermes`
 
-
 ### Mistral Models (`mistral`)
 
 Supported models:
+
 * `mistralai/Mistral-7B-Instruct-v0.3` (confirmed)
 * Additional mistral function-calling models are compatible as well.
 
 Known issues:
+
 1. Mistral 7B struggles to generate parallel tool calls correctly.
 2. Mistral's `tokenizer_config.json` chat template requires tool call IDs that are exactly 9 digits, which is
 much shorter than what vLLM generates. Since an exception is thrown when this condition
@@ -136,13 +140,12 @@ it works with vLLM's tool call IDs (provided `tool_call_id` fields are truncated
 * `examples/tool_chat_template_mistral_parallel.jinja` - this is a "better" version that adds a tool-use system prompt
 when tools are provided, that results in much better reliability when working with parallel tool calling.
 
-
 Recommended flags: `--tool-call-parser mistral --chat-template examples/tool_chat_template_mistral_parallel.jinja`
 
-
 ### Llama Models (`llama3_json`)
 
 Supported models:
+
 * `meta-llama/Meta-Llama-3.1-8B-Instruct`
 * `meta-llama/Meta-Llama-3.1-70B-Instruct`
 * `meta-llama/Meta-Llama-3.1-405B-Instruct`
@@ -152,6 +155,7 @@ The tool calling that is supported is the [JSON based tool calling](https://llam
 Other tool calling formats like the built in python tool calling or custom tool calling are not supported.
 
 Known issues:
+
 1. Parallel tool calls are not supported.
 2. The model can generate parameters with a wrong format, such as generating
    an array serialized as string instead of an array.
@@ -164,6 +168,7 @@ Recommended flags: `--tool-call-parser llama3_json --chat-template examples/tool
 #### IBM Granite
 
 Supported models:
+
 * `ibm-granite/granite-3.0-8b-instruct`
 
 Recommended flags: `--tool-call-parser granite --chat-template examples/tool_chat_template_granite.jinja`
@@ -182,42 +187,45 @@ Recommended flags: `--tool-call-parser granite-20b-fc --chat-template examples/t
 
 `examples/tool_chat_template_granite_20b_fc.jinja`: this is a modified chat template from the original on Huggingface, which is not vLLM compatible. It blends function description elements from the Hermes template and follows the same system prompt as "Response Generation" mode from [the paper](https://arxiv.org/abs/2407.00121). Parallel function calls are supported.
 
-
 ### InternLM Models (`internlm`)
 
 Supported models:
+
 * `internlm/internlm2_5-7b-chat` (confirmed)
 * Additional internlm2.5 function-calling models are compatible as well
 
 Known issues:
+
 * Although this implementation also supports InternLM2, the tool call results are not stable when testing with the `internlm/internlm2-chat-7b` model.
 
 Recommended flags: `--tool-call-parser internlm --chat-template examples/tool_chat_template_internlm2_tool.jinja`
 
-
 ### Jamba Models (`jamba`)
+
 AI21's Jamba-1.5 models are supported.
+
 * `ai21labs/AI21-Jamba-1.5-Mini`
 * `ai21labs/AI21-Jamba-1.5-Large`
 
-
 Flags: `--tool-call-parser jamba`
 
-
 ### Models with Pythonic Tool Calls (`pythonic`)
 
 A growing number of models output a python list to represent tool calls instead of using JSON. This has the advantage of inherently supporting parallel tool calls and removing ambiguity around the JSON schema required for tool calls. The `pythonic` tool parser can support such models.
 
 As a concrete example, these models may look up the weather in San Francisco and Seattle by generating:
+
 ```python
 [get_weather(city='San Francisco', metric='celsius'), get_weather(city='Seattle', metric='celsius')]
 ```
 
 Limitations:
+
 * The model must not generate both text and tool calls in the same generation. This may not be hard to change for a specific model, but the community currently lacks consensus on which tokens to emit when starting and ending tool calls.  (In particular, the Llama 3.2 models emit no such tokens.)
 * Llama's smaller models struggle to use tools effectively.
 
 Example supported models:
+
 * `meta-llama/Llama-3.2-1B-Instruct`\* (use with `examples/tool_chat_template_llama3.2_pythonic.jinja`)
 * `meta-llama/Llama-3.2-3B-Instruct`\* (use with `examples/tool_chat_template_llama3.2_pythonic.jinja`)
 * `Team-ACE/ToolACE-8B` (use with `examples/tool_chat_template_toolace.jinja`)
@@ -231,7 +239,6 @@ Llama's smaller models frequently fail to emit tool calls in the correct format.
 
 ---
 
-
 ## How to write a tool parser plugin
 
 A tool parser plugin is a Python file containing one or more ToolParser implementations. You can write a ToolParser similar to the `Hermes2ProToolParser` in vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py.
@@ -284,7 +291,8 @@ class ExampleToolParser(ToolParser):
 ```
 
 Then you can use this plugin in the command line like this.
-```
+
+```console
     --enable-auto-tool-choice \
     --tool-parser-plugin <absolute path of the plugin file>
     --tool-call-parser example \
diff --git a/docs/source/getting_started/faq.md b/docs/source/getting_started/faq.md
index fde2954f10c59..4751b325e6fc4 100644
--- a/docs/source/getting_started/faq.md
+++ b/docs/source/getting_started/faq.md
@@ -30,7 +30,7 @@ changes in batch size, or batch expansion in speculative decoding. These batchin
 can lead to slightly different logit/logprob values at each step. Such differences can accumulate, potentially resulting in
 different tokens being sampled. Once a different token is sampled, further divergence is likely.
 
-**Mitigation Strategies**
+## Mitigation Strategies
 
 - For improved stability and reduced variance, use `float32`. Note that this will require more memory.
 - If using `bfloat16`, switching to `float16` can also help.
diff --git a/docs/source/getting_started/installation/cpu-apple.md b/docs/source/getting_started/installation/cpu-apple.md
index b55e4384d064d..1068893f5bafa 100644
--- a/docs/source/getting_started/installation/cpu-apple.md
+++ b/docs/source/getting_started/installation/cpu-apple.md
@@ -18,25 +18,23 @@ Currently the CPU implementation for macOS supports FP32 and FP16 datatypes.
 
 After installation of XCode and the Command Line Tools, which include Apple Clang, execute the following commands to build and install vLLM from the source.
 
-```
-$ git clone https://github.com/vllm-project/vllm.git
-$ cd vllm
-$ pip install -r requirements-cpu.txt
-$ pip install -e . 
+```console
+git clone https://github.com/vllm-project/vllm.git
+cd vllm
+pip install -r requirements-cpu.txt
+pip install -e . 
 ```
 
 ```{note}
 On macOS the `VLLM_TARGET_DEVICE` is automatically set to `cpu`, which currently is the only supported device.
 ```
 
-
-
 ## Troubleshooting
 
-If the build has error like the following snippet where standard C++ headers cannot be found, try to remove and reinstall your 
+If the build has error like the following snippet where standard C++ headers cannot be found, try to remove and reinstall your
 [Command Line Tools for Xcode](https://developer.apple.com/download/all/).
 
-```
+```text
 [...] fatal error: 'map' file not found
           1 | #include <map>
             |          ^~~~~
@@ -48,4 +46,3 @@ If the build has error like the following snippet where standard C++ headers can
             |          ^~~~~~~~~
       1 error generated.
 ```
-
diff --git a/docs/source/getting_started/installation/cpu-x86.md b/docs/source/getting_started/installation/cpu-x86.md
index 26bdcd93ad190..c49c8e0f2a18c 100644
--- a/docs/source/getting_started/installation/cpu-x86.md
+++ b/docs/source/getting_started/installation/cpu-x86.md
@@ -32,13 +32,13 @@ Table of contents:
 ## Quick start using Dockerfile
 
 ```console
-$ docker build -f Dockerfile.cpu -t vllm-cpu-env --shm-size=4g .
-$ docker run -it \
-             --rm \
-             --network=host \
-             --cpuset-cpus=<cpu-id-list, optional> \
-             --cpuset-mems=<memory-node, optional> \
-             vllm-cpu-env
+docker build -f Dockerfile.cpu -t vllm-cpu-env --shm-size=4g .
+docker run -it \
+           --rm \
+           --network=host \
+           --cpuset-cpus=<cpu-id-list, optional> \
+           --cpuset-mems=<memory-node, optional> \
+           vllm-cpu-env
 ```
 
 (build-cpu-backend-from-source)=
@@ -48,23 +48,23 @@ $ docker run -it \
 - First, install recommended compiler. We recommend to use `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run:
 
 ```console
-$ sudo apt-get update  -y
-$ sudo apt-get install -y gcc-12 g++-12 libnuma-dev
-$ sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
+sudo apt-get update  -y
+sudo apt-get install -y gcc-12 g++-12 libnuma-dev
+sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
 ```
 
 - Second, install Python packages for vLLM CPU backend building:
 
 ```console
-$ pip install --upgrade pip
-$ pip install cmake>=3.26 wheel packaging ninja "setuptools-scm>=8" numpy
-$ pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
+pip install --upgrade pip
+pip install cmake>=3.26 wheel packaging ninja "setuptools-scm>=8" numpy
+pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
 ```
 
 - Finally, build and install vLLM CPU backend:
 
 ```console
-$ VLLM_TARGET_DEVICE=cpu python setup.py install
+VLLM_TARGET_DEVICE=cpu python setup.py install
 ```
 
 ```{note}
@@ -92,18 +92,18 @@ $ VLLM_TARGET_DEVICE=cpu python setup.py install
 - We highly recommend to use TCMalloc for high performance memory allocation and better cache locality. For example, on Ubuntu 22.4, you can run:
 
 ```console
-$ sudo apt-get install libtcmalloc-minimal4 # install TCMalloc library
-$ find / -name *libtcmalloc* # find the dynamic link library path
-$ export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD
-$ python examples/offline_inference/basic.py # run vLLM
+sudo apt-get install libtcmalloc-minimal4 # install TCMalloc library
+find / -name *libtcmalloc* # find the dynamic link library path
+export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD
+python examples/offline_inference/basic.py # run vLLM
 ```
 
 - When using the online serving, it is recommended to reserve 1-2 CPU cores for the serving framework to avoid CPU oversubscription. For example, on a platform with 32 physical CPU cores, reserving CPU 30 and 31 for the framework and using CPU 0-29 for OpenMP:
 
 ```console
-$ export VLLM_CPU_KVCACHE_SPACE=40
-$ export VLLM_CPU_OMP_THREADS_BIND=0-29
-$ vllm serve facebook/opt-125m
+export VLLM_CPU_KVCACHE_SPACE=40
+export VLLM_CPU_OMP_THREADS_BIND=0-29
+vllm serve facebook/opt-125m
 ```
 
 - If using vLLM CPU backend on a machine with hyper-threading, it is recommended to bind only one OpenMP thread on each physical CPU core using `VLLM_CPU_OMP_THREADS_BIND`. On a hyper-threading enabled platform with 16 logical CPU cores / 8 physical CPU cores:
@@ -148,7 +148,7 @@ $ python examples/offline_inference/basic.py
   - Using Tensor Parallel for a latency constraints deployment: following GPU backend design, a Megatron-LM's parallel algorithm will be used to shard the model, based on the number of NUMA nodes (e.g. TP = 2 for a two NUMA node system). With [TP feature on CPU](gh-pr:6125) merged, Tensor Parallel is supported for serving and offline inferencing. In general each NUMA node is treated as one GPU card. Below is the example script to enable Tensor Parallel = 2 for serving:
 
     ```console
-    $ VLLM_CPU_KVCACHE_SPACE=40 VLLM_CPU_OMP_THREADS_BIND="0-31|32-63" vllm serve meta-llama/Llama-2-7b-chat-hf -tp=2 --distributed-executor-backend mp
+    VLLM_CPU_KVCACHE_SPACE=40 VLLM_CPU_OMP_THREADS_BIND="0-31|32-63" vllm serve meta-llama/Llama-2-7b-chat-hf -tp=2 --distributed-executor-backend mp
     ```
 
   - Using Data Parallel for maximum throughput: to launch an LLM serving endpoint on each NUMA node along with one additional load balancer to dispatch the requests to those endpoints. Common solutions like [Nginx](#nginxloadbalancer) or HAProxy are recommended. Anyscale Ray project provides the feature on LLM [serving](https://docs.ray.io/en/latest/serve/index.html). Here is the example to setup a scalable LLM serving with [Ray Serve](https://github.com/intel/llm-on-ray/blob/main/docs/setup.md).
diff --git a/docs/source/getting_started/installation/gpu-cuda.md b/docs/source/getting_started/installation/gpu-cuda.md
index 419b8163fc034..727486abbd10f 100644
--- a/docs/source/getting_started/installation/gpu-cuda.md
+++ b/docs/source/getting_started/installation/gpu-cuda.md
@@ -17,9 +17,9 @@ vLLM is a Python library that also contains pre-compiled C++ and CUDA (12.1) bin
 You can create a new Python environment using `conda`:
 
 ```console
-$ # (Recommended) Create a new conda environment.
-$ conda create -n myenv python=3.12 -y
-$ conda activate myenv
+# (Recommended) Create a new conda environment.
+conda create -n myenv python=3.12 -y
+conda activate myenv
 ```
 
 ```{note}
@@ -29,9 +29,9 @@ $ conda activate myenv
 Or you can create a new Python environment using [uv](https://docs.astral.sh/uv/), a very fast Python environment manager. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment using the following command:
 
 ```console
-$ # (Recommended) Create a new uv environment. Use `--seed` to install `pip` and `setuptools` in the environment.
-$ uv venv myenv --python 3.12 --seed
-$ source myenv/bin/activate
+# (Recommended) Create a new uv environment. Use `--seed` to install `pip` and `setuptools` in the environment.
+uv venv myenv --python 3.12 --seed
+source myenv/bin/activate
 ```
 
 In order to be performant, vLLM has to compile many cuda kernels. The compilation unfortunately introduces binary incompatibility with other CUDA versions and PyTorch versions, even for the same PyTorch version with different building configurations.
@@ -43,18 +43,18 @@ Therefore, it is recommended to install vLLM with a **fresh new** environment. I
 You can install vLLM using either `pip` or `uv pip`:
 
 ```console
-$ # Install vLLM with CUDA 12.1.
-$ pip install vllm # If you are using pip.
-$ uv pip install vllm # If you are using uv.
+# Install vLLM with CUDA 12.1.
+pip install vllm # If you are using pip.
+uv pip install vllm # If you are using uv.
 ```
 
 As of now, vLLM's binaries are compiled with CUDA 12.1 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 11.8 and public PyTorch release versions:
 
 ```console
-$ # Install vLLM with CUDA 11.8.
-$ export VLLM_VERSION=0.6.1.post1
-$ export PYTHON_VERSION=310
-$ pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
+# Install vLLM with CUDA 11.8.
+export VLLM_VERSION=0.6.1.post1
+export PYTHON_VERSION=310
+pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
 ```
 
 (install-the-latest-code)=
@@ -66,7 +66,7 @@ LLM inference is a fast-evolving field, and the latest code may contain bug fixe
 ### Install the latest code using `pip`
 
 ```console
-$ pip install vllm --pre --extra-index-url https://wheels.vllm.ai/nightly
+pip install vllm --pre --extra-index-url https://wheels.vllm.ai/nightly
 ```
 
 `--pre` is required for `pip` to consider pre-released versions.
@@ -74,8 +74,8 @@ $ pip install vllm --pre --extra-index-url https://wheels.vllm.ai/nightly
 If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), due to the limitation of `pip`, you have to specify the full URL of the wheel file by embedding the commit hash in the URL:
 
 ```console
-$ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
-$ pip install https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
+export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
+pip install https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
 ```
 
 Note that the wheels are built with Python 3.8 ABI (see [PEP 425](https://peps.python.org/pep-0425/) for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (`1.0.0.dev`) is just a placeholder to have a unified URL for the wheels, the actual versions of wheels are contained in the wheel metadata (the wheels listed in the extra index url have correct versions). Although we don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the wheels are still built with Python 3.8 ABI to keep the same wheel name as before.
@@ -85,14 +85,14 @@ Note that the wheels are built with Python 3.8 ABI (see [PEP 425](https://peps.p
 Another way to install the latest code is to use `uv`:
 
 ```console
-$ uv pip install vllm --extra-index-url https://wheels.vllm.ai/nightly
+uv pip install vllm --extra-index-url https://wheels.vllm.ai/nightly
 ```
 
 If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), you can specify the commit hash in the URL:
 
 ```console
-$ export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch
-$ uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT}
+export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch
+uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT}
 ```
 
 The `uv` approach works for vLLM `v0.6.6` and later and offers an easy-to-remember command. A unique feature of `uv` is that packages in `--extra-index-url` have [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes). If the latest public release is `v0.6.6.post1`, `uv`'s behavior allows installing a commit before `v0.6.6.post1` by specifying the `--extra-index-url`. In contrast, `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version.
@@ -102,8 +102,8 @@ The `uv` approach works for vLLM `v0.6.6` and later and offers an easy-to-rememb
 Another way to access the latest code is to use the docker images:
 
 ```console
-$ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
-$ docker pull public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:${VLLM_COMMIT}
+export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
+docker pull public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:${VLLM_COMMIT}
 ```
 
 These docker images are used for CI and testing only, and they are not intended for production use. They will be expired after several days.
@@ -121,18 +121,18 @@ The latest code can contain bugs and may not be stable. Please use it with cauti
 If you only need to change Python code, you can build and install vLLM without compilation. Using `pip`'s [`--editable` flag](https://pip.pypa.io/en/stable/topics/local-project-installs/#editable-installs), changes you make to the code will be reflected when you run vLLM:
 
 ```console
-$ git clone https://github.com/vllm-project/vllm.git
-$ cd vllm
-$ VLLM_USE_PRECOMPILED=1 pip install --editable .
+git clone https://github.com/vllm-project/vllm.git
+cd vllm
+VLLM_USE_PRECOMPILED=1 pip install --editable .
 ```
 
-This will download the latest nightly wheel from https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl and use the compiled libraries from there in the installation.
+This will download the [latest nightly wheel](https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl) and use the compiled libraries from there in the installation.
 
 The `VLLM_PRECOMPILED_WHEEL_LOCATION` environment variable can be used instead of `VLLM_USE_PRECOMPILED` to specify a custom path or URL to the wheel file. For example, to use the [0.6.1.post1 PyPi wheel](https://pypi.org/project/vllm/#files):
 
 ```console
-$ export VLLM_PRECOMPILED_WHEEL_LOCATION=https://files.pythonhosted.org/packages/4a/4c/ee65ba33467a4c0de350ce29fbae39b9d0e7fcd887cc756fa993654d1228/vllm-0.6.3.post1-cp38-abi3-manylinux1_x86_64.whl
-$ pip install --editable .
+export VLLM_PRECOMPILED_WHEEL_LOCATION=https://files.pythonhosted.org/packages/4a/4c/ee65ba33467a4c0de350ce29fbae39b9d0e7fcd887cc756fa993654d1228/vllm-0.6.3.post1-cp38-abi3-manylinux1_x86_64.whl
+pip install --editable .
 ```
 
 You can find more information about vLLM's wheels [above](#install-the-latest-code).
@@ -147,9 +147,9 @@ It is recommended to use the same commit ID for the source code as the vLLM whee
 If you want to modify C++ or CUDA code, you'll need to build vLLM from source. This can take several minutes:
 
 ```console
-$ git clone https://github.com/vllm-project/vllm.git
-$ cd vllm
-$ pip install -e .
+git clone https://github.com/vllm-project/vllm.git
+cd vllm
+pip install -e .
 ```
 
 ```{tip}
@@ -172,11 +172,11 @@ There are scenarios where the PyTorch dependency cannot be easily installed via
 To build vLLM using an existing PyTorch installation:
 
 ```console
-$ git clone https://github.com/vllm-project/vllm.git
-$ cd vllm
-$ python use_existing_torch.py
-$ pip install -r requirements-build.txt
-$ pip install -e . --no-build-isolation
+git clone https://github.com/vllm-project/vllm.git
+cd vllm
+python use_existing_torch.py
+pip install -r requirements-build.txt
+pip install -e . --no-build-isolation
 ```
 
 #### Use the local cutlass for compilation
@@ -185,9 +185,9 @@ Currently, before starting the build process, vLLM fetches cutlass code from Git
 To achieve this, you can set the environment variable VLLM_CUTLASS_SRC_DIR to point to your local cutlass directory.
 
 ```console
-$ git clone https://github.com/vllm-project/vllm.git
-$ cd vllm
-$ VLLM_CUTLASS_SRC_DIR=/path/to/cutlass pip install -e .
+git clone https://github.com/vllm-project/vllm.git
+cd vllm
+VLLM_CUTLASS_SRC_DIR=/path/to/cutlass pip install -e .
 ```
 
 #### Troubleshooting
@@ -196,8 +196,8 @@ To avoid your system being overloaded, you can limit the number of compilation j
 to be run simultaneously, via the environment variable `MAX_JOBS`. For example:
 
 ```console
-$ export MAX_JOBS=6
-$ pip install -e .
+export MAX_JOBS=6
+pip install -e .
 ```
 
 This is especially useful when you are building on less powerful machines. For example, when you use WSL it only [assigns 50% of the total memory by default](https://learn.microsoft.com/en-us/windows/wsl/wsl-config#main-wsl-settings), so using `export MAX_JOBS=1` can avoid compiling multiple files simultaneously and running out of memory.
@@ -206,22 +206,22 @@ A side effect is a much slower build process.
 Additionally, if you have trouble building vLLM, we recommend using the NVIDIA PyTorch Docker image.
 
 ```console
-$ # Use `--ipc=host` to make sure the shared memory is large enough.
-$ docker run --gpus all -it --rm --ipc=host nvcr.io/nvidia/pytorch:23.10-py3
+# Use `--ipc=host` to make sure the shared memory is large enough.
+docker run --gpus all -it --rm --ipc=host nvcr.io/nvidia/pytorch:23.10-py3
 ```
 
 If you don't want to use docker, it is recommended to have a full installation of CUDA Toolkit. You can download and install it from [the official website](https://developer.nvidia.com/cuda-toolkit-archive). After installation, set the environment variable `CUDA_HOME` to the installation path of CUDA Toolkit, and make sure that the `nvcc` compiler is in your `PATH`, e.g.:
 
 ```console
-$ export CUDA_HOME=/usr/local/cuda
-$ export PATH="${CUDA_HOME}/bin:$PATH"
+export CUDA_HOME=/usr/local/cuda
+export PATH="${CUDA_HOME}/bin:$PATH"
 ```
 
 Here is a sanity check to verify that the CUDA Toolkit is correctly installed:
 
 ```console
-$ nvcc --version # verify that nvcc is in your PATH
-$ ${CUDA_HOME}/bin/nvcc --version # verify that nvcc is in your CUDA_HOME
+nvcc --version # verify that nvcc is in your PATH
+${CUDA_HOME}/bin/nvcc --version # verify that nvcc is in your CUDA_HOME
 ```
 
 ### Unsupported OS build
@@ -231,6 +231,6 @@ vLLM can fully run only on Linux but for development purposes, you can still bui
 Simply disable the `VLLM_TARGET_DEVICE` environment variable before installing:
 
 ```console
-$ export VLLM_TARGET_DEVICE=empty
-$ pip install -e .
+export VLLM_TARGET_DEVICE=empty
+pip install -e .
 ```
diff --git a/docs/source/getting_started/installation/gpu-rocm.md b/docs/source/getting_started/installation/gpu-rocm.md
index e36b92513e31d..a8971bb96248c 100644
--- a/docs/source/getting_started/installation/gpu-rocm.md
+++ b/docs/source/getting_started/installation/gpu-rocm.md
@@ -47,13 +47,13 @@ Their values can be passed in when running `docker build` with `--build-arg` opt
 To build vllm on ROCm 6.2 for MI200 and MI300 series, you can use the default:
 
 ```console
-$ DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm -t vllm-rocm .
+DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm -t vllm-rocm .
 ```
 
 To build vllm on ROCm 6.2 for Radeon RX7900 series (gfx1100), you should specify `BUILD_FA` as below:
 
 ```console
-$ DOCKER_BUILDKIT=1 docker build --build-arg BUILD_FA="0" -f Dockerfile.rocm -t vllm-rocm .
+DOCKER_BUILDKIT=1 docker build --build-arg BUILD_FA="0" -f Dockerfile.rocm -t vllm-rocm .
 ```
 
 To run the above docker image `vllm-rocm`, use the below command:
@@ -83,81 +83,81 @@ Where the `<path/to/model>` is the location where the model is stored, for examp
 - [ROCm](https://rocm.docs.amd.com/en/latest/deploy/linux/index.html)
 - [PyTorch](https://pytorch.org/)
 
-For installing PyTorch, you can start from a fresh docker image, e.g, `rocm/pytorch:rocm6.2_ubuntu20.04_py3.9_pytorch_release_2.3.0`, `rocm/pytorch-nightly`.
+    For installing PyTorch, you can start from a fresh docker image, e.g, `rocm/pytorch:rocm6.2_ubuntu20.04_py3.9_pytorch_release_2.3.0`, `rocm/pytorch-nightly`.
 
-Alternatively, you can install PyTorch using PyTorch wheels. You can check PyTorch installation guide in PyTorch [Getting Started](https://pytorch.org/get-started/locally/)
+    Alternatively, you can install PyTorch using PyTorch wheels. You can check PyTorch installation guide in PyTorch [Getting Started](https://pytorch.org/get-started/locally/)
 
 1. Install [Triton flash attention for ROCm](https://github.com/ROCm/triton)
 
-Install ROCm's Triton flash attention (the default triton-mlir branch) following the instructions from [ROCm/triton](https://github.com/ROCm/triton/blob/triton-mlir/README.md)
+    Install ROCm's Triton flash attention (the default triton-mlir branch) following the instructions from [ROCm/triton](https://github.com/ROCm/triton/blob/triton-mlir/README.md)
 
-```console
-$ python3 -m pip install ninja cmake wheel pybind11
-$ pip uninstall -y triton
-$ git clone https://github.com/OpenAI/triton.git
-$ cd triton
-$ git checkout e192dba
-$ cd python
-$ pip3 install .
-$ cd ../..
-```
+    ```console
+    python3 -m pip install ninja cmake wheel pybind11
+    pip uninstall -y triton
+    git clone https://github.com/OpenAI/triton.git
+    cd triton
+    git checkout e192dba
+    cd python
+    pip3 install .
+    cd ../..
+    ```
 
-```{note}
-- If you see HTTP issue related to downloading packages during building triton, please try again as the HTTP error is intermittent.
-```
+    ```{note}
+    - If you see HTTP issue related to downloading packages during building triton, please try again as the HTTP error is intermittent.
+    ```
 
 2. Optionally, if you choose to use CK flash attention, you can install [flash attention for ROCm](https://github.com/ROCm/flash-attention/tree/ck_tile)
 
-Install ROCm's flash attention (v2.5.9.post1) following the instructions from [ROCm/flash-attention](https://github.com/ROCm/flash-attention/tree/ck_tile#amd-gpurocm-support)
-Alternatively, wheels intended for vLLM use can be accessed under the releases.
+    Install ROCm's flash attention (v2.5.9.post1) following the instructions from [ROCm/flash-attention](https://github.com/ROCm/flash-attention/tree/ck_tile#amd-gpurocm-support)
+    Alternatively, wheels intended for vLLM use can be accessed under the releases.
 
-For example, for ROCm 6.2, suppose your gfx arch is `gfx90a`. To get your gfx architecture, run `rocminfo |grep gfx`.
+    For example, for ROCm 6.2, suppose your gfx arch is `gfx90a`. To get your gfx architecture, run `rocminfo |grep gfx`.
 
-```console
-$ git clone https://github.com/ROCm/flash-attention.git
-$ cd flash-attention
-$ git checkout 3cea2fb
-$ git submodule update --init
-$ GPU_ARCHS="gfx90a" python3 setup.py install
-$ cd ..
-```
+    ```console
+    git clone https://github.com/ROCm/flash-attention.git
+    cd flash-attention
+    git checkout 3cea2fb
+    git submodule update --init
+    GPU_ARCHS="gfx90a" python3 setup.py install
+    cd ..
+    ```
 
-```{note}
-- You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`)
-```
+    ```{note}
+    - You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`)
+    ```
 
 3. Build vLLM. For example, vLLM on ROCM 6.2 can be built with the following steps:
 
-```bash
-$ pip install --upgrade pip
+    ```bash
+    $ pip install --upgrade pip
 
-# Install PyTorch
-$ pip uninstall torch -y
-$ pip install --no-cache-dir --pre torch==2.6.0.dev20241024 --index-url https://download.pytorch.org/whl/nightly/rocm6.2
+    # Install PyTorch
+    $ pip uninstall torch -y
+    $ pip install --no-cache-dir --pre torch==2.6.0.dev20241024 --index-url https://download.pytorch.org/whl/nightly/rocm6.2
 
-# Build & install AMD SMI
-$ pip install /opt/rocm/share/amd_smi
+    # Build & install AMD SMI
+    $ pip install /opt/rocm/share/amd_smi
 
-# Install dependencies
-$ pip install --upgrade numba scipy huggingface-hub[cli]
-$ pip install "numpy<2"
-$ pip install -r requirements-rocm.txt
+    # Install dependencies
+    $ pip install --upgrade numba scipy huggingface-hub[cli]
+    $ pip install "numpy<2"
+    $ pip install -r requirements-rocm.txt
 
-# Build vLLM for MI210/MI250/MI300.
-$ export PYTORCH_ROCM_ARCH="gfx90a;gfx942"
-$ python3 setup.py develop
-```
+    # Build vLLM for MI210/MI250/MI300.
+    $ export PYTORCH_ROCM_ARCH="gfx90a;gfx942"
+    $ python3 setup.py develop
+    ```
 
-This may take 5-10 minutes. Currently, `pip install .` does not work for ROCm installation.
+    This may take 5-10 minutes. Currently, `pip install .` does not work for ROCm installation.
 
-```{tip}
-- Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers.
-- Triton flash attention does not currently support sliding window attention. If using half precision, please use CK flash-attention for sliding window support.
-- To use CK flash-attention or PyTorch naive attention, please use this flag `export VLLM_USE_TRITON_FLASH_ATTN=0` to turn off triton flash attention.
-- The ROCm version of PyTorch, ideally, should match the ROCm driver version.
-```
+    ```{tip}
+    - Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers.
+    - Triton flash attention does not currently support sliding window attention. If using half precision, please use CK flash-attention for sliding window support.
+    - To use CK flash-attention or PyTorch naive attention, please use this flag `export VLLM_USE_TRITON_FLASH_ATTN=0` to turn off triton flash attention.
+    - The ROCm version of PyTorch, ideally, should match the ROCm driver version.
+    ```
 
-```{tip}
-- For MI300x (gfx942) users, to achieve optimal performance, please refer to [MI300x tuning guide](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/index.html) for performance optimization and tuning tips on system and workflow level.
-  For vLLM, please refer to [vLLM performance optimization](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/workload.html#vllm-performance-optimization).
-```
+    ```{tip}
+    - For MI300x (gfx942) users, to achieve optimal performance, please refer to [MI300x tuning guide](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/index.html) for performance optimization and tuning tips on system and workflow level.
+      For vLLM, please refer to [vLLM performance optimization](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/workload.html#vllm-performance-optimization).
+    ```
diff --git a/docs/source/getting_started/installation/hpu-gaudi.md b/docs/source/getting_started/installation/hpu-gaudi.md
index 21822327c8822..a829b1c9ff996 100644
--- a/docs/source/getting_started/installation/hpu-gaudi.md
+++ b/docs/source/getting_started/installation/hpu-gaudi.md
@@ -22,8 +22,8 @@ Guide](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optim
 ### Quick start using Dockerfile
 
 ```console
-$ docker build -f Dockerfile.hpu -t vllm-hpu-env  .
-$ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --rm vllm-hpu-env
+docker build -f Dockerfile.hpu -t vllm-hpu-env  .
+docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --rm vllm-hpu-env
 ```
 
 ```{tip}
@@ -37,10 +37,10 @@ If you're observing the following error: `docker: Error response from daemon: Un
 To verify that the Intel Gaudi software was correctly installed, run:
 
 ```console
-$ hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible
-$ apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core, habanalabs-thunk and habanalabs-container-runtime are installed
-$ pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml and habana-media-loader are installed
-$ pip list | grep neural # verify that neural_compressor is installed
+hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible
+apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core, habanalabs-thunk and habanalabs-container-runtime are installed
+pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml and habana-media-loader are installed
+pip list | grep neural # verify that neural_compressor is installed
 ```
 
 Refer to [Intel Gaudi Software Stack
@@ -57,8 +57,8 @@ for more details.
 Use the following commands to run a Docker image:
 
 ```console
-$ docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
-$ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
 ```
 
 #### Build and Install vLLM
@@ -66,18 +66,18 @@ $ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_
 To build and install vLLM from source, run:
 
 ```console
-$ git clone https://github.com/vllm-project/vllm.git
-$ cd vllm
-$ python setup.py develop
+git clone https://github.com/vllm-project/vllm.git
+cd vllm
+python setup.py develop
 ```
 
 Currently, the latest features and performance optimizations are developed in Gaudi's [vLLM-fork](https://github.com/HabanaAI/vllm-fork) and we periodically upstream them to vLLM main repo. To install latest [HabanaAI/vLLM-fork](https://github.com/HabanaAI/vllm-fork), run the following:
 
 ```console
-$ git clone https://github.com/HabanaAI/vllm-fork.git
-$ cd vllm-fork
-$ git checkout habana_main
-$ python setup.py develop
+git clone https://github.com/HabanaAI/vllm-fork.git
+cd vllm-fork
+git checkout habana_main
+python setup.py develop
 ```
 
 ## Supported Features
@@ -181,7 +181,7 @@ Bucketing allows us to reduce the number of required graphs significantly, but i
 
 Bucketing ranges are determined with 3 parameters - `min`, `step` and `max`. They can be set separately for prompt and decode phase, and for batch size and sequence length dimension. These parameters can be observed in logs during vLLM startup:
 
-```
+```text
 INFO 08-01 21:37:59 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024]
 INFO 08-01 21:37:59 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)]
 INFO 08-01 21:37:59 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048]
@@ -192,7 +192,7 @@ INFO 08-01 21:37:59 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 1
 
 Example (with ramp-up)
 
-```
+```text
 min = 2, step = 32, max = 64
 => ramp_up = (2, 4, 8, 16)
 => stable = (32, 64)
@@ -201,7 +201,7 @@ min = 2, step = 32, max = 64
 
 Example (without ramp-up)
 
-```
+```text
 min = 128, step = 128, max = 512
 => ramp_up = ()
 => stable = (128, 256, 384, 512)
@@ -224,7 +224,7 @@ Bucketing is transparent to a client -- padding in sequence length dimension is
 
 Warmup is an optional, but highly recommended step occurring before vLLM server starts listening. It executes a forward pass for each bucket with dummy data. The goal is to pre-compile all graphs and not incur any graph compilation overheads within bucket boundaries during server runtime. Each warmup step is logged during vLLM startup:
 
-```
+```text
 INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:79.16 GiB
 INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][2/24] batch_size:4 seq_len:896 free_mem:55.43 GiB
 INFO 08-01 22:26:48 hpu_model_runner.py:1066] [Warmup][Prompt][3/24] batch_size:4 seq_len:768 free_mem:55.43 GiB
@@ -273,7 +273,7 @@ When there's large amount of requests pending, vLLM scheduler will attempt to fi
 
 Each described step is logged by vLLM server, as follows (negative values correspond to memory being released):
 
-```
+```text
 INFO 08-02 17:37:44 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024]
 INFO 08-02 17:37:44 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)]
 INFO 08-02 17:37:44 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048]
@@ -349,19 +349,19 @@ INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of devi
   - Default values:
 
     - Prompt:
-      : - batch size min (`VLLM_PROMPT_BS_BUCKET_MIN`): `1`
-        - batch size step (`VLLM_PROMPT_BS_BUCKET_STEP`): `min(max_num_seqs, 32)`
-        - batch size max (`VLLM_PROMPT_BS_BUCKET_MAX`): `min(max_num_seqs, 64)`
-        - sequence length min (`VLLM_PROMPT_SEQ_BUCKET_MIN`): `block_size`
-        - sequence length step (`VLLM_PROMPT_SEQ_BUCKET_STEP`): `block_size`
-        - sequence length max (`VLLM_PROMPT_SEQ_BUCKET_MAX`): `max_model_len`
+      - batch size min (`VLLM_PROMPT_BS_BUCKET_MIN`): `1`
+      - batch size step (`VLLM_PROMPT_BS_BUCKET_STEP`): `min(max_num_seqs, 32)`
+      - batch size max (`VLLM_PROMPT_BS_BUCKET_MAX`): `min(max_num_seqs, 64)`
+      - sequence length min (`VLLM_PROMPT_SEQ_BUCKET_MIN`): `block_size`
+      - sequence length step (`VLLM_PROMPT_SEQ_BUCKET_STEP`): `block_size`
+      - sequence length max (`VLLM_PROMPT_SEQ_BUCKET_MAX`): `max_model_len`
     - Decode:
-      : - batch size min (`VLLM_DECODE_BS_BUCKET_MIN`): `1`
-        - batch size step (`VLLM_DECODE_BS_BUCKET_STEP`): `min(max_num_seqs, 32)`
-        - batch size max (`VLLM_DECODE_BS_BUCKET_MAX`): `max_num_seqs`
-        - sequence length min (`VLLM_DECODE_BLOCK_BUCKET_MIN`): `block_size`
-        - sequence length step (`VLLM_DECODE_BLOCK_BUCKET_STEP`): `block_size`
-        - sequence length max (`VLLM_DECODE_BLOCK_BUCKET_MAX`): `max(128, (max_num_seqs*max_model_len)/block_size)`
+      - batch size min (`VLLM_DECODE_BS_BUCKET_MIN`): `1`
+      - batch size step (`VLLM_DECODE_BS_BUCKET_STEP`): `min(max_num_seqs, 32)`
+      - batch size max (`VLLM_DECODE_BS_BUCKET_MAX`): `max_num_seqs`
+      - sequence length min (`VLLM_DECODE_BLOCK_BUCKET_MIN`): `block_size`
+      - sequence length step (`VLLM_DECODE_BLOCK_BUCKET_STEP`): `block_size`
+      - sequence length max (`VLLM_DECODE_BLOCK_BUCKET_MAX`): `max(128, (max_num_seqs*max_model_len)/block_size)`
 
 Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM execution:
 
diff --git a/docs/source/getting_started/installation/neuron.md b/docs/source/getting_started/installation/neuron.md
index 431f90537f543..5581b1940ca46 100644
--- a/docs/source/getting_started/installation/neuron.md
+++ b/docs/source/getting_started/installation/neuron.md
@@ -123,10 +123,10 @@ python -m pip install --upgrade neuronx-cc==2.* --pre torch-neuronx==2.1.* torch
 Once neuronx-cc and transformers-neuronx packages are installed, we will be able to install vllm as follows:
 
 ```console
-$ git clone https://github.com/vllm-project/vllm.git
-$ cd vllm
-$ pip install -U -r requirements-neuron.txt
-$ VLLM_TARGET_DEVICE="neuron" pip install .
+git clone https://github.com/vllm-project/vllm.git
+cd vllm
+pip install -U -r requirements-neuron.txt
+VLLM_TARGET_DEVICE="neuron" pip install .
 ```
 
 If neuron packages are detected correctly in the installation process, `vllm-0.3.0+neuron212` will be installed.
diff --git a/docs/source/getting_started/installation/openvino.md b/docs/source/getting_started/installation/openvino.md
index 60f95fd1c4250..d97d4173bf36b 100644
--- a/docs/source/getting_started/installation/openvino.md
+++ b/docs/source/getting_started/installation/openvino.md
@@ -27,8 +27,8 @@ vLLM powered by OpenVINO supports all LLM models from [vLLM supported models lis
 ## Quick start using Dockerfile
 
 ```console
-$ docker build -f Dockerfile.openvino -t vllm-openvino-env .
-$ docker run -it --rm vllm-openvino-env
+docker build -f Dockerfile.openvino -t vllm-openvino-env .
+docker run -it --rm vllm-openvino-env
 ```
 
 (install-openvino-backend-from-source)=
@@ -38,21 +38,21 @@ $ docker run -it --rm vllm-openvino-env
 - First, install Python. For example, on Ubuntu 22.04, you can run:
 
   ```console
-  $ sudo apt-get update  -y
-  $ sudo apt-get install python3
+  sudo apt-get update  -y
+  sudo apt-get install python3
   ```
 
 - Second, install prerequisites vLLM OpenVINO backend installation:
 
   ```console
-  $ pip install --upgrade pip
-  $ pip install -r requirements-build.txt --extra-index-url https://download.pytorch.org/whl/cpu
+  pip install --upgrade pip
+  pip install -r requirements-build.txt --extra-index-url https://download.pytorch.org/whl/cpu
   ```
 
 - Finally, install vLLM with OpenVINO backend:
 
   ```console
-  $ PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE=openvino python -m pip install -v .
+  PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE=openvino python -m pip install -v .
   ```
 
 - [Optional] To use vLLM OpenVINO backend with a GPU device, ensure your system is properly set up. Follow the instructions provided here: [https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html](https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html).
diff --git a/docs/source/getting_started/installation/tpu.md b/docs/source/getting_started/installation/tpu.md
index bc93c44fead30..1938785ade46a 100644
--- a/docs/source/getting_started/installation/tpu.md
+++ b/docs/source/getting_started/installation/tpu.md
@@ -156,14 +156,14 @@ For more information about using TPUs with GKE, see
 You can use <gh-file:Dockerfile.tpu> to build a Docker image with TPU support.
 
 ```console
-$ docker build -f Dockerfile.tpu -t vllm-tpu .
+docker build -f Dockerfile.tpu -t vllm-tpu .
 ```
 
 Run the Docker image with the following command:
 
 ```console
-$ # Make sure to add `--privileged --net host --shm-size=16G`.
-$ docker run --privileged --net host --shm-size=16G -it vllm-tpu
+# Make sure to add `--privileged --net host --shm-size=16G`.
+docker run --privileged --net host --shm-size=16G -it vllm-tpu
 ```
 
 ```{note}
diff --git a/docs/source/getting_started/installation/xpu.md b/docs/source/getting_started/installation/xpu.md
index c1ab5478eb652..73758f37cf0f6 100644
--- a/docs/source/getting_started/installation/xpu.md
+++ b/docs/source/getting_started/installation/xpu.md
@@ -40,15 +40,15 @@ $ docker run -it \
 - Second, install Python packages for vLLM XPU backend building:
 
 ```console
-$ source /opt/intel/oneapi/setvars.sh
-$ pip install --upgrade pip
-$ pip install -v -r requirements-xpu.txt
+source /opt/intel/oneapi/setvars.sh
+pip install --upgrade pip
+pip install -v -r requirements-xpu.txt
 ```
 
 - Finally, build and install vLLM XPU backend:
 
 ```console
-$ VLLM_TARGET_DEVICE=xpu python setup.py install
+VLLM_TARGET_DEVICE=xpu python setup.py install
 ```
 
 ```{note}
@@ -61,14 +61,14 @@ $ VLLM_TARGET_DEVICE=xpu python setup.py install
 XPU platform supports tensor-parallel inference/serving and also supports pipeline parallel as a beta feature for online serving. We requires Ray as the distributed runtime backend. For example, a reference execution likes following:
 
 ```console
-$ python -m vllm.entrypoints.openai.api_server \
-$      --model=facebook/opt-13b \
-$      --dtype=bfloat16 \
-$      --device=xpu \
-$      --max_model_len=1024 \
-$      --distributed-executor-backend=ray \
-$      --pipeline-parallel-size=2 \
-$      -tp=8
+python -m vllm.entrypoints.openai.api_server \
+     --model=facebook/opt-13b \
+     --dtype=bfloat16 \
+     --device=xpu \
+     --max_model_len=1024 \
+     --distributed-executor-backend=ray \
+     --pipeline-parallel-size=2 \
+     -tp=8
 ```
 
 By default, a ray instance will be launched automatically if no existing one is detected in system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the <gh-file:examples/online_serving/run_cluster.sh> helper script.
diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md
index 6fd0083a9bb7b..8ac80e5e5c553 100644
--- a/docs/source/getting_started/quickstart.md
+++ b/docs/source/getting_started/quickstart.md
@@ -19,17 +19,17 @@ If you are using NVIDIA GPUs, you can install vLLM using [pip](https://pypi.org/
 It's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python environment manager, to create and manage Python environments. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment and install vLLM using the following commands:
 
 ```console
-$ uv venv myenv --python 3.12 --seed
-$ source myenv/bin/activate
-$ uv pip install vllm
+uv venv myenv --python 3.12 --seed
+source myenv/bin/activate
+uv pip install vllm
 ```
 
 You can also use [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html) to create and manage Python environments.
 
 ```console
-$ conda create -n myenv python=3.12 -y
-$ conda activate myenv
-$ pip install vllm
+conda create -n myenv python=3.12 -y
+conda activate myenv
+pip install vllm
 ```
 
 ```{note}
@@ -94,7 +94,7 @@ By default, it starts the server at `http://localhost:8000`. You can specify the
 Run the following command to start the vLLM server with the [Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct) model:
 
 ```console
-$ vllm serve Qwen/Qwen2.5-1.5B-Instruct
+vllm serve Qwen/Qwen2.5-1.5B-Instruct
 ```
 
 ```{note}
@@ -105,7 +105,7 @@ You can learn about overriding it [here](#chat-template).
 This server can be queried in the same format as OpenAI API. For example, to list the models:
 
 ```console
-$ curl http://localhost:8000/v1/models
+curl http://localhost:8000/v1/models
 ```
 
 You can pass in the argument `--api-key` or environment variable `VLLM_API_KEY` to enable the server to check for API key in the header.
@@ -115,14 +115,14 @@ You can pass in the argument `--api-key` or environment variable `VLLM_API_KEY`
 Once your server is started, you can query the model with input prompts:
 
 ```console
-$ curl http://localhost:8000/v1/completions \
-$     -H "Content-Type: application/json" \
-$     -d '{
-$         "model": "Qwen/Qwen2.5-1.5B-Instruct",
-$         "prompt": "San Francisco is a",
-$         "max_tokens": 7,
-$         "temperature": 0
-$     }'
+curl http://localhost:8000/v1/completions \
+    -H "Content-Type: application/json" \
+    -d '{
+        "model": "Qwen/Qwen2.5-1.5B-Instruct",
+        "prompt": "San Francisco is a",
+        "max_tokens": 7,
+        "temperature": 0
+    }'
 ```
 
 Since this server is compatible with OpenAI API, you can use it as a drop-in replacement for any applications using OpenAI API. For example, another way to query the server is via the `openai` Python package:
@@ -151,15 +151,15 @@ vLLM is designed to also support the OpenAI Chat Completions API. The chat inter
 You can use the [create chat completion](https://platform.openai.com/docs/api-reference/chat/completions/create) endpoint to interact with the model:
 
 ```console
-$ curl http://localhost:8000/v1/chat/completions \
-$     -H "Content-Type: application/json" \
-$     -d '{
-$         "model": "Qwen/Qwen2.5-1.5B-Instruct",
-$         "messages": [
-$             {"role": "system", "content": "You are a helpful assistant."},
-$             {"role": "user", "content": "Who won the world series in 2020?"}
-$         ]
-$     }'
+curl http://localhost:8000/v1/chat/completions \
+    -H "Content-Type: application/json" \
+    -d '{
+        "model": "Qwen/Qwen2.5-1.5B-Instruct",
+        "messages": [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": "Who won the world series in 2020?"}
+        ]
+    }'
 ```
 
 Alternatively, you can use the `openai` Python package:
diff --git a/docs/source/getting_started/troubleshooting.md b/docs/source/getting_started/troubleshooting.md
index f5efe0bef7506..1e290d2b4c0bd 100644
--- a/docs/source/getting_started/troubleshooting.md
+++ b/docs/source/getting_started/troubleshooting.md
@@ -48,6 +48,7 @@ If vLLM crashes and the error trace captures it somewhere around `self.graph.rep
 To identify the particular CUDA operation that causes the error, you can add `--enforce-eager` to the command line, or `enforce_eager=True` to the {class}`~vllm.LLM` class to disable the CUDAGraph optimization and isolate the exact CUDA operation that causes the error.
 
 (troubleshooting-incorrect-hardware-driver)=
+
 ## Incorrect hardware/driver
 
 If GPU/CPU communication cannot be established, you can use the following Python script and follow the instructions below to confirm whether the GPU/CPU communication is working correctly.
@@ -118,13 +119,13 @@ dist.destroy_process_group()
 If you are testing with a single node, adjust `--nproc-per-node` to the number of GPUs you want to use:
 
 ```console
-$ NCCL_DEBUG=TRACE torchrun --nproc-per-node=<number-of-GPUs> test.py
+NCCL_DEBUG=TRACE torchrun --nproc-per-node=<number-of-GPUs> test.py
 ```
 
 If you are testing with multi-nodes, adjust `--nproc-per-node` and `--nnodes` according to your setup and set `MASTER_ADDR` to the correct IP address of the master node, reachable from all nodes. Then, run:
 
 ```console
-$ NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=$MASTER_ADDR test.py
+NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=$MASTER_ADDR test.py
 ```
 
 If the script runs successfully, you should see the message `sanity check is successful!`.
@@ -141,6 +142,7 @@ Adjust `--nproc-per-node`, `--nnodes`, and `--node-rank` according to your setup
 ```
 
 (troubleshooting-python-multiprocessing)=
+
 ## Python multiprocessing
 
 ### `RuntimeError` Exception
diff --git a/docs/source/index.md b/docs/source/index.md
index de74276a50fb6..8f9493d77186e 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -1,4 +1,4 @@
-# Welcome to vLLM!
+# Welcome to vLLM
 
 ```{figure} ./assets/logos/vllm-logo-text-light.png
 :align: center
@@ -186,7 +186,7 @@ community/meetups
 community/sponsors
 ```
 
-# Indices and tables
+## Indices and tables
 
 - {ref}`genindex`
 - {ref}`modindex`
diff --git a/docs/source/models/extensions/runai_model_streamer.md b/docs/source/models/extensions/runai_model_streamer.md
index fe2701194a604..75f7a9fcad416 100644
--- a/docs/source/models/extensions/runai_model_streamer.md
+++ b/docs/source/models/extensions/runai_model_streamer.md
@@ -9,25 +9,25 @@ vLLM supports loading weights in Safetensors format using the Run:ai Model Strea
 You first need to install vLLM RunAI optional dependency:
 
 ```console
-$ pip3 install vllm[runai]
+pip3 install vllm[runai]
 ```
 
 To run it as an OpenAI-compatible server, add the `--load-format runai_streamer` flag:
 
 ```console
-$ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer
+vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer
 ```
 
 To run model from AWS S3 object store run:
 
 ```console
-$ vllm serve s3://core-llm/Llama-3-8b --load-format runai_streamer
+vllm serve s3://core-llm/Llama-3-8b --load-format runai_streamer
 ```
 
 To run model from a S3 compatible object store run:
 
 ```console
-$ RUNAI_STREAMER_S3_USE_VIRTUAL_ADDRESSING=0 AWS_EC2_METADATA_DISABLED=true AWS_ENDPOINT_URL=https://storage.googleapis.com vllm serve s3://core-llm/Llama-3-8b --load-format runai_streamer
+RUNAI_STREAMER_S3_USE_VIRTUAL_ADDRESSING=0 AWS_EC2_METADATA_DISABLED=true AWS_ENDPOINT_URL=https://storage.googleapis.com vllm serve s3://core-llm/Llama-3-8b --load-format runai_streamer
 ```
 
 ## Tunable parameters
@@ -38,14 +38,14 @@ You can tune `concurrency` that controls the level of concurrency and number of
 For reading from S3, it will be the number of client instances the host is opening to the S3 server.
 
 ```console
-$ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"concurrency":16}'
+vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"concurrency":16}'
 ```
 
 You can control the size of the CPU Memory buffer to which tensors are read from the file, and limit this size.
 You can read further about CPU buffer memory limiting [here](https://github.com/run-ai/runai-model-streamer/blob/master/docs/src/env-vars.md#runai_streamer_memory_limit).
 
 ```console
-$ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"memory_limit":5368709120}'
+vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"memory_limit":5368709120}'
 ```
 
 ```{note}
diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 72910ea1e2d19..5c96dfdad25f7 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -45,7 +45,7 @@ Alternatively, you can [open an issue on GitHub](https://github.com/vllm-project
 To use models from [ModelScope](https://www.modelscope.cn) instead of HuggingFace Hub, set an environment variable:
 
 ```shell
-$ export VLLM_USE_MODELSCOPE=True
+export VLLM_USE_MODELSCOPE=True
 ```
 
 And use with `trust_remote_code=True`.
@@ -820,19 +820,22 @@ The following table lists those that are tested in vLLM.
 
 _________________
 
-# Model Support Policy
+## Model Support Policy
 
 At vLLM, we are committed to facilitating the integration and support of third-party models within our ecosystem. Our approach is designed to balance the need for robustness and the practical limitations of supporting a wide range of models. Here’s how we manage third-party model support:
 
 1. **Community-Driven Support**: We encourage community contributions for adding new models. When a user requests support for a new model, we welcome pull requests (PRs) from the community. These contributions are evaluated primarily on the sensibility of the output they generate, rather than strict consistency with existing implementations such as those in transformers. **Call for contribution:** PRs coming directly from model vendors are greatly appreciated!
+
 2. **Best-Effort Consistency**: While we aim to maintain a level of consistency between the models implemented in vLLM and other frameworks like transformers, complete alignment is not always feasible. Factors like acceleration techniques and the use of low-precision computations can introduce discrepancies. Our commitment is to ensure that the implemented models are functional and produce sensible results.
 
-```{tip}
-When comparing the output of `model.generate` from HuggingFace Transformers with the output of `llm.generate` from vLLM, note that the former reads the model's generation config file (i.e., [generation_config.json](https://github.com/huggingface/transformers/blob/19dabe96362803fb0a9ae7073d03533966598b17/src/transformers/generation/utils.py#L1945)) and applies the default parameters for generation, while the latter only uses the parameters passed to the function. Ensure all sampling parameters are identical when comparing outputs.
-```
+    ```{tip}
+    When comparing the output of `model.generate` from HuggingFace Transformers with the output of `llm.generate` from vLLM, note that the former reads the model's generation config file (i.e., [generation_config.json](https://github.com/huggingface/transformers/blob/19dabe96362803fb0a9ae7073d03533966598b17/src/transformers/generation/utils.py#L1945)) and applies the default parameters for generation, while the latter only uses the parameters passed to the function. Ensure all sampling parameters are identical when comparing outputs.
+    ```
 
 3. **Issue Resolution and Model Updates**: Users are encouraged to report any bugs or issues they encounter with third-party models. Proposed fixes should be submitted via PRs, with a clear explanation of the problem and the rationale behind the proposed solution. If a fix for one model impacts another, we rely on the community to highlight and address these cross-model dependencies. Note: for bugfix PRs, it is good etiquette to inform the original author to seek their feedback.
+
 4. **Monitoring and Updates**: Users interested in specific models should monitor the commit history for those models (e.g., by tracking changes in the main/vllm/model_executor/models directory). This proactive approach helps users stay informed about updates and changes that may affect the models they use.
+
 5. **Selective Focus**: Our resources are primarily directed towards models with significant user interest and impact. Models that are less frequently used may receive less attention, and we rely on the community to play a more active role in their upkeep and improvement.
 
 Through this approach, vLLM fosters a collaborative environment where both the core development team and the broader community contribute to the robustness and diversity of the third-party models supported in our ecosystem.
diff --git a/docs/source/performance/optimization.md b/docs/source/performance/optimization.md
index 4fcde9b03b887..4fbc376e1aa39 100644
--- a/docs/source/performance/optimization.md
+++ b/docs/source/performance/optimization.md
@@ -8,7 +8,7 @@ Due to the auto-regressive nature of transformer architecture, there are times w
 The vLLM can preempt requests to free up KV cache space for other requests. Preempted requests are recomputed when sufficient KV cache space becomes
 available again. When this occurs, the following warning is printed:
 
-```
+```text
 WARNING 05-09 00:49:33 scheduler.py:1057 Sequence group 0 is preempted by PreemptionMode.SWAP mode because there is not enough KV cache space. This can affect the end-to-end performance. Increase gpu_memory_utilization or tensor_parallel_size to provide more KV cache memory. total_cumulative_preemption_cnt=1
 ```
 
diff --git a/docs/source/serving/distributed_serving.md b/docs/source/serving/distributed_serving.md
index 4e0a9ef6ecf7d..daf6e2f250416 100644
--- a/docs/source/serving/distributed_serving.md
+++ b/docs/source/serving/distributed_serving.md
@@ -35,16 +35,16 @@ output = llm.generate("San Franciso is a")
 To run multi-GPU serving, pass in the `--tensor-parallel-size` argument when starting the server. For example, to run API server on 4 GPUs:
 
 ```console
-$ vllm serve facebook/opt-13b \
-$     --tensor-parallel-size 4
+ vllm serve facebook/opt-13b \
+     --tensor-parallel-size 4
 ```
 
 You can also additionally specify `--pipeline-parallel-size` to enable pipeline parallelism. For example, to run API server on 8 GPUs with pipeline parallelism and tensor parallelism:
 
 ```console
-$ vllm serve gpt2 \
-$     --tensor-parallel-size 4 \
-$     --pipeline-parallel-size 2
+ vllm serve gpt2 \
+     --tensor-parallel-size 4 \
+     --pipeline-parallel-size 2
 ```
 
 ## Running vLLM on multiple nodes
@@ -56,21 +56,21 @@ The first step, is to start containers and organize them into a cluster. We have
 Pick a node as the head node, and run the following command:
 
 ```console
-$ bash run_cluster.sh \
-$                   vllm/vllm-openai \
-$                   ip_of_head_node \
-$                   --head \
-$                   /path/to/the/huggingface/home/in/this/node
+bash run_cluster.sh \
+                vllm/vllm-openai \
+                ip_of_head_node \
+                --head \
+                /path/to/the/huggingface/home/in/this/node
 ```
 
 On the rest of the worker nodes, run the following command:
 
 ```console
-$ bash run_cluster.sh \
-$                   vllm/vllm-openai \
-$                   ip_of_head_node \
-$                   --worker \
-$                   /path/to/the/huggingface/home/in/this/node
+bash run_cluster.sh \
+                vllm/vllm-openai \
+                ip_of_head_node \
+                --worker \
+                /path/to/the/huggingface/home/in/this/node
 ```
 
 Then you get a ray cluster of containers. Note that you need to keep the shells running these commands alive to hold the cluster. Any shell disconnect will terminate the cluster. In addition, please note that the argument `ip_of_head_node` should be the IP address of the head node, which is accessible by all the worker nodes. A common misunderstanding is to use the IP address of the worker node, which is not correct.
@@ -80,16 +80,16 @@ Then, on any node, use `docker exec -it node /bin/bash` to enter the container,
 After that, on any node, you can use vLLM as usual, just as you have all the GPUs on one node. The common practice is to set the tensor parallel size to the number of GPUs in each node, and the pipeline parallel size to the number of nodes. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2:
 
 ```console
-$ vllm serve /path/to/the/model/in/the/container \
-$     --tensor-parallel-size 8 \
-$     --pipeline-parallel-size 2
+ vllm serve /path/to/the/model/in/the/container \
+     --tensor-parallel-size 8 \
+     --pipeline-parallel-size 2
 ```
 
 You can also use tensor parallel without pipeline parallel, just set the tensor parallel size to the number of GPUs in the cluster. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 16:
 
 ```console
-$ vllm serve /path/to/the/model/in/the/container \
-$     --tensor-parallel-size 16
+vllm serve /path/to/the/model/in/the/container \
+     --tensor-parallel-size 16
 ```
 
 To make tensor parallel performant, you should make sure the communication between nodes is efficient, e.g. using high-speed network cards like Infiniband. To correctly set up the cluster to use Infiniband, append additional arguments like `--privileged -e NCCL_IB_HCA=mlx5` to the `run_cluster.sh` script. Please contact your system administrator for more information on how to set up the flags. One way to confirm if the Infiniband is working is to run vLLM with `NCCL_DEBUG=TRACE` environment variable set, e.g. `NCCL_DEBUG=TRACE vllm serve ...` and check the logs for the NCCL version and the network used. If you find `[send] via NET/Socket` in the logs, it means NCCL uses raw TCP Socket, which is not efficient for cross-node tensor parallel. If you find `[send] via NET/IB/GDRDMA` in the logs, it means NCCL uses Infiniband with GPU-Direct RDMA, which is efficient.
diff --git a/docs/source/serving/integrations/langchain.md b/docs/source/serving/integrations/langchain.md
index 49ff6e0c32a72..03142d23b145a 100644
--- a/docs/source/serving/integrations/langchain.md
+++ b/docs/source/serving/integrations/langchain.md
@@ -7,7 +7,7 @@ vLLM is also available via [LangChain](https://github.com/langchain-ai/langchain
 To install LangChain, run
 
 ```console
-$ pip install langchain langchain_community -q
+pip install langchain langchain_community -q
 ```
 
 To run inference on a single or multiple GPUs, use `VLLM` class from `langchain`.
diff --git a/docs/source/serving/integrations/llamaindex.md b/docs/source/serving/integrations/llamaindex.md
index 9961c181d7e1c..8c72605202cf5 100644
--- a/docs/source/serving/integrations/llamaindex.md
+++ b/docs/source/serving/integrations/llamaindex.md
@@ -7,7 +7,7 @@ vLLM is also available via [LlamaIndex](https://github.com/run-llama/llama_index
 To install LlamaIndex, run
 
 ```console
-$ pip install llama-index-llms-vllm -q
+pip install llama-index-llms-vllm -q
 ```
 
 To run inference on a single or multiple GPUs, use `Vllm` class from `llamaindex`.
diff --git a/docs/source/serving/metrics.md b/docs/source/serving/metrics.md
index e6ded2e6dd465..6c84f6d1350a6 100644
--- a/docs/source/serving/metrics.md
+++ b/docs/source/serving/metrics.md
@@ -7,7 +7,7 @@ OpenAI compatible API server.
 You can start the server using Python, or using [Docker](#deployment-docker):
 
 ```console
-$ vllm serve unsloth/Llama-3.2-1B-Instruct
+vllm serve unsloth/Llama-3.2-1B-Instruct
 ```
 
 Then query the endpoint to get the latest metrics from the server:
diff --git a/docs/source/serving/multimodal_inputs.md b/docs/source/serving/multimodal_inputs.md
index 53f5a274e39a3..0213b0a3388ea 100644
--- a/docs/source/serving/multimodal_inputs.md
+++ b/docs/source/serving/multimodal_inputs.md
@@ -303,6 +303,7 @@ vllm serve llava-hf/llava-onevision-qwen2-0.5b-ov-hf --task generate --max-model
 ```
 
 Then, you can use the OpenAI client as follows:
+
 ```python
 from openai import OpenAI
 
diff --git a/docs/source/serving/offline_inference.md b/docs/source/serving/offline_inference.md
index 79092ab208784..94703a1c32ade 100644
--- a/docs/source/serving/offline_inference.md
+++ b/docs/source/serving/offline_inference.md
@@ -64,7 +64,7 @@ Dynamic quantization is also supported via the `quantization` option -- see [her
 
 #### Context length and batch size
 
-You can further reduce memory usage by limit the context length of the model (`max_model_len` option)
+You can further reduce memory usage by limiting the context length of the model (`max_model_len` option)
 and the maximum batch size (`max_num_seqs` option).
 
 ```python
diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index ec5a367594743..e49bbb06695f8 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -5,11 +5,13 @@
 vLLM provides an HTTP server that implements OpenAI's [Completions API](https://platform.openai.com/docs/api-reference/completions), [Chat API](https://platform.openai.com/docs/api-reference/chat), and more!
 
 You can start the server via the [`vllm serve`](#vllm-serve) command, or through [Docker](#deployment-docker):
+
 ```bash
 vllm serve NousResearch/Meta-Llama-3-8B-Instruct --dtype auto --api-key token-abc123
 ```
 
 To call the server, you can use the [official OpenAI Python client](https://github.com/openai/openai-python), or any other HTTP client.
+
 ```python
 from openai import OpenAI
 client = OpenAI(
@@ -50,6 +52,7 @@ In addition, we have the following custom APIs:
   - Only applicable to [cross-encoder models](../models/pooling_models.md) (`--task score`).
 
 (chat-template)=
+
 ## Chat Template
 
 In order for the language model to support chat protocol, vLLM requires the model to include
@@ -71,6 +74,7 @@ vLLM community provides a set of chat templates for popular models. You can find
 
 With the inclusion of multi-modal chat APIs, the OpenAI spec now accepts chat messages in a new format which specifies
 both a `type` and a `text` field. An example is provided below:
+
 ```python
 completion = client.chat.completions.create(
   model="NousResearch/Meta-Llama-3-8B-Instruct",
@@ -80,7 +84,7 @@ completion = client.chat.completions.create(
 )
 ```
 
-Most chat templates for LLMs expect the `content` field to be a string, but there are some newer models like 
+Most chat templates for LLMs expect the `content` field to be a string, but there are some newer models like
 `meta-llama/Llama-Guard-3-1B` that expect the content to be formatted according to the OpenAI schema in the
 request. vLLM provides best-effort support to detect this automatically, which is logged as a string like
 *"Detected the chat template content format to be..."*, and internally converts incoming requests to match
@@ -115,12 +119,12 @@ completion = client.chat.completions.create(
 ## Extra HTTP Headers
 
 Only `X-Request-Id` HTTP request header is supported for now. It can be enabled
-with `--enable-request-id-headers`. 
+with `--enable-request-id-headers`.
 
 > Note that enablement of the headers can impact performance significantly at high QPS
 > rates. We recommend implementing HTTP headers at the router level (e.g. via Istio),
 > rather than within the vLLM layer for this reason.
-> See https://github.com/vllm-project/vllm/pull/11529 for more details.
+> See [this PR](https://github.com/vllm-project/vllm/pull/11529) for more details.
 
 ```python
 completion = client.chat.completions.create(
@@ -147,6 +151,7 @@ print(completion._request_id)
 ## CLI Reference
 
 (vllm-serve)=
+
 ### `vllm serve`
 
 The `vllm serve` command is used to launch the OpenAI-compatible server.
@@ -175,7 +180,7 @@ uvicorn-log-level: "info"
 To use the above config file:
 
 ```bash
-$ vllm serve SOME_MODEL --config config.yaml
+vllm serve SOME_MODEL --config config.yaml
 ```
 
 ```{note}
@@ -186,6 +191,7 @@ The order of priorities is `command line > config file values > defaults`.
 ## API Reference
 
 (completions-api)=
+
 ### Completions API
 
 Our Completions API is compatible with [OpenAI's Completions API](https://platform.openai.com/docs/api-reference/completions);
@@ -212,6 +218,7 @@ The following extra parameters are supported:
 ```
 
 (chat-api)=
+
 ### Chat API
 
 Our Chat API is compatible with [OpenAI's Chat Completions API](https://platform.openai.com/docs/api-reference/chat);
@@ -243,6 +250,7 @@ The following extra parameters are supported:
 ```
 
 (embeddings-api)=
+
 ### Embeddings API
 
 Our Embeddings API is compatible with [OpenAI's Embeddings API](https://platform.openai.com/docs/api-reference/embeddings);
@@ -284,6 +292,7 @@ For chat-like input (i.e. if `messages` is passed), these extra parameters are s
 ```
 
 (tokenizer-api)=
+
 ### Tokenizer API
 
 Our Tokenizer API is a simple wrapper over [HuggingFace-style tokenizers](https://huggingface.co/docs/transformers/en/main_classes/tokenizer).
@@ -293,6 +302,7 @@ It consists of two endpoints:
 - `/detokenize` corresponds to calling `tokenizer.decode()`.
 
 (pooling-api)=
+
 ### Pooling API
 
 Our Pooling API encodes input prompts using a [pooling model](../models/pooling_models.md) and returns the corresponding hidden states.
@@ -302,6 +312,7 @@ The input format is the same as [Embeddings API](#embeddings-api), but the outpu
 Code example: <gh-file:examples/online_serving/openai_pooling_client.py>
 
 (score-api)=
+
 ### Score API
 
 Our Score API applies a cross-encoder model to predict scores for sentence pairs.
diff --git a/format.sh b/format.sh
index 0b196de9d0773..2277eef93c745 100755
--- a/format.sh
+++ b/format.sh
@@ -41,7 +41,7 @@ MYPY_VERSION=$(mypy --version | awk '{print $2}')
 CODESPELL_VERSION=$(codespell --version)
 ISORT_VERSION=$(isort --vn)
 CLANGFORMAT_VERSION=$(clang-format --version | awk '{print $3}')
-SPHINX_LINT_VERSION=$(sphinx-lint --version | awk '{print $2}')
+PYMARKDOWNLNT_VERSION=$(pymarkdownlnt version | awk '{print $1}')
 
 # # params: tool name, tool version, required version
 tool_version_check() {
@@ -58,7 +58,7 @@ tool_version_check "mypy" "$MYPY_VERSION"
 tool_version_check "isort" "$ISORT_VERSION"
 tool_version_check "codespell" "$CODESPELL_VERSION"
 tool_version_check "clang-format" "$CLANGFORMAT_VERSION"
-tool_version_check "sphinx-lint" "$SPHINX_LINT_VERSION"
+tool_version_check "pymarkdownlnt" "$PYMARKDOWNLNT_VERSION"
 
 YAPF_FLAGS=(
     '--recursive'
@@ -316,6 +316,6 @@ else
     echo "✨🎉 Format check passed! Congratulations! 🎉✨"
 fi
 
-echo 'vLLM sphinx-lint:'
-tools/sphinx-lint.sh
-echo 'vLLM sphinx-lint: Done'
+echo 'vLLM doc-lint:'
+tools/doc-lint.sh
+echo 'vLLM doc-lint: Done'
diff --git a/pyproject.toml b/pyproject.toml
index 0ac3f39ef7a5f..82275ccafb572 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -101,3 +101,9 @@ markers = [
     "skip_v1: do not run this test with v1",
     "optional: optional tests that are automatically skipped, include --optional to run them",
 ]
+
+[tool.pymarkdown]
+plugins.md013.enabled = false # line-length
+plugins.md041.enabled = false # first-line-h1
+plugins.md033.enabled = false # inline-html
+plugins.md024.allow_different_nesting = true # no-duplicate-headers
diff --git a/requirements-lint.txt b/requirements-lint.txt
index 711bb50a0e936..ffc73f90a0d48 100644
--- a/requirements-lint.txt
+++ b/requirements-lint.txt
@@ -6,7 +6,7 @@ ruff==0.6.5
 codespell==2.3.0
 isort==5.13.2
 clang-format==18.1.5
-sphinx-lint==1.0.0
+pymarkdownlnt==0.9.26
 
 # type checking
 mypy==1.11.1
diff --git a/tools/doc-lint.sh b/tools/doc-lint.sh
new file mode 100755
index 0000000000000..19a55ddfa91c4
--- /dev/null
+++ b/tools/doc-lint.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+pymarkdownlnt scan docs -r
diff --git a/tools/sphinx-lint.sh b/tools/sphinx-lint.sh
deleted file mode 100755
index 04f8075c5527f..0000000000000
--- a/tools/sphinx-lint.sh
+++ /dev/null
@@ -1,3 +0,0 @@
-#!/bin/bash
-
-sphinx-lint --disable trailing-whitespace,missing-final-newline docs

From f967e51f386404c7ead21d3c59ddc195cf946975 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sun, 12 Jan 2025 16:17:24 +0800
Subject: [PATCH 184/462] [Model] Initialize support for Deepseek-VL2 models
 (#11578)

Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 .buildkite/test-pipeline.yaml                 |   1 +
 docs/source/models/supported_models.md        |  20 +-
 examples/offline_inference/vision_language.py |  18 +
 .../vision_language_multi_image.py            |  23 +
 .../vision_language/test_models.py            |  27 +
 .../vision_language/vlm_utils/model_utils.py  |  36 +
 tests/models/registry.py                      |   2 +
 tests/models/test_initialization.py           |   3 +
 vllm/entrypoints/chat_utils.py                |   4 +-
 vllm/model_executor/models/deepseek_v2.py     |  18 +-
 vllm/model_executor/models/deepseek_v3.py     |  20 +-
 vllm/model_executor/models/deepseek_vl2.py    | 662 ++++++++++++++++++
 vllm/model_executor/models/minicpmv.py        |   2 +-
 vllm/model_executor/models/registry.py        |   1 +
 vllm/transformers_utils/config.py             |   6 +-
 vllm/transformers_utils/configs/__init__.py   |   2 +
 .../configs/deepseek_vl2.py                   | 214 ++++++
 17 files changed, 1050 insertions(+), 9 deletions(-)
 create mode 100644 vllm/model_executor/models/deepseek_vl2.py
 create mode 100644 vllm/transformers_utils/configs/deepseek_vl2.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index cf82210f96ee3..393912881bca3 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -52,6 +52,7 @@ steps:
   - tests/worker
   - tests/standalone_tests/lazy_torch_compile.py
   commands:
+  - pip install git+https://github.com/Isotr0py/DeepSeek-VL2.git  # Used by multimoda processing test
   - python3 standalone_tests/lazy_torch_compile.py
   - pytest -v -s mq_llm_engine # MQLLMEngine
   - pytest -v -s async_engine # AsyncLLMEngine
diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 5c96dfdad25f7..642ef3c9655b8 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -610,6 +610,13 @@ See [this page](#generative-models) for more information on how to use generativ
   -
   - ✅︎
   - ✅︎
+* - `DeepseekVLV2ForCausalLM`
+  - DeepSeek-VL2
+  - T + I<sup>+</sup>
+  - `deepseek-ai/deepseek-vl2-tiny`(WIP), `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2` etc. (see note)
+  -
+  - ✅︎
+  - ✅︎
 * - `FuyuForCausalLM`
   - Fuyu
   - T + I
@@ -755,8 +762,19 @@ See [this page](#generative-models) for more information on how to use generativ
 <sup>E</sup> Pre-computed embeddings can be inputted for this modality.  
 <sup>+</sup> Multiple items can be inputted per text prompt for this modality.
 
+````{note}
+The `deepseek-ai/deepseek-vl2-tiny` is not supported yet.
+
+To use `DeepSeek-VL2` series models, you need to install a fork version `deepseek_vl2` package:
+```shell
+pip install git+https://github.com/Isotr0py/DeepSeek-VL2.git
+```
+
+Besides, to run `DeepSeek-VL2` series models, you have to pass `--hf_overrides '{"architectures": ["DeepseekVLV2ForCausalLM"]}'` when running vLLM.
+````
+
 ```{note}
-To use `TIGER-Lab/Mantis-8B-siglip-llama3`, you have pass `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM.
+To use `TIGER-Lab/Mantis-8B-siglip-llama3`, you have to pass `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM.
 ```
 
 ```{note}
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index b51bfae455267..ad32b9fe242e9 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -66,6 +66,23 @@ def run_chameleon(question: str, modality: str):
     return llm, prompt, stop_token_ids
 
 
+# Deepseek-VL2
+def run_deepseek_vl2(question: str, modality: str):
+    assert modality == "image"
+
+    model_name = "deepseek-ai/deepseek-vl2-small"
+
+    llm = LLM(model=model_name,
+              max_model_len=4096,
+              max_num_seqs=2,
+              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+              hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]})
+
+    prompt = f"<|User|>: <image>\n{question}\n\n<|Assistant|>:"
+    stop_token_ids = None
+    return llm, prompt, stop_token_ids
+
+
 # Fuyu
 def run_fuyu(question: str, modality: str):
     assert modality == "image"
@@ -498,6 +515,7 @@ def run_qwen2_vl(question: str, modality: str):
     "aria": run_aria,
     "blip-2": run_blip2,
     "chameleon": run_chameleon,
+    "deepseek_vl_v2": run_deepseek_vl2,
     "fuyu": run_fuyu,
     "glm4v": run_glm4v,
     "h2ovl_chat": run_h2ovl,
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index cf2e90a325c6a..c6cf3f30c31cb 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -54,6 +54,28 @@ def load_aria(question, image_urls: List[str]) -> ModelRequestData:
     )
 
 
+def load_deepseek_vl2(question: str, image_urls: List[str]):
+    model_name = "deepseek-ai/deepseek-vl2-small"
+
+    llm = LLM(model=model_name,
+              max_model_len=4096,
+              max_num_seqs=2,
+              hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
+              limit_mm_per_prompt={"image": len(image_urls)})
+
+    placeholder = "".join(f"image_{i}:<image>\n"
+                          for i, _ in enumerate(image_urls, start=1))
+    prompt = f"<|User|>: {placeholder}{question}\n\n<|Assistant|>:"
+
+    return ModelRequestData(
+        llm=llm,
+        prompt=prompt,
+        stop_token_ids=None,
+        image_data=[fetch_image(url) for url in image_urls],
+        chat_template=None,
+    )
+
+
 def load_h2onvl(question: str, image_urls: List[str]) -> ModelRequestData:
     model_name = "h2oai/h2ovl-mississippi-2b"
 
@@ -372,6 +394,7 @@ def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData:
 
 model_example_map = {
     "aria": load_aria,
+    "deepseek_vl2": load_deepseek_vl2,
     "h2ovl_chat": load_h2onvl,
     "idefics3": load_idefics3,
     "internvl_chat": load_internvl,
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index 146685738a1d0..7620ed1107e8f 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -188,6 +188,33 @@
         max_tokens=8,
         dtype="bfloat16",
     ),
+    "deepseek_vl_v2": VLMTestInfo(
+        models=["deepseek-ai/deepseek-vl2-small"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        dtype="bfloat16",
+        prompt_formatter=lambda img_prompt: f"<|User|>: {img_prompt}\n\n<|Assistant|>: ", # noqa: E501
+        max_model_len=4096,
+        max_num_seqs=2,
+        single_image_prompts=IMAGE_ASSETS.prompts({
+            "stop_sign": "<image>\nWhat's the color of the stop sign and car?",
+            "cherry_blossom": "<image>\nWhat's the color of the tower?",
+        }),
+        multi_image_prompt="image_1:<image>\nimage_2:<image>\nDescribe the two images shortly.",    # noqa: E501
+        vllm_runner_kwargs={"hf_overrides": {"architectures": ["DeepseekVLV2ForCausalLM"]}},  # noqa: E501
+        image_size_factors=[(0.10, 0.15)],
+        patch_hf_runner=model_utils.deepseekvl2_patch_hf_runner,
+        postprocess_inputs=model_utils.cast_dtype_post_processor("images"),
+        hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output,
+        stop_str=["<｜end▁of▁sentence｜>", "<｜begin▁of▁sentence｜>"],  # noqa: E501
+        num_logprobs=5,
+        marks=[
+            pytest.mark.skipif(
+                not is_flash_attn_2_available(),
+                reason="Model needs flash-attn for numeric convergence.",
+            ),
+            large_gpu_mark(min_gb=48),
+        ],
+    ),
     "fuyu": VLMTestInfo(
         models=["adept/fuyu-8b"],
         test_type=VLMTestType.IMAGE,
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
index 6c7a753af787e..1ca85c7bb2056 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
@@ -183,6 +183,14 @@ def paligemma_vllm_to_hf_output(vllm_output: RunnerOutput,
 
 
 ####### Post-processors for HF outputs
+def deepseekvl2_trunc_hf_output(hf_output: RunnerOutput,
+                                model: str) -> RunnerOutput:
+    output_ids, output_str, out_logprobs = hf_output
+    if output_str.endswith("<｜end▁of▁sentence｜>"):
+        output_str = output_str.split("<｜end▁of▁sentence｜>")[0]
+    return output_ids, output_str, out_logprobs
+
+
 def minicpmv_trunc_hf_output(hf_output: RunnerOutput,
                              model: str) -> RunnerOutput:
     output_ids, output_str, out_logprobs = hf_output
@@ -261,6 +269,34 @@ def qwen_prompt_path_encoder(
 
 
 ####### Model-specific HuggingFace runner patchers
+def deepseekvl2_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+    """Patches and returns an instance of the HfRunner to use for GLM4."""
+    hf_processor = hf_model.processor
+
+    def processor(*args, text="", images=None, **kwargs):
+        if isinstance(images, Image):
+            images = [images]
+        # inputs is a custom class instead of dict or BatchFeature
+        inputs = hf_processor(
+            *args,
+            prompt=text,
+            images=images,
+            **kwargs,
+        )
+        inputs = {
+            k: inputs[k]
+            for k in inputs.keys()  # noqa
+            if k not in ("seq_lens", "sft_format")
+        }
+        inputs = BatchEncoding(data=inputs, tensor_type="pt")
+        return inputs
+
+    hf_model.processor = processor
+    hf_model.model.get_output_embeddings = lambda: \
+        hf_model.model.language.model.embed_tokens
+    return hf_model
+
+
 def glm_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
     """Patches and returns an instance of the HfRunner to use for GLM4."""
     hf_processor = hf_model.processor
diff --git a/tests/models/registry.py b/tests/models/registry.py
index f5aaa8eb071f9..d079725b2f78d 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -179,6 +179,8 @@ class _HfExamplesInfo:
                                     trust_remote_code=True),
     "ChatGLMForConditionalGeneration": _HfExamplesInfo("chatglm2-6b",
                                                        is_available_online=False),
+    # TODO(Isotr0py): Use deepseek-vl2-tiny for test after it's supported
+    "DeepseekVLV2ForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-vl2-small"),   # noqa: E501
     "FuyuForCausalLM": _HfExamplesInfo("adept/fuyu-8b"),
     "H2OVLChatModel": _HfExamplesInfo("h2oai/h2ovl-mississippi-800m"),
     "InternVLChatModel": _HfExamplesInfo("OpenGVLab/InternVL2-1B",
diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py
index 7a564c1f4a1d0..daece7c93c0ef 100644
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -26,6 +26,9 @@ def test_can_initialize(model_arch):
 
     # Avoid OOM
     def hf_overrides(hf_config: PretrainedConfig) -> PretrainedConfig:
+        if hf_config.model_type == "deepseek_vl_v2":
+            hf_config.update({"architectures": ["DeepseekVLV2ForCausalLM"]})
+
         if hasattr(hf_config, "text_config"):
             text_config: PretrainedConfig = hf_config.text_config
         else:
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 923c7459f6948..beedf5d16ab86 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -403,8 +403,8 @@ def _placeholder_str(self, modality: ModalityStr,
             if model_type.startswith("llava"):
                 return self._cached_token_str(self._tokenizer,
                                               hf_config.image_token_index)
-            if model_type in ("chameleon", "internvl_chat", "NVLM_D",
-                              "h2ovl_chat"):
+            if model_type in ("chameleon", "deepseek_vl_v2", "internvl_chat",
+                              "NVLM_D", "h2ovl_chat"):
                 return "<image>"
             if model_type == "mllama":
                 return "<|image|>"
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index 4cf4e6c358bf2..9132040545863 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -243,7 +243,11 @@ def __init__(
                                         bias=False,
                                         quant_config=quant_config,
                                         prefix=f"{prefix}.o_proj")
-        rope_scaling["rope_type"] = 'deepseek_yarn'
+        if rope_scaling:
+            rope_scaling["rope_type"] = 'deepseek_yarn'
+            self.use_normal_rope = False
+        else:
+            self.use_normal_rope = True
         self.rotary_emb = get_rope(qk_rope_head_dim,
                                    rotary_dim=qk_rope_head_dim,
                                    max_position=max_position_embeddings,
@@ -298,7 +302,18 @@ def forward(
                      self.qk_nope_head_dim + self.v_head_dim)
         k_nope, v = kv.split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
         k_pe = latent_cache[:, :, self.kv_lora_rank:]
+
+        if self.use_normal_rope:
+            seq_len = positions.size(0)
+            ori_q_pe_shape, ori_k_pe_shape = q_pe.shape, k_pe.shape
+            q_pe = q_pe.reshape(seq_len, -1)
+            k_pe = k_pe.reshape(seq_len, -1)
+
         q_pe, k_pe = self.rotary_emb(positions, q_pe, k_pe)
+
+        if self.use_normal_rope:
+            q_pe, k_pe = q_pe.view(ori_q_pe_shape), k_pe.view(ori_k_pe_shape)
+
         q[..., self.qk_nope_head_dim:] = q_pe
         k = torch.empty_like(q)
         k[..., :self.qk_nope_head_dim] = k_nope
@@ -355,6 +370,7 @@ def __init__(
             quant_config=quant_config,
             prefix=f"{prefix}.self_attn",
         )
+
         if (config.n_routed_experts is not None
                 and layer_idx >= config.first_k_dense_replace
                 and layer_idx % config.moe_layer_freq == 0):
diff --git a/vllm/model_executor/models/deepseek_v3.py b/vllm/model_executor/models/deepseek_v3.py
index d4710622681b5..ca79b14c55fea 100644
--- a/vllm/model_executor/models/deepseek_v3.py
+++ b/vllm/model_executor/models/deepseek_v3.py
@@ -251,7 +251,11 @@ def __init__(
                                         bias=False,
                                         quant_config=quant_config,
                                         prefix=f"{prefix}.o_proj")
-        rope_scaling["rope_type"] = 'deepseek_yarn'
+        if rope_scaling:
+            rope_scaling["rope_type"] = 'deepseek_yarn'
+            self.use_normal_rope = False
+        else:
+            self.use_normal_rope = True
         self.rotary_emb = get_rope(qk_rope_head_dim,
                                    rotary_dim=qk_rope_head_dim,
                                    max_position=max_position_embeddings,
@@ -306,7 +310,18 @@ def forward(
                      self.qk_nope_head_dim + self.v_head_dim)
         k_nope, v = kv.split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
         k_pe = latent_cache[:, :, self.kv_lora_rank:]
+
+        if self.use_normal_rope:
+            seq_len = positions.size(0)
+            ori_q_pe_shape, ori_k_pe_shape = q_pe.shape, k_pe.shape
+            q_pe = q_pe.reshape(seq_len, -1)
+            k_pe = k_pe.reshape(seq_len, -1)
+
         q_pe, k_pe = self.rotary_emb(positions, q_pe, k_pe)
+
+        if self.use_normal_rope:
+            q_pe, k_pe = q_pe.view(ori_q_pe_shape), k_pe.view(ori_k_pe_shape)
+
         q[..., self.qk_nope_head_dim:] = q_pe
         k = torch.empty_like(q)
         k[..., :self.qk_nope_head_dim] = k_nope
@@ -583,7 +598,8 @@ def load_weights(self, weights: Iterable[Tuple[str,
                 continue
 
             # TODO(simon): support nextn predict layers
-            if self.config.num_nextn_predict_layers > 0:
+            if hasattr(self.config, "num_nextn_predict_layers"
+                       ) and self.config.num_nextn_predict_layers > 0:
                 assert self.config.num_nextn_predict_layers == 1
                 layer_idx = self.config.num_hidden_layers
                 if name.startswith(f"model.layers.{layer_idx}"):
diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
new file mode 100644
index 0000000000000..99fa941c055d2
--- /dev/null
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -0,0 +1,662 @@
+# adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/faf18023f24b962b32d9f0a2d89e402a8d383a78/deepseek_vl2/models/modeling_deepseek_vl_v2.py
+"""Inference-only Deepseek-VL2 model compatible with HuggingFace weights."""
+import math
+from functools import cached_property, partial
+from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
+                    TypedDict, Union)
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from transformers import AutoProcessor, BatchFeature, ProcessorMixin
+
+from vllm.attention import AttentionMetadata
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.model_executor import SamplingMetadata
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+from vllm.model_executor.model_loader.utils import set_default_torch_dtype
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
+                                    NestedTensors)
+from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
+                                   ImageSize, MultiModalDataItems)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, PromptReplacement)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.multimodal.utils import cached_get_tokenizer
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.configs.deepseek_vl2 import (DeepseekVLV2Config,
+                                                          MlpProjectorConfig,
+                                                          VisionEncoderConfig)
+from vllm.utils import is_list_of
+
+from .interfaces import SupportsMultiModal, SupportsPP
+from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
+                    init_vllm_registered_model, maybe_prefix,
+                    merge_multimodal_embeddings)
+
+logger = init_logger(__name__)
+
+# The image token id may be various
+_IMAGE_TOKEN = "<image>"
+
+
+class DeepseekVL2ImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    data: Union[torch.Tensor, List[torch.Tensor]]
+    """
+    Shape: `(batch_size * num_images, num_channels, height, width)`
+    """
+    images_spatial_crop: torch.Tensor
+    """
+    Shape: `(batch_size * num_images, 2)`
+    """
+
+
+class DeepseekVL2VImageEmbeddingInputs(TypedDict):
+    type: Literal["image_embeds"]
+    data: Union[torch.Tensor, List[torch.Tensor]]
+    """Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
+
+    `hidden_size` must match the hidden size of language model backbone.
+    """
+
+
+DeepseekVL2ImageInputs = Union[DeepseekVL2ImagePixelInputs,
+                               DeepseekVL2VImageEmbeddingInputs]
+
+
+class MlpProjector(nn.Module):
+
+    def __init__(self, cfg: MlpProjectorConfig):
+
+        super().__init__()
+
+        self.cfg = cfg
+        assert not cfg.token_pooling, (
+            "Token pooling is not supported currently.")
+
+        if cfg.projector_type == "downsample_mlp_gelu":
+            mlp_depth = cfg.depth
+            mlp_ratio = cfg.mlp_ratio
+            modules = [
+                nn.Linear(
+                    cfg.input_dim * cfg.downsample_ratio *
+                    cfg.downsample_ratio, cfg.n_embed * mlp_ratio)
+            ]
+            for _ in range(1, mlp_depth - 1):
+                modules.append(nn.GELU())
+                modules.append(
+                    nn.Linear(cfg.n_embed * mlp_ratio,
+                              cfg.n_embed * mlp_ratio))
+            modules.append(nn.GELU())
+            modules.append(nn.Linear(cfg.n_embed * mlp_ratio, cfg.n_embed))
+            modules = nn.Sequential(*modules)
+
+        else:
+            raise NotImplementedError(
+                f"Unsupported projector type: {cfg.projector_type}")
+
+        self.layers = modules
+
+    def forward(self, x):
+        bs, hw, input_dim = x.shape
+        h = w = int((hw)**0.5)
+        """compute padding"""
+        if h % self.cfg.downsample_ratio:
+            pad = self.cfg.downsample_ratio - h % self.cfg.downsample_ratio
+        else:
+            pad = 0
+        x = x.reshape(bs, h, w, input_dim)
+        if pad > 0:
+            x = F.pad(x, (0, 0, 0, pad, 0, pad), "constant", 0)
+        """4 to 1 concat"""
+        x = x.permute(0, 3, 1, 2)  # B, C, H, W
+        x = F.unfold(x,
+                     kernel_size=self.cfg.downsample_ratio,
+                     stride=self.cfg.downsample_ratio,
+                     padding=0)  # B, C*4, HW // 4
+        x = x.permute(0, 2, 1)
+
+        return self.layers(x)
+
+
+class DeepseekVL2ProcessingInfo(BaseProcessingInfo):
+
+    def get_hf_config(self):
+        return self.ctx.get_hf_config(DeepseekVLV2Config)
+
+    def get_hf_processor(self) -> ProcessorMixin:
+        # TODO(Isotr0py): we should get rid of dependency on deepseek_vl2
+        # in the future, because it's flasky and lack of maintenance.
+        try:
+            from deepseek_vl2.models.processing_deepseek_vl_v2 import (
+                DeepseekVLV2Processor, select_best_resolution)
+            AutoProcessor.register("DeepseekVLV2Processor",
+                                   DeepseekVLV2Processor)
+        except ModuleNotFoundError as exc:
+            raise ModuleNotFoundError(
+                "You need to `pip install "
+                "git+https://github.com/deepseek-ai/DeepSeek-VL2.git` "
+                "to use this model") from exc
+
+        processor = self.ctx.get_hf_processor(DeepseekVLV2Processor)
+        processor.select_best_resolution = partial(
+            select_best_resolution,
+            candidate_resolutions=processor.candidate_resolutions)
+        return processor
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None}
+
+    def get_num_image_tokens(self, *, image_width: int,
+                             image_height: int) -> int:
+        hf_processor = self.get_hf_processor()
+        image_size = hf_processor.image_size
+        patch_size = hf_processor.patch_size
+        downsample_ratio = hf_processor.downsample_ratio
+
+        best_width, best_height = hf_processor.select_best_resolution(
+            (image_width, image_height))
+
+        num_width_tiles, num_height_tiles = (best_width // image_size,
+                                             best_height // image_size)
+        h = w = math.ceil((image_size // patch_size) / downsample_ratio)
+
+        global_views_tokens = h * (w + 1)
+        local_views_tokens = (num_height_tiles * h) * (num_width_tiles * w + 1)
+        return global_views_tokens + local_views_tokens + 1
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        hf_config = self.get_hf_config()
+        candidate_resolutions = hf_config.candidate_resolutions
+        height, width = max(candidate_resolutions,
+                            key=lambda x: self.get_num_image_tokens(
+                                image_width=x[1], image_height=x[0]))
+        return ImageSize(width=width, height=height)
+
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+        max_image_size = self.get_image_size_with_most_features()
+        max_image_tokens = self.get_num_image_tokens(
+            image_height=max_image_size.height,
+            image_width=max_image_size.width)
+
+        return {"image": max_image_tokens}
+
+
+class DeepseekVL2DummyInputsBuilder(
+        BaseDummyInputsBuilder[DeepseekVL2ProcessingInfo]):
+
+    def get_dummy_processor_inputs(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        num_images = mm_counts.get("image", 0)
+        hf_processor = self.info.get_hf_processor()
+        image_token: str = hf_processor.image_token
+
+        max_image_size = self.info.get_image_size_with_most_features()
+
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=max_image_size.width,
+                                   height=max_image_size.height,
+                                   num_images=num_images)
+        }
+
+        return ProcessorInputs(
+            prompt_text=image_token * num_images,
+            mm_data=mm_data,
+        )
+
+
+class DeepseekVL2MultiModalProcessor(
+        BaseMultiModalProcessor[DeepseekVL2ProcessingInfo]):
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        if mm_data:
+            outputs = self.info.ctx.call_hf_processor(
+                self.info.get_hf_processor(**mm_kwargs),
+                dict(prompt=prompt, **mm_data),
+                mm_kwargs,
+            )
+
+            # Deepseek-vl2 processor don't return BatchFeature,
+            # we need to manually create it
+            processed_outputs = dict(input_ids=outputs["input_ids"])
+            processed_outputs = BatchFeature(data=dict(processed_outputs),
+                                             tensor_type="pt")
+
+            # Remove batch dimension from processor outputs,
+            # because we will try batch to create NestedTensors
+            target_dtype = self.info.ctx.model_config.dtype
+            pixel_values = outputs["images"].to(target_dtype).squeeze(0)
+            images_spatial_crop = outputs["images_spatial_crop"].squeeze(0)
+            patches_per_image = [
+                x.prod().item() + 1 for x in images_spatial_crop
+            ]
+
+            # Rename `images` -> `pixel_values` to avoid confusion
+            processed_outputs["pixel_values"] = list(
+                pixel_values.split(patches_per_image))
+            processed_outputs["images_spatial_crop"] = images_spatial_crop
+        else:
+            tokenizer = self.info.get_tokenizer()
+            processed_outputs = tokenizer(prompt,
+                                          add_special_tokens=True,
+                                          return_tensors="pt")
+
+        return processed_outputs
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+            images_spatial_crop=MultiModalFieldConfig.batched("image"),
+            image_embeds=MultiModalFieldConfig.batched("image"),
+        )
+
+    def _get_prompt_replacements(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> list[PromptReplacement]:
+        hf_processor = self.info.get_hf_processor()
+        image_token_id: int = hf_processor.image_token_id
+
+        def get_replacement_deepseek_vl2(item_idx: int):
+            images = mm_items.get_items(
+                "image", (ImageEmbeddingItems, ImageProcessorItems))
+
+            if isinstance(images, ImageEmbeddingItems):
+                num_image_tokens = images.get_feature_size(item_idx)
+            else:
+                image_size = images.get_image_size(item_idx)
+
+                num_image_tokens = self.info.get_num_image_tokens(
+                    image_width=image_size.width,
+                    image_height=image_size.height,
+                )
+            return [image_token_id] * num_image_tokens
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=[image_token_id],
+                replacement=get_replacement_deepseek_vl2,
+            )
+        ]
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    DeepseekVL2MultiModalProcessor,
+    info=DeepseekVL2ProcessingInfo,
+    dummy_inputs=DeepseekVL2DummyInputsBuilder)
+class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
+
+    hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={
+        "language.": "language_model.",
+    })
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config: DeepseekVLV2Config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+
+        self.config = config
+        self.multimodal_config = multimodal_config
+
+        self.vision_config = config.vision_config
+        self.projector_config = config.projector_config
+        self.text_config = config.text_config
+
+        model_config = vllm_config.model_config
+        tokenizer = cached_get_tokenizer(
+            model_config.tokenizer,
+            tokenizer_mode=model_config.tokenizer_mode,
+            tokenizer_revision=model_config.tokenizer_revision,
+            trust_remote_code=model_config.trust_remote_code,
+        )
+        self.image_token_id = tokenizer.vocab.get(_IMAGE_TOKEN)
+
+        self.vision = self._init_vision_module(self.vision_config,
+                                               quant_config,
+                                               maybe_prefix(prefix, "vision"))
+
+        self.projector = MlpProjector(self.projector_config)
+        self.tile_tag = config.tile_tag
+        self.global_view_pos = config.global_view_pos
+
+        # special token for image token sequence format
+        embed_std = 1 / torch.sqrt(
+            torch.tensor(self.projector_config.n_embed, dtype=torch.float32))
+        if self.tile_tag == "2D":
+            # <|view_separator|>, <|\n|>
+            self.image_newline = nn.Parameter(
+                torch.randn(self.projector_config.n_embed) * embed_std)
+            # This is a typo in original implementation
+            self.view_seperator = nn.Parameter(
+                torch.randn(self.projector_config.n_embed) * embed_std)
+        else:
+            raise ValueError(
+                f"Only 2D tile_tag is supported currently, got: {self.tile_tag}"
+            )
+
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            hf_config=self.text_config,
+            prefix=maybe_prefix(prefix, "language"),
+            architectures=["DeepseekV3ForCausalLM"]
+            if self.text_config.topk_method == "noaux_tc" else
+            ["DeepseekV2ForCausalLM"],
+        )
+
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors)
+
+    def _init_vision_module(
+        self,
+        vision_config: VisionEncoderConfig,
+        quant_config: Optional[QuantizationConfig],
+        prefix: str = "",
+    ) -> nn.Module:
+        # TODO: refactor vision model through timm wrapper from transformers
+        try:
+            import timm
+        except ImportError:
+            raise ImportError("Please install timm") from ImportError
+
+        with set_default_torch_dtype(torch.float16):
+            model = timm.create_model(
+                "vit_so400m_patch14_siglip_384.webli",
+                pretrained=False,
+                num_classes=0,
+                dynamic_img_size=True,
+                dynamic_img_pad=True,
+            )
+
+        model = model.to(dtype=torch.get_default_dtype())
+        return model
+
+    @cached_property
+    def sampler(self):
+        if hasattr(self.language_model, "sampler"):
+            return self.language_model.sampler
+
+        return get_sampler()
+
+    def _validate_pixel_values(
+        self, data: Union[torch.Tensor, List[torch.Tensor]]
+    ) -> Union[torch.Tensor, List[torch.Tensor]]:
+
+        h = w = self.vision_config.image_size
+        expected_dims = (3, h, w)
+
+        def _validate_shape(d: torch.Tensor):
+            actual_dims = tuple(d.shape[1:])
+
+            if actual_dims != expected_dims:
+                expected_expr = ("num_patches", *map(str, expected_dims))
+                raise ValueError(
+                    "The expected shape of pixel values per image per batch "
+                    f"is {expected_expr}. You supplied {tuple(d.shape)}.")
+
+        for d in data:
+            _validate_shape(d)
+
+        return data
+
+    def _validate_images_spatial_crop(
+        self, data: Union[torch.Tensor, List[torch.Tensor]]
+    ) -> Union[torch.Tensor, List[torch.Tensor]]:
+        expected_dims = 2
+
+        def _validate_shape(d: torch.Tensor):
+            actual_dims = d.size(-1)
+
+            if actual_dims != expected_dims:
+                expected_expr = str(expected_dims)
+                raise ValueError(
+                    f"The expected shape of image sizes per image per batch "
+                    f"is {expected_expr}. You supplied {tuple(d.shape)}.")
+
+        for d in data:
+            _validate_shape(d)
+
+        return data
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[DeepseekVL2ImageInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        images_spatial_crop = kwargs.pop("images_spatial_crop", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+
+        if pixel_values is None and image_embeds is None:
+            return None
+
+        if pixel_values is not None:
+            if not isinstance(pixel_values, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of pixel values. "
+                                 f"Got type: {type(pixel_values)}")
+
+            if not isinstance(images_spatial_crop, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of image sizes. "
+                                 f"Got type: {type(images_spatial_crop)}")
+
+            return DeepseekVL2ImagePixelInputs(
+                type="pixel_values",
+                data=self._validate_pixel_values(flatten_bn(pixel_values)),
+                images_spatial_crop=self._validate_images_spatial_crop(
+                    flatten_bn(images_spatial_crop, concat=True)))
+
+        if image_embeds is not None:
+            if not isinstance(image_embeds, torch.Tensor):
+                raise ValueError("Incorrect type of image embeddings. "
+                                 f"Got type: {type(image_embeds)}")
+
+            return DeepseekVL2VImageEmbeddingInputs(
+                type="image_embeds",
+                data=flatten_bn(image_embeds),
+            )
+
+        raise AssertionError("This line should be unreachable.")
+
+    def _pixel_values_to_embedding(
+        self,
+        pixel_values: NestedTensors,
+        images_spatial_crop: torch.Tensor,
+    ) -> NestedTensors:
+        # Pixel_values: n_image * batch_size * [patch_per_img, 3, height, width]
+        total_tiles = [x for x in pixel_values]
+
+        # [batch_all_tiles, 3, height, width]
+        total_tiles = torch.cat(total_tiles, dim=0)
+
+        # [batch_all_tiles, vit_seq_len, c]
+        images_feature = self.vision.forward_features(total_tiles)
+
+        # [batch_all_tiles, hw, D]
+        images_embeds = self.projector(images_feature)
+
+        _, hw, n_dim = images_embeds.shape
+        h = w = int(hw**0.5)
+
+        # 根据self.tile_tag & self.global_view_pos填充image token sequence
+        tile_index = 0
+        vision_embeddings = []
+        for jdx in range(images_spatial_crop.size(0)):
+            # extra global & local features
+            num_width_tiles, num_height_tiles = images_spatial_crop[jdx]
+            if num_width_tiles == 0 or num_height_tiles == 0:
+                break
+            num_tiles_in_image = num_width_tiles * num_height_tiles
+
+            # [hw, D]
+            global_features = images_embeds[tile_index]
+
+            # [num_height_tiles * num_width_tiles, hw, D]
+            local_features = images_embeds[tile_index + 1:tile_index + 1 +
+                                           num_tiles_in_image]
+            tile_index += num_tiles_in_image + 1
+
+            # format global and local features
+            # ----------------- global view add newline -----------------
+            # [hw, D] -> [h, w, D]
+            global_features = global_features.view(h, w, n_dim)
+
+            # [D]     -> [h, 1, D]
+            new_lines_in_global = repeat(self.image_newline, "d -> h 1 d", h=h)
+
+            # cat([h, w, D], [h, 1, D], dim=1) -> [h, w + 1, D]
+            global_features = torch.cat([global_features, new_lines_in_global],
+                                        dim=1)
+
+            # [h, w + 1, D] -> [h * (w + 1), D]
+            global_features = global_features.view(-1, n_dim)
+
+            # ----------------- local view add newline -----------------
+            # [num_height_tiles * num_width_tiles, h * w, D] ->
+            # [num_height_tiles * h, num_width_tiles * w, D]
+            local_features = rearrange(local_features,
+                                       "(th tw) (h w) d -> (th h) (tw w) d",
+                                       th=num_height_tiles,
+                                       tw=num_width_tiles,
+                                       h=h,
+                                       w=w)
+
+            # [D] -> [num_height_tiles * h, 1, D]
+            new_lines_in_local = repeat(self.image_newline,
+                                        "d -> (th h) 1 d",
+                                        th=num_height_tiles,
+                                        h=h)
+
+            # [num_height_tiles * h, num_width_tiles * w + 1, D]
+            local_features = torch.cat([local_features, new_lines_in_local],
+                                       dim=1)
+
+            # [num_height_tiles * h, num_width_tiles * w + 1, D]
+            #   --> [(num_height_tiles * h) * (num_width_tiles * w + 1), D]
+            local_features = local_features.view(-1, n_dim)
+
+            # merge global and local tiles
+            if self.global_view_pos == "head":
+                global_local_features = torch.cat([
+                    global_features,
+                    self.view_seperator[None, :],
+                    local_features,
+                ])
+            else:
+                global_local_features = torch.cat([
+                    local_features,
+                    self.view_seperator[None, :],
+                    global_features,
+                ])
+
+            vision_embeddings.append(global_local_features)
+        return vision_embeddings
+
+    def _process_image_input(
+            self, image_input: DeepseekVL2ImageInputs) -> torch.Tensor:
+        if image_input["type"] == "image_embeds":
+            image_data = image_input["data"]
+            if is_list_of(image_data, torch.Tensor):
+                # it's already a list of tensors
+                return image_data
+            if len(image_data.shape) == 3:
+                # 3D tensor
+                return list(torch.unbind(image_data, dim=0))
+            raise ValueError(
+                "We expect batched 2D tensors;"
+                "this can be either a list of 2D tensors or a single 3D tensor."
+            )
+
+        pixel_values = image_input["data"]
+        images_spatial_crop = image_input["images_spatial_crop"]
+
+        return self._pixel_values_to_embedding(
+            pixel_values=pixel_values, images_spatial_crop=images_spatial_crop)
+
+    def get_multimodal_embeddings(self, **kwargs: object) -> torch.Tensor:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return None
+        vision_embeddings = self._process_image_input(image_input)
+        return vision_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[NestedTensors] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, multimodal_embeddings,
+                self.image_token_id)
+        return inputs_embeds
+
+    def forward(self,
+                input_ids: torch.Tensor,
+                positions: torch.Tensor,
+                kv_caches: List[torch.Tensor],
+                attn_metadata: AttentionMetadata,
+                intermediate_tensors: Optional[IntermediateTensors] = None,
+                inputs_embeds: Optional[torch.Tensor] = None,
+                **kwargs: object):
+
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
+
+        hidden_states = self.language_model(input_ids,
+                                            positions,
+                                            kv_caches,
+                                            attn_metadata,
+                                            intermediate_tensors,
+                                            inputs_embeds=inputs_embeds)
+
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        return self.language_model.sample(logits, sampling_metadata)
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+
+        loader = AutoWeightsLoader(self)
+        autoloaded_weights = loader.load_weights(weights,
+                                                 mapper=self.hf_to_vllm_mapper)
+        return autoloaded_weights
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 8f36437d47d9e..ff7dab89e4da8 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -657,7 +657,7 @@ def init_vision_module(
         quant_config: Optional[QuantizationConfig],
         prefix: str = "",
     ) -> nn.Module:
-        # TODO: refactor this vision model
+        # TODO: refactor vision model through timm wrapper from transformers
         try:
             import timm
         except ImportError:
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 62840b8c1bcda..a7286a9203f67 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -149,6 +149,7 @@
     "ChameleonForConditionalGeneration": ("chameleon", "ChameleonForConditionalGeneration"),  # noqa: E501
     "ChatGLMModel": ("chatglm", "ChatGLMForCausalLM"),
     "ChatGLMForConditionalGeneration": ("chatglm", "ChatGLMForCausalLM"),
+    "DeepseekVLV2ForCausalLM": ("deepseek_vl2", "DeepseekVLV2ForCausalLM"),
     "FuyuForCausalLM": ("fuyu", "FuyuForCausalLM"),
     "H2OVLChatModel": ("h2ovl", "H2OVLChatModel"),
     "InternVLChatModel": ("internvl", "InternVLChatModel"),
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 58417980e7b47..c97acffa1a719 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -23,8 +23,9 @@
 # yapf conflicts with isort for this block
 # yapf: disable
 from vllm.transformers_utils.configs import (ChatGLMConfig, Cohere2Config,
-                                             DbrxConfig, EAGLEConfig,
-                                             ExaoneConfig, H2OVLChatConfig,
+                                             DbrxConfig, DeepseekVLV2Config,
+                                             EAGLEConfig, ExaoneConfig,
+                                             H2OVLChatConfig,
                                              InternVLChatConfig, JAISConfig,
                                              MedusaConfig, MllamaConfig,
                                              MLPSpeculatorConfig, MPTConfig,
@@ -54,6 +55,7 @@
     "chatglm": ChatGLMConfig,
     "cohere2": Cohere2Config,
     "dbrx": DbrxConfig,
+    "deepseek_vl_v2": DeepseekVLV2Config,
     "mpt": MPTConfig,
     "RefinedWeb": RWConfig,  # For tiiuae/falcon-40b(-instruct)
     "RefinedWebModel": RWConfig,  # For tiiuae/falcon-7b(-instruct)
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index a41a35c88b3a1..f065c56124605 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -1,6 +1,7 @@
 from vllm.transformers_utils.configs.chatglm import ChatGLMConfig
 from vllm.transformers_utils.configs.cohere2 import Cohere2Config
 from vllm.transformers_utils.configs.dbrx import DbrxConfig
+from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekVLV2Config
 from vllm.transformers_utils.configs.eagle import EAGLEConfig
 from vllm.transformers_utils.configs.exaone import ExaoneConfig
 # RWConfig is for the original tiiuae/falcon-40b(-instruct) and
@@ -25,6 +26,7 @@
     "ChatGLMConfig",
     "Cohere2Config",
     "DbrxConfig",
+    "DeepseekVLV2Config",
     "MPTConfig",
     "RWConfig",
     "H2OVLChatConfig",
diff --git a/vllm/transformers_utils/configs/deepseek_vl2.py b/vllm/transformers_utils/configs/deepseek_vl2.py
new file mode 100644
index 0000000000000..681528c3c0116
--- /dev/null
+++ b/vllm/transformers_utils/configs/deepseek_vl2.py
@@ -0,0 +1,214 @@
+# adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/faf18023f24b962b32d9f0a2d89e402a8d383a78/deepseek_vl2/models/modeling_deepseek_vl_v2.py#L115-L268
+from typing import Tuple
+
+from transformers.configuration_utils import PretrainedConfig
+
+
+class VisionEncoderConfig(PretrainedConfig):
+    model_type: str = "vision"
+
+    model_name: str = "vit_so400m_patch14_siglip_384.webli"
+    image_size: int = 384
+    patch_size: int = 16
+    width: int = 1024
+    layers: int = 24
+    heads: int = 16
+    mlp_ratio: int = 4
+    global_pool: str = "map"
+    ignore_head: bool = True
+    class_token: bool = False
+    num_classes: int = 0
+    use_checkpoint: bool = False
+    weight_init: str = "skip"
+    deterministic: bool = False
+    num_recomputing_layers: int = 0
+
+    def __init__(self,
+                 model_name: str = "vit_so400m_patch14_siglip_384.webli",
+                 image_size: int = 384,
+                 patch_size: int = 16,
+                 width: int = 1024,
+                 layers: int = 24,
+                 heads: int = 16,
+                 mlp_ratio: int = 4,
+                 global_pool: str = "map",
+                 ignore_head: bool = True,
+                 class_token: bool = False,
+                 num_classes: int = 0,
+                 use_checkpoint: bool = False,
+                 **kwargs):
+        self.model_name = model_name
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.width = width
+        self.layers = layers
+        self.heads = heads
+        self.mlp_ratio = mlp_ratio
+        self.global_pool = global_pool
+        self.ignore_head = ignore_head
+        self.class_token = class_token
+        self.num_classes = num_classes
+        self.use_checkpoint = use_checkpoint
+
+        super().__init__(**kwargs)
+
+
+class MlpProjectorConfig(PretrainedConfig):
+    model_type = "mlp_projector"
+    projector_type: str = "downsample_mlp_gelu"
+    input_dim: int = 1152
+    n_embed: int = 2048
+    depth: int = 2
+    mlp_ratio: int = 1
+    downsample_ratio: int = 2
+    token_pooling: bool = False
+
+    def __init__(self,
+                 projector_type: str = "downsample_mlp_gelu",
+                 input_dim: int = 1152,
+                 n_embed: int = 2048,
+                 depth: int = 2,
+                 mlp_ratio: int = 1,
+                 downsample_ratio: int = 2,
+                 **kwargs):
+        self.projector_type = projector_type
+        self.input_dim = input_dim
+        self.n_embed = n_embed
+        self.depth = depth
+        self.mlp_ratio = mlp_ratio
+        self.downsample_ratio = downsample_ratio
+
+        super().__init__(**kwargs)
+
+
+class DeepseekV2Config(PretrainedConfig):
+
+    model_type = "deepseek_v2"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=102400,
+        hidden_size=4096,
+        intermediate_size=11008,
+        moe_intermediate_size=1407,
+        num_hidden_layers=30,
+        num_attention_heads=32,
+        num_key_value_heads=32,
+        n_shared_experts=None,
+        n_routed_experts=None,
+        ep_size=1,
+        routed_scaling_factor=1.0,
+        kv_lora_rank=512,
+        q_lora_rank=1536,
+        qk_rope_head_dim=64,
+        v_head_dim=128,
+        qk_nope_head_dim=128,
+        topk_method='gready',
+        n_group=None,
+        topk_group=None,
+        num_experts_per_tok=None,
+        moe_layer_freq=1,
+        first_k_dense_replace=0,
+        norm_topk_prob=False,
+        scoring_func='softmax',
+        aux_loss_alpha=0.001,
+        seq_aux=True,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=100000,
+        eos_token_id=100001,
+        pretraining_tp=1,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        use_mla=True,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.moe_intermediate_size = moe_intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.n_shared_experts = n_shared_experts
+        self.n_routed_experts = n_routed_experts
+        self.ep_size = ep_size
+        self.routed_scaling_factor = routed_scaling_factor
+        self.kv_lora_rank = kv_lora_rank
+        self.q_lora_rank = q_lora_rank
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.v_head_dim = v_head_dim
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.topk_method = topk_method
+        self.n_group = n_group
+        self.topk_group = topk_group
+        self.num_experts_per_tok = num_experts_per_tok
+        self.moe_layer_freq = moe_layer_freq
+        self.first_k_dense_replace = first_k_dense_replace
+        self.norm_topk_prob = norm_topk_prob
+        self.scoring_func = scoring_func
+        self.aux_loss_alpha = aux_loss_alpha
+        self.seq_aux = seq_aux
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = float(rms_norm_eps)
+        self.pretraining_tp = pretraining_tp
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.use_mla = use_mla
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+class DeepseekVLV2Config(PretrainedConfig):
+    model_type = "deepseek_vl_v2"
+    vision_config: VisionEncoderConfig
+    projector_config: MlpProjectorConfig
+
+    tile_tag: str = "2D"
+    global_view_pos: str = "head"
+    candidate_resolutions: Tuple[Tuple[int, int]] = ((384, 384), )
+
+    def __init__(self,
+                 tile_tag: str = "tile_tag",
+                 global_view_pos: str = "head",
+                 candidate_resolutions: Tuple[Tuple[int,
+                                                    int]] = ((384, 384), ),
+                 **kwargs):
+        super().__init__(**kwargs)
+
+        vision_config = kwargs.get("vision_config", {})
+        self.vision_config = VisionEncoderConfig(**vision_config)
+
+        projector_config = kwargs.get("projector_config", {})
+        self.projector_config = MlpProjectorConfig(**projector_config)
+
+        language_config = kwargs.get("language_config", {})
+        self.text_config = DeepseekV2Config(**language_config)
+
+        self.tile_tag = tile_tag
+        self.global_view_pos = global_view_pos
+        self.candidate_resolutions = candidate_resolutions
+        self.vocab_size = self.text_config.vocab_size

From 8bddb735123204872788a8ffe117321de7550e6c Mon Sep 17 00:00:00 2001
From: Akshat Tripathi <Akshat.tripathi6568@gmail.com>
Date: Sun, 12 Jan 2025 13:01:52 +0000
Subject: [PATCH 185/462] [Hardware][CPU] Multi-LoRA implementation for the CPU
 backend (#11100)

Signed-off-by: Akshat Tripathi <akshat@krai.ai>
Signed-off-by: Oleg Mosalov <oleg@krai.ai>
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Oleg Mosalov <oleg@krai.ai>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
---
 .buildkite/run-cpu-test.sh                    |   6 +
 docs/source/features/compatibility_matrix.md  |   2 +-
 tests/lora/conftest.py                        |  32 +-
 tests/lora/test_layers.py                     |  39 +-
 tests/lora/test_lora_manager.py               |  21 +-
 tests/lora/test_mixtral.py                    |   4 +-
 ...nica_sizes.py => test_punica_ops_sizes.py} | 124 ++++---
 ...iation.py => test_punica_ops_variation.py} | 134 ++++---
 tests/lora/test_quant_model.py                |   3 +-
 tests/lora/utils.py                           |  27 --
 vllm/executor/cpu_executor.py                 |   3 -
 vllm/lora/ops/torch_ops/__init__.py           |  13 +
 vllm/lora/ops/torch_ops/lora_ops.py           | 113 ++++++
 vllm/lora/ops/triton_ops/__init__.py          |  13 +
 vllm/lora/ops/{ => triton_ops}/bgmv_expand.py |   0
 .../ops/{ => triton_ops}/bgmv_expand_slice.py |   0
 vllm/lora/ops/{ => triton_ops}/bgmv_shrink.py |   0
 vllm/lora/ops/{ => triton_ops}/sgmv_expand.py |   0
 vllm/lora/ops/{ => triton_ops}/sgmv_shrink.py |   0
 vllm/lora/ops/{ => triton_ops}/utils.py       |   0
 vllm/lora/punica_wrapper/punica_cpu.py        | 346 ++++++++++++++++++
 vllm/lora/punica_wrapper/punica_gpu.py        |  10 +-
 vllm/lora/punica_wrapper/punica_selector.py   |   5 +
 vllm/worker/cpu_model_runner.py               | 133 ++++++-
 vllm/worker/cpu_worker.py                     |  20 +-
 25 files changed, 855 insertions(+), 193 deletions(-)
 rename tests/lora/{test_punica_sizes.py => test_punica_ops_sizes.py} (77%)
 rename tests/lora/{test_punica_variation.py => test_punica_ops_variation.py} (74%)
 create mode 100644 vllm/lora/ops/torch_ops/__init__.py
 create mode 100644 vllm/lora/ops/torch_ops/lora_ops.py
 create mode 100644 vllm/lora/ops/triton_ops/__init__.py
 rename vllm/lora/ops/{ => triton_ops}/bgmv_expand.py (100%)
 rename vllm/lora/ops/{ => triton_ops}/bgmv_expand_slice.py (100%)
 rename vllm/lora/ops/{ => triton_ops}/bgmv_shrink.py (100%)
 rename vllm/lora/ops/{ => triton_ops}/sgmv_expand.py (100%)
 rename vllm/lora/ops/{ => triton_ops}/sgmv_shrink.py (100%)
 rename vllm/lora/ops/{ => triton_ops}/utils.py (100%)
 create mode 100644 vllm/lora/punica_wrapper/punica_cpu.py

diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index 4ae66f6f3215a..9925db7bea593 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -75,6 +75,12 @@ function cpu_tests() {
       --num-prompts 20 \
       --endpoint /v1/completions \
       --tokenizer facebook/opt-125m"
+
+  # Run multi-lora tests
+  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
+    set -e
+    pytest -s -v \
+    tests/lora/test_qwen2vl.py"
 }
 
 # All of CPU tests are expected to be finished less than 25 mins.
diff --git a/docs/source/features/compatibility_matrix.md b/docs/source/features/compatibility_matrix.md
index 8d8f7dca2e5b5..ea1d545ff3d73 100644
--- a/docs/source/features/compatibility_matrix.md
+++ b/docs/source/features/compatibility_matrix.md
@@ -359,7 +359,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar
      - ✅
      - ✅
      - ✅
-     - [✗](gh-pr:4830)
+     - ✅
      - ✅
    * - <abbr title="Prompt Adapter">prmpt adptr</abbr>
      - ✅
diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index 57ebaa424fc59..e7378d00765f0 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -21,6 +21,7 @@
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader import get_model
+from vllm.platforms import current_platform
 
 
 class ContextIDInfo(TypedDict):
@@ -65,13 +66,16 @@ def cleanup_fixture(should_do_global_cleanup_after_test: bool):
 @pytest.fixture
 def dist_init():
     temp_file = tempfile.mkstemp()[1]
-    init_distributed_environment(
-        world_size=1,
-        rank=0,
-        distributed_init_method=f"file://{temp_file}",
-        local_rank=0,
-        backend="nccl",
-    )
+
+    backend = "nccl"
+    if current_platform.is_cpu():
+        backend = "gloo"
+
+    init_distributed_environment(world_size=1,
+                                 rank=0,
+                                 distributed_init_method=f"file://{temp_file}",
+                                 local_rank=0,
+                                 backend=backend)
     initialize_model_parallel(1, 1)
     yield
     cleanup_dist_env_and_memory(shutdown_ray=True)
@@ -81,13 +85,15 @@ def dist_init():
 def dist_init_torch_only():
     if torch.distributed.is_initialized():
         return
+    backend = "nccl"
+    if current_platform.is_cpu():
+        backend = "gloo"
+
     temp_file = tempfile.mkstemp()[1]
-    torch.distributed.init_process_group(
-        backend="nccl",
-        world_size=1,
-        rank=0,
-        init_method=f"file://{temp_file}",
-    )
+    torch.distributed.init_process_group(world_size=1,
+                                         rank=0,
+                                         init_method=f"file://{temp_file}",
+                                         backend=backend)
 
 
 @pytest.fixture
diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
index fb8c0b2a7ba26..08a589d7ee29c 100644
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -48,10 +48,14 @@
     torch.float32: (5e-3, 5e-3),
     torch.bfloat16: (3e-2, 2e-2),
 }
-# TODO: Modify this based on platform
-DEVICES = [
+
+pytestmark = pytest.mark.skipif(
+    not (current_platform.is_cuda_alike() or current_platform.is_cpu()),
+    reason="Backend not supported")
+
+DEVICES = ([
     f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-]
+] if current_platform.is_cuda_alike() else ["cpu"])
 
 #For GPU, we will launch different triton kernels between the prefill and decode
 # stages, so we need to verify this. prefill stage(True) or decode stage(False)
@@ -198,6 +202,10 @@ def check_punica_wrapper(punica_wrapper) -> bool:
         from vllm.lora.punica_wrapper.punica_gpu import PunicaWrapperGPU
 
         return type(punica_wrapper) is PunicaWrapperGPU
+    elif current_platform.is_cpu():
+        from vllm.lora.punica_wrapper.punica_cpu import PunicaWrapperCPU
+
+        return type(punica_wrapper) is PunicaWrapperCPU
     else:
         return False
 
@@ -211,7 +219,8 @@ def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None:
     # For multi-GPU testing of Triton kernel, we must explicitly set the CUDA
     # device, see: https://github.com/triton-lang/triton/issues/2925
     # Same below.
-    torch.cuda.set_device(device)
+    if current_platform.is_cuda_alike():
+        torch.cuda.set_device(device)
 
     torch.set_default_device(device)
     max_loras = 8
@@ -313,7 +322,9 @@ def create_random_embedding_layer():
 def test_embeddings_with_new_embeddings(dist_init, num_loras, device,
                                         vocab_size, stage) -> None:
 
-    torch.cuda.set_device(device)
+    if current_platform.is_cuda_alike():
+        torch.cuda.set_device(device)
+
     torch.set_default_device(device)
     max_loras = 8
     punica_wrapper = get_punica_wrapper(8192, 256, device)
@@ -450,7 +461,9 @@ def create_random_embedding_layer():
 def test_lm_head_logits_processor(dist_init, num_loras, device, vocab_size,
                                   stage) -> None:
 
-    torch.cuda.set_device(device)
+    if current_platform.is_cuda_alike():
+        torch.cuda.set_device(device)
+
     torch.set_default_device(device)
     max_loras = 8
     punica_wrapper = get_punica_wrapper(8192, 256, device)
@@ -582,7 +595,9 @@ def _pretest():
 def test_linear_replicated(dist_init, num_loras, device, stage,
                            bias_enabled) -> None:
 
-    torch.cuda.set_device(device)
+    if current_platform.is_cuda_alike():
+        torch.cuda.set_device(device)
+
     torch.set_default_device(device)
     punica_wrapper = get_punica_wrapper(8192, 256, device)
     assert check_punica_wrapper(punica_wrapper)
@@ -695,7 +710,9 @@ def create_random_linear_replicated_layer():
 def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
                          device, stage, bias_enabled) -> None:
 
-    torch.cuda.set_device(device)
+    if current_platform.is_cuda_alike():
+        torch.cuda.set_device(device)
+
     torch.set_default_device(device)
     punica_wrapper = get_punica_wrapper(8192, 256, device)
     assert check_punica_wrapper(punica_wrapper)
@@ -818,7 +835,9 @@ def create_random_linear_parallel_layer():
 def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
                                 device, stage, bias_enabled) -> None:
 
-    torch.cuda.set_device(device)
+    if current_platform.is_cuda_alike():
+        torch.cuda.set_device(device)
+
     torch.set_default_device(device)
     punica_wrapper = get_punica_wrapper(8192, 256, device)
     assert check_punica_wrapper(punica_wrapper)
@@ -971,6 +990,8 @@ class FakeConfig:
 @pytest.mark.parametrize("rotary_dim", [None, 32])
 @pytest.mark.parametrize("head_size", [32, 108])
 @pytest.mark.parametrize("seq_len", [11, 1024])
+@pytest.mark.skipif(not current_platform.is_cuda_alike(),
+                    reason="Only CUDA backends are supported")
 def test_rotary_embedding_long_context(dist_init, num_loras, device,
                                        scaling_factors, max_position,
                                        is_neox_style, rotary_dim, head_size,
diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py
index a099f36b0a465..ca523c66abe42 100644
--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@@ -20,6 +20,7 @@
 from vllm.lora.worker_manager import (LRUCacheWorkerLoRAManager,
                                       WorkerLoRAManager)
 from vllm.model_executor.layers.linear import RowParallelLinear
+from vllm.platforms import current_platform
 
 EMBEDDING_MODULES = {
     "embed_tokens": "input_embeddings",
@@ -28,9 +29,9 @@
 
 EMBEDDING_PADDING_MODULES = ["lm_head"]
 
-CUDA_DEVICES = [
+DEVICES = ([
     f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-]
+] if current_platform.is_cuda_alike() else ["cpu"])
 
 
 def test_peft_helper(sql_lora_files):
@@ -83,7 +84,7 @@ def test_peft_helper(sql_lora_files):
         PEFTHelper.from_dict(config)
 
 
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 def test_from_lora_tensors(sql_lora_files, device):
     tensors = load_file(
         os.path.join(sql_lora_files, "adapter_model.safetensors"))
@@ -171,7 +172,7 @@ def test_replace_submodules(dist_init, dummy_model):
     manager = LoRAModelManager(
         model, 1, 1, 1,
         LoRAConfig(max_lora_rank=8, max_cpu_loras=8, max_loras=8),
-        torch.device("cuda"))
+        torch.device(DEVICES[0]))
     model = manager.model
 
     assert isinstance(model.get_submodule("dense1"),
@@ -183,7 +184,7 @@ def test_replace_submodules(dist_init, dummy_model):
                       RowParallelLinearWithLoRA)
 
 
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 def test_lora_model_manager(dist_init, dummy_model, device):
     model = dummy_model
     model.supported_lora_modules = ["dense1", "dense2", "lm_head"]
@@ -244,7 +245,7 @@ def test_lora_model_manager(dist_init, dummy_model, device):
     assert manager.punica_wrapper.device == device
 
 
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 def test_lora_lru_cache_model_manager(dist_init, dummy_model, device):
     model = dummy_model
     model.supported_lora_modules = ["dense1", "dense2", "lm_head"]
@@ -336,7 +337,7 @@ def test_lora_lru_cache_model_manager(dist_init, dummy_model, device):
     assert manager.device == device
 
 
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 def test_lru_lora_model_manager(dist_init, dummy_model, device):
     # This tests just the LRU cache functionality, everything else is
     # tested in test_lora_model_manager
@@ -466,7 +467,7 @@ def test_lru_lora_model_manager(dist_init, dummy_model, device):
     assert manager.device == device
 
 
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
                                           sql_lora_files, device):
     lora_config = LoRAConfig(max_lora_rank=8, max_cpu_loras=4, max_loras=4)
@@ -545,7 +546,7 @@ def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
             device)
 
 
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
                                 sql_lora_files, device):
     # Should remove every LoRA not specified in the request.
@@ -621,7 +622,7 @@ def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
             device)
 
 
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 def test_packed_loras(dist_init, dummy_model_gate_up, device):
     model = dummy_model_gate_up
     model.supported_lora_modules = ["gate_up_proj"]
diff --git a/tests/lora/test_mixtral.py b/tests/lora/test_mixtral.py
index 797a495201d33..940a865228806 100644
--- a/tests/lora/test_mixtral.py
+++ b/tests/lora/test_mixtral.py
@@ -5,6 +5,7 @@
 
 import vllm
 from vllm.lora.request import LoRARequest
+from vllm.platforms import current_platform
 
 MODEL_PATH = "mistralai/Mixtral-8x7B-Instruct-v0.1"
 
@@ -31,7 +32,8 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int,
 @pytest.mark.parametrize("tp_size", [4])
 def test_mixtral_lora(mixtral_lora_files, tp_size):
     """Original test, the LoRA model has the common target modules, not all"""
-    if torch.cuda.device_count() < tp_size:
+    if torch.cuda.device_count(
+    ) < tp_size and tp_size > 1 and current_platform.is_cuda_alike():
         pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
 
     prompts = [
diff --git a/tests/lora/test_punica_sizes.py b/tests/lora/test_punica_ops_sizes.py
similarity index 77%
rename from tests/lora/test_punica_sizes.py
rename to tests/lora/test_punica_ops_sizes.py
index 0351fedd1cfa5..433ca7577d084 100644
--- a/tests/lora/test_punica_sizes.py
+++ b/tests/lora/test_punica_ops_sizes.py
@@ -9,17 +9,16 @@
 import pytest
 import torch
 
-from vllm.lora.ops.bgmv_expand import bgmv_expand
-from vllm.lora.ops.bgmv_expand_slice import bgmv_expand_slice
-from vllm.lora.ops.bgmv_shrink import bgmv_shrink
-from vllm.lora.ops.sgmv_expand import sgmv_expand
-from vllm.lora.ops.sgmv_shrink import sgmv_shrink
-from vllm.lora.ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
+import vllm.lora.ops.triton_ops  # noqa: F401
+from vllm.lora.ops.torch_ops import (bgmv_expand, bgmv_expand_slice,
+                                     bgmv_shrink, sgmv_expand,
+                                     sgmv_expand_slice, sgmv_shrink)
+from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
 from vllm.platforms import current_platform
 
 from .utils import (assert_close, generate_data,
                     generate_data_for_expand_nslices,
-                    generate_data_for_nslices, ref_torch_groupgemm)
+                    generate_data_for_nslices)
 
 HIDDEN_SIZES = [
     128,
@@ -113,7 +112,7 @@
 MAX_RANKS = [32]
 SCALES = [0.5]
 SEED = [0]
-CUDA_DEVICES = [f"cuda:{0}"]
+DEVICES = [f"cuda:{0}"]
 
 _dict_lock = Lock()
 
@@ -127,7 +126,7 @@
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("op_type", ["shrink", "expand"])
 @pytest.mark.parametrize("seed", SEED)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 def test_punica_sgmv(
     batches: int,
     num_loras: int,
@@ -174,7 +173,7 @@ def test_punica_sgmv(
         # Preventing cache error pointer.
         with _dict_lock:
             _LORA_A_PTR_DICT.clear()
-            sgmv_shrink(
+            torch.ops.vllm.sgmv_shrink(
                 inputs_tensor,
                 lora_weights_lst,
                 our_out_tensor,
@@ -187,20 +186,23 @@ def test_punica_sgmv(
                 scaling,
             )
         for index in range(nslices):
-            ref_torch_groupgemm(
-                ref_out_tensor[index],
+            sgmv_shrink(
                 inputs_tensor,
                 lora_weights_lst[index],
-                lora_indices_tensor,
+                ref_out_tensor[index],
+                b_seq_start_loc,
                 seq_len_tensor,
+                lora_indices_tensor,
                 batches,
+                max_seq_length,
+                token_nums,
                 scaling,
-                op_type,
             )
+
     else:
         with _dict_lock:
             _LORA_B_PTR_DICT.clear()
-            sgmv_expand(
+            torch.ops.vllm.sgmv_expand(
                 inputs_tensor,
                 lora_weights_lst,
                 our_out_tensor,
@@ -213,21 +215,39 @@ def test_punica_sgmv(
                 offset_start=0,
                 add_inputs=True,
             )
-
-        slice_offset = 0
-        for index in range(nslices):
-            lora_weights = lora_weights_lst[index]
-            ref_torch_groupgemm(
-                ref_out_tensor[:, slice_offset:slice_offset + hidden_size],
-                inputs_tensor[index],
-                lora_weights,
-                lora_indices_tensor,
+        if nslices == 1:
+            # Verify the torch's sgmv_expand op
+            sgmv_expand(
+                inputs_tensor[0],
+                lora_weights_lst[0],
+                ref_out_tensor,
+                b_seq_start_loc,
                 seq_len_tensor,
+                lora_indices_tensor,
                 batches,
-                1.0,
-                op_type,
+                max_seq_length,
+                token_nums,
+                add_inputs=True,
             )
-            slice_offset += hidden_size
+        else:
+            slice_offset = 0
+            for index in range(nslices):
+                lora_weights = lora_weights_lst[index]
+                sgmv_expand_slice(
+                    inputs_tensor[index],
+                    lora_weights,
+                    ref_out_tensor,
+                    b_seq_start_loc,
+                    seq_len_tensor,
+                    lora_indices_tensor,
+                    batches,
+                    max_seq_length,
+                    token_nums,
+                    slice_offset,
+                    hidden_size,
+                    add_inputs=True,
+                )
+                slice_offset += hidden_size
 
     assert_close(our_out_tensor, ref_out_tensor)
 
@@ -240,7 +260,7 @@ def test_punica_sgmv(
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("op_type", ["shrink", "expand"])
 @pytest.mark.parametrize("seed", SEED)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 def test_punica_bgmv(
     batches: int,
     num_loras: int,
@@ -276,31 +296,38 @@ def test_punica_bgmv(
         device,
     )
     if op_type == "shrink":
-        bgmv_shrink(
+        torch.ops.vllm.bgmv_shrink(
             inputs_tensor,
             lora_weights,
             our_out_tensor,
             indices,
             scaling,
         )
+
+        bgmv_shrink(
+            inputs_tensor,
+            lora_weights,
+            ref_out_tensor,
+            indices,
+            scaling,
+        )
+
     else:
-        bgmv_expand(
+        torch.ops.vllm.bgmv_expand(
             inputs_tensor,
             lora_weights,
             our_out_tensor,
             indices,
             add_inputs=True,
         )
-    ref_torch_groupgemm(
-        ref_out_tensor,
-        inputs_tensor,
-        lora_weights,
-        lora_indices_tensor,
-        seq_len_tensor,
-        batches,
-        scaling if op_type == "shrink" else 1.0,
-        op_type,
-    )
+        bgmv_expand(
+            inputs_tensor,
+            lora_weights,
+            ref_out_tensor,
+            indices,
+            add_inputs=True,
+        )
+
     if op_type == "shrink":
         ref_out_tensor = ref_out_tensor.to(torch.float32)
     assert_close(our_out_tensor, ref_out_tensor)
@@ -313,7 +340,7 @@ def test_punica_bgmv(
 @pytest.mark.parametrize("nslices", [2, 3])
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEED)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 def test_punica_bgmv_expand_nslices(
     batches: int,
     num_loras: int,
@@ -350,7 +377,7 @@ def test_punica_bgmv_expand_nslices(
     slice_offset = 0
     for index in range(nslices):
         lora_weights = lora_weights_lst[index]
-        bgmv_expand_slice(
+        torch.ops.vllm.bgmv_expand_slice(
             inputs_tensor,
             lora_weights,
             our_outputs,
@@ -359,15 +386,14 @@ def test_punica_bgmv_expand_nslices(
             slice_size=hidden_size,
             add_inputs=True,
         )
-        ref_torch_groupgemm(
-            ref_outputs[:, slice_offset:slice_offset + hidden_size],
+        bgmv_expand_slice(
             inputs_tensor,
             lora_weights,
-            lora_indices_tensor,
-            seq_len_tensor,
-            batches,
-            1.0,
-            op_type="expand",
+            ref_outputs,
+            indices,
+            slice_offset,
+            slice_size=hidden_size,
+            add_inputs=True,
         )
 
         slice_offset += hidden_size
diff --git a/tests/lora/test_punica_variation.py b/tests/lora/test_punica_ops_variation.py
similarity index 74%
rename from tests/lora/test_punica_variation.py
rename to tests/lora/test_punica_ops_variation.py
index 9ee10e7c23ee6..2bb84c1cf11e9 100644
--- a/tests/lora/test_punica_variation.py
+++ b/tests/lora/test_punica_ops_variation.py
@@ -9,19 +9,18 @@
 import torch
 
 # Enable custom op register
-import vllm.lora.ops.bgmv_expand
-import vllm.lora.ops.bgmv_expand_slice
-import vllm.lora.ops.bgmv_shrink
-import vllm.lora.ops.sgmv_expand
-import vllm.lora.ops.sgmv_shrink  # noqa: F401
-from vllm.lora.ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
+import vllm.lora.ops.triton_ops  # noqa: F401
+from vllm.lora.ops.torch_ops import (bgmv_expand, bgmv_expand_slice,
+                                     bgmv_shrink, sgmv_expand,
+                                     sgmv_expand_slice, sgmv_shrink)
+from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
 from vllm.platforms import current_platform
 
 from .utils import (assert_close, generate_data,
                     generate_data_for_expand_nslices,
-                    generate_data_for_nslices, ref_torch_groupgemm)
+                    generate_data_for_nslices)
 
-HIDDEN_SIZES = [4097]
+HIDDEN_SIZES = [2049]
 
 BATCHES = [1, 4, 16, 32]
 NUM_LORA = [1, 8, 32, 128]
@@ -29,15 +28,7 @@
 MAX_RANKS = [1, 4, 8, 16, 32, 64, 128, 256]
 SCALES = [0.5]
 SEED = [0]
-CUDA_DEVICES = [f"cuda:{0}"]
-
-# Unlike test_punica_sizes.py, we directly utilize custom op for
-# testing, which verifies the correct registration of these ops.
-bgmv_expand = torch.ops.vllm.bgmv_expand
-bgmv_expand_slice = torch.ops.vllm.bgmv_expand_slice
-bgmv_shrink = torch.ops.vllm.bgmv_shrink
-sgmv_expand = torch.ops.vllm.sgmv_expand
-sgmv_shrink = torch.ops.vllm.sgmv_shrink
+DEVICES = [f"cuda:{0}"]
 
 _dict_lock = Lock()
 
@@ -51,7 +42,7 @@
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("op_type", ["shrink", "expand"])
 @pytest.mark.parametrize("seed", SEED)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 def test_punica_sgmv(
     batches: int,
     num_loras: int,
@@ -98,7 +89,7 @@ def test_punica_sgmv(
         # Preventing cache error pointer.
         with _dict_lock:
             _LORA_A_PTR_DICT.clear()
-            sgmv_shrink(
+            torch.ops.vllm.sgmv_shrink(
                 inputs_tensor,
                 lora_weights_lst,
                 our_out_tensor,
@@ -111,20 +102,23 @@ def test_punica_sgmv(
                 scaling,
             )
         for index in range(nslices):
-            ref_torch_groupgemm(
-                ref_out_tensor[index],
+            sgmv_shrink(
                 inputs_tensor,
                 lora_weights_lst[index],
-                lora_indices_tensor,
+                ref_out_tensor[index],
+                b_seq_start_loc,
                 seq_len_tensor,
+                lora_indices_tensor,
                 batches,
+                max_seq_length,
+                token_nums,
                 scaling,
-                op_type,
             )
+
     else:
         with _dict_lock:
             _LORA_B_PTR_DICT.clear()
-            sgmv_expand(
+            torch.ops.vllm.sgmv_expand(
                 inputs_tensor,
                 lora_weights_lst,
                 our_out_tensor,
@@ -137,21 +131,39 @@ def test_punica_sgmv(
                 offset_start=0,
                 add_inputs=True,
             )
-
         slice_offset = 0
-        for index in range(nslices):
-            lora_weights = lora_weights_lst[index]
-            ref_torch_groupgemm(
-                ref_out_tensor[:, slice_offset:slice_offset + hidden_size],
-                inputs_tensor[index],
-                lora_weights,
-                lora_indices_tensor,
+        if nslices == 1:
+            # Verify the torch's sgmv_expand op
+            sgmv_expand(
+                inputs_tensor[0],
+                lora_weights_lst[0],
+                ref_out_tensor,
+                b_seq_start_loc,
                 seq_len_tensor,
+                lora_indices_tensor,
                 batches,
-                1.0,
-                op_type,
+                max_seq_length,
+                token_nums,
+                add_inputs=True,
             )
-            slice_offset += hidden_size
+        else:
+            for index in range(nslices):
+                lora_weights = lora_weights_lst[index]
+                sgmv_expand_slice(
+                    inputs_tensor[index],
+                    lora_weights,
+                    ref_out_tensor,
+                    b_seq_start_loc,
+                    seq_len_tensor,
+                    lora_indices_tensor,
+                    batches,
+                    max_seq_length,
+                    token_nums,
+                    slice_offset,
+                    hidden_size,
+                    add_inputs=True,
+                )
+                slice_offset += hidden_size
 
     assert_close(our_out_tensor, ref_out_tensor)
 
@@ -164,7 +176,7 @@ def test_punica_sgmv(
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("op_type", ["shrink", "expand"])
 @pytest.mark.parametrize("seed", SEED)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 def test_punica_bgmv(
     batches: int,
     num_loras: int,
@@ -176,7 +188,6 @@ def test_punica_bgmv(
     seed: int,
     device: str,
 ):
-
     torch.set_default_device(device)
     current_platform.seed_everything(seed)
 
@@ -201,32 +212,38 @@ def test_punica_bgmv(
         device,
     )
     if op_type == "shrink":
-        bgmv_shrink(
+        torch.ops.vllm.bgmv_shrink(
             inputs_tensor,
             lora_weights,
             our_out_tensor,
             indices,
             scaling,
         )
-    else:
 
-        bgmv_expand(
+        bgmv_shrink(
+            inputs_tensor,
+            lora_weights,
+            ref_out_tensor,
+            indices,
+            scaling,
+        )
+
+    else:
+        torch.ops.vllm.bgmv_expand(
             inputs_tensor,
             lora_weights,
             our_out_tensor,
             indices,
             add_inputs=True,
         )
-    ref_torch_groupgemm(
-        ref_out_tensor,
-        inputs_tensor,
-        lora_weights,
-        lora_indices_tensor,
-        seq_len_tensor,
-        batches,
-        scaling if op_type == "shrink" else 1.0,
-        op_type,
-    )
+        bgmv_expand(
+            inputs_tensor,
+            lora_weights,
+            ref_out_tensor,
+            indices,
+            add_inputs=True,
+        )
+
     if op_type == "shrink":
         ref_out_tensor = ref_out_tensor.to(torch.float32)
     assert_close(our_out_tensor, ref_out_tensor)
@@ -239,7 +256,7 @@ def test_punica_bgmv(
 @pytest.mark.parametrize("nslices", [2, 3])
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEED)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 def test_punica_bgmv_expand_nslices(
     batches: int,
     num_loras: int,
@@ -276,7 +293,7 @@ def test_punica_bgmv_expand_nslices(
     slice_offset = 0
     for index in range(nslices):
         lora_weights = lora_weights_lst[index]
-        bgmv_expand_slice(
+        torch.ops.vllm.bgmv_expand_slice(
             inputs_tensor,
             lora_weights,
             our_outputs,
@@ -285,15 +302,14 @@ def test_punica_bgmv_expand_nslices(
             slice_size=hidden_size,
             add_inputs=True,
         )
-        ref_torch_groupgemm(
-            ref_outputs[:, slice_offset:slice_offset + hidden_size],
+        bgmv_expand_slice(
             inputs_tensor,
             lora_weights,
-            lora_indices_tensor,
-            seq_len_tensor,
-            batches,
-            1.0,
-            op_type="expand",
+            ref_outputs,
+            indices,
+            slice_offset,
+            slice_size=hidden_size,
+            add_inputs=True,
         )
 
         slice_offset += hidden_size
diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py
index 026269667b473..26bf770cc0d4a 100644
--- a/tests/lora/test_quant_model.py
+++ b/tests/lora/test_quant_model.py
@@ -72,7 +72,8 @@ def format_prompt_tuples(prompt):
 @pytest.mark.parametrize("tp_size", [1])
 def test_quant_model_lora(tinyllama_lora_files, num_gpus_available, model,
                           tp_size):
-    if num_gpus_available < tp_size:
+    if num_gpus_available < tp_size and \
+        tp_size > 1 and current_platform.is_cuda_alike():
         pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
 
     llm = vllm.LLM(
diff --git a/tests/lora/utils.py b/tests/lora/utils.py
index b66d18074a7bf..ce47546f2154b 100644
--- a/tests/lora/utils.py
+++ b/tests/lora/utils.py
@@ -104,33 +104,6 @@ def assert_close(a, b):
     torch.testing.assert_close(a, b, rtol=rtol, atol=atol)
 
 
-def ref_torch_groupgemm(
-    out_tensor,
-    inputs,
-    lora_weights,
-    lora_indices_tensor,
-    seq_len_tensor,
-    batches,
-    scaling,
-    op_type,
-) -> torch.Tensor:
-    out_list = []
-    current_offset = 0
-    for lora_index, b_length in zip(range(batches), seq_len_tensor):
-        input_weight = inputs[current_offset:b_length + current_offset, :]
-        current_offset += b_length
-        lora_weight = lora_weights[lora_indices_tensor[lora_index]]
-        result = torch.nn.functional.linear(input_weight, lora_weight)
-        result *= scaling
-        out_list.append(result)
-    cat_result = torch.cat(out_list, dim=0)
-    if op_type == "expand":
-        out_tensor += cat_result
-    else:
-        out_tensor.copy_(cat_result)
-    return
-
-
 def generate_data(
     batches,
     hidden_size,
diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py
index c7f018d9a203e..b9a6bee5720fd 100644
--- a/vllm/executor/cpu_executor.py
+++ b/vllm/executor/cpu_executor.py
@@ -22,9 +22,6 @@ class CPUExecutor(ExecutorBase):
 
     def _init_executor(self) -> None:
         assert self.device_config.device_type == "cpu"
-        # Reminder: Please update docs/source/features/compatibility_matrix.md
-        # If the feature combo become valid
-        assert self.lora_config is None, "cpu backend doesn't support LoRA"
 
         #
         # Environment variables for CPU executor
diff --git a/vllm/lora/ops/torch_ops/__init__.py b/vllm/lora/ops/torch_ops/__init__.py
new file mode 100644
index 0000000000000..9c9159b95f308
--- /dev/null
+++ b/vllm/lora/ops/torch_ops/__init__.py
@@ -0,0 +1,13 @@
+from vllm.lora.ops.torch_ops.lora_ops import bgmv_expand  # noqa: F401
+from vllm.lora.ops.torch_ops.lora_ops import (bgmv_expand_slice, bgmv_shrink,
+                                              sgmv_expand, sgmv_expand_slice,
+                                              sgmv_shrink)
+
+__all__ = [
+    "bgmv_expand",
+    "bgmv_expand_slice",
+    "bgmv_shrink",
+    "sgmv_expand",
+    "sgmv_expand_slice",
+    "sgmv_shrink",
+]
diff --git a/vllm/lora/ops/torch_ops/lora_ops.py b/vllm/lora/ops/torch_ops/lora_ops.py
new file mode 100644
index 0000000000000..5f5aafd516159
--- /dev/null
+++ b/vllm/lora/ops/torch_ops/lora_ops.py
@@ -0,0 +1,113 @@
+import torch
+
+
+def sgmv_expand(inputs: torch.Tensor,
+                lora_b_weights: torch.Tensor,
+                output_tensor: torch.Tensor,
+                b_seq_start_loc: torch.Tensor,
+                seq_len_tensor: torch.Tensor,
+                lora_indices_tensor: torch.Tensor,
+                batches: int,
+                max_seq_length: int,
+                token_nums: int,
+                add_inputs: bool = False):
+    exploded_indices = torch.repeat_interleave(lora_indices_tensor,
+                                               seq_len_tensor)
+
+    bgmv_expand(inputs, lora_b_weights, output_tensor, exploded_indices,
+                add_inputs)
+
+
+def bgmv_expand(inputs: torch.Tensor,
+                lora_b_weights: torch.Tensor,
+                output_tensor: torch.Tensor,
+                lora_indices_tensor: torch.Tensor,
+                add_inputs: bool = True):
+    selected_loras = lora_b_weights[lora_indices_tensor].to(
+        dtype=output_tensor.dtype)
+    if len(selected_loras.shape) == 4:
+        selected_loras = selected_loras.squeeze(dim=1)
+    inputs = inputs.to(dtype=output_tensor.dtype)
+    outputs = torch.einsum("bi, boi -> bo", inputs, selected_loras)
+
+    limit = output_tensor.shape[0]
+    if outputs.shape[0] == 1 and output_tensor.shape[0] != 1:
+        limit = 1
+
+    if add_inputs:
+        output_tensor[:, :outputs.shape[1]] += outputs[:limit, :]
+    else:
+        output_tensor[:, :outputs.shape[1]] = outputs[:limit, :]
+
+
+def sgmv_shrink(
+    inputs: torch.Tensor,
+    lora_a_weights: torch.Tensor,
+    output_tensor: torch.Tensor,
+    b_seq_start_loc: torch.Tensor,
+    seq_len_tensor: torch.Tensor,
+    lora_indices_tensor: torch.Tensor,
+    batches: int,
+    max_seq_length: int,
+    token_nums: int,
+    scaling: float,
+):
+    exploded_indices = torch.repeat_interleave(lora_indices_tensor,
+                                               seq_len_tensor)
+
+    bgmv_shrink(inputs, lora_a_weights, output_tensor, exploded_indices,
+                scaling)
+
+
+def bgmv_shrink(inputs: torch.Tensor,
+                lora_b_weights: torch.Tensor,
+                output_tensor: torch.Tensor,
+                lora_indices_tensor: torch.Tensor,
+                scaling: float = 1.0):
+    selected_loras = lora_b_weights[lora_indices_tensor].to(
+        dtype=output_tensor.dtype)
+    if len(selected_loras.shape) == 4:
+        selected_loras = selected_loras.squeeze(dim=1)
+    inputs = inputs.to(dtype=output_tensor.dtype)
+    outputs = torch.einsum("bi, boi -> bo", inputs, selected_loras)
+
+    output_tensor[:, :outputs.shape[1]] = scaling * outputs[:]
+
+
+def sgmv_expand_slice(inputs: torch.Tensor,
+                      lora_b_weights: torch.Tensor,
+                      output_tensor: torch.Tensor,
+                      b_seq_start_loc: torch.Tensor,
+                      seq_len_tensor: torch.Tensor,
+                      lora_indices_tensor: torch.Tensor,
+                      batches: int,
+                      max_seq_length: int,
+                      token_nums: int,
+                      slice_offset: int,
+                      slice_size: int,
+                      add_inputs: bool = False):
+    exploded_indices = torch.repeat_interleave(lora_indices_tensor,
+                                               seq_len_tensor)
+
+    bgmv_expand_slice(inputs, lora_b_weights, output_tensor, exploded_indices,
+                      slice_offset, slice_size, add_inputs)
+
+
+def bgmv_expand_slice(inputs: torch.Tensor,
+                      lora_b_weights: torch.Tensor,
+                      output_tensor: torch.Tensor,
+                      lora_indices_tensor: torch.Tensor,
+                      slice_offset: int,
+                      slice_size: int,
+                      add_inputs: bool = True):
+    selected_loras = lora_b_weights[lora_indices_tensor].to(
+        dtype=output_tensor.dtype)
+    inputs = inputs.to(dtype=output_tensor.dtype)
+    if len(selected_loras.shape) == 4:
+        selected_loras = selected_loras.squeeze(dim=1)
+    outputs = torch.einsum("bi, boi -> bo", inputs, selected_loras)
+
+    if add_inputs:
+        output_tensor[:, slice_offset:slice_offset + slice_size] += outputs[:]
+    else:
+        output_tensor[:, slice_offset:slice_offset + slice_size] = outputs[:]
diff --git a/vllm/lora/ops/triton_ops/__init__.py b/vllm/lora/ops/triton_ops/__init__.py
new file mode 100644
index 0000000000000..9805b6dd5038e
--- /dev/null
+++ b/vllm/lora/ops/triton_ops/__init__.py
@@ -0,0 +1,13 @@
+from vllm.lora.ops.triton_ops.bgmv_expand import bgmv_expand
+from vllm.lora.ops.triton_ops.bgmv_expand_slice import bgmv_expand_slice
+from vllm.lora.ops.triton_ops.bgmv_shrink import bgmv_shrink
+from vllm.lora.ops.triton_ops.sgmv_expand import sgmv_expand
+from vllm.lora.ops.triton_ops.sgmv_shrink import sgmv_shrink  # noqa: F401
+
+__all__ = [
+    "bgmv_expand",
+    "bgmv_expand_slice",
+    "bgmv_shrink",
+    "sgmv_expand",
+    "sgmv_shrink",
+]
diff --git a/vllm/lora/ops/bgmv_expand.py b/vllm/lora/ops/triton_ops/bgmv_expand.py
similarity index 100%
rename from vllm/lora/ops/bgmv_expand.py
rename to vllm/lora/ops/triton_ops/bgmv_expand.py
diff --git a/vllm/lora/ops/bgmv_expand_slice.py b/vllm/lora/ops/triton_ops/bgmv_expand_slice.py
similarity index 100%
rename from vllm/lora/ops/bgmv_expand_slice.py
rename to vllm/lora/ops/triton_ops/bgmv_expand_slice.py
diff --git a/vllm/lora/ops/bgmv_shrink.py b/vllm/lora/ops/triton_ops/bgmv_shrink.py
similarity index 100%
rename from vllm/lora/ops/bgmv_shrink.py
rename to vllm/lora/ops/triton_ops/bgmv_shrink.py
diff --git a/vllm/lora/ops/sgmv_expand.py b/vllm/lora/ops/triton_ops/sgmv_expand.py
similarity index 100%
rename from vllm/lora/ops/sgmv_expand.py
rename to vllm/lora/ops/triton_ops/sgmv_expand.py
diff --git a/vllm/lora/ops/sgmv_shrink.py b/vllm/lora/ops/triton_ops/sgmv_shrink.py
similarity index 100%
rename from vllm/lora/ops/sgmv_shrink.py
rename to vllm/lora/ops/triton_ops/sgmv_shrink.py
diff --git a/vllm/lora/ops/utils.py b/vllm/lora/ops/triton_ops/utils.py
similarity index 100%
rename from vllm/lora/ops/utils.py
rename to vllm/lora/ops/triton_ops/utils.py
diff --git a/vllm/lora/punica_wrapper/punica_cpu.py b/vllm/lora/punica_wrapper/punica_cpu.py
new file mode 100644
index 0000000000000..b9ae3e07492c0
--- /dev/null
+++ b/vllm/lora/punica_wrapper/punica_cpu.py
@@ -0,0 +1,346 @@
+from typing import Callable, Optional, Tuple, Union
+
+import torch
+
+from vllm.lora.ops.torch_ops import (bgmv_expand, bgmv_expand_slice,
+                                     bgmv_shrink, sgmv_expand,
+                                     sgmv_expand_slice, sgmv_shrink)
+
+from .punica_base import PunicaWrapperBase
+
+
+# The platforms that are compatible with the PyTorch-native implementation can
+# inherit this class
+class PunicaWrapperCPU(PunicaWrapperBase):
+    """
+    PunicaWrapperCPU is designed to manage and provide metadata for the punica 
+    kernel. The main function is to maintain the state information for 
+    Multi-LoRA, and to provide the interface for the pytorch punica ops.
+    """
+
+    def __init__(self, max_num_batched_tokens: int, max_batches: int,
+                 device: Union[torch.device, str], **kwargs):
+        PunicaWrapperBase.__init__(self, max_num_batched_tokens, max_batches,
+                                   device)
+
+    def _shrink_prefill(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        w_t_all: torch.Tensor,
+        scale: float,
+    ):
+        #No LoRA request, so return directly
+        if self.no_lora:
+            return
+        sgmv_shrink(
+            x,
+            w_t_all,
+            y,
+            *self.prefill_metadata,
+            scale,
+        )
+
+    def _shrink_decode(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        w_t_all: torch.Tensor,
+        scale: float,
+    ):
+        bgmv_shrink(x, w_t_all, y, self.token_lora_indices, scale)
+
+    def _expand_prefill(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        w_t_all: torch.Tensor,
+        add_inputs: bool,
+    ):
+        #No LoRA request, so return directly
+        if self.no_lora:
+            return
+        sgmv_expand(
+            x,
+            w_t_all,
+            y,
+            *self.prefill_metadata,
+            add_inputs,
+        )
+
+    def _expand_decode(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        w_t_all: torch.Tensor,
+        add_inputs: bool,
+    ):
+        bgmv_expand(x, w_t_all, y, self.token_lora_indices, add_inputs)
+
+    def _expand_slice_prefill(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        w_t_all: torch.Tensor,
+        y_offset: int,
+        y_slice_size: int,
+        add_inputs: bool,
+    ):
+        #No LoRA request, so return directly
+        if self.no_lora:
+            return
+        sgmv_expand_slice(
+            x,
+            w_t_all,
+            y,
+            *self.prefill_metadata,
+            y_offset,
+            y_slice_size,
+            add_inputs,
+        )
+
+    def _expand_slice_decode(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        w_t_all: torch.Tensor,
+        y_offset: int,
+        y_slice_size: int,
+        add_inputs: bool,
+    ):
+        bgmv_expand_slice(x, w_t_all, y, self.token_lora_indices, y_offset,
+                          y_slice_size, add_inputs)
+
+    def _apply_expand(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        w_t_all: torch.Tensor,
+        y_offset: int,
+        y_slice_size: int,
+        add_inputs: bool = True,
+    ):
+        """
+        Perform the ` y[:,y_offset:y_offset+y_slice_size]+=x@w_t_all` 
+        computation, which is suitable for the
+        GEMM of lora'b.
+        """
+
+        expand_slice_fun: Callable = (self._expand_slice_prefill
+                                      if self.is_prefill else
+                                      self._expand_slice_decode)
+        expand_slice_fun(y, x, w_t_all, y_offset, y_slice_size, add_inputs)
+
+    def _apply_shrink(self, y: torch.Tensor, x: torch.Tensor,
+                      w_t_all: torch.Tensor, scale: float):
+        """
+        Perform the ` y+=x@w_t_all` computation, which is suitable for the
+        GEMM of lora'a.
+        When `is_prefill is` true, it indicates that it is currently the
+        prefill stage, and the `_shrink_prefill` function should be called.
+        Otherwise, it is the decode stage, and the _shrink_decode function
+        should be called.
+        """
+        y_org = y
+        y = y.view(-1, y.shape[-1])
+        shrink_fun: Callable = (self._shrink_prefill
+                                if self.is_prefill else self._shrink_decode)
+        shrink_fun(y, x, w_t_all, scale)
+        y = y.view_as(y_org)
+
+    def add_shrink(self, y: Union[Tuple[torch.Tensor, ...], torch.Tensor],
+                   x: torch.Tensor, lora_a_stacked: Tuple[torch.Tensor, ...],
+                   scale: float, **kwargs):
+        """
+        Performs GEMM  for multiple slices of lora_a.
+        When `is_prefill is` true, it indicates that it is currently the
+        prefill stage, and the `_shrink_prefill` function should be called.
+        Otherwise, it is the decode stage, and the _shrink_decode function
+        should be called.
+            
+        Semantics:
+        for i in range(len(lora_a_stacked)):
+            y[i] += (x @ lora_a_stacked[i]) * scale
+        
+        Args:
+            y (Union[Tuple[torch.Tensor, ...], torch.Tensor]): Output tensors
+            x (torch.Tensor): Input tensor
+            lora_a_stacked (Tuple[torch.Tensor, ...]): lora_a's weights
+            scale (float): Scaling factor for the operation
+        """
+
+        x = x.view(-1, x.shape[-1])
+        # TODO fuse these kernels
+        for slice_idx in range(len(lora_a_stacked)):
+            self._apply_shrink(y[slice_idx], x, lora_a_stacked[slice_idx],
+                               scale)
+
+    def add_expand(self,
+                   y: torch.Tensor,
+                   x: Union[Tuple[torch.Tensor, ...], torch.Tensor],
+                   lora_b_stacked: Tuple[torch.Tensor, ...],
+                   lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]],
+                   output_slices: Tuple[int, ...],
+                   offset_start: int = 0,
+                   add_inputs=True,
+                   **kwargs) -> None:
+        """
+        Performs GEMM and bias addition for multiple slices of lora_b.
+      
+        Semantics:
+            for i in range(len(lora_b_stacked)):
+                slice = output_slices[i]
+                y[:, offset:offset+slice] += x[i] @ lora_b_stacked[i] + 
+                    lora_bias_stacked[i] 
+                offset += slice
+            
+        Args:
+            y (torch.Tensor): Output tensor.
+            x (Union[Tuple[torch.Tensor, ...], torch.Tensor]): Input tensors
+            lora_b_stacked (Tuple[torch.Tensor, ...]): lora_b's weight
+            lora_bias_stacked (Optional[Tuple[torch.Tensor, ...]]): 
+                bias's weight
+            output_slices (Tuple[int, ...]): Every slice's size
+            add_inputs (bool):  Defaults to True.
+        """
+        y_org = y
+        y = y.view(-1, y.shape[-1])
+        offset_left = offset_start
+        if lora_bias_stacked is not None:
+            self._apply_bias(self.token_lora_indices, y, output_slices,
+                             lora_bias_stacked)
+        for slice_idx in range(len(lora_b_stacked)):
+            self._apply_expand(
+                y,
+                x[slice_idx],
+                lora_b_stacked[slice_idx],
+                offset_left,
+                output_slices[slice_idx],
+                add_inputs=add_inputs,
+            )
+            offset_left += output_slices[slice_idx]
+        y = y.view_as(y_org)
+
+    def add_lora_embedding(self,
+                           y: torch.Tensor,
+                           x: torch.Tensor,
+                           lora_b_stacked: torch.Tensor,
+                           add_inputs: bool = True,
+                           **kwargs) -> None:
+        """
+        Applies lora  specifically for VocabParallelEmbeddingWithLoRA.
+
+        Semantics:
+            y += x @ lora_b_stacked
+
+        Args:
+            y (torch.Tensor): Output tensor.
+            x (torch.Tensor): Input tensor.
+            lora_b_stacked (torch.Tensor): lora_b's weights.
+            add_inputs (bool): Default to True.
+        """
+
+        # Embedding layer only need expand op
+        expand_fun: Callable = (self._expand_prefill
+                                if self.is_prefill else self._expand_decode)
+        expand_fun(y, x, lora_b_stacked, add_inputs)
+
+    def add_lora_linear(self,
+                        y: torch.Tensor,
+                        x: torch.Tensor,
+                        lora_a_stacked: Tuple[torch.Tensor, ...],
+                        lora_b_stacked: Tuple[torch.Tensor, ...],
+                        lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]],
+                        scale: float,
+                        output_slices: Tuple[int, ...],
+                        *,
+                        buffer: Optional[Tuple[torch.Tensor, ...]] = None,
+                        **kwargs) -> None:
+        """
+        Applicable to linear-related lora. 
+
+        Semantics:
+            for i in range(len(lora_a_stacked)):
+                y[i] += (
+                    x[i].unsqueeze(0)
+                    @ lora_a_stacked[indices[i], layer_idx, :, :]
+                    @ lora_b_stacked[indices[i], layer_idx, :, :]
+                    * scale
+                    ).squeeze(0)+lora_bias_stacked[i]
+
+        Args:
+            y (torch.Tensor): Output tensor. Will be changed in-place.
+            x (torch.Tensor): Input tensor
+            lora_a_stacked (Tuple[torch.Tensor, ...]): lora_a's weight.
+            lora_b_stacked (Tuple[torch.Tensor, ...]): lora_b's weight.
+            lora_bias_stacked (Optional[Tuple[torch.Tensor, ...]]): lora's bias.
+            scale (float): Scaling factor.
+            output_slices (Tuple[int, ...]): Every slice's size.
+            buffer (Optional[Tuple[torch.Tensor, ...]]): Defaults to None.
+        """
+
+        assert len(lora_a_stacked) == len(lora_b_stacked) == len(output_slices)
+        if lora_bias_stacked is not None:
+            assert len(lora_bias_stacked) == len(output_slices)
+            y = self._apply_bias(self.token_lora_indices, y, output_slices,
+                                 lora_bias_stacked)
+
+        if buffer is None:
+            r = lora_b_stacked[0].size(-1)
+            # We set the buffer to be float32 by default, consistent with the
+            # triton op
+            buffer = tuple(
+                torch.zeros(
+                    (x.size(0), r), dtype=torch.float32, device=x.device)
+                for _ in range(len(output_slices)))
+        self.add_shrink(buffer, x, lora_a_stacked, scale, **kwargs)
+        self.add_expand(y,
+                        buffer,
+                        lora_b_stacked,
+                        None,
+                        output_slices,
+                        add_inputs=True,
+                        **kwargs)
+
+    def add_lora_logits(self,
+                        y: torch.Tensor,
+                        x: torch.Tensor,
+                        lora_a_stacked: torch.Tensor,
+                        lora_b_stacked: torch.Tensor,
+                        scale,
+                        *,
+                        buffer: Optional[torch.Tensor] = None,
+                        **kwargs) -> None:
+        """
+        Applies lora  specifically for LogitsProcessorWithLoRA.
+        
+        Semantics:
+            buffer = (x @ lora_a_stacked) * scale
+            y += buffer @ lora_b_stacked
+
+        Args:
+            y (torch.Tensor): Output tensor.
+            x (torch.Tensor): Input tensor.
+            lora_a_stacked (torch.Tensor): lora_a's weights.
+            lora_b_stacked (torch.Tensor):lora_b's weights.
+            scale (float): Scaling factor.
+            buffer (Optional[torch.Tensor]):Default to None.
+        """
+        y_org = y
+        y = y.view(-1, y.shape[-1])
+        x = x.view(-1, x.shape[-1])
+        r = lora_b_stacked.size(-1)
+        if buffer is None:
+            # We set the buffer to be float32 by default, consistent with the
+            # triton op
+            buffer = torch.zeros((x.size(0), r),
+                                 dtype=torch.float32,
+                                 device=x.device)
+        # LogitsProcessorWithLoRA always using bgmv.
+        bgmv_shrink(x, lora_a_stacked, buffer, self.sampler_indices, scale)
+        bgmv_expand(buffer,
+                    lora_b_stacked,
+                    y,
+                    self.sampler_indices,
+                    add_inputs=True)
+        y = y.view_as(y_org)
diff --git a/vllm/lora/punica_wrapper/punica_gpu.py b/vllm/lora/punica_wrapper/punica_gpu.py
index 278f7b5a8e9f4..451f23e49f27c 100644
--- a/vllm/lora/punica_wrapper/punica_gpu.py
+++ b/vllm/lora/punica_wrapper/punica_gpu.py
@@ -12,11 +12,11 @@
 from vllm.triton_utils import HAS_TRITON
 
 if HAS_TRITON:
-    from vllm.lora.ops.bgmv_expand import bgmv_expand
-    from vllm.lora.ops.bgmv_expand_slice import bgmv_expand_slice
-    from vllm.lora.ops.bgmv_shrink import bgmv_shrink
-    from vllm.lora.ops.sgmv_expand import sgmv_expand
-    from vllm.lora.ops.sgmv_shrink import sgmv_shrink
+    from vllm.lora.ops.triton_ops import bgmv_expand
+    from vllm.lora.ops.triton_ops import bgmv_expand_slice
+    from vllm.lora.ops.triton_ops import bgmv_shrink
+    from vllm.lora.ops.triton_ops import sgmv_expand
+    from vllm.lora.ops.triton_ops import sgmv_shrink
 
 from .punica_base import PunicaWrapperBase
 
diff --git a/vllm/lora/punica_wrapper/punica_selector.py b/vllm/lora/punica_wrapper/punica_selector.py
index 9791d492d8e48..9f1606e672dea 100644
--- a/vllm/lora/punica_wrapper/punica_selector.py
+++ b/vllm/lora/punica_wrapper/punica_selector.py
@@ -12,6 +12,11 @@ def get_punica_wrapper(*args, **kwargs) -> PunicaWrapperBase:
         from vllm.lora.punica_wrapper.punica_gpu import PunicaWrapperGPU
         logger.info_once("Using PunicaWrapperGPU.")
         return PunicaWrapperGPU(*args, **kwargs)
+    elif current_platform.is_cpu():
+        # Lazy import to avoid ImportError
+        from vllm.lora.punica_wrapper.punica_cpu import PunicaWrapperCPU
+        logger.info_once("Using PunicaWrapperCPU.")
+        return PunicaWrapperCPU(*args, **kwargs)
     elif current_platform.is_hpu():
         # Lazy import to avoid ImportError
         from vllm.lora.punica_wrapper.punica_hpu import PunicaWrapperHPU
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index d99db4e0c6c40..303d9a15e9c3c 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -2,8 +2,8 @@
 import weakref
 from collections import defaultdict
 from dataclasses import dataclass
-from typing import (TYPE_CHECKING, Any, Dict, List, Optional, Type, TypeVar,
-                    Union)
+from typing import (TYPE_CHECKING, Any, Dict, List, Optional, Set, Type,
+                    TypeVar, Union)
 
 import torch
 from torch import nn
@@ -12,10 +12,14 @@
 from vllm.config import VllmConfig
 from vllm.forward_context import set_forward_context
 from vllm.logger import init_logger
+from vllm.lora.layers import LoRAMapping
+from vllm.lora.request import LoRARequest
+from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.model_loader import get_model
+from vllm.model_executor.models import supports_lora, supports_multimodal
 from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
                              MultiModalKwargs, MultiModalPlaceholderMap)
 from vllm.sequence import (IntermediateTensors, SequenceData,
@@ -49,6 +53,8 @@ class ModelInputForCPU(ModelRunnerInputBase):
     virtual_engine: Optional[int] = None
     seq_lens: Optional[List[int]] = None
     query_lens: Optional[List[int]] = None
+    lora_mapping: Optional["LoRAMapping"] = None
+    lora_requests: Optional[Set[LoRARequest]] = None
 
     def as_broadcastable_tensor_dict(
             self) -> Dict[str, Union[int, torch.Tensor]]:
@@ -57,6 +63,8 @@ def as_broadcastable_tensor_dict(
             "input_positions": self.input_positions,
             "token_type_ids": self.token_type_ids,
             "multi_modal_kwargs": self.multi_modal_kwargs,
+            "lora_requests": self.lora_requests,
+            "lora_mapping": self.lora_mapping,
         }
         _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
 
@@ -143,7 +151,11 @@ def __init__(self,
                                 or runner.cache_config.enable_prefix_caching)
         self.model_input_cls = self.runner._model_input_cls
         self.attn_backend = self.runner.attn_backend
+        self.sliding_window = self.runner.sliding_window
+        self.block_size = self.runner.block_size
+        self.device = self.runner.device
         self.multi_modal_input_mapper = self.runner.multi_modal_input_mapper
+        self.enable_lora = self.runner.lora_config is not None
         self.input_data = ModelInputForCPUBuilder.ModelInputData(
             self.runner.model_config.uses_mrope)
         self.att_metadata_builder = self.runner.attn_backend.get_builder_cls()(
@@ -183,15 +195,28 @@ def build(self) -> ModelInputForCPU:
         attn_metadata = self.att_metadata_builder.build(
             input_data.seq_lens, input_data.query_lens, -1, -1)
 
-        return self.model_input_cls(
-            input_tokens=input_tokens,
-            input_positions=input_positions,
-            token_type_ids=token_type_ids,
-            seq_lens=input_data.seq_lens,
-            query_lens=input_data.query_lens,
-            attn_metadata=attn_metadata,
-            multi_modal_kwargs=multi_modal_kwargs,
-        )
+        is_prompt = (self.seq_group_metadata_list[0].is_prompt
+                     if self.seq_group_metadata_list else None)
+        # LoRA data.
+        lora_requests = set()
+        lora_mapping = None
+        if self.enable_lora:
+            lora_requests = set(seq.lora_request
+                                for seq in self.seq_group_metadata_list
+                                if seq.lora_request is not None)
+
+            lora_mapping = self._prepare_lora_input(
+                self.seq_group_metadata_list, is_prompt)
+
+        return self.model_input_cls(input_tokens=input_tokens,
+                                    input_positions=input_positions,
+                                    token_type_ids=token_type_ids,
+                                    seq_lens=input_data.seq_lens,
+                                    query_lens=input_data.query_lens,
+                                    attn_metadata=attn_metadata,
+                                    multi_modal_kwargs=multi_modal_kwargs,
+                                    lora_mapping=lora_mapping,
+                                    lora_requests=lora_requests)
 
     def _build_input_data(self):
         for seq_group_metadata in self.seq_group_metadata_list:
@@ -381,6 +406,24 @@ def _compute_multi_modal_input(self,
             self.input_data.multi_modal_placeholder_maps[modality].extend(
                 placeholder_map)
 
+    def _prepare_lora_input(
+            self, seq_group_metadata_list: List[SequenceGroupMetadata],
+            is_prefill: bool) -> LoRAMapping:
+        index_mapping = []
+        prompt_mapping = []
+        for seq in seq_group_metadata_list:
+            lora_id = seq.lora_int_id
+            query_len = seq.token_chunk_size
+
+            index_mapping += [lora_id] * query_len
+            prompt_mapping += [lora_id] * (
+                query_len if seq.sampling_params
+                and seq.sampling_params.prompt_logprobs is not None else 1)
+
+        return LoRAMapping(index_mapping=tuple(index_mapping),
+                           prompt_mapping=tuple(prompt_mapping),
+                           is_prefill=is_prefill)
+
 
 class CPUModelRunnerBase(ModelRunnerBase[TModelInputForCPU]):
     """
@@ -431,10 +474,41 @@ def __init__(
 
         # Lazy initialization.
         self.model: nn.Module  # Set after init_Model
+        # Set after load_model.
+        self.lora_manager: Optional[LRUCacheWorkerLoRAManager] = None
 
     def load_model(self) -> None:
         self.model = get_model(vllm_config=self.vllm_config)
 
+        if self.lora_config:
+            assert supports_lora(
+                self.model
+            ), f"{self.model.__class__.__name__} does not support LoRA yet."
+
+            if supports_multimodal(self.model):
+                logger.warning("Regarding multimodal models, vLLM currently "
+                               "only supports adding LoRA to language model.")
+
+            # It's necessary to distinguish between the max_position_embeddings
+            # of VLMs and LLMs.
+            if hasattr(self.model.config, "max_position_embeddings"):
+                max_pos_embeddings = self.model.config.max_position_embeddings
+            else:
+                max_pos_embeddings = (
+                    self.model.config.text_config.max_position_embeddings)
+
+            self.lora_manager = LRUCacheWorkerLoRAManager(
+                self.scheduler_config.max_num_seqs,
+                self.scheduler_config.max_num_batched_tokens,
+                self.vocab_size,
+                self.lora_config,
+                self.device,
+                self.model.embedding_modules,
+                self.model.embedding_padding_modules,
+                max_position_embeddings=max_pos_embeddings,
+            )
+            self.model = self.lora_manager.create_lora_manager(self.model)
+
     def _prepare_model_input_tensors(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
@@ -459,6 +533,37 @@ def sampler(self):
     def vocab_size(self) -> int:
         return self.model_config.get_vocab_size()
 
+    def remove_all_loras(self):
+        if not self.lora_manager:
+            raise RuntimeError("LoRA is not enabled.")
+        self.lora_manager.remove_all_adapters()
+
+    def set_active_loras(self, lora_requests: Set[LoRARequest],
+                         lora_mapping: LoRAMapping) -> None:
+        if not self.lora_manager:
+            raise RuntimeError("LoRA is not enabled.")
+        self.lora_manager.set_active_adapters(lora_requests, lora_mapping)
+
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        if not self.lora_manager:
+            raise RuntimeError("LoRA is not enabled.")
+        return self.lora_manager.add_adapter(lora_request)
+
+    def remove_lora(self, lora_id: int) -> bool:
+        if not self.lora_manager:
+            raise RuntimeError("LoRA is not enabled.")
+        return self.lora_manager.remove_adapter(lora_id)
+
+    def pin_lora(self, lora_id: int) -> bool:
+        if not self.lora_manager:
+            raise RuntimeError("LoRA is not enabled.")
+        return self.lora_manager.pin_adapter(lora_id)
+
+    def list_loras(self) -> Set[int]:
+        if not self.lora_manager:
+            raise RuntimeError("LoRA is not enabled.")
+        return self.lora_manager.list_adapters()
+
 
 class CPUModelRunner(CPUModelRunnerBase[ModelInputForCPUWithSamplingMetadata]):
     _model_input_cls: Type[ModelInputForCPUWithSamplingMetadata] = (
@@ -515,6 +620,12 @@ def execute_model(
             raise ValueError(
                 "CPU worker does not support multi-step execution.")
 
+        if self.lora_config:
+            assert model_input.lora_requests is not None
+            assert model_input.lora_mapping is not None
+            self.set_active_loras(model_input.lora_requests,
+                                  model_input.lora_mapping)
+
         model_executable = self.model
 
         multimodal_kwargs = {}
diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py
index 494c6506f3c0f..3e5fcf11b9e16 100644
--- a/vllm/worker/cpu_worker.py
+++ b/vllm/worker/cpu_worker.py
@@ -1,5 +1,5 @@
 """A CPU worker class."""
-from typing import Dict, List, Optional, Tuple, Type
+from typing import Dict, List, Optional, Set, Tuple, Type
 
 import torch
 import torch.distributed
@@ -11,14 +11,14 @@
 from vllm.distributed import (ensure_model_parallel_initialized,
                               init_distributed_environment)
 from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
 from vllm.model_executor import set_random_seed
 from vllm.sequence import ExecuteModelRequest
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, bind_kv_cache
 from vllm.worker.cpu_enc_dec_model_runner import CPUEncoderDecoderModelRunner
 from vllm.worker.cpu_model_runner import CPUModelRunner, CPUModelRunnerBase
 from vllm.worker.cpu_pooling_model_runner import CPUPoolingModelRunner
-from vllm.worker.worker_base import (LocalOrDistributedWorkerBase,
-                                     LoraNotSupportedWorkerBase, WorkerBase,
+from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, WorkerBase,
                                      WorkerInput)
 
 logger = init_logger(__name__)
@@ -111,7 +111,7 @@ def get_cache_block_size(
         return dtype_size * total
 
 
-class CPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
+class CPUWorker(LocalOrDistributedWorkerBase):
     """A worker class that executes (a partition of) the model on a CPU socket.
 
     Each worker is associated with a single CPU socket. The worker is 
@@ -266,6 +266,18 @@ def initialize_cache(self, num_gpu_blocks: int,
         # Initialize the cache.
         self._init_cache_engine()
 
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        return self.model_runner.add_lora(lora_request)
+
+    def remove_lora(self, lora_id: int) -> bool:
+        return self.model_runner.remove_lora(lora_id)
+
+    def pin_lora(self, lora_id: int) -> bool:
+        return self.model_runner.pin_lora(lora_id)
+
+    def list_loras(self) -> Set[int]:
+        return self.model_runner.list_loras()
+
     def _validate_num_cpu_blocks(self, num_cpu_blocks: int) -> None:
         """Raise errors if the num_cpu_blocks is invalid.
         """

From 263a870ee18bd6a90e25dbfa342be32c6b92c33e Mon Sep 17 00:00:00 2001
From: Avshalom Manevich <12231371+avshalomman@users.noreply.github.com>
Date: Sun, 12 Jan 2025 17:53:51 +0200
Subject: [PATCH 186/462] [Hardware][TPU] workaround fix for MoE on TPU
 (#11764)

---
 tests/kernels/test_moe.py                     |  7 +++
 vllm/model_executor/layers/fused_moe/layer.py |  3 +-
 .../layers/fused_moe/moe_torch_iterative.py   | 51 +++++++++++++++++++
 3 files changed, 60 insertions(+), 1 deletion(-)
 create mode 100644 vllm/model_executor/layers/fused_moe/moe_torch_iterative.py

diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py
index 8b23b62826053..7fa5de1984452 100644
--- a/tests/kernels/test_moe.py
+++ b/tests/kernels/test_moe.py
@@ -14,6 +14,8 @@
 from vllm.model_executor.layers.fused_moe import fused_moe
 from vllm.model_executor.layers.fused_moe.fused_moe import (
     fused_topk, moe_align_block_size)
+from vllm.model_executor.layers.fused_moe.moe_torch_iterative import (
+    fused_moe as iterative_moe)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
     marlin_quantize)
 from vllm.model_executor.models.mixtral import MixtralMoE
@@ -46,6 +48,11 @@ def test_fused_moe(
     triton_output = fused_moe(a, w1, w2, score, topk, renormalize=False)
     torch_output = torch_moe(a, w1, w2, score, topk)
     torch.testing.assert_close(triton_output, torch_output, atol=2e-2, rtol=0)
+    iterative_output = iterative_moe(a, w1, w2, score, topk, renormalize=False)
+    torch.testing.assert_close(iterative_output,
+                               torch_output,
+                               atol=2e-2,
+                               rtol=0)
 
 
 @pytest.mark.parametrize("dtype",
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index cf5db368926b4..3d822fc0c7f99 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -20,7 +20,8 @@
 else:
     fused_experts = None  # type: ignore
 if current_platform.is_tpu():
-    from .moe_pallas import fused_moe as fused_moe_pallas
+    # the iterative moe implementation is used until the moe_pallas is fixed
+    from .moe_torch_iterative import fused_moe as fused_moe_pallas
 else:
     fused_moe_pallas = None  # type: ignore
 logger = init_logger(__name__)
diff --git a/vllm/model_executor/layers/fused_moe/moe_torch_iterative.py b/vllm/model_executor/layers/fused_moe/moe_torch_iterative.py
new file mode 100644
index 0000000000000..bcff55f4fdf16
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/moe_torch_iterative.py
@@ -0,0 +1,51 @@
+import torch
+import torch.nn.functional as F
+
+
+def fused_moe(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+) -> torch.Tensor:
+    """
+    Args:
+        hidden_states: [*, hidden_size]
+        w1: [num_experts, intermediate_size * 2, hidden_size]
+        w2: [num_experts, hidden_size, intermediate_size]
+        gating_output: [*, num_experts]
+    """
+    orig_shape = hidden_states.shape
+    hidden_size = hidden_states.shape[-1]
+    num_tokens = hidden_states.shape[:-1].numel()
+    num_experts = w1.shape[0]
+    intermediate_size = w2.shape[-1]
+    dtype = hidden_states.dtype
+
+    hidden_states = hidden_states.view(num_tokens, hidden_size)
+    gating_output = gating_output.view(num_tokens, num_experts)
+    topk_weights = gating_output.softmax(dim=-1, dtype=torch.float)
+    topk_weights, selected_experts = topk_weights.topk(topk, dim=-1)
+    if renormalize:
+        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
+    topk_weights = topk_weights.to(dtype)
+
+    final_hidden_states = None
+    for expert_idx in range(num_experts):
+        expert_w1 = w1[expert_idx]
+        expert_w2 = w2[expert_idx]
+        expert_mask = (selected_experts == expert_idx)
+        expert_weights = (topk_weights * expert_mask).sum(dim=-1, keepdim=True)
+        x = F.linear(hidden_states, expert_w1)
+        gate = F.silu(x[:, :intermediate_size])
+        x = x[:, intermediate_size:] * gate
+        x = F.linear(x, expert_w2)
+        current_hidden_states = x * expert_weights
+        if final_hidden_states is None:
+            final_hidden_states = current_hidden_states
+        else:
+            final_hidden_states = final_hidden_states + current_hidden_states
+
+    return final_hidden_states.view(orig_shape)  # type: ignore

From 9597a095f2c02670b44f5973635ce4b9852e8eab Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Sun, 12 Jan 2025 16:02:02 -0500
Subject: [PATCH 187/462] [V1][Core][1/n] Logging and Metrics (#11962)

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
---
 tests/v1/engine/test_engine_core.py        |  4 +-
 tests/v1/engine/test_engine_core_client.py |  4 +-
 vllm/v1/core/scheduler.py                  | 20 ++++++--
 vllm/v1/engine/__init__.py                 |  3 ++
 vllm/v1/engine/async_llm.py                | 26 ++++++++---
 vllm/v1/engine/core.py                     | 53 +++++++---------------
 vllm/v1/engine/core_client.py              | 48 ++++++++------------
 vllm/v1/engine/llm_engine.py               |  5 +-
 vllm/v1/metrics/__init__.py                |  0
 vllm/v1/metrics/loggers.py                 | 38 ++++++++++++++++
 vllm/v1/metrics/stats.py                   | 12 +++++
 11 files changed, 129 insertions(+), 84 deletions(-)
 create mode 100644 vllm/v1/metrics/__init__.py
 create mode 100644 vllm/v1/metrics/loggers.py
 create mode 100644 vllm/v1/metrics/stats.py

diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index 5b1732036e807..cccfd305ac604 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -80,7 +80,7 @@ def test_engine_core(monkeypatch):
         assert len(engine_core.scheduler.running) == 4
 
         # Loop through until they are all done.
-        while len(engine_core.step()) > 0:
+        while len(engine_core.step().outputs) > 0:
             pass
 
         assert len(engine_core.scheduler.waiting) == 0
@@ -170,7 +170,7 @@ def test_engine_core_advanced_sampling(monkeypatch):
         assert len(engine_core.scheduler.waiting) == 1
         assert len(engine_core.scheduler.running) == 0
         # Loop through until they are all done.
-        while len(engine_core.step()) > 0:
+        while len(engine_core.step().outputs) > 0:
             pass
 
         assert len(engine_core.scheduler.waiting) == 0
diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
index 7eac16f2cf542..e2c728b22d481 100644
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -43,7 +43,7 @@ def make_request(params: SamplingParams) -> EngineCoreRequest:
 def loop_until_done(client: EngineCoreClient, outputs: Dict):
 
     while True:
-        engine_core_outputs = client.get_output()
+        engine_core_outputs = client.get_output().outputs
 
         if len(engine_core_outputs) == 0:
             break
@@ -61,7 +61,7 @@ def loop_until_done(client: EngineCoreClient, outputs: Dict):
 async def loop_until_done_async(client: EngineCoreClient, outputs: Dict):
 
     while True:
-        engine_core_outputs = await client.get_output_async()
+        engine_core_outputs = await client.get_output_async().outputs
 
         if len(engine_core_outputs) == 0:
             break
diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index b26716f5c02e6..f04e529891287 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -8,7 +8,8 @@
 from vllm.sampling_params import SamplingParams
 from vllm.v1.core.encoder_cache_manager import EncoderCacheManager
 from vllm.v1.core.kv_cache_manager import KVCacheManager
-from vllm.v1.engine import EngineCoreOutput
+from vllm.v1.engine import EngineCoreOutput, EngineCoreOutputs
+from vllm.v1.metrics.stats import SchedulerStats
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
 
@@ -394,12 +395,12 @@ def update_from_output(
         self,
         scheduler_output: "SchedulerOutput",
         model_runner_output: "ModelRunnerOutput",
-    ) -> List[EngineCoreOutput]:
+    ) -> EngineCoreOutputs:
         # NOTE(woosuk): This method doesn't consider speculative decoding.
         sampled_token_ids = model_runner_output.sampled_token_ids
         num_scheduled_tokens = scheduler_output.num_scheduled_tokens
         new_running: List[Request] = []
-        engine_core_outputs: List[EngineCoreOutput] = []
+        outputs: List[EngineCoreOutput] = []
         for request in self.running:
             req_id = request.request_id
             request.num_computed_tokens += num_scheduled_tokens[req_id]
@@ -438,7 +439,7 @@ def update_from_output(
                     finished=request.is_finished(),
                     finish_reason=request.get_finished_reason(),
                     stop_reason=request.stop_reason)
-                engine_core_outputs.append(output)
+                outputs.append(output)
 
                 # Breakout of the loop.
                 if stopped:
@@ -446,7 +447,10 @@ def update_from_output(
 
             new_running.append(request)
         self.running = new_running
-        return engine_core_outputs
+        return EngineCoreOutputs(
+            outputs=outputs,
+            scheduler_stats=self.make_stats(),
+        )
 
     def _check_stop(self, request: Request) -> bool:
         if (request.num_tokens >= self.max_model_len
@@ -515,6 +519,12 @@ def get_num_unfinished_requests(self) -> int:
     def has_unfinished_requests(self) -> bool:
         return self.get_num_unfinished_requests() > 0
 
+    def make_stats(self) -> SchedulerStats:
+        return SchedulerStats(
+            num_running_reqs=len(self.running),
+            num_waiting_reqs=len(self.waiting),
+        )
+
 
 @dataclass
 class NewRequestData:
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 3ce9db0e47eed..6d90c38c72cf5 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -4,6 +4,8 @@
 
 import msgspec
 
+from vllm.v1.metrics.stats import SchedulerStats
+
 if TYPE_CHECKING:
     from vllm.lora.request import LoRARequest
     from vllm.multimodal import MultiModalKwargs
@@ -56,6 +58,7 @@ class EngineCoreOutputs(
 
     # [num_reqs]
     outputs: List[EngineCoreOutput]
+    scheduler_stats: SchedulerStats
 
 
 @dataclass
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 5daae45dee85c..e0ceb59dffcbd 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -4,7 +4,6 @@
 
 from vllm.config import ModelConfig, VllmConfig
 from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.engine.metrics_types import StatLoggerBase
 from vllm.engine.protocol import EngineClient
 from vllm.inputs import INPUT_REGISTRY, InputRegistry, PromptType
 from vllm.inputs.preprocess import InputPreprocessor
@@ -22,6 +21,8 @@
 from vllm.v1.engine.detokenizer import Detokenizer
 from vllm.v1.engine.processor import Processor
 from vllm.v1.executor.abstract import Executor
+from vllm.v1.metrics.loggers import LoggingStatLogger, StatLoggerBase
+from vllm.v1.metrics.stats import SchedulerStats
 
 logger = init_logger(__name__)
 
@@ -34,7 +35,6 @@ def __init__(
         executor_class: Type[Executor],
         log_stats: bool,
         usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
-        stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
         input_registry: InputRegistry = INPUT_REGISTRY,
         use_cached_outputs: bool = False,
         log_requests: bool = True,
@@ -45,7 +45,10 @@ def __init__(
 
         self.log_requests = log_requests
         self.log_stats = log_stats
-        self.stat_loggers = stat_loggers
+        self.stat_loggers: List[StatLoggerBase] = [
+            LoggingStatLogger(),
+            # TODO(rob): PrometheusStatLogger(),
+        ]
         self.model_config = vllm_config.model_config
 
         # Tokenizer (+ ensure liveness if running in another process).
@@ -82,7 +85,6 @@ def __init__(
             asyncio_mode=True,
             vllm_config=vllm_config,
             executor_class=executor_class,
-            log_stats=self.log_stats,
         )
 
         self.output_handler: Optional[asyncio.Task] = None
@@ -94,7 +96,6 @@ def from_engine_args(
         engine_config: Optional[VllmConfig] = None,
         start_engine_loop: bool = True,
         usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
-        stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
     ) -> "AsyncLLM":
         """Create an AsyncLLM from the EngineArgs."""
 
@@ -114,7 +115,6 @@ def from_engine_args(
             log_stats=not engine_args.disable_log_stats,
             start_engine_loop=start_engine_loop,
             usage_context=usage_context,
-            stat_loggers=stat_loggers,
         )
 
     def shutdown(self):
@@ -254,7 +254,8 @@ async def _run_output_handler(self):
                 outputs = await self.engine_core.get_output_async()
 
                 # 2) Detokenize based on the output.
-                request_outputs, reqs_to_abort = self.detokenizer.step(outputs)
+                request_outputs, reqs_to_abort = self.detokenizer.step(
+                    outputs.outputs)
 
                 # 3) Put the RequestOutputs into the per-request queues.
                 self._process_request_outputs(request_outputs)
@@ -262,6 +263,9 @@ async def _run_output_handler(self):
                 # 4) Abort any requests that finished due to stop strings.
                 await self.engine_core.abort_requests_async(reqs_to_abort)
 
+                # 5) Log any stats.
+                await self._log_stats(scheduler_stats=outputs.scheduler_stats)
+
         except Exception as e:
             logger.exception("EngineCore output handler hit an error: %s", e)
             kill_process_tree(os.getpid())
@@ -278,6 +282,14 @@ async def abort(self, request_id: str) -> None:
         if request_id in self.rid_to_queue:
             del self.rid_to_queue[request_id]
 
+    async def _log_stats(self, scheduler_stats: SchedulerStats):
+        """Log stats to the stat loggers."""
+        if not self.log_stats:
+            return
+
+        for logger in self.stat_loggers:
+            logger.log(scheduler_stats=scheduler_stats)
+
     def encode(
         self,
         prompt: PromptType,
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 975ce11fe8aff..e7f90d3c62142 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -17,9 +17,9 @@
     maybe_register_config_serialize_by_value)
 from vllm.utils import get_exception_traceback, zmq_socket_ctx
 from vllm.v1.core.scheduler import Scheduler
-from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs,
-                            EngineCoreProfile, EngineCoreRequest,
-                            EngineCoreRequestType, EngineCoreRequestUnion)
+from vllm.v1.engine import (EngineCoreOutputs, EngineCoreProfile,
+                            EngineCoreRequest, EngineCoreRequestType,
+                            EngineCoreRequestUnion)
 from vllm.v1.engine.mm_input_mapper import MMInputMapperServer
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.request import Request, RequestStatus
@@ -28,9 +28,7 @@
 
 logger = init_logger(__name__)
 
-POLLING_TIMEOUT_MS = 5000
-POLLING_TIMEOUT_S = POLLING_TIMEOUT_MS // 1000
-LOGGING_TIME_S = 5
+POLLING_TIMEOUT_S = 2.5
 
 
 class EngineCore:
@@ -40,10 +38,8 @@ def __init__(
         self,
         vllm_config: VllmConfig,
         executor_class: Type[Executor],
-        log_stats: bool = False,
     ):
         assert vllm_config.model_config.runner_type != "pooling"
-        self.log_stats = log_stats
 
         logger.info("Initializing an LLM engine (v%s) with config: %s",
                     VLLM_VERSION, vllm_config)
@@ -62,8 +58,6 @@ def __init__(
                                    vllm_config.cache_config,
                                    vllm_config.lora_config)
 
-        self._last_logging_time = time.time()
-
         self.mm_input_mapper_server = MMInputMapperServer(
             vllm_config.model_config)
 
@@ -114,11 +108,12 @@ def abort_requests(self, request_ids: List[str]):
         self.scheduler.finish_requests(request_ids,
                                        RequestStatus.FINISHED_ABORTED)
 
-    def step(self) -> List[EngineCoreOutput]:
+    def step(self) -> EngineCoreOutputs:
         """Schedule, execute, and make output."""
 
         if not self.scheduler.has_unfinished_requests():
-            return []
+            return EngineCoreOutputs(
+                outputs=[], scheduler_stats=self.scheduler.make_stats())
 
         scheduler_output = self.scheduler.schedule()
         output = self.model_executor.execute_model(scheduler_output)
@@ -145,7 +140,9 @@ def __init__(
         executor_class: Type[Executor],
         log_stats: bool = False,
     ):
-        super().__init__(vllm_config, executor_class, log_stats)
+        super().__init__(vllm_config, executor_class)
+
+        self.log_stats = log_stats
 
         # Background Threads and Queues for IO. These enable us to
         # overlap ZMQ socket IO with GPU since they release the GIL,
@@ -153,7 +150,7 @@ def __init__(
         # model forward pass.
         # Threads handle Socket <-> Queues and core_busy_loop uses Queue.
         self.input_queue: queue.Queue[EngineCoreRequestUnion] = queue.Queue()
-        self.output_queue: queue.Queue[List[EngineCoreOutput]] = queue.Queue()
+        self.output_queue: queue.Queue[EngineCoreOutputs] = queue.Queue()
         threading.Thread(target=self.process_input_socket,
                          args=(input_path, ),
                          daemon=True).start()
@@ -217,8 +214,10 @@ def run_busy_loop(self):
                         self._handle_client_request(req)
                         break
                     except queue.Empty:
-                        self._log_stats()
                         logger.debug("EngineCore busy loop waiting.")
+                        # Break out the loop so we can log_stats in step().
+                        if self.log_stats:
+                            break
                     except BaseException:
                         raise
 
@@ -230,28 +229,9 @@ def run_busy_loop(self):
             # 3) Step the engine core.
             outputs = self.step()
 
-            # 4) Put EngineCoreOutputs into the output queue.
+            # 5) Put EngineCoreOutputs into the output queue.
             self.output_queue.put_nowait(outputs)
 
-            self._log_stats()
-
-    def _log_stats(self):
-        """Log basic stats every LOGGING_TIME_S"""
-
-        if not self.log_stats:
-            return
-
-        now = time.time()
-
-        if now - self._last_logging_time > LOGGING_TIME_S:
-            logger.info(
-                "RUNNING: %s | WAITING: %s",
-                len(self.scheduler.running),
-                len(self.scheduler.waiting),
-            )
-
-            self._last_logging_time = now
-
     def _handle_client_request(self, request: EngineCoreRequestUnion) -> None:
         """Handle EngineCoreRequest or EngineCoreABORT from Client."""
 
@@ -301,7 +281,6 @@ def process_output_socket(self, output_path: str):
 
         with zmq_socket_ctx(output_path, zmq.constants.PUSH) as socket:
             while True:
-                engine_core_outputs = self.output_queue.get()
-                outputs = EngineCoreOutputs(outputs=engine_core_outputs)
+                outputs = self.output_queue.get()
                 encoder.encode_into(outputs, buffer)
                 socket.send_multipart((buffer, ), copy=False)
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 4ed7f944b058f..9d6ae725e9d2b 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -12,9 +12,9 @@
 from vllm.logger import init_logger
 from vllm.utils import (get_open_zmq_ipc_path, kill_process_tree,
                         make_zmq_socket)
-from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs,
-                            EngineCoreProfile, EngineCoreRequest,
-                            EngineCoreRequestType, EngineCoreRequestUnion)
+from vllm.v1.engine import (EngineCoreOutputs, EngineCoreProfile,
+                            EngineCoreRequest, EngineCoreRequestType,
+                            EngineCoreRequestUnion)
 from vllm.v1.engine.core import EngineCore, EngineCoreProc
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.serial_utils import PickleEncoder
@@ -40,7 +40,6 @@ def make_client(
         asyncio_mode: bool,
         vllm_config: VllmConfig,
         executor_class: Type[Executor],
-        log_stats: bool = False,
     ) -> "EngineCoreClient":
 
         # TODO: support this for debugging purposes.
@@ -50,18 +49,18 @@ def make_client(
                 "is not currently supported.")
 
         if multiprocess_mode and asyncio_mode:
-            return AsyncMPClient(vllm_config, executor_class, log_stats)
+            return AsyncMPClient(vllm_config, executor_class)
 
         if multiprocess_mode and not asyncio_mode:
-            return SyncMPClient(vllm_config, executor_class, log_stats)
+            return SyncMPClient(vllm_config, executor_class)
 
-        return InprocClient(vllm_config, executor_class, log_stats)
+        return InprocClient(vllm_config, executor_class)
 
     @abstractmethod
     def shutdown(self):
         ...
 
-    def get_output(self) -> List[EngineCoreOutput]:
+    def get_output(self) -> EngineCoreOutputs:
         raise NotImplementedError
 
     def add_request(self, request: EngineCoreRequest) -> None:
@@ -73,7 +72,7 @@ def profile(self, is_start: bool = True) -> None:
     def abort_requests(self, request_ids: List[str]) -> None:
         raise NotImplementedError
 
-    async def get_output_async(self) -> List[EngineCoreOutput]:
+    async def get_output_async(self) -> EngineCoreOutputs:
         raise NotImplementedError
 
     async def add_request_async(self, request: EngineCoreRequest) -> None:
@@ -99,7 +98,7 @@ class InprocClient(EngineCoreClient):
     def __init__(self, *args, **kwargs):
         self.engine_core = EngineCore(*args, **kwargs)
 
-    def get_output(self) -> List[EngineCoreOutput]:
+    def get_output(self) -> EngineCoreOutputs:
         return self.engine_core.step()
 
     def add_request(self, request: EngineCoreRequest) -> None:
@@ -133,7 +132,7 @@ def __init__(
         asyncio_mode: bool,
         vllm_config: VllmConfig,
         executor_class: Type[Executor],
-        log_stats: bool = False,
+        log_stats: bool,
     ):
         # The child processes will send SIGUSR1 when unrecoverable
         # errors happen. We kill the process tree here so that the
@@ -194,22 +193,19 @@ def shutdown(self):
 class SyncMPClient(MPClient):
     """Synchronous client for multi-proc EngineCore."""
 
-    def __init__(self,
-                 vllm_config: VllmConfig,
-                 executor_class: Type[Executor],
-                 log_stats: bool = False):
+    def __init__(self, vllm_config: VllmConfig,
+                 executor_class: Type[Executor]):
         super().__init__(
             asyncio_mode=False,
             vllm_config=vllm_config,
             executor_class=executor_class,
-            log_stats=log_stats,
+            log_stats=False,
         )
 
-    def get_output(self) -> List[EngineCoreOutput]:
+    def get_output(self) -> EngineCoreOutputs:
 
         (frame, ) = self.output_socket.recv_multipart(copy=False)
-        engine_core_outputs = self.decoder.decode(frame.buffer).outputs
-        return engine_core_outputs
+        return self.decoder.decode(frame.buffer)
 
     def _send_input(self, request_type: EngineCoreRequestType,
                     request: EngineCoreRequestUnion) -> None:
@@ -235,23 +231,19 @@ def profile(self, is_start: bool = True) -> None:
 class AsyncMPClient(MPClient):
     """Asyncio-compatible client for multi-proc EngineCore."""
 
-    def __init__(self,
-                 vllm_config: VllmConfig,
-                 executor_class: Type[Executor],
-                 log_stats: bool = False):
+    def __init__(self, vllm_config: VllmConfig,
+                 executor_class: Type[Executor]):
         super().__init__(
             asyncio_mode=True,
             vllm_config=vllm_config,
             executor_class=executor_class,
-            log_stats=log_stats,
+            log_stats=True,
         )
 
-    async def get_output_async(self) -> List[EngineCoreOutput]:
+    async def get_output_async(self) -> EngineCoreOutputs:
 
         frames = await self.output_socket.recv_multipart(copy=False)
-        engine_core_outputs = self.decoder.decode(frames[0].buffer).outputs
-
-        return engine_core_outputs
+        return self.decoder.decode(frames[0].buffer)
 
     async def _send_input(self, request_type: EngineCoreRequestType,
                           request: EngineCoreRequestUnion) -> None:
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 8ced3a34d2da3..ac392f5e4f4cf 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -74,7 +74,6 @@ def __init__(
             asyncio_mode=False,
             vllm_config=vllm_config,
             executor_class=executor_class,
-            log_stats=False,
         )
 
     @classmethod
@@ -147,11 +146,11 @@ def add_request(
     def step(self) -> List[RequestOutput]:
 
         # 1) Get EngineCoreOutput from the EngineCore.
-        engine_core_outputs = self.engine_core.get_output()
+        outputs = self.engine_core.get_output()
 
         # 2) Detokenizer the EngineCoreOutput.
         request_outputs, requests_to_abort = self.detokenizer.step(
-            engine_core_outputs)
+            outputs.outputs)
 
         # 3) Abort requests that finished due to stopping criteria.
         if requests_to_abort:
diff --git a/vllm/v1/metrics/__init__.py b/vllm/v1/metrics/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
new file mode 100644
index 0000000000000..8feeef17542e6
--- /dev/null
+++ b/vllm/v1/metrics/loggers.py
@@ -0,0 +1,38 @@
+import time
+from abc import ABC, abstractmethod
+
+from vllm.logger import init_logger
+from vllm.v1.metrics.stats import SchedulerStats
+
+logger = init_logger(__name__)
+
+_LOCAL_LOGGING_INTERVAL_SEC = 5.0
+
+
+class StatLoggerBase(ABC):
+
+    @abstractmethod
+    def log(self, scheduler_stats: SchedulerStats):
+        ...
+
+
+class LoggingStatLogger(StatLoggerBase):
+
+    def __init__(self):
+        self.last_log_time = time.monotonic()
+
+    def log(self, scheduler_stats: SchedulerStats):
+        """Log Stats to standard output."""
+
+        # Log every _LOCAL_LOGGING_INTERVAL_SEC.
+        now = time.monotonic()
+        if now - self.last_log_time < _LOCAL_LOGGING_INTERVAL_SEC:
+            return
+        self.last_log_time = now
+
+        # Format and print output.
+        logger.info(
+            "Running: %d reqs, Waiting: %d reqs ",
+            scheduler_stats.num_running_reqs,
+            scheduler_stats.num_waiting_reqs,
+        )
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
new file mode 100644
index 0000000000000..5ebb4fd5b37db
--- /dev/null
+++ b/vllm/v1/metrics/stats.py
@@ -0,0 +1,12 @@
+from dataclasses import dataclass
+
+
+@dataclass
+class SchedulerStats:
+    """Stats associated with the scheduler."""
+
+    num_running_reqs: int = 0
+    num_waiting_reqs: int = 0
+
+    # gpu_cache_usage: float = 0.0
+    # gpu_prefix_cache_hit_rate: float = 0.0

From d14e98d924724b284dc5eaf8070d935e214e50c0 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Mon, 13 Jan 2025 08:13:44 +0800
Subject: [PATCH 188/462] [Model] Support GGUF models newly added in
 `transformers` 4.46.0 (#9685)

Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 examples/offline_inference/gguf_inference.py  |  22 ++--
 .../models/decoder_only/language/test_gguf.py | 105 ++++++++++++------
 vllm/model_executor/layers/linear.py          |  58 ++++++----
 vllm/model_executor/models/gpt2.py            |  11 +-
 vllm/model_executor/models/llama.py           |   3 +-
 vllm/model_executor/models/stablelm.py        |  31 ++++--
 vllm/model_executor/models/starcoder2.py      |  19 +++-
 7 files changed, 162 insertions(+), 87 deletions(-)

diff --git a/examples/offline_inference/gguf_inference.py b/examples/offline_inference/gguf_inference.py
index 09a5fcc22e553..aa05c4c0bfaa5 100644
--- a/examples/offline_inference/gguf_inference.py
+++ b/examples/offline_inference/gguf_inference.py
@@ -3,27 +3,20 @@
 from vllm import LLM, SamplingParams
 
 
-def run_gguf_inference(model_path):
-    PROMPT_TEMPLATE = "<|system|>\n{system_message}</s>\n<|user|>\n{prompt}</s>\n<|assistant|>\n"  # noqa: E501
-    system_message = "You are a friendly chatbot who always responds in the style of a pirate."  # noqa: E501
+def run_gguf_inference(model_path, tokenizer):
     # Sample prompts.
     prompts = [
         "How many helicopters can a human eat in one sitting?",
         "What's the future of AI?",
     ]
-    prompts = [
-        PROMPT_TEMPLATE.format(system_message=system_message, prompt=prompt)
-        for prompt in prompts
-    ]
+    prompts = [[{"role": "user", "content": prompt}] for prompt in prompts]
     # Create a sampling params object.
     sampling_params = SamplingParams(temperature=0, max_tokens=128)
 
     # Create an LLM.
-    llm = LLM(model=model_path,
-              tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-              gpu_memory_utilization=0.95)
+    llm = LLM(model=model_path, tokenizer=tokenizer)
 
-    outputs = llm.generate(prompts, sampling_params)
+    outputs = llm.chat(prompts, sampling_params)
     # Print the outputs.
     for output in outputs:
         prompt = output.prompt
@@ -32,7 +25,8 @@ def run_gguf_inference(model_path):
 
 
 if __name__ == "__main__":
-    repo_id = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
-    filename = "tinyllama-1.1b-chat-v1.0.Q4_0.gguf"
+    repo_id = "bartowski/Phi-3-medium-4k-instruct-GGUF"
+    filename = "Phi-3-medium-4k-instruct-IQ2_M.gguf"
+    tokenizer = "microsoft/Phi-3-medium-4k-instruct"
     model = hf_hub_download(repo_id, filename=filename)
-    run_gguf_inference(model)
+    run_gguf_inference(model, tokenizer)
diff --git a/tests/models/decoder_only/language/test_gguf.py b/tests/models/decoder_only/language/test_gguf.py
index 2b8f5e2faa45e..81b93ebdf0fc0 100644
--- a/tests/models/decoder_only/language/test_gguf.py
+++ b/tests/models/decoder_only/language/test_gguf.py
@@ -4,6 +4,7 @@
 """
 
 import os
+from typing import List, NamedTuple, Type
 
 import pytest
 from huggingface_hub import hf_hub_download
@@ -11,6 +12,7 @@
 
 from tests.quantization.utils import is_quant_method_supported
 
+from ....conftest import VllmRunner
 from ...utils import check_logprobs_close
 
 os.environ["TOKENIZERS_PARALLELISM"] = "true"
@@ -18,31 +20,74 @@
 MAX_MODEL_LEN = 1024
 
 
+class GGUFTestConfig(NamedTuple):
+    original_model: str
+    gguf_repo: str
+    gguf_filename: str
+
+    @property
+    def gguf_model(self):
+        return hf_hub_download(self.gguf_repo, filename=self.gguf_filename)
+
+
+LLAMA_CONFIG = GGUFTestConfig(
+    original_model="meta-llama/Llama-3.2-1B-Instruct",
+    gguf_repo="bartowski/Llama-3.2-1B-Instruct-GGUF",
+    gguf_filename="Llama-3.2-1B-Instruct-IQ4_XS.gguf",
+)
+
+QWEN2_CONFIG = GGUFTestConfig(
+    original_model="Qwen/Qwen2.5-1.5B-Instruct",
+    gguf_repo="Qwen/Qwen2.5-1.5B-Instruct-GGUF",
+    gguf_filename="qwen2.5-1.5b-instruct-q6_k.gguf",
+)
+
+PHI3_CONFIG = GGUFTestConfig(
+    original_model="microsoft/Phi-3.5-mini-instruct",
+    gguf_repo="bartowski/Phi-3.5-mini-instruct-GGUF",
+    gguf_filename="Phi-3.5-mini-instruct-IQ4_XS.gguf",
+)
+
+GPT2_CONFIG = GGUFTestConfig(
+    original_model="openai-community/gpt2-large",
+    gguf_repo="QuantFactory/gpt2-large-GGUF",
+    gguf_filename="gpt2-large.Q4_K_M.gguf",
+)
+
+STABLELM_CONFIG = GGUFTestConfig(
+    original_model="stabilityai/stablelm-3b-4e1t",
+    gguf_repo="afrideva/stablelm-3b-4e1t-GGUF",
+    gguf_filename="stablelm-3b-4e1t.q4_k_m.gguf",
+)
+
+STARCODER_CONFIG = GGUFTestConfig(
+    original_model="bigcode/starcoder2-3b",
+    gguf_repo="QuantFactory/starcoder2-3b-GGUF",
+    gguf_filename="starcoder2-3b.Q6_K.gguf",
+)
+
+MODELS = [
+    LLAMA_CONFIG,
+    QWEN2_CONFIG,
+    PHI3_CONFIG,
+    GPT2_CONFIG,
+    STABLELM_CONFIG,
+    # STARCODER_CONFIG, # broken
+]
+
+
 @pytest.mark.skipif(not is_quant_method_supported("gguf"),
                     reason="gguf is not supported on this GPU type.")
-@pytest.mark.parametrize(("original_model", "gguf_id", "gguf_path"), [
-    ("meta-llama/Llama-3.2-1B-Instruct",
-     "bartowski/Llama-3.2-1B-Instruct-GGUF",
-     "Llama-3.2-1B-Instruct-Q4_K_M.gguf"),
-    ("meta-llama/Llama-3.2-1B-Instruct",
-     "bartowski/Llama-3.2-1B-Instruct-GGUF",
-     "Llama-3.2-1B-Instruct-IQ4_XS.gguf"),
-    ("Qwen/Qwen2-1.5B-Instruct", "Qwen/Qwen2-1.5B-Instruct-GGUF",
-     "qwen2-1_5b-instruct-q4_k_m.gguf"),
-    ("Qwen/Qwen2-1.5B-Instruct", "legraphista/Qwen2-1.5B-Instruct-IMat-GGUF",
-     "Qwen2-1.5B-Instruct.IQ4_XS.gguf"),
-])
+@pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [32])
 @pytest.mark.parametrize("num_logprobs", [5])
 @pytest.mark.parametrize("tp_size", [1, 2])
 def test_models(
-    num_gpus_available,
-    vllm_runner,
-    example_prompts,
-    original_model,
-    gguf_id,
-    gguf_path,
+    num_gpus_available: int,
+    vllm_runner: Type[VllmRunner],
+    example_prompts: List[str],
+    model: GGUFTestConfig,
     dtype: str,
     max_tokens: int,
     num_logprobs: int,
@@ -51,28 +96,26 @@ def test_models(
     if num_gpus_available < tp_size:
         pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
 
-    gguf_model = hf_hub_download(gguf_id, filename=gguf_path)
-
-    tokenizer = AutoTokenizer.from_pretrained(original_model)
-    messages = [[{
-        'role': 'user',
-        'content': prompt
-    }] for prompt in example_prompts]
-    example_prompts = tokenizer.apply_chat_template(messages,
-                                                    tokenize=False,
-                                                    add_generation_prompt=True)
+    tokenizer = AutoTokenizer.from_pretrained(model.original_model)
+    if tokenizer.chat_template is not None:
+        messages = [[{
+            'role': 'user',
+            'content': prompt
+        }] for prompt in example_prompts]
+        example_prompts = tokenizer.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True)
 
     # Run unquantized model.
-    with vllm_runner(model_name=original_model,
+    with vllm_runner(model_name=model.original_model,
                      dtype=dtype,
                      max_model_len=MAX_MODEL_LEN,
                      tensor_parallel_size=tp_size) as original_model:
-
         original_outputs = original_model.generate_greedy_logprobs(
             example_prompts[:-1], max_tokens, num_logprobs)
 
     # Run gguf model.
-    with vllm_runner(model_name=gguf_model,
+    with vllm_runner(model_name=model.gguf_model,
+                     tokenizer_name=model.original_model,
                      dtype=dtype,
                      max_model_len=MAX_MODEL_LEN,
                      tensor_parallel_size=tp_size) as gguf_model:
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 48cfb1b221720..8876ca72792cf 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -447,8 +447,14 @@ def weight_loader(self,
         is_gguf_weight = getattr(param, "is_gguf_weight", False)
         is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False)
         if is_gguf_weight_type:
-            param.data[loaded_shard_id].copy_(loaded_weight)
-            param.shard_weight_type[loaded_shard_id] = loaded_weight.item()
+            if loaded_shard_id is not None:
+                param.data[loaded_shard_id].copy_(loaded_weight)
+                param.shard_weight_type[loaded_shard_id] = loaded_weight.item()
+            else:
+                param.shard_weight_type = {
+                    i: loaded_weight.item()
+                    for i, _ in enumerate(self.output_sizes)
+                }
             return
 
         if is_gguf_weight:
@@ -459,15 +465,15 @@ def weight_loader(self,
             shard_size = loaded_weight.size(output_dim) // tp_size
             start_idx = tp_rank * shard_size
 
-            loaded_weight = loaded_weight.narrow(output_dim, start_idx,
-                                                 shard_size)
-
-            param.shard_id.append(loaded_shard_id)
-            param.shard_id_map[loaded_shard_id] = len(param.data_container)
-            param.data_container.append(loaded_weight)
-            if len(param.data_container) == 2:
-                self.qweight = param.materialize_nested()
-            return
+            if loaded_shard_id is not None:
+                loaded_weight = loaded_weight.narrow(output_dim, start_idx,
+                                                     shard_size)
+                param.shard_id.append(loaded_shard_id)
+                param.shard_id_map[loaded_shard_id] = len(param.data_container)
+                param.data_container.append(loaded_weight)
+                if len(param.data_container) == 2:
+                    self.qweight = param.materialize_nested()
+                return
 
         param_data = param.data
         output_dim = getattr(param, "output_dim", None)
@@ -811,10 +817,16 @@ def weight_loader(self,
         # initialize GGUF param after we know the quantize type
         is_gguf_weight = getattr(param, "is_gguf_weight", False)
         is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False)
-        if is_gguf_weight_type and loaded_shard_id is not None:
+        if is_gguf_weight_type:
             idx_map = {"q": 0, "k": 1, "v": 2}
-            param.data[idx_map[loaded_shard_id]].copy_(loaded_weight)
-            param.shard_weight_type[loaded_shard_id] = loaded_weight.item()
+            if loaded_shard_id is not None:
+                param.data[idx_map[loaded_shard_id]].copy_(loaded_weight)
+                param.shard_weight_type[loaded_shard_id] = loaded_weight.item()
+            else:
+                param.shard_weight_type = {
+                    k: loaded_weight.item()
+                    for k in idx_map
+                }
             return
 
         if is_gguf_weight:
@@ -825,15 +837,15 @@ def weight_loader(self,
             shard_size = loaded_weight.size(output_dim) // tp_size
             start_idx = tp_rank * shard_size
 
-            loaded_weight = loaded_weight.narrow(output_dim, start_idx,
-                                                 shard_size)
-
-            param.shard_id.append(loaded_shard_id)
-            param.shard_id_map[loaded_shard_id] = len(param.data_container)
-            param.data_container.append(loaded_weight)
-            if len(param.data_container) == 3:
-                self.qweight = param.materialize_nested()
-            return
+            if loaded_shard_id is not None:
+                loaded_weight = loaded_weight.narrow(output_dim, start_idx,
+                                                     shard_size)
+                param.shard_id.append(loaded_shard_id)
+                param.shard_id_map[loaded_shard_id] = len(param.data_container)
+                param.data_container.append(loaded_weight)
+                if len(param.data_container) == 3:
+                    self.qweight = param.materialize_nested()
+                return
 
         param_data = param.data
         output_dim = getattr(param, "output_dim", None)
diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py
index fd926ff0254d4..1656a3cc9e46d 100644
--- a/vllm/model_executor/models/gpt2.py
+++ b/vllm/model_executor/models/gpt2.py
@@ -198,7 +198,10 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         assert not config.scale_attn_by_inverse_layer_idx
         assert not config.reorder_and_upcast_attn
         self.embed_dim = config.hidden_size
-        self.wte = VocabParallelEmbedding(config.vocab_size, self.embed_dim)
+        self.wte = VocabParallelEmbedding(config.vocab_size,
+                                          self.embed_dim,
+                                          quant_config=quant_config,
+                                          prefix=f"{prefix}.wte")
         self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)
         self.start_layer, self.end_layer, self.h = make_layers(
             config.num_hidden_layers,
@@ -259,7 +262,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.lm_head = self.transformer.wte
         else:
             self.lm_head = ParallelLMHead(self.config.vocab_size,
-                                          self.config.hidden_size)
+                                          self.config.hidden_size,
+                                          quant_config=quant_config,
+                                          prefix=f"{prefix}.lm_head")
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
@@ -304,7 +309,7 @@ def load_weights(self, weights: Iterable[Tuple[str,
         params_dict = dict(self.named_parameters(remove_duplicate=False))
         loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
-            if "lm_head.weight" in name:
+            if name.startswith("lm_head"):
                 # GPT-2 ties the weights of the embedding layer and the final
                 # linear layer.
                 continue
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 8623da99574bb..17b0fbb777e8e 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -156,7 +156,8 @@ def __init__(
         )
 
         is_neox_style = True
-        if quant_config is not None and quant_config.get_name() == "gguf":
+        is_gguf = quant_config and quant_config.get_name() == "gguf"
+        if is_gguf and config.model_type == "llama":
             is_neox_style = False
 
         self.rotary_emb = get_rope(
diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py
index 6b2107bef0a66..c9d1af78246a6 100644
--- a/vllm/model_executor/models/stablelm.py
+++ b/vllm/model_executor/models/stablelm.py
@@ -22,7 +22,7 @@
 
 import torch
 from torch import nn
-from transformers import PretrainedConfig
+from transformers import StableLmConfig
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.config import CacheConfig, VllmConfig
@@ -50,8 +50,9 @@
 class StablelmMLP(nn.Module):
 
     def __init__(self,
-                 config: PretrainedConfig,
-                 quant_config: Optional[QuantizationConfig] = None) -> None:
+                 config: StableLmConfig,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = "") -> None:
         super().__init__()
         self.config = config
         self.hidden_size = config.hidden_size
@@ -59,10 +60,13 @@ def __init__(self,
         self.gate_up_proj = MergedColumnParallelLinear(
             config.hidden_size, [config.intermediate_size] * 2,
             bias=False,
-            quant_config=quant_config)
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj")
         self.down_proj = RowParallelLinear(config.intermediate_size,
                                            config.hidden_size,
-                                           bias=False)
+                                           bias=False,
+                                           quant_config=quant_config,
+                                           prefix=f"{prefix}.down_proj")
         self.act_fn = SiluAndMul()
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -75,7 +79,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 class StablelmAttention(nn.Module):
 
     def __init__(self,
-                 config: PretrainedConfig,
+                 config: StableLmConfig,
                  cache_config: Optional[CacheConfig] = None,
                  quant_config: Optional[QuantizationConfig] = None,
                  prefix: str = "") -> None:
@@ -116,11 +120,13 @@ def __init__(self,
                                           self.total_num_heads,
                                           self.total_num_key_value_heads,
                                           self.qkv_bias,
-                                          quant_config=quant_config)
+                                          quant_config=quant_config,
+                                          prefix=f"{prefix}.qkv_proj")
         self.o_proj = RowParallelLinear(self.total_num_heads * self.head_dim,
                                         self.hidden_size,
                                         bias=False,
-                                        quant_config=quant_config)
+                                        quant_config=quant_config,
+                                        prefix=f"{prefix}.o_proj")
         self.rotary_emb = get_rope(
             self.head_dim,
             rotary_dim=self.rotary_ndims,
@@ -154,7 +160,7 @@ class StablelmDecoderLayer(nn.Module):
 
     def __init__(
         self,
-        config: PretrainedConfig,
+        config: StableLmConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
@@ -164,7 +170,7 @@ def __init__(
                                            cache_config,
                                            quant_config,
                                            prefix=f"{prefix}.self_attn")
-        self.mlp = StablelmMLP(config, quant_config)
+        self.mlp = StablelmMLP(config, quant_config, prefix=f"{prefix}.mlp")
         norm_eps = getattr(config, "norm_eps",
                            getattr(config, "layer_norm_eps", 1e-05))
         self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=norm_eps)
@@ -210,6 +216,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.embed_tokens = VocabParallelEmbedding(
             config.vocab_size,
             config.hidden_size,
+            quant_config=quant_config,
+            prefix=f"{prefix}.embed_tokens",
         )
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
@@ -270,7 +278,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                                         prefix=maybe_prefix(prefix, "model"))
         self.lm_head = ParallelLMHead(config.vocab_size,
                                       config.hidden_size,
-                                      quant_config=quant_config)
+                                      quant_config=quant_config,
+                                      prefix=f"{prefix}.lm_head")
         if self.config.tie_word_embeddings:
             self.lm_head.weight = self.model.embed_tokens.weight
         self.logits_processor = LogitsProcessor(config.vocab_size)
diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py
index 22189a517d313..1cd0dedfed2cb 100644
--- a/vllm/model_executor/models/starcoder2.py
+++ b/vllm/model_executor/models/starcoder2.py
@@ -88,12 +88,14 @@ def __init__(self,
             self.total_num_kv_heads,
             bias=self.use_bias,
             quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
         )
         self.o_proj = RowParallelLinear(
             self.total_num_heads * self.head_dim,
             self.hidden_size,
             bias=self.use_bias,
             quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
         )
         self.rotary_emb = get_rope(
             self.head_dim,
@@ -129,19 +131,22 @@ class Starcoder2MLP(nn.Module):
 
     def __init__(self,
                  config: Starcoder2Config,
-                 quant_config: Optional[QuantizationConfig] = None):
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
         super().__init__()
         self.c_fc = ColumnParallelLinear(
             config.hidden_size,
             config.intermediate_size,
             bias=config.use_bias,
             quant_config=quant_config,
+            prefix=f"{prefix}.c_fc",
         )
         self.c_proj = RowParallelLinear(
             config.intermediate_size,
             config.hidden_size,
             bias=config.use_bias,
             quant_config=quant_config,
+            prefix=f"{prefix}.c_proj",
         )
         self.act = get_act_fn(config.hidden_act)
 
@@ -165,7 +170,9 @@ def __init__(self,
                                              cache_config,
                                              quant_config=quant_config,
                                              prefix=f"{prefix}.self_attn")
-        self.mlp = Starcoder2MLP(config, quant_config=quant_config)
+        self.mlp = Starcoder2MLP(config,
+                                 quant_config=quant_config,
+                                 prefix=f"{prefix}.mlp")
         self.input_layernorm = nn.LayerNorm(config.hidden_size,
                                             eps=config.norm_epsilon)
         self.post_attention_layernorm = nn.LayerNorm(config.hidden_size,
@@ -213,8 +220,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.vocab_size = config.vocab_size
 
         # TODO: consider padding_idx (currently removed)
-        self.embed_tokens = VocabParallelEmbedding(config.vocab_size,
-                                                   config.hidden_size)
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=f"{prefix}.embed_tokens")
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
             lambda prefix: Starcoder2DecoderLayer(
@@ -279,6 +289,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                 org_num_embeddings=config.vocab_size,
                 padding_size=DEFAULT_VOCAB_PADDING_SIZE,
                 quant_config=quant_config,
+                prefix=f"{prefix}.lm_head",
             )
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 config.vocab_size)

From 619ae268c3dc848a9b2f04579ea78ac5655f190f Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Sun, 12 Jan 2025 23:54:10 -0500
Subject: [PATCH 189/462] [V1] [2/n] Logging and Metrics - `OutputProcessor`
 Abstraction (#11973)

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
---
 tests/v1/engine/test_async_llm.py             |  69 +++++-
 ...etokenizer.py => test_output_processor.py} |  99 ++++++++-
 vllm/v1/engine/async_llm.py                   |  89 ++++----
 vllm/v1/engine/core_client.py                 |   6 +-
 vllm/v1/engine/detokenizer.py                 | 137 ++----------
 vllm/v1/engine/llm_engine.py                  |  33 ++-
 vllm/v1/engine/output_processor.py            | 200 ++++++++++++++++++
 vllm/v1/metrics/stats.py                      |  27 +++
 8 files changed, 450 insertions(+), 210 deletions(-)
 rename tests/v1/engine/{test_detokenizer.py => test_output_processor.py} (65%)
 create mode 100644 vllm/v1/engine/output_processor.py

diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py
index fffb5b8100ec7..2c805e18eebae 100644
--- a/tests/v1/engine/test_async_llm.py
+++ b/tests/v1/engine/test_async_llm.py
@@ -1,5 +1,5 @@
 import asyncio
-from typing import Tuple
+from typing import List, Tuple
 
 import pytest
 
@@ -13,6 +13,7 @@
                 allow_module_level=True)
 
 ENGINE_ARGS = AsyncEngineArgs(model="meta-llama/Llama-3.2-1B",
+                              enforce_eager=True,
                               disable_log_requests=True)
 
 
@@ -53,17 +54,63 @@ async def test_load(monkeypatch):
                     generate(engine, request_id, NUM_EXPECTED_TOKENS)))
 
         # Confirm that we got all the EXPECTED tokens from the requests.
-        failed_request_id = None
-        tokens = None
         for task in tasks:
             num_generated_tokens, request_id = await task
-            if (num_generated_tokens != NUM_EXPECTED_TOKENS
-                    and failed_request_id is None):
-                failed_request_id = request_id
-                tokens = num_generated_tokens
-
-        assert failed_request_id is None, (
-            f"{failed_request_id} generated {tokens} but "
-            f"expected {NUM_EXPECTED_TOKENS}")
+            assert num_generated_tokens == NUM_EXPECTED_TOKENS, (
+                f"{request_id} generated {num_generated_tokens} but "
+                f"expected {NUM_EXPECTED_TOKENS}")
+
+        assert not engine.output_processor.has_unfinished_requests()
+        engine.shutdown()
+
+
+@pytest.mark.asyncio
+async def test_abort(monkeypatch):
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        engine = AsyncLLM.from_engine_args(ENGINE_ARGS)
+
+        NUM_REQUESTS = 100
+        NUM_EXPECTED_TOKENS = 100
+        REQUEST_IDS_TO_ABORT = range(1, 100, 10)
+
+        request_ids = [f"request-{i}" for i in range(NUM_REQUESTS)]
+
+        # Create concurrent requests.
+        tasks: List[asyncio.Task] = []
+        for request_id in request_ids:
+            tasks.append(
+                asyncio.create_task(
+                    generate(engine, request_id, NUM_EXPECTED_TOKENS)))
+
+        # API server cancels requests when they disconnect.
+        for idx in REQUEST_IDS_TO_ABORT:
+            tasks[idx].cancel()
+            await asyncio.sleep(0.1)
+
+        # Confirm the other requests are okay.
+        for idx, task in enumerate(tasks):
+            # Confirm that it was actually canceled.
+            if idx in REQUEST_IDS_TO_ABORT:
+                with pytest.raises(asyncio.CancelledError):
+                    await task
+            else:
+                # Otherwise, make sure the request was not impacted.
+                num_generated_tokens, request_id = await task
+                assert num_generated_tokens == NUM_EXPECTED_TOKENS, (
+                    f"{request_id} generated {num_generated_tokens} but "
+                    f"expected {NUM_EXPECTED_TOKENS}")
+
+        assert not engine.output_processor.has_unfinished_requests()
+
+        # Confirm we can do another generation.
+        request_id = f"request-{REQUEST_IDS_TO_ABORT[0]}"
+        task = asyncio.create_task(
+            generate(engine, request_id, NUM_EXPECTED_TOKENS))
+        num_generated_tokens, request_id = await task
+        assert num_generated_tokens == NUM_EXPECTED_TOKENS
+        assert not engine.output_processor.has_unfinished_requests()
 
         engine.shutdown()
diff --git a/tests/v1/engine/test_detokenizer.py b/tests/v1/engine/test_output_processor.py
similarity index 65%
rename from tests/v1/engine/test_detokenizer.py
rename to tests/v1/engine/test_output_processor.py
index aeae697ca32b0..4735c6f947537 100644
--- a/tests/v1/engine/test_detokenizer.py
+++ b/tests/v1/engine/test_output_processor.py
@@ -3,11 +3,18 @@
 import pytest
 from transformers import AutoTokenizer
 
+from vllm.engine.arg_utils import EngineArgs
 from vllm.sampling_params import RequestOutputKind, SamplingParams
+from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
 from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
-from vllm.v1.engine.detokenizer import Detokenizer
+from vllm.v1.engine.output_processor import OutputProcessor
 
 TOKENIZER_NAME = "mistralai/Mistral-7B-Instruct-v0.3"
+VLLM_CONFIG = EngineArgs(model=TOKENIZER_NAME).create_engine_config()
+TOKENIZER_GROUP = init_tokenizer_from_configs(VLLM_CONFIG.model_config,
+                                              VLLM_CONFIG.scheduler_config,
+                                              VLLM_CONFIG.parallel_config,
+                                              VLLM_CONFIG.lora_config)
 tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)
 
 FULL_STRINGS = [
@@ -66,7 +73,7 @@ def get_outputs(self) -> List[EngineCoreOutput]:
     "request_output_kind",
     [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
 def test_incremental_detokenization(request_output_kind: RequestOutputKind):
-    detokenizer = Detokenizer(TOKENIZER_NAME)
+    output_processor = OutputProcessor(TOKENIZER_GROUP, log_stats=False)
     engine_core = MockEngineCore(GENERATION_TOKENS)
 
     # Make N requests.
@@ -93,7 +100,7 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind):
 
     # Add requests to the detokenizer.
     for request in requests:
-        detokenizer.add_request(request)
+        output_processor.add_request(request)
 
     gen_strings = {}
     gen_tokens = {}
@@ -104,7 +111,9 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind):
             break
 
         # Step the Detokenizer.
-        request_outputs, requests_to_abort = detokenizer.step(outputs)
+        processed_outputs = output_processor.process_outputs(outputs, )
+        request_outputs = processed_outputs.request_outputs
+        requests_to_abort = processed_outputs.reqs_to_abort
         assert len(requests_to_abort) == 0
 
         # Update tracking.
@@ -128,13 +137,13 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind):
         assert gen_str == ref_gen_str, f"{gen_str=}, {ref_gen_str=}"
         assert gen_toks == ref_gen_toks, f"{gen_toks=}, {ref_gen_toks=}"
 
-    assert detokenizer.get_num_unfinished_requests() == 0
-    assert not detokenizer.has_unfinished_requests()
+    assert output_processor.get_num_unfinished_requests() == 0
+    assert not output_processor.has_unfinished_requests()
 
 
 @pytest.mark.parametrize("include_stop_str_in_output", [True, False])
 def test_stop_string(include_stop_str_in_output: bool):
-    detokenizer = Detokenizer(TOKENIZER_NAME)
+    output_processor = OutputProcessor(TOKENIZER_GROUP, log_stats=False)
     engine_core = MockEngineCore(GENERATION_TOKENS)
 
     # Make N requests.
@@ -162,7 +171,7 @@ def test_stop_string(include_stop_str_in_output: bool):
 
     # Add requests to the detokenizer.
     for request in requests:
-        detokenizer.add_request(request)
+        output_processor.add_request(request)
 
     gen_strings = {}
     aborted = []
@@ -173,7 +182,9 @@ def test_stop_string(include_stop_str_in_output: bool):
             break
 
         # Step the Detokenizer.
-        request_outputs, requests_to_abort = detokenizer.step(outputs)
+        processed_outputs = output_processor.process_outputs(outputs)
+        request_outputs = processed_outputs.request_outputs
+        requests_to_abort = processed_outputs.reqs_to_abort
         for request_output in request_outputs:
             # If aborted, we should not get a request output.
             assert request_output.request_id not in aborted
@@ -214,5 +225,71 @@ def test_stop_string(include_stop_str_in_output: bool):
             assert gen_str == ref_str_exc_stop, (
                 f"{gen_str=}, {ref_str_exc_stop=}")
 
-    assert detokenizer.get_num_unfinished_requests() == 0
-    assert not detokenizer.has_unfinished_requests()
+    assert output_processor.get_num_unfinished_requests() == 0
+    assert not output_processor.has_unfinished_requests()
+
+
+def test_iteration_stats():
+    output_processor = OutputProcessor(TOKENIZER_GROUP, log_stats=True)
+    engine_core = MockEngineCore(GENERATION_TOKENS)
+
+    # Make N requests.
+    requests = [
+        EngineCoreRequest(
+            request_id=f"request-{idx}",
+            prompt=prompt,
+            prompt_token_ids=prompt_tokens,
+            arrival_time=0,
+            mm_inputs=None,
+            mm_hashes=None,
+            mm_placeholders=None,
+            eos_token_id=None,
+            lora_request=None,
+            sampling_params=SamplingParams(),
+        ) for idx, (
+            prompt,
+            prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS))
+    ]
+
+    # Add all requests except one to the OutputProcessor.
+    num_active = len(GENERATION_TOKENS) - 1
+    for request in requests[:num_active]:
+        output_processor.add_request(request)
+    inactive_request = requests[num_active]
+
+    # First iteration has 2 prefills.
+    outputs = engine_core.get_outputs()[:num_active]
+    processed_outputs = output_processor.process_outputs(outputs)
+    iteration_stats = processed_outputs.iteration_stats
+    total_prompt_tokens = sum(
+        [len(prompt_tokens) for prompt_tokens in PROMPT_TOKENS[:num_active]])
+
+    assert iteration_stats.num_prompt_tokens == total_prompt_tokens
+    assert iteration_stats.num_generation_tokens == num_active
+
+    # Just decodes in this step.
+    outputs = engine_core.get_outputs()[:num_active]
+    processed_outputs = output_processor.process_outputs(outputs)
+    iteration_stats = processed_outputs.iteration_stats
+
+    assert iteration_stats.num_prompt_tokens == 0
+    assert iteration_stats.num_generation_tokens == num_active
+
+    # Add a new request - prefill and 2 decodes in this step.
+    output_processor.add_request(inactive_request)
+    num_active += 1
+    outputs = engine_core.get_outputs()[:num_active]
+    processed_outputs = output_processor.process_outputs(outputs)
+    iteration_stats = processed_outputs.iteration_stats
+    total_prompt_tokens = len(PROMPT_TOKENS[num_active - 1])
+
+    assert iteration_stats.num_prompt_tokens == total_prompt_tokens
+    assert iteration_stats.num_generation_tokens == num_active
+
+    # Just decodes in this step.
+    outputs = engine_core.get_outputs()[:num_active]
+    processed_outputs = output_processor.process_outputs(outputs)
+    iteration_stats = processed_outputs.iteration_stats
+
+    assert iteration_stats.num_prompt_tokens == 0
+    assert iteration_stats.num_generation_tokens == num_active
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index e0ceb59dffcbd..a74699f7513e6 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -1,6 +1,6 @@
 import asyncio
 import os
-from typing import AsyncGenerator, Dict, List, Mapping, Optional, Type, Union
+from typing import AsyncGenerator, List, Mapping, Optional, Type, Union
 
 from vllm.config import ModelConfig, VllmConfig
 from vllm.engine.arg_utils import AsyncEngineArgs
@@ -18,11 +18,11 @@
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import kill_process_tree
 from vllm.v1.engine.core_client import EngineCoreClient
-from vllm.v1.engine.detokenizer import Detokenizer
+from vllm.v1.engine.output_processor import OutputProcessor
 from vllm.v1.engine.processor import Processor
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.metrics.loggers import LoggingStatLogger, StatLoggerBase
-from vllm.v1.metrics.stats import SchedulerStats
+from vllm.v1.metrics.stats import IterationStats, SchedulerStats
 
 logger = init_logger(__name__)
 
@@ -59,9 +59,6 @@ def __init__(
             lora_config=vllm_config.lora_config)
         self.tokenizer.ping()
 
-        # Request streams (map of request_id -> queue).
-        self.rid_to_queue: Dict[str, asyncio.Queue] = {}
-
         # Processor (converts Inputs --> EngineCoreRequests).
         self.processor = Processor(
             model_config=vllm_config.model_config,
@@ -71,13 +68,9 @@ def __init__(
             input_registry=input_registry,
         )
 
-        # Detokenizer (converts EngineCoreOutputs --> RequestOutput).
-        self.detokenizer = Detokenizer(
-            tokenizer_name=vllm_config.model_config.tokenizer,
-            tokenizer_mode=vllm_config.model_config.tokenizer_mode,
-            trust_remote_code=vllm_config.model_config.trust_remote_code,
-            revision=vllm_config.model_config.tokenizer_revision,
-        )
+        # OutputProcessor (converts EngineCoreOutputs --> RequestOutput).
+        self.output_processor = OutputProcessor(self.tokenizer,
+                                                log_stats=self.log_stats)
 
         # EngineCore (starts the engine in background process).
         self.engine_core = EngineCoreClient.make_client(
@@ -140,9 +133,9 @@ async def add_request(
         """Add new request to the AsyncLLM."""
 
         # 1) Create a new output queue for the request.
-        if request_id in self.rid_to_queue:
+        if self.output_processor.is_request_active(request_id):
             raise ValueError(f"Request id {request_id} already running.")
-        self.rid_to_queue[request_id] = asyncio.Queue()
+        queue: asyncio.Queue[RequestOutput] = asyncio.Queue()
 
         # 2) Convert Input --> Request.
         request = self.processor.process_inputs(request_id, prompt, params,
@@ -151,8 +144,8 @@ async def add_request(
                                                 prompt_adapter_request,
                                                 priority)
 
-        # 3) Add the request to Detokenizer (this process).
-        self.detokenizer.add_request(request)
+        # 3) Add the request to OutputProcessor (this process).
+        self.output_processor.add_request(request, queue)
 
         # 4) Add the EngineCoreRequest to EngineCore (separate process).
         await self.engine_core.add_request_async(request)
@@ -160,7 +153,7 @@ async def add_request(
         if self.log_requests:
             logger.info("Added request %s.", request_id)
 
-        return self.rid_to_queue[request_id]
+        return queue
 
     # TODO: we should support multiple prompts in one call, as you
     # can do with LLM.generate. So that for multi-prompt completion
@@ -217,10 +210,9 @@ async def generate(
                 # task switching under load which helps performance).
                 out = q.get_nowait() if q.qsize() > 0 else await q.get()
 
-                # Note: both Detokenizer and EngineCore handle their
+                # Note: both OutputProcessor and EngineCore handle their
                 # own request cleanup based on finished.
                 if out.finished:
-                    del self.rid_to_queue[request_id]
                     yield out
                     break
 
@@ -233,57 +225,51 @@ async def generate(
             await self.abort(request_id)
             raise
 
-    def _process_request_outputs(self, request_outputs: List[RequestOutput]):
-        """Process outputs by putting them into per-request queues."""
-
-        for request_output in request_outputs:
-            request_id = request_output.request_id
-
-            # Note: it is possible a request was aborted and removed from
-            # the state due to client cancellations, so if we encounter a
-            # request id not in the state, we skip.
-            if request_id in self.rid_to_queue:
-                self.rid_to_queue[request_id].put_nowait(request_output)
-
     async def _run_output_handler(self):
         """Background loop: pulls from EngineCore and pushes to AsyncStreams."""
 
         try:
             while True:
-                # 1) Pull EngineCoreOutput from the EngineCore.
+                # 1) Pull EngineCoreOutputs from the EngineCore.
                 outputs = await self.engine_core.get_output_async()
 
-                # 2) Detokenize based on the output.
-                request_outputs, reqs_to_abort = self.detokenizer.step(
+                # 2) Process EngineCoreOutputs.
+                processed_outputs = self.output_processor.process_outputs(
                     outputs.outputs)
+                # NOTE: RequestOutputs are pushed to their queues.
+                assert len(processed_outputs.request_outputs) == 0
 
-                # 3) Put the RequestOutputs into the per-request queues.
-                self._process_request_outputs(request_outputs)
+                # 3) Abort any reqs that finished due to stop strings.
+                await self.engine_core.abort_requests_async(
+                    processed_outputs.reqs_to_abort)
 
-                # 4) Abort any requests that finished due to stop strings.
-                await self.engine_core.abort_requests_async(reqs_to_abort)
-
-                # 5) Log any stats.
-                await self._log_stats(scheduler_stats=outputs.scheduler_stats)
+                # 4) Logging.
+                # TODO(rob): make into a coroutine and launch it in
+                # background thread once we add Prometheus.
+                self._log_stats(
+                    scheduler_stats=outputs.scheduler_stats,
+                    iteration_stats=processed_outputs.iteration_stats,
+                )
 
         except Exception as e:
             logger.exception("EngineCore output handler hit an error: %s", e)
             kill_process_tree(os.getpid())
 
     async def abort(self, request_id: str) -> None:
-        """Abort RequestId in self, detokenizer, and engine core."""
+        """Abort RequestId in OutputProcessor and EngineCore."""
 
         request_ids = [request_id]
         await self.engine_core.abort_requests_async(request_ids)
-        self.detokenizer.abort_requests(request_ids)
+        self.output_processor.abort_requests(request_ids)
 
-        # If a request finishes while we await then the request_id
-        # will be removed from the tracked queues before we get here.
-        if request_id in self.rid_to_queue:
-            del self.rid_to_queue[request_id]
+        if self.log_requests:
+            logger.info("Aborted request %s.", request_id)
 
-    async def _log_stats(self, scheduler_stats: SchedulerStats):
-        """Log stats to the stat loggers."""
+    def _log_stats(
+        self,
+        scheduler_stats: SchedulerStats,
+        iteration_stats: IterationStats,
+    ):
         if not self.log_stats:
             return
 
@@ -314,8 +300,7 @@ async def get_tokenizer(
         self,
         lora_request: Optional[LoRARequest] = None,
     ) -> AnyTokenizer:
-        assert lora_request is None
-        return self.detokenizer.tokenizer
+        return self.tokenizer.get_lora_tokenizer(lora_request)
 
     async def is_tracing_enabled(self) -> bool:
         return False
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 9d6ae725e9d2b..ac0f0f14bf1ab 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -105,7 +105,8 @@ def add_request(self, request: EngineCoreRequest) -> None:
         self.engine_core.add_request(request)
 
     def abort_requests(self, request_ids: List[str]) -> None:
-        self.engine_core.abort_requests(request_ids)
+        if len(request_ids) > 0:
+            self.engine_core.abort_requests(request_ids)
 
     def shutdown(self):
         self.engine_core.shutdown()
@@ -221,7 +222,8 @@ def add_request(self, request: EngineCoreRequest) -> None:
         self._send_input(EngineCoreRequestType.ADD, request)
 
     def abort_requests(self, request_ids: List[str]) -> None:
-        self._send_input(EngineCoreRequestType.ABORT, request_ids)
+        if len(request_ids) > 0:
+            self._send_input(EngineCoreRequestType.ABORT, request_ids)
 
     def profile(self, is_start: bool = True) -> None:
         self._send_input(EngineCoreRequestType.PROFILE,
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 65be9e58e03c8..4a8b61beec037 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -1,18 +1,25 @@
 from dataclasses import dataclass
-from typing import Dict, Iterable, List, Optional, Tuple, Union
+from typing import List, Optional, Union
 
 from vllm.engine.output_processor.stop_checker import StopChecker
 from vllm.logger import init_logger
-from vllm.outputs import RequestOutput
 from vllm.sampling_params import RequestOutputKind
 from vllm.transformers_utils.detokenizer_utils import (
     AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally)
-from vllm.transformers_utils.tokenizer import get_tokenizer
 from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
 
 logger = init_logger(__name__)
 
 
+@dataclass
+class DetokenizerOutput:
+    output_text: str
+    token_ids: List[int]
+    finished: bool
+    finish_reason: Optional[str] = None
+    stop_reason: Union[int, str, None] = None
+
+
 @dataclass
 class IncrementalDetokenizer:
 
@@ -20,6 +27,7 @@ class IncrementalDetokenizer:
     output_text: str
     tokens: List[str]
     token_ids: List[int]
+    prompt_len: int
 
     # Stop strings
     stop: List[str]
@@ -34,11 +42,6 @@ class IncrementalDetokenizer:
     spaces_between_special_tokens: bool
     output_kind: RequestOutputKind
 
-    # TODO: Probably decouple these
-    request_id: str
-    prompt: Optional[str]
-    prompt_token_ids: List[int]
-
     # Tokenizer for this request
     tokenizer: AnyTokenizer
 
@@ -48,8 +51,7 @@ class IncrementalDetokenizer:
 
     @property
     def output_token_ids(self) -> List[int]:
-        assert len(self.token_ids) >= len(self.prompt_token_ids)
-        return self.token_ids[len(self.prompt_token_ids):]
+        return self.token_ids[self.prompt_len:]
 
     @classmethod
     def from_new_request(
@@ -87,25 +89,25 @@ def from_new_request(
             spaces_between_special_tokens=request.sampling_params.
             spaces_between_special_tokens,
             output_kind=request.sampling_params.output_kind,
-            request_id=request.request_id,
-            prompt=request.prompt,
-            prompt_token_ids=request.prompt_token_ids,
+            prompt_len=len(request.prompt_token_ids),
             tokenizer=tokenizer,
             stop_buffer_length=stop_buffer_length,
         )
 
-    def add_tokens(
+    def update_from_output(
         self,
-        new_token_ids: List[int],
-        finish_reason: Optional[str],
-        stop_reason: Optional[Union[int, str, None]],
-    ) -> Optional[RequestOutput]:
+        output: EngineCoreOutput,
+    ) -> Optional[DetokenizerOutput]:
         """
         Update RequestState for the request_id by:
             1) Detokenize the new token ids incrementally.
             2) Update the RequestOutput with the new text.
         """
 
+        new_token_ids = output.new_token_ids
+        finish_reason = output.finish_reason
+        stop_reason = output.stop_reason
+
         # 1) Detokenize the new token ids incrementally.
         # TODO(woosuk): This method becomes very inefficient when the number of
         # new_token_ids is more than 1. We need to optimize this.
@@ -158,21 +160,8 @@ def add_tokens(
         output_text = self._get_next_output_text(finished, delta)
         token_ids = new_token_ids if delta else self.output_token_ids
 
-        request_output = RequestOutput.new(
-            self.request_id,
-            self.prompt,
-            self.prompt_token_ids,
-            output_text,
-            token_ids,
-            finished,
-        )
-
-        if finished:
-            completion_output = request_output.outputs[0]
-            completion_output.finish_reason = finish_reason
-            completion_output.stop_reason = stop_reason
-
-        return request_output
+        return DetokenizerOutput(output_text, token_ids, finished,
+                                 finish_reason, stop_reason)
 
     def _get_next_output_text(self, finished: bool, delta: bool) -> str:
         """If delta is True, only new text since the last call to
@@ -189,85 +178,3 @@ def _get_next_output_text(self, finished: bool, delta: bool) -> str:
             self._last_output_text_offset = length
             return self.output_text[last_offset:length]
         return ""
-
-
-class Detokenizer:
-
-    def __init__(self,
-                 tokenizer_name: str,
-                 tokenizer_mode: str = "auto",
-                 trust_remote_code: bool = False,
-                 revision: Optional[str] = None):
-        # TODO: once we support LoRA, we should should pass the tokenizer
-        # here. We currently have two copies (this + in the LLMEngine).
-        self.tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
-                                       tokenizer_mode=tokenizer_mode,
-                                       trust_remote_code=trust_remote_code,
-                                       revision=revision)
-
-        # Request id -> IncrementalDetokenizer
-        self.request_states: Dict[str, IncrementalDetokenizer] = {}
-
-    def is_request_active(self, request_id: str):
-        return request_id in self.request_states
-
-    def get_num_unfinished_requests(self):
-        return len(self.request_states)
-
-    def has_unfinished_requests(self) -> bool:
-        return len(self.request_states) > 0
-
-    def abort_requests(
-        self,
-        request_ids: Iterable[str],
-    ) -> None:
-        """Remove the request_ids from the Detokenizer."""
-
-        for request_id in request_ids:
-            self.request_states.pop(request_id, None)
-
-    def add_request(
-        self,
-        request: EngineCoreRequest,
-    ):
-        """Add new request to the Detokenizer."""
-
-        assert (request.request_id not in self.request_states)
-
-        request_state = IncrementalDetokenizer.from_new_request(
-            self.tokenizer, request)
-        self.request_states[request.request_id] = request_state
-
-    def step(
-        self, encore_core_outputs: List[EngineCoreOutput]
-    ) -> Tuple[List[RequestOutput], List[str]]:
-        """Update state and request the RequestOutputs to the LLMEngine."""
-
-        request_outputs: List[RequestOutput] = []
-        requests_to_abort: List[str] = []
-        for engine_core_output in encore_core_outputs:
-            request_id = engine_core_output.request_id
-            detokenizer = self.request_states.get(request_id)
-            if detokenizer is None:
-                # Ignore output for already-aborted request.
-                continue
-
-            # Detokenize and update state.
-            request_output = detokenizer.add_tokens(
-                new_token_ids=engine_core_output.new_token_ids,
-                finish_reason=engine_core_output.finish_reason,
-                stop_reason=engine_core_output.stop_reason,
-            )
-
-            if request_output is not None:
-                # Add to RequestOutputs list.
-                request_outputs.append(request_output)
-
-                # Free completed requests.
-                if request_output.finished:
-                    self.request_states.pop(request_id)
-                    if not engine_core_output.finished:
-                        requests_to_abort.append(request_id)
-
-        # Return to EngineClient.
-        return request_outputs, requests_to_abort
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index ac392f5e4f4cf..f5999ccda6447 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -18,7 +18,7 @@
     BaseTokenizerGroup, init_tokenizer_from_configs)
 from vllm.usage.usage_lib import UsageContext
 from vllm.v1.engine.core_client import EngineCoreClient
-from vllm.v1.engine.detokenizer import Detokenizer
+from vllm.v1.engine.output_processor import OutputProcessor
 from vllm.v1.engine.processor import Processor
 from vllm.v1.executor.abstract import Executor
 
@@ -60,13 +60,9 @@ def __init__(
                                    input_registry=input_registry,
                                    mm_registry=mm_registry)
 
-        # Detokenizer (converts EngineCoreOutputs --> RequestOutput)
-        self.detokenizer = Detokenizer(
-            tokenizer_name=vllm_config.model_config.tokenizer,
-            tokenizer_mode=vllm_config.model_config.tokenizer_mode,
-            trust_remote_code=vllm_config.model_config.trust_remote_code,
-            revision=vllm_config.model_config.tokenizer_revision,
-        )
+        # OutputProcessor (convert EngineCoreOutputs --> RequestOutput).
+        self.output_processor = OutputProcessor(self.tokenizer,
+                                                log_stats=False)
 
         # EngineCore (gets EngineCoreRequests and gives EngineCoreOutputs)
         self.engine_core = EngineCoreClient.make_client(
@@ -103,10 +99,10 @@ def from_engine_args(
                    multiprocess_mode=enable_multiprocessing)
 
     def get_num_unfinished_requests(self) -> int:
-        return self.detokenizer.get_num_unfinished_requests()
+        return self.output_processor.get_num_unfinished_requests()
 
     def has_unfinished_requests(self) -> bool:
-        return self.detokenizer.has_unfinished_requests()
+        return self.output_processor.has_unfinished_requests()
 
     @classmethod
     def validate_outputs(cls, outputs, output_type):
@@ -116,7 +112,7 @@ def abort_request(self, request_ids: List[str]) -> None:
         """Remove request_ids from EngineCore and Detokenizer."""
 
         self.engine_core.abort_requests(request_ids)
-        self.detokenizer.abort_requests(request_ids)
+        self.output_processor.abort_requests(request_ids)
 
     def add_request(
         self,
@@ -137,8 +133,8 @@ def add_request(
                                                 prompt_adapter_request,
                                                 priority)
 
-        # 2) Add the request to Detokenizer.
-        self.detokenizer.add_request(request)
+        # 2) Make a new RequestState and queue.
+        self.output_processor.add_request(request)
 
         # 3) Add the request to EngineCore.
         self.engine_core.add_request(request)
@@ -148,15 +144,14 @@ def step(self) -> List[RequestOutput]:
         # 1) Get EngineCoreOutput from the EngineCore.
         outputs = self.engine_core.get_output()
 
-        # 2) Detokenizer the EngineCoreOutput.
-        request_outputs, requests_to_abort = self.detokenizer.step(
+        # 2) Process EngineCoreOutputs.
+        processed_outputs = self.output_processor.process_outputs(
             outputs.outputs)
 
-        # 3) Abort requests that finished due to stopping criteria.
-        if requests_to_abort:
-            self.abort_request(requests_to_abort)
+        # 3) Abort any reqs that finished due to stop strings.
+        self.engine_core.abort_requests(processed_outputs.reqs_to_abort)
 
-        return request_outputs
+        return processed_outputs.request_outputs
 
     def get_model_config(self):
         return self.model_config
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
new file mode 100644
index 0000000000000..749f4f5043c97
--- /dev/null
+++ b/vllm/v1/engine/output_processor.py
@@ -0,0 +1,200 @@
+import asyncio
+from dataclasses import dataclass
+from typing import Dict, List, Optional
+
+from vllm.outputs import RequestOutput
+from vllm.transformers_utils.detokenizer_utils import AnyTokenizer
+from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
+from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
+from vllm.v1.engine.detokenizer import (DetokenizerOutput,
+                                        IncrementalDetokenizer)
+from vllm.v1.metrics.stats import IterationStats
+
+
+@dataclass
+class OutputProcessorOutput:
+
+    request_outputs: List[RequestOutput]
+    reqs_to_abort: List[str]
+    iteration_stats: IterationStats
+
+
+class RequestState:
+
+    def __init__(
+        self,
+        request_id: str,
+        prompt: Optional[str],
+        prompt_token_ids: List[int],
+        detokenizer: IncrementalDetokenizer,
+        queue: Optional[asyncio.Queue[RequestOutput]],
+    ):
+        self.request_id = request_id
+        self.prompt = prompt
+        self.prompt_token_ids = prompt_token_ids
+        self.prompt_len = len(prompt_token_ids)
+        self.detokenizer = detokenizer
+        self.is_prefilling = True
+        self.queue = queue
+
+    @classmethod
+    def from_new_request(
+        cls,
+        tokenizer: AnyTokenizer,
+        request: EngineCoreRequest,
+        queue: Optional[asyncio.Queue[RequestOutput]] = None,
+    ) -> "RequestState":
+        return cls(
+            request_id=request.request_id,
+            prompt=request.prompt,
+            prompt_token_ids=request.prompt_token_ids,
+            detokenizer=IncrementalDetokenizer.from_new_request(
+                tokenizer=tokenizer,
+                request=request,
+            ),
+            queue=queue,
+        )
+
+
+class OutputProcessor:
+    """Process EngineCoreOutputs into RequestOutputs."""
+
+    def __init__(
+        self,
+        tokenizer: BaseTokenizerGroup,
+        log_stats: bool,
+    ):
+        self.log_stats = log_stats
+        self.tokenizer = tokenizer
+        self.request_states: Dict[str, RequestState] = {}
+
+    def is_request_active(self, request_id: str) -> bool:
+        return request_id in self.request_states
+
+    def get_num_unfinished_requests(self):
+        return len(self.request_states)
+
+    def has_unfinished_requests(self) -> bool:
+        return len(self.request_states) > 0
+
+    def abort_requests(
+        self,
+        request_ids: List[str],
+    ) -> None:
+        for request_id in request_ids:
+            self.request_states.pop(request_id, None)
+
+    def add_request(
+        self,
+        request: EngineCoreRequest,
+        queue: Optional[asyncio.Queue[RequestOutput]] = None,
+    ) -> None:
+        request_id = request.request_id
+        if request_id in self.request_states:
+            raise ValueError(f"Request id {request_id} already running.")
+
+        self.request_states[request_id] = RequestState.from_new_request(
+            tokenizer=self.tokenizer.get_lora_tokenizer(request.lora_request),
+            request=request,
+            queue=queue)
+
+    def process_outputs(
+        self,
+        engine_core_outputs: List[EngineCoreOutput],
+    ) -> OutputProcessorOutput:
+        """
+        Process the EngineCoreOutputs:
+        1) Compute stats for logging
+        2) Detokenize
+        3) Create and handle RequestOutput objects:
+            * If there is a queue (for usage with AsyncLLM), 
+              put the RequestOutput objects into the queue for
+              handling by the per-request generate() tasks.
+
+            * If there is no queue (for usage with LLMEngine), 
+              return a list of RequestOutput objects.
+
+        ****************** NOTE FOR DEVELOPERS ******************
+
+        VLLM V1 minimizes the number of python loops over the full
+        batch to ensure system overheads are minimized. This is the 
+        only function that should loop over EngineCoreOutputs.
+
+        If you need to touch every element of the batch, implement a
+        method called XXXClass.update_from_output() to be called
+        within the loop below. For examples, see:
+            * IterationStats.update_from_output()
+            * Detokenizer.update_from_output()
+        
+        TODO(rob): add Protocol makes update_from_output explicit.
+        
+        **********************************************************
+        """
+
+        request_outputs: List[RequestOutput] = []
+        reqs_to_abort: List[str] = []
+        iteration_stats = IterationStats(self.log_stats)
+        for engine_core_output in engine_core_outputs:
+            req_id = engine_core_output.request_id
+            req_state = self.request_states.get(req_id)
+            if req_state is None:
+                # Ignore output for already-aborted request.
+                continue
+
+            # 1) Compute stats for this iteration.
+            iteration_stats.update_from_output(engine_core_output,
+                                               req_state.is_prefilling,
+                                               req_state.prompt_len)
+            req_state.is_prefilling = False
+
+            # 2) Detokenize the token ids into text.
+            detokenizer_output = req_state.detokenizer.update_from_output(
+                engine_core_output)
+
+            # 3) Create and handle RequestOutput objects.
+            if request_output := self._make_request_output(
+                    req_state, detokenizer_output):
+                if req_state.queue is not None:
+                    # AsyncLLM: put into queue for handling by generate().
+                    req_state.queue.put_nowait(request_output)
+                else:
+                    # LLMEngine: return list of RequestOutputs.
+                    request_outputs.append(request_output)
+
+                # Free completed requests.
+                if request_output.finished:
+                    self.request_states.pop(req_id)
+                    if not engine_core_output.finished:
+                        # If req not finished in EngineCore, but Detokenizer
+                        # detected stop string, abort needed in EngineCore.
+                        reqs_to_abort.append(req_id)
+
+        return OutputProcessorOutput(
+            request_outputs=request_outputs,
+            reqs_to_abort=reqs_to_abort,
+            iteration_stats=iteration_stats,
+        )
+
+    def _make_request_output(
+        self,
+        request_state: RequestState,
+        detokenizer_output: Optional[DetokenizerOutput],
+    ) -> Optional[RequestOutput]:
+
+        if detokenizer_output is None:
+            return None
+
+        request_output = RequestOutput.new(
+            request_state.request_id,
+            request_state.prompt,
+            request_state.prompt_token_ids,
+            detokenizer_output.output_text,
+            detokenizer_output.token_ids,
+            detokenizer_output.finished,
+        )
+        if detokenizer_output.finished:
+            completion_output = request_output.outputs[0]
+            completion_output.finish_reason = detokenizer_output.finish_reason
+            completion_output.stop_reason = detokenizer_output.stop_reason
+
+        return request_output
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
index 5ebb4fd5b37db..60cb986f8bbce 100644
--- a/vllm/v1/metrics/stats.py
+++ b/vllm/v1/metrics/stats.py
@@ -1,4 +1,8 @@
 from dataclasses import dataclass
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from vllm.v1.engine import EngineCoreOutput
 
 
 @dataclass
@@ -10,3 +14,26 @@ class SchedulerStats:
 
     # gpu_cache_usage: float = 0.0
     # gpu_prefix_cache_hit_rate: float = 0.0
+
+
+class IterationStats:
+    """Stats associated with a single set of EngineCoreOutputs."""
+
+    def __init__(self, log_stats: bool):
+        self.log_stats = log_stats
+        self.num_generation_tokens = 0
+        self.num_prompt_tokens = 0
+
+    def update_from_output(self, output: "EngineCoreOutput",
+                           is_prefilling: bool, prompt_len: int):
+        if not self.log_stats:
+            return
+
+        self.num_generation_tokens += len(output.new_token_ids)
+        if is_prefilling:
+            # This relies on the invariant that EngineCore does
+            # not stream outputs for partially completed prefills
+            # (scheduler.update_from_output makes EngineCoreOutput
+            # iff num_computed_tokens == num_tokens).
+            assert (len(output.new_token_ids) > 0)
+            self.num_prompt_tokens += prompt_len

From f7b3ba82c3eec71f31f8d49f708ab328b5e908f6 Mon Sep 17 00:00:00 2001
From: Yangcheng Li <liyangcheng.lyc@alibaba-inc.com>
Date: Mon, 13 Jan 2025 13:07:48 +0800
Subject: [PATCH 190/462] [MISC] fix typo in kv transfer send recv test
 (#11983)

---
 tests/kv_transfer/test_send_recv.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/kv_transfer/test_send_recv.py b/tests/kv_transfer/test_send_recv.py
index 4beba4dc05dde..1cc1ced9968d7 100644
--- a/tests/kv_transfer/test_send_recv.py
+++ b/tests/kv_transfer/test_send_recv.py
@@ -22,13 +22,13 @@ def test_run(my_rank, pipe):
         x2 = pipe.recv_tensor()
         print(f"rank {my_rank} received x2 = ", x2)
         y2 = pipe.recv_tensor()
-        print(f"rank {my_rank} received y2 = ", x2)
+        print(f"rank {my_rank} received y2 = ", y2)
 
     else:
         x2 = pipe.recv_tensor()
         print(f"rank {my_rank} received x2 = ", x2)
         y2 = pipe.recv_tensor()
-        print(f"rank {my_rank} received y2 = ", x2)
+        print(f"rank {my_rank} received y2 = ", y2)
         pipe.send_tensor(x)
         print(f"rank {my_rank} sent tensor x")
         pipe.send_tensor(y)

From 9dd02d85ca801c99241317a8061bd025c726af93 Mon Sep 17 00:00:00 2001
From: Siyuan Li <94890248+liaoyanqing666@users.noreply.github.com>
Date: Mon, 13 Jan 2025 14:24:10 +0800
Subject: [PATCH 191/462] [Bug] Fix usage of `.transpose()` and `.view()`
 consecutively. (#11979)

---
 vllm/attention/layer.py                  | 2 +-
 vllm/model_executor/models/intern_vit.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 55e4e14027f79..b8afd428f2cc0 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -230,7 +230,7 @@ def forward(
                                                  value,
                                                  scale=self.scale)
             out = out.transpose(1, 2)
-        return out.view(bsz, q_len, -1)
+        return out.reshape(bsz, q_len, -1)
 
 
 def unified_attention(
diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py
index 7ff68bd60e8ad..8ad009d5101e4 100644
--- a/vllm/model_executor/models/intern_vit.py
+++ b/vllm/model_executor/models/intern_vit.py
@@ -271,7 +271,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         v = v.transpose(1, 2)
 
         x = F.scaled_dot_product_attention(q, k, v, scale=self.scale)
-        x = x.transpose(1, 2).view(B, N, -1)
+        x = x.transpose(1, 2).reshape(B, N, -1)
 
         x = self.proj(x)
         return x

From 80ea3af1a06ff445e6cdede072bda1429c9dac06 Mon Sep 17 00:00:00 2001
From: Sungjae Lee <33976427+llsj14@users.noreply.github.com>
Date: Mon, 13 Jan 2025 15:50:35 +0900
Subject: [PATCH 192/462] [CI][Spec Decode] fix: broken test for EAGLE model
 (#11972)

Signed-off-by: Sungjae Lee <33976427+llsj14@users.noreply.github.com>
---
 .buildkite/test-pipeline.yaml       |  4 +++-
 vllm/model_executor/models/eagle.py | 11 ++++++++++-
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 393912881bca3..74b287c7adbfa 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -231,13 +231,15 @@ steps:
     - pytest -v -s test_logits_processor.py
     - pytest -v -s model_executor/test_guided_processors.py
 
-- label: Speculative decoding tests # 30min
+- label: Speculative decoding tests # 40min
   source_file_dependencies:
   - vllm/spec_decode
   - tests/spec_decode
+  - vllm/model_executor/models/eagle.py
   commands:
     - pytest -v -s spec_decode/e2e/test_multistep_correctness.py
     - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py
+    - pytest -v -s spec_decode/e2e/test_eagle_correctness.py
 
 - label: LoRA Test %N # 15min each
   mirror_hardwares: [amd]
diff --git a/vllm/model_executor/models/eagle.py b/vllm/model_executor/models/eagle.py
index eb7b5af19ae96..948560b4906b8 100644
--- a/vllm/model_executor/models/eagle.py
+++ b/vllm/model_executor/models/eagle.py
@@ -19,6 +19,11 @@
 
 class DummyInputLayerNorm(nn.Module):
 
+    def __init__(self, weight=None, bias=None):
+        super().__init__()
+        self.weight = nn.Parameter(weight) if weight is not None else None
+        self.bias = nn.Parameter(bias) if bias is not None else None
+
     def forward(self, x):
         return x
 
@@ -69,7 +74,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         # Modify layer normalization and residual connections as suggested
         # in the EAGLE framework: https://github.com/SafeAILab/EAGLE
-        self.model.model.layers[0].input_layernorm = DummyInputLayerNorm()
+        # While weights and biases are generally not needed,
+        # they are retained here to support certain unit tests
+        # (e.g., spec_decode/e2e/test_eagle_correctness.py).
+        self.model.model.layers[0].input_layernorm = DummyInputLayerNorm(
+            weight=self.model.model.layers[0].input_layernorm.weight)
         self.model.model.norm = DummyOutputNorm()
 
         self.orig_vocab_size = config.vocab_size

From cf6bbcb49324c24fc0f6f9381400c299c9c2d7ac Mon Sep 17 00:00:00 2001
From: Concurrensee <yida.wu@amd.com>
Date: Mon, 13 Jan 2025 01:05:06 -0600
Subject: [PATCH 193/462] [Misc] Fix Deepseek V2 fp8 kv-scale remapping
 (#11947)

Signed-off-by: Yida Wu <yidawu@alumni.cmu.edu>
---
 vllm/model_executor/models/deepseek_v2.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index 9132040545863..d83cafaf998ab 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -45,7 +45,8 @@
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
@@ -635,6 +636,11 @@ def load_weights(self, weights: Iterable[Tuple[str,
                     if name.endswith(".bias") and name not in params_dict:
                         continue
 
+                    # Remapping the name of FP8 kv-scale.
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
+
                     if is_pp_missing_parameter(name, self):
                         continue
 

From c3f05b09a040b9d13ad62914be3f7a84c535e417 Mon Sep 17 00:00:00 2001
From: Chenguang Li <757486878@qq.com>
Date: Mon, 13 Jan 2025 15:47:05 +0800
Subject: [PATCH 194/462] [Misc]Minor Changes about Worker (#11555)

Signed-off-by: Chenguang Li <757486878@qq.com>
---
 vllm/v1/worker/gpu_worker.py | 1 -
 vllm/worker/worker.py        | 1 -
 2 files changed, 2 deletions(-)

diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index af438f7d5820c..e83bce4283555 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -132,7 +132,6 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
         # Execute a forward pass with dummy inputs to profile the memory usage
         # of the model.
         self.model_runner.profile_run()
-        torch.cuda.synchronize()
 
         free_gpu_memory, _ = torch.cuda.mem_get_info()
         # NOTE(woosuk): Here we assume that the other processes using the same
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 0f12549e3f3fd..a3e377ef2b19d 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -200,7 +200,6 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
                               weights_memory_in_bytes=self.model_runner.
                               model_memory_usage) as result:
             self.model_runner.profile_run()
-            torch.cuda.synchronize()
 
         self._assert_memory_footprint_increased_during_profiling()
 

From 89ce62a316e68c50121e74d5a832e0cb9a5101d1 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 13 Jan 2025 16:20:52 +0800
Subject: [PATCH 195/462] [platform] add ray_device_key (#11948)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/executor/ray_utils.py    | 19 +++++++++++++------
 vllm/platforms/cuda.py        |  1 +
 vllm/platforms/hpu.py         |  1 +
 vllm/platforms/interface.py   |  4 ++++
 vllm/platforms/neuron.py      |  1 +
 vllm/platforms/rocm.py        |  2 ++
 vllm/platforms/tpu.py         |  2 ++
 vllm/platforms/xpu.py         |  3 +++
 vllm/v1/executor/ray_utils.py | 13 +++++++++++--
 9 files changed, 38 insertions(+), 8 deletions(-)

diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py
index 8d766bad1a072..9f40f6a65dcd7 100644
--- a/vllm/executor/ray_utils.py
+++ b/vllm/executor/ray_utils.py
@@ -8,6 +8,7 @@
 from vllm.config import ParallelConfig
 from vllm.executor.msgspec_utils import decode_hook, encode_hook
 from vllm.logger import init_logger
+from vllm.platforms import current_platform
 from vllm.sequence import ExecuteModelRequest, IntermediateTensors
 from vllm.utils import get_ip
 from vllm.worker.worker_base import WorkerWrapperBase
@@ -47,7 +48,12 @@ def get_node_ip(self) -> str:
 
         def get_node_and_gpu_ids(self) -> Tuple[str, List[int]]:
             node_id = ray.get_runtime_context().get_node_id()
-            gpu_ids = ray.get_gpu_ids()
+            device_key = current_platform.ray_device_key
+            if not device_key:
+                raise RuntimeError("current platform %s does not support ray.",
+                                   current_platform.device_name)
+            gpu_ids = ray.get_runtime_context().get_accelerator_ids(
+            )[device_key]
             return node_id, gpu_ids
 
         def execute_model_spmd(
@@ -249,11 +255,12 @@ def initialize_ray_cluster(
         # Placement group is already set.
         return
 
-    device_str = "GPU"
-    if current_platform.is_tpu():
-        device_str = "TPU"
-    elif current_platform.is_hpu():
-        device_str = 'HPU'
+    device_str = current_platform.ray_device_key
+    if not device_str:
+        raise ValueError(
+            f"current platform {current_platform.device_name} does not "
+            "support ray.")
+
     # Create placement group for worker processes
     current_placement_group = ray.util.get_current_placement_group()
     if current_placement_group:
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 23ceac83e49de..3f77ec50ed31f 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -77,6 +77,7 @@ class CudaPlatformBase(Platform):
     device_name: str = "cuda"
     device_type: str = "cuda"
     dispatch_key: str = "CUDA"
+    ray_device_key: str = "GPU"
 
     @classmethod
     def get_device_capability(cls,
diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py
index 8152d881fa8d9..0acb2804a5f66 100644
--- a/vllm/platforms/hpu.py
+++ b/vllm/platforms/hpu.py
@@ -19,6 +19,7 @@ class HpuPlatform(Platform):
     device_name: str = "hpu"
     device_type: str = "hpu"
     dispatch_key: str = "HPU"
+    ray_device_key: str = "HPU"
 
     @classmethod
     def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index fe398801c5dd9..ec917f75689dd 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -82,6 +82,10 @@ class Platform:
     # check https://github.com/pytorch/pytorch/blob/313dac6c1ca0fa0cde32477509cce32089f8532a/torchgen/model.py#L134 # noqa
     # use "CPU" as a fallback for platforms not registered in PyTorch
     dispatch_key: str = "CPU"
+    # available ray device keys:
+    # https://github.com/ray-project/ray/blob/10ba5adadcc49c60af2c358a33bb943fb491a171/python/ray/_private/ray_constants.py#L438 # noqa
+    # empty string means the device does not support ray
+    ray_device_key: str = ""
     # The torch.compile backend for compiling simple and
     # standalone functions. The default value is "inductor" to keep
     # the same behavior as PyTorch.
diff --git a/vllm/platforms/neuron.py b/vllm/platforms/neuron.py
index a4bbbd27c8a89..7f4a867b32ba1 100644
--- a/vllm/platforms/neuron.py
+++ b/vllm/platforms/neuron.py
@@ -16,6 +16,7 @@ class NeuronPlatform(Platform):
     _enum = PlatformEnum.NEURON
     device_name: str = "neuron"
     device_type: str = "neuron"
+    ray_device_key: str = "neuron_cores"
     supported_quantization: list[str] = ["neuron_quant"]
 
     @classmethod
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 1c2f602efc856..f12e948113723 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -64,6 +64,8 @@ class RocmPlatform(Platform):
     device_name: str = "rocm"
     device_type: str = "cuda"
     dispatch_key: str = "CUDA"
+    ray_device_key: str = "GPU"
+
     supported_quantization: list[str] = [
         "awq", "gptq", "fp8", "compressed_tensors", "compressed-tensors",
         "fbgemm_fp8", "gguf"
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index 8a59b53ca4b15..460eb170bba34 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -19,6 +19,8 @@ class TpuPlatform(Platform):
     device_name: str = "tpu"
     device_type: str = "tpu"
     dispatch_key: str = "XLA"
+    ray_device_key: str = "TPU"
+
     supported_quantization: list[str] = [
         "tpu_int8", "compressed-tensors", "compressed_tensors"
     ]
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index 00692a5d23031..cb74f79b31794 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -19,6 +19,9 @@ class XPUPlatform(Platform):
     device_name: str = "xpu"
     device_type: str = "xpu"
     dispatch_key: str = "XPU"
+    # Intel XPU's device key is "GPU" for Ray.
+    # see https://github.com/ray-project/ray/blob/6a5eb5865eeb9ccf058a79b44f107e327e360673/python/ray/_private/accelerators/intel_gpu.py#L20 # noqa: E501
+    ray_device_key: str = "GPU"
 
     @classmethod
     def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
diff --git a/vllm/v1/executor/ray_utils.py b/vllm/v1/executor/ray_utils.py
index 7733610e59c7f..fc9715b7a5909 100644
--- a/vllm/v1/executor/ray_utils.py
+++ b/vllm/v1/executor/ray_utils.py
@@ -41,7 +41,12 @@ def get_node_ip(self) -> str:
 
         def get_node_and_gpu_ids(self) -> Tuple[str, List[int]]:
             node_id = ray.get_runtime_context().get_node_id()
-            gpu_ids = ray.get_gpu_ids()
+            device_key = current_platform.ray_device_key
+            if not device_key:
+                raise RuntimeError("current platform %s does not support ray.",
+                                   current_platform.device_name)
+            gpu_ids = ray.get_runtime_context().get_accelerator_ids(
+            )[device_key]
             return node_id, gpu_ids
 
         def setup_device_if_necessary(self):
@@ -211,7 +216,11 @@ def initialize_ray_cluster(
         # Placement group is already set.
         return
 
-    device_str = "GPU" if not current_platform.is_tpu() else "TPU"
+    device_str = current_platform.ray_device_key
+    if not device_str:
+        raise ValueError(
+            f"current platform {current_platform.device_name} does not "
+            "support ray.")
     # Create placement group for worker processes
     current_placement_group = ray.util.get_current_placement_group()
     if current_placement_group:

From 5340a30d0193547a19e236757fec1f3f246642f9 Mon Sep 17 00:00:00 2001
From: Alex Brooks <alex.brooks@ibm.com>
Date: Mon, 13 Jan 2025 01:37:48 -0700
Subject: [PATCH 196/462] Fix Max Token ID for Qwen-VL-Chat (#11980)

Signed-off-by: Alex-Brooks <Alex.brooks@ibm.com>
---
 vllm/transformers_utils/tokenizer.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index 97920f42ec52f..294262484f2fb 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -1,3 +1,4 @@
+import contextlib
 import os
 import warnings
 from pathlib import Path
@@ -67,7 +68,15 @@ def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer:
         tokenizer.all_special_tokens_extended)
     tokenizer_all_special_tokens = set(tokenizer.all_special_tokens)
     tokenizer_len = len(tokenizer)
+
     max_token_id = max(tokenizer.get_vocab().values())
+    # Some tokenizers (e.g., QwenTokenizer) have special tokens that
+    # are added and included in the implementation of the vocab_size
+    # property, but not in get_vocab(); if there is an implementation
+    # of vocab size, we should take the greater value.
+    if hasattr(tokenizer, "vocab_size"):
+        with contextlib.suppress(NotImplementedError):
+            max_token_id = max(max_token_id, tokenizer.vocab_size)
 
     class CachedTokenizer(tokenizer.__class__):  # type: ignore
 

From 0f8cafe2d1550a33803fb64b2224e6bf3f913449 Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Mon, 13 Jan 2025 19:28:53 +0800
Subject: [PATCH 197/462] [Kernel] unified_attention for Attention.forward
 (#11967)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
---
 vllm/attention/layer.py              | 26 ++++++++++++++------------
 vllm/utils.py                        |  1 -
 vllm/worker/hpu_model_runner.py      | 13 +++++++++++--
 vllm/worker/hpu_worker.py            |  3 +++
 vllm/worker/neuron_model_runner.py   | 17 ++++++++++-------
 vllm/worker/openvino_model_runner.py |  4 +++-
 vllm/worker/openvino_worker.py       | 13 +++++++++++--
 vllm/worker/tpu_model_runner.py      | 28 ++++++++++++++++++----------
 vllm/worker/tpu_worker.py            |  6 +++++-
 vllm/worker/xpu_model_runner.py      | 21 ++++++++++++---------
 10 files changed, 87 insertions(+), 45 deletions(-)

diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index b8afd428f2cc0..c7e7a4d52e5a7 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -134,15 +134,10 @@ def forward(
         query: torch.Tensor,
         key: torch.Tensor,
         value: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
+        _kv_cache: torch.Tensor,
+        _attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
-
-        if self.use_direct_call:
-            return self.impl.forward(query, key, value, kv_cache,
-                                     attn_metadata, self._k_scale,
-                                     self._v_scale)
-        elif self.use_output:
+        if self.use_output:
             output = torch.empty_like(query)
             hidden_size = query.size(-1)
             # Reshape the query, key, and value tensors.
@@ -154,12 +149,19 @@ def forward(
                 key = key.view(-1, self.num_kv_heads, self.head_size)
             if value is not None:
                 value = value.view(-1, self.num_kv_heads, self.head_size)
-            torch.ops.vllm.unified_attention_with_output(
-                query, key, value, output, self.layer_name)
+            if self.use_direct_call:
+                unified_attention_with_output(query, key, value, output,
+                                              self.layer_name)
+            else:
+                torch.ops.vllm.unified_attention_with_output(
+                    query, key, value, output, self.layer_name)
             return output.view(-1, hidden_size)
         else:
-            return torch.ops.vllm.unified_attention(query, key, value,
-                                                    self.layer_name)
+            if self.use_direct_call:
+                return unified_attention(query, key, value, self.layer_name)
+            else:
+                return torch.ops.vllm.unified_attention(
+                    query, key, value, self.layer_name)
 
     def extra_repr(self) -> str:
         s = f"head_size={self.impl.head_size}"  # type: ignore
diff --git a/vllm/utils.py b/vllm/utils.py
index 217ccb25cef6d..9a509da3c1ef1 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -2171,5 +2171,4 @@ def bind_kv_cache(
         forward_ctx = ctx[layer_name]
         assert len(forward_ctx.kv_cache) == len(kv_cache)
         for ve, ve_kv_cache in enumerate(kv_cache):
-            assert forward_ctx.kv_cache[ve].numel() == 0
             forward_ctx.kv_cache[ve] = ve_kv_cache[kv_cache_idx]
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 9d479f412af46..3e5105f3b62e3 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -28,6 +28,7 @@
 from vllm.attention import AttentionMetadata, get_attn_backend
 from vllm.config import DeviceConfig, VllmConfig
 from vllm.distributed.parallel_state import get_world_group
+from vllm.forward_context import set_forward_context
 from vllm.logger import init_logger
 from vllm.lora.layers import LoRAMapping
 from vllm.lora.request import LoRARequest
@@ -40,7 +41,8 @@
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import (IntermediateTensors, SequenceData,
                            SequenceGroupMetadata)
-from vllm.utils import is_pin_memory_available, make_tensor_with_pad
+from vllm.utils import (bind_kv_cache, is_pin_memory_available,
+                        make_tensor_with_pad)
 from vllm.worker.model_runner_base import (
     ModelRunnerBase, ModelRunnerInputBase,
     _add_attn_metadata_broadcastable_dict,
@@ -1286,6 +1288,9 @@ def create_dummy_seq_group_metadata(self,
     def profile_run(self) -> None:
         num_layers = self.model_config.get_num_layers(self.parallel_config)
         kv_caches = [None] * num_layers
+        bind_kv_cache(
+            self.vllm_config.compilation_config.static_forward_context,
+            [kv_caches])
         max_seq_len = self.bucketing_global_state.prompt_seq_bucket_cfg[-1]
         max_batch_size = min(self.max_num_batched_tokens // max_seq_len,
                              self.scheduler_config.max_num_seqs)
@@ -1943,7 +1948,11 @@ def execute_model(
                                 f"graphs{'T' if use_graphs else 'F'}")
         else:
             model_event_name = 'model_executable'
-        with self.profiler.record_event('internal', model_event_name):
+        with set_forward_context(
+                model_input.attn_metadata, self.vllm_config,
+                model_input.virtual_engine), \
+            self.profiler.record_event(
+                    'internal', model_event_name):
             hidden_states = self.model.forward(
                 **execute_model_kwargs,
                 selected_token_indices=sampling_metadata.selected_token_indices
diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py
index cca7cd50bfc7b..8b2d8aaed2803 100644
--- a/vllm/worker/hpu_worker.py
+++ b/vllm/worker/hpu_worker.py
@@ -20,6 +20,7 @@
 from vllm.model_executor import set_random_seed
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sequence import ExecuteModelRequest
+from vllm.utils import bind_kv_cache
 from vllm.worker.cache_engine import CacheEngine
 from vllm.worker.hpu_model_runner import HPUModelRunner
 from vllm.worker.model_runner_base import ModelRunnerBase
@@ -215,6 +216,8 @@ def _init_cache_engine(self):
             self.cache_engine[ve].gpu_cache
             for ve in range(self.parallel_config.pipeline_parallel_size)
         ]
+        bind_kv_cache(self.compilation_config.static_forward_context,
+                      self.hpu_cache)
 
     def _warm_up_model(self) -> None:
         # NOTE(kzawora): We should use virtual engine index here
diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py
index ae4eb6ba6eaec..a35f5467e1a1f 100644
--- a/vllm/worker/neuron_model_runner.py
+++ b/vllm/worker/neuron_model_runner.py
@@ -8,6 +8,7 @@
 from transformers_neuronx.config import GenerationConfig
 
 from vllm.config import VllmConfig
+from vllm.forward_context import set_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.layers.sampler import SamplerOutput
@@ -314,13 +315,15 @@ def execute_model(
             raise ValueError(
                 "NeuronModelRunner does not support multi-step execution.")
 
-        hidden_states = self.model(
-            input_ids=model_input.input_tokens,
-            positions=model_input.input_positions,
-            input_block_ids=model_input.input_block_ids,
-            **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs or {},
-                                         device=self.device),
-        )
+        with set_forward_context(None, self.vllm_config, 0):
+            hidden_states = self.model(
+                input_ids=model_input.input_tokens,
+                positions=model_input.input_positions,
+                input_block_ids=model_input.input_block_ids,
+                **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs
+                                             or {},
+                                             device=self.device),
+            )
 
         # Compute the logits only if the on-device sampling is turned off as
         # on-device sampling outputs the token ids.
diff --git a/vllm/worker/openvino_model_runner.py b/vllm/worker/openvino_model_runner.py
index 6000e5dfe4e30..a38b5a4e6e8d5 100644
--- a/vllm/worker/openvino_model_runner.py
+++ b/vllm/worker/openvino_model_runner.py
@@ -8,6 +8,7 @@
 from vllm.attention import get_attn_backend
 from vllm.attention.backends.openvino import OpenVINOAttentionMetadata
 from vllm.config import VllmConfig
+from vllm.forward_context import set_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.layers.sampler import SamplerOutput
@@ -350,7 +351,8 @@ def execute_model(
                                          device=self.device),
         }
 
-        hidden_states = model_executable(**execute_model_kwargs)
+        with set_forward_context(attn_metadata, self.vllm_config, 0):
+            hidden_states = model_executable(**execute_model_kwargs)
 
         # Compute the logits.
         logits = self.model.compute_logits(hidden_states, sampling_metadata)
diff --git a/vllm/worker/openvino_worker.py b/vllm/worker/openvino_worker.py
index 0bf522d5333ed..3482073566215 100644
--- a/vllm/worker/openvino_worker.py
+++ b/vllm/worker/openvino_worker.py
@@ -20,6 +20,7 @@
 from vllm.platforms import current_platform
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import ExecuteModelRequest, SequenceGroupMetadata
+from vllm.utils import bind_kv_cache
 from vllm.worker.openvino_model_runner import OpenVINOModelRunner
 from vllm.worker.worker_base import LoraNotSupportedWorkerBase, WorkerBase
 
@@ -339,6 +340,8 @@ def _init_cache_engine(self) -> None:
             ov_device,
         )
         self.kv_cache = self.cache_engine.kv_cache
+        bind_kv_cache(self.compilation_config.static_forward_context,
+                      [self.kv_cache])
         self.model_runner.block_size = self.cache_engine.block_size
 
         assert self.kv_cache is not None
@@ -507,12 +510,18 @@ def model_profile_run():
 
             self.model_runner.block_size = tmp_cache_config.block_size
 
+            bind_kv_cache(self.compilation_config.static_forward_context,
+                          profiling_cache_engine.kv_cache)
             # Run the model with the dummy inputs.
             self.model_runner.execute_model(seqs,
                                             profiling_cache_engine.kv_cache)
 
-            # explicitly delete temporary KV cache manager to free KV cache
-            # when real inputs will be passed to OV
+            # Explicitly revert bind_kv_cache and delete temporary KV cache
+            # manager to free KV cache when real inputs will be passed to OV
+            bind_kv_cache(self.compilation_config.static_forward_context, [[
+                torch.tensor([])
+                for _ in range(len(profiling_cache_engine.kv_cache))
+            ]])
             del profiling_cache_engine
 
             logger.info(
diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py
index 7bdb7f0e2d6a9..52c577bccab9c 100644
--- a/vllm/worker/tpu_model_runner.py
+++ b/vllm/worker/tpu_model_runner.py
@@ -13,6 +13,7 @@
 
 from vllm.attention import AttentionMetadata, get_attn_backend
 from vllm.config import VllmConfig
+from vllm.forward_context import set_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.model_loader import get_model
@@ -265,8 +266,9 @@ def _dummy_run(
             torch._dynamo.mark_dynamic(t, 0)
             torch._dynamo.mark_dynamic(p, 0)
         # Dummy run.
-        self.model(token_ids, position_ids, attn_metadata, input_lens, t, p,
-                   num_samples, kv_caches)
+        with set_forward_context(attn_metadata, self.vllm_config, 0):
+            self.model(token_ids, position_ids, attn_metadata, input_lens, t,
+                       p, num_samples, kv_caches)
 
     def warmup_model(
         self,
@@ -663,10 +665,13 @@ def execute_model(
                 input_lens = model_input.input_lens[i:i + 1].to(self.device)
                 t = model_input.t[i:i + 1].to(self.device)
                 p = model_input.p[i:i + 1].to(self.device)
-                output_token_ids = self.model(token_ids, position_ids,
-                                              attn_metadata, input_lens, t, p,
-                                              model_input.num_samples,
-                                              kv_caches)
+                with set_forward_context(model_input.attn_metadata,
+                                         self.vllm_config,
+                                         model_input.virtual_engine):
+                    output_token_ids = self.model(token_ids, position_ids,
+                                                  attn_metadata, input_lens, t,
+                                                  p, model_input.num_samples,
+                                                  kv_caches)
                 next_token_ids.append(output_token_ids[0])
                 start_idx = end_idx
 
@@ -711,10 +716,13 @@ def execute_model(
             input_lens = model_input.input_lens.to(self.device)
             for i in range(num_steps):
                 slot_mapping = attn_metadata.slot_mapping
-                output_token_ids = self.model(token_ids, position_ids,
-                                              attn_metadata, input_lens, t, p,
-                                              model_input.num_samples,
-                                              kv_caches)
+                with set_forward_context(model_input.attn_metadata,
+                                         self.vllm_config,
+                                         model_input.virtual_engine):
+                    output_token_ids = self.model(token_ids, position_ids,
+                                                  attn_metadata, input_lens, t,
+                                                  p, model_input.num_samples,
+                                                  kv_caches)
                 self.cached_step_outputs.append(output_token_ids)
 
                 if i < num_steps - 1:
diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py
index 8754f7538f251..ea0e700545b16 100644
--- a/vllm/worker/tpu_worker.py
+++ b/vllm/worker/tpu_worker.py
@@ -12,7 +12,7 @@
 from vllm.logger import init_logger
 from vllm.model_executor import set_random_seed
 from vllm.sequence import ExecuteModelRequest
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, get_dtype_size
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, bind_kv_cache, get_dtype_size
 from vllm.worker.tpu_model_runner import ExecutionMode, TPUModelRunner
 from vllm.worker.worker_base import (LocalOrDistributedWorkerBase,
                                      LoraNotSupportedWorkerBase, WorkerBase,
@@ -108,6 +108,8 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
                       torch.tensor([], dtype=torch.float32,
                                    device=self.device))
                      for _ in range(num_layers)]
+        bind_kv_cache(self.compilation_config.static_forward_context,
+                      [kv_caches])
         self.model_runner._dummy_run(
             batch_size=1,
             seq_len=self.scheduler_config.max_num_batched_tokens,
@@ -170,6 +172,8 @@ def initialize_cache(
                                       device="cpu")
             cpu_v_cache = torch.zeros_like(cpu_k_cache)
             self.cpu_cache.append((cpu_k_cache, cpu_v_cache))
+        bind_kv_cache(self.compilation_config.static_forward_context,
+                      [self.tpu_cache])
         self._warmup_model()
 
     def _warmup_model(self) -> None:
diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
index 9cf25387560da..82b8f22a5af33 100644
--- a/vllm/worker/xpu_model_runner.py
+++ b/vllm/worker/xpu_model_runner.py
@@ -12,6 +12,7 @@
 from vllm.attention import get_attn_backend
 from vllm.config import VllmConfig
 from vllm.distributed import get_pp_group
+from vllm.forward_context import set_forward_context
 from vllm.inputs import INPUT_REGISTRY, InputRegistry
 from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadataCache
@@ -562,15 +563,17 @@ def execute_model(
         if (self.observability_config is not None
                 and self.observability_config.collect_model_forward_time):
             model_forward_start_time = time.time()
-
-        hidden_or_intermediate_states = model_executable(
-            input_ids=model_input.input_tokens,
-            positions=model_input.input_positions,
-            kv_caches=kv_caches,
-            attn_metadata=model_input.attn_metadata,
-            intermediate_tensors=intermediate_tensors,
-            **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs or {},
-                                         device=self.device))
+        with set_forward_context(model_input.attn_metadata, self.vllm_config,
+                                 model_input.virtual_engine):
+            hidden_or_intermediate_states = model_executable(
+                input_ids=model_input.input_tokens,
+                positions=model_input.input_positions,
+                kv_caches=kv_caches,
+                attn_metadata=model_input.attn_metadata,
+                intermediate_tensors=intermediate_tensors,
+                **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs
+                                             or {},
+                                             device=self.device))
         # Compute the logits in the last pipeline stage.
         if not get_pp_group().is_last_rank:
             return hidden_or_intermediate_states

From cd8249903f189c5f06424e67dbc6512ca494a046 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Mon, 13 Jan 2025 03:58:54 -0800
Subject: [PATCH 198/462] [Doc][V1] Update model implementation guide for V1
 support (#11998)

Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 docs/source/contributing/model/basic.md      | 12 ++-
 docs/source/contributing/model/multimodal.md | 87 ++++++++++++++++----
 2 files changed, 83 insertions(+), 16 deletions(-)

diff --git a/docs/source/contributing/model/basic.md b/docs/source/contributing/model/basic.md
index 5c2dc486c8bea..b9b92fd027f6e 100644
--- a/docs/source/contributing/model/basic.md
+++ b/docs/source/contributing/model/basic.md
@@ -57,7 +57,17 @@ class MyModelForCausalLM(nn.Module):
 
 ### Computation Code
 
-Rewrite the {meth}`~torch.nn.Module.forward` method of your model to remove any unnecessary code, such as training-specific code. Modify the input parameters to treat `input_ids` and `positions` as flattened tensors with a single batch size dimension, without a max-sequence length dimension.
+- Add a `get_input_embeddings` method inside `MyModel` module that returns the text embeddings given `input_ids`. This is equivalent to directly calling the text embedding layer, but provides a unified interface in case `MyModel` is used within a composite multimodal model.
+
+```python
+class MyModel(nn.Module):
+        ...
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        ... 
+```
+
+- Rewrite the {meth}`~torch.nn.Module.forward` method of your model to remove any unnecessary code, such as training-specific code. Modify the input parameters to treat `input_ids` and `positions` as flattened tensors with a single batch size dimension, without a max-sequence length dimension.
 
 ```python
 def forward(
diff --git a/docs/source/contributing/model/multimodal.md b/docs/source/contributing/model/multimodal.md
index 99f6a1d5462c4..e5fd9a2877ceb 100644
--- a/docs/source/contributing/model/multimodal.md
+++ b/docs/source/contributing/model/multimodal.md
@@ -9,7 +9,78 @@ This document walks you through the steps to extend a basic model so that it acc
 It is assumed that you have already implemented the model in vLLM according to [these steps](#new-model-basic).
 Further update the model as follows:
 
-- Implement the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface.
+- Reserve a keyword parameter in {meth}`~torch.nn.Module.forward` for each input tensor that corresponds to a multi-modal input, as shown in the following example:
+
+  ```diff
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+  +     pixel_values: torch.Tensor,
+    ) -> SamplerOutput:
+  ```
+  
+  More conveniently, you can simply pass `**kwargs` to the {meth}`~torch.nn.Module.forward` method and retrieve the keyword parameters for multimodal inputs from it.
+
+- Implement {meth}`~vllm.model_executor.models.interfaces.SupportsMultiModal.get_multimodal_embeddings` that returns the embeddings from running the multimodal inputs through the multimodal tokenizer of the model. Below we provide a boilerplate of a typical implementation pattern, but feel free to adjust it to your own needs.
+
+    ```python
+    class YourModelForImage2Seq(nn.Module):
+        ...
+
+        def _process_image_input(self, image_input: YourModelImageInputs) -> torch.Tensor:
+
+            assert self.vision_encoder is not None
+            image_features = self.vision_encoder(image_input)
+            return self.multi_modal_projector(image_features)
+
+        def get_multimodal_embeddings(self, **kwargs: object) -> Optional[NestedTensors]:
+
+            # Validate the multimodal input keyword arguments
+            image_input = self._parse_and_validate_image_input(**kwargs)
+            if image_input is None:
+                return None
+
+            # Run multimodal inputs through encoder and projector
+            vision_embeddings = self._process_image_input(image_input)
+            return vision_embeddings
+    ```
+
+    ```{important}
+    The returned `multimodal_embeddings` must be either a **3D {class}`torch.Tensor`** of shape `(num_items, feature_size, hidden_size)`, or a **list / tuple of 2D {class}`torch.Tensor`'s** of shape `(feature_size, hidden_size)`, so that `multimodal_embeddings[i]` retrieves the embeddings generated from the `i`-th multimodal data item (e.g, image) of the request.
+    ```
+
+- Implement {meth}`~vllm.model_executor.models.interfaces.SupportsMultiModal.get_input_embeddings` to merge `multimodal_embeddings` with text embeddings from the `input_ids`. If input processing for the model is implemented correctly (see sections below), then you can leverage the utility function we provide to easily merge the embeddings.
+
+    ```python
+    from .utils import merge_multimodal_embeddings
+
+    class YourModelForImage2Seq(nn.Module):
+        ...
+
+        def get_input_embeddings(
+            self,
+            input_ids: torch.Tensor,
+            multimodal_embeddings: Optional[NestedTensors] = None,
+        ) -> torch.Tensor:
+
+            # `get_input_embeddings` should already be implemented for the language 
+            # model as one of the requirements of basic vLLM model implementation.
+            inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+
+            if multimodal_embeddings is not None:
+                inputs_embeds = merge_multimodal_embeddings(
+                    input_ids=input_ids, 
+                    inputs_embeds=inputs_embeds, 
+                    multimodal_embeddings=multimodal_embeddings,
+                    placeholder_token_id=self.config.image_token_index)
+
+            return inputs_embeds
+    ```
+
+- Once the above steps are done, update the model class with the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface.
 
   ```diff
   + from vllm.model_executor.models.interfaces import SupportsMultiModal
@@ -23,20 +94,6 @@ Further update the model as follows:
   Check out [the HuggingFace Transformers documentation](https://huggingface.co/docs/transformers/model_doc/auto#multimodal) for some examples.
   ```
 
-- If you haven't already done so, reserve a keyword parameter in {meth}`~torch.nn.Module.forward`
-  for each input tensor that corresponds to a multi-modal input, as shown in the following example:
-
-  ```diff
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
-  +     pixel_values: torch.Tensor,
-    ) -> SamplerOutput:
-  ```
-
 ## 2. Specify processing information
 
 Next, create a subclass of {class}`~vllm.multimodal.processing.BaseProcessingInfo`

From e8c23ff989d4b061726315bbf74d0bca7136fdc4 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Mon, 13 Jan 2025 12:27:36 +0000
Subject: [PATCH 199/462] [Doc] Organise installation documentation into
 categories and tabs (#11935)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/source/conf.py                           |   2 +-
 docs/source/deployment/docker.md              |   4 +
 docs/source/features/compatibility_matrix.md  |   4 +-
 .../hpu-gaudi.inc.md}                         |  72 ++--
 .../installation/ai_accelerator/index.md      | 375 ++++++++++++++++++
 .../neuron.inc.md}                            |  67 ++--
 .../openvino.inc.md}                          |  78 ++--
 .../{tpu.md => ai_accelerator/tpu.inc.md}     |  38 +-
 .../getting_started/installation/cpu-arm.md   |  46 ---
 .../{cpu-apple.md => cpu/apple.inc.md}        |  28 +-
 .../installation/cpu/arm.inc.md               |  30 ++
 .../installation/cpu/build.inc.md             |  21 +
 .../installation/{cpu-x86.md => cpu/index.md} | 190 ++++++---
 .../installation/cpu/x86.inc.md               |  35 ++
 .../installation/device.template.md           |  17 +
 .../{gpu-cuda.md => gpu/cuda.inc.md}          |  96 ++---
 .../getting_started/installation/gpu/index.md | 300 ++++++++++++++
 .../{gpu-rocm.md => gpu/rocm.inc.md}          | 147 +++----
 .../installation/{xpu.md => gpu/xpu.inc.md}   |  51 ++-
 .../getting_started/installation/index.md     |  13 +-
 .../installation/python_env_setup.inc.md      |  19 +
 21 files changed, 1241 insertions(+), 392 deletions(-)
 rename docs/source/getting_started/installation/{hpu-gaudi.md => ai_accelerator/hpu-gaudi.inc.md} (97%)
 create mode 100644 docs/source/getting_started/installation/ai_accelerator/index.md
 rename docs/source/getting_started/installation/{neuron.md => ai_accelerator/neuron.inc.md} (86%)
 rename docs/source/getting_started/installation/{openvino.md => ai_accelerator/openvino.inc.md} (69%)
 rename docs/source/getting_started/installation/{tpu.md => ai_accelerator/tpu.inc.md} (88%)
 delete mode 100644 docs/source/getting_started/installation/cpu-arm.md
 rename docs/source/getting_started/installation/{cpu-apple.md => cpu/apple.inc.md} (70%)
 create mode 100644 docs/source/getting_started/installation/cpu/arm.inc.md
 create mode 100644 docs/source/getting_started/installation/cpu/build.inc.md
 rename docs/source/getting_started/installation/{cpu-x86.md => cpu/index.md} (67%)
 create mode 100644 docs/source/getting_started/installation/cpu/x86.inc.md
 create mode 100644 docs/source/getting_started/installation/device.template.md
 rename docs/source/getting_started/installation/{gpu-cuda.md => gpu/cuda.inc.md} (84%)
 create mode 100644 docs/source/getting_started/installation/gpu/index.md
 rename docs/source/getting_started/installation/{gpu-rocm.md => gpu/rocm.inc.md} (87%)
 rename docs/source/getting_started/installation/{xpu.md => gpu/xpu.inc.md} (80%)
 create mode 100644 docs/source/getting_started/installation/python_env_setup.inc.md

diff --git a/docs/source/conf.py b/docs/source/conf.py
index bff0141ffbce8..7aa52db092e36 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -56,7 +56,7 @@
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
 # This pattern also affects html_static_path and html_extra_path.
-exclude_patterns: List[str] = ["**/*.template.md"]
+exclude_patterns: List[str] = ["**/*.template.md", "**/*.inc.md"]
 
 # Exclude the prompt "$" when copying code
 copybutton_prompt_text = r"\$ "
diff --git a/docs/source/deployment/docker.md b/docs/source/deployment/docker.md
index c735bfd0e87a7..9e301483ef7f9 100644
--- a/docs/source/deployment/docker.md
+++ b/docs/source/deployment/docker.md
@@ -2,6 +2,8 @@
 
 # Using Docker
 
+(deployment-docker-pre-built-image)=
+
 ## Use vLLM's Official Docker Image
 
 vLLM offers an official Docker image for deployment.
@@ -23,6 +25,8 @@ container to access the host's shared memory. vLLM uses PyTorch, which uses shar
 memory to share data between processes under the hood, particularly for tensor parallel inference.
 ```
 
+(deployment-docker-build-image-from-source)=
+
 ## Building vLLM's Docker Image from Source
 
 You can build and run vLLM from source via the provided <gh-file:Dockerfile>. To build vLLM:
diff --git a/docs/source/features/compatibility_matrix.md b/docs/source/features/compatibility_matrix.md
index ea1d545ff3d73..86a82eb36df33 100644
--- a/docs/source/features/compatibility_matrix.md
+++ b/docs/source/features/compatibility_matrix.md
@@ -322,7 +322,9 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar
 
 ```
 
-### Feature x Hardware
+(feature-x-hardware)=
+
+## Feature x Hardware
 
 ```{list-table}
    :header-rows: 1
diff --git a/docs/source/getting_started/installation/hpu-gaudi.md b/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
similarity index 97%
rename from docs/source/getting_started/installation/hpu-gaudi.md
rename to docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
index a829b1c9ff996..b4695d504b601 100644
--- a/docs/source/getting_started/installation/hpu-gaudi.md
+++ b/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
@@ -1,10 +1,13 @@
-(installation-gaudi)=
+# Installation
 
-# Installation for Intel® Gaudi®
+This tab provides instructions on running vLLM with Intel Gaudi devices.
 
-This README provides instructions on running vLLM with Intel Gaudi devices.
+## Requirements
 
-## Requirements and Installation
+- OS: Ubuntu 22.04 LTS
+- Python: 3.10
+- Intel Gaudi accelerator
+- Intel Gaudi software version 1.18.0
 
 Please follow the instructions provided in the [Gaudi Installation
 Guide](https://docs.habana.ai/en/latest/Installation_Guide/index.html)
@@ -12,27 +15,9 @@ to set up the execution environment. To achieve the best performance,
 please follow the methods outlined in the [Optimizing Training Platform
 Guide](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html).
 
-### Requirements
-
-- OS: Ubuntu 22.04 LTS
-- Python: 3.10
-- Intel Gaudi accelerator
-- Intel Gaudi software version 1.18.0
-
-### Quick start using Dockerfile
-
-```console
-docker build -f Dockerfile.hpu -t vllm-hpu-env  .
-docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --rm vllm-hpu-env
-```
-
-```{tip}
-If you're observing the following error: `docker: Error response from daemon: Unknown runtime specified habana.`, please refer to "Install Using Containers" section of [Intel Gaudi Software Stack and Driver Installation](https://docs.habana.ai/en/v1.18.0/Installation_Guide/Bare_Metal_Fresh_OS.html). Make sure you have `habana-container-runtime` package installed and that `habana` container runtime is registered.
-```
+## Configure a new environment
 
-### Build from source
-
-#### Environment verification
+### Environment verification
 
 To verify that the Intel Gaudi software was correctly installed, run:
 
@@ -47,7 +32,7 @@ Refer to [Intel Gaudi Software Stack
 Verification](https://docs.habana.ai/en/latest/Installation_Guide/SW_Verification.html#platform-upgrade)
 for more details.
 
-#### Run Docker Image
+### Run Docker Image
 
 It is highly recommended to use the latest Docker image from Intel Gaudi
 vault. Refer to the [Intel Gaudi
@@ -61,7 +46,13 @@ docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-i
 docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
 ```
 
-#### Build and Install vLLM
+## Set up using Python
+
+### Pre-built wheels
+
+Currently, there are no pre-built Intel Gaudi wheels.
+
+### Build wheel from source
 
 To build and install vLLM from source, run:
 
@@ -80,7 +71,26 @@ git checkout habana_main
 python setup.py develop
 ```
 
-## Supported Features
+## Set up using Docker
+
+### Pre-built images
+
+Currently, there are no pre-built Intel Gaudi images.
+
+### Build image from source
+
+```console
+docker build -f Dockerfile.hpu -t vllm-hpu-env  .
+docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --rm vllm-hpu-env
+```
+
+```{tip}
+If you're observing the following error: `docker: Error response from daemon: Unknown runtime specified habana.`, please refer to "Install Using Containers" section of [Intel Gaudi Software Stack and Driver Installation](https://docs.habana.ai/en/v1.18.0/Installation_Guide/Bare_Metal_Fresh_OS.html). Make sure you have `habana-container-runtime` package installed and that `habana` container runtime is registered.
+```
+
+## Extra information
+
+## Supported features
 
 - [Offline inference](#offline-inference)
 - Online serving via [OpenAI-Compatible Server](#openai-compatible-server)
@@ -94,14 +104,14 @@ python setup.py develop
   for accelerating low-batch latency and throughput
 - Attention with Linear Biases (ALiBi)
 
-## Unsupported Features
+## Unsupported features
 
 - Beam search
 - LoRA adapters
 - Quantization
 - Prefill chunking (mixed-batch inferencing)
 
-## Supported Configurations
+## Supported configurations
 
 The following configurations have been validated to be function with
 Gaudi2 devices. Configurations that are not listed may or may not work.
@@ -137,7 +147,7 @@ Gaudi2 devices. Configurations that are not listed may or may not work.
 - [meta-llama/Meta-Llama-3.1-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct)
   with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
 
-## Performance Tuning
+## Performance tuning
 
 ### Execution modes
 
@@ -368,7 +378,7 @@ Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM
 - `PT_HPU_LAZY_MODE`: if `0`, PyTorch Eager backend for Gaudi will be used, if `1` PyTorch Lazy backend for Gaudi will be used, `1` is default
 - `PT_HPU_ENABLE_LAZY_COLLECTIVES`: required to be `true` for tensor parallel inference with HPU Graphs
 
-## Troubleshooting: Tweaking HPU Graphs
+## Troubleshooting: tweaking HPU graphs
 
 If you experience device out-of-memory issues or want to attempt
 inference at higher batch sizes, try tweaking HPU Graphs by following
diff --git a/docs/source/getting_started/installation/ai_accelerator/index.md b/docs/source/getting_started/installation/ai_accelerator/index.md
new file mode 100644
index 0000000000000..a6c4c44305a4c
--- /dev/null
+++ b/docs/source/getting_started/installation/ai_accelerator/index.md
@@ -0,0 +1,375 @@
+# Other AI accelerators
+
+vLLM is a Python library that supports the following AI accelerators. Select your AI accelerator type to see vendor specific instructions:
+
+::::{tab-set}
+:sync-group: device
+
+:::{tab-item} TPU
+:sync: tpu
+
+```{include} tpu.inc.md
+:start-after: "# Installation"
+:end-before: "## Requirements"
+```
+
+:::
+
+:::{tab-item} Intel Gaudi
+:sync: hpu-gaudi
+
+```{include} hpu-gaudi.inc.md
+:start-after: "# Installation"
+:end-before: "## Requirements"
+```
+
+:::
+
+:::{tab-item} Neuron
+:sync: neuron
+
+```{include} neuron.inc.md
+:start-after: "# Installation"
+:end-before: "## Requirements"
+```
+
+:::
+
+:::{tab-item} OpenVINO
+:sync: openvino
+
+```{include} openvino.inc.md
+:start-after: "# Installation"
+:end-before: "## Requirements"
+```
+
+:::
+
+::::
+
+## Requirements
+
+::::{tab-set}
+:sync-group: device
+
+:::{tab-item} TPU
+:sync: tpu
+
+```{include} tpu.inc.md
+:start-after: "## Requirements"
+:end-before: "## Configure a new environment"
+```
+
+:::
+
+:::{tab-item} Intel Gaudi
+:sync: hpu-gaudi
+
+```{include} hpu-gaudi.inc.md
+:start-after: "## Requirements"
+:end-before: "## Configure a new environment"
+```
+
+:::
+
+:::{tab-item} Neuron
+:sync: neuron
+
+```{include} neuron.inc.md
+:start-after: "## Requirements"
+:end-before: "## Configure a new environment"
+```
+
+:::
+
+:::{tab-item} OpenVINO
+:sync: openvino
+
+```{include} openvino.inc.md
+:start-after: "## Requirements"
+:end-before: "## Set up using Python"
+```
+
+:::
+
+::::
+
+## Configure a new environment
+
+::::{tab-set}
+:sync-group: device
+
+:::{tab-item} TPU
+:sync: tpu
+
+```{include} tpu.inc.md
+:start-after: "## Configure a new environment"
+:end-before: "## Set up using Python"
+```
+
+:::
+
+:::{tab-item} Intel Gaudi
+:sync: hpu-gaudi
+
+```{include} hpu-gaudi.inc.md
+:start-after: "## Configure a new environment"
+:end-before: "## Set up using Python"
+```
+
+:::
+
+:::{tab-item} Neuron
+:sync: neuron
+
+```{include} neuron.inc.md
+:start-after: "## Configure a new environment"
+:end-before: "## Set up using Python"
+```
+
+:::
+
+:::{tab-item} OpenVINO
+:sync: openvino
+
+```{include} ../python_env_setup.inc.md
+```
+
+:::
+
+::::
+
+## Set up using Python
+
+### Pre-built wheels
+
+::::{tab-set}
+:sync-group: device
+
+:::{tab-item} TPU
+:sync: tpu
+
+```{include} tpu.inc.md
+:start-after: "### Pre-built wheels"
+:end-before: "### Build wheel from source"
+```
+
+:::
+
+:::{tab-item} Intel Gaudi
+:sync: hpu-gaudi
+
+```{include} hpu-gaudi.inc.md
+:start-after: "### Pre-built wheels"
+:end-before: "### Build wheel from source"
+```
+
+:::
+
+:::{tab-item} Neuron
+:sync: neuron
+
+```{include} neuron.inc.md
+:start-after: "### Pre-built wheels"
+:end-before: "### Build wheel from source"
+```
+
+:::
+
+:::{tab-item} OpenVINO
+:sync: openvino
+
+```{include} openvino.inc.md
+:start-after: "### Pre-built wheels"
+:end-before: "### Build wheel from source"
+```
+
+:::
+
+::::
+
+### Build wheel from source
+
+::::{tab-set}
+:sync-group: device
+
+:::{tab-item} TPU
+:sync: tpu
+
+```{include} tpu.inc.md
+:start-after: "### Build wheel from source"
+:end-before: "## Set up using Docker"
+```
+
+:::
+
+:::{tab-item} Intel Gaudi
+:sync: hpu-gaudi
+
+```{include} hpu-gaudi.inc.md
+:start-after: "### Build wheel from source"
+:end-before: "## Set up using Docker"
+```
+
+:::
+
+:::{tab-item} Neuron
+:sync: neuron
+
+```{include} neuron.inc.md
+:start-after: "### Build wheel from source"
+:end-before: "## Set up using Docker"
+```
+
+:::
+
+:::{tab-item} OpenVINO
+:sync: openvino
+
+```{include} openvino.inc.md
+:start-after: "### Build wheel from source"
+:end-before: "## Set up using Docker"
+```
+
+:::
+
+::::
+
+## Set up using Docker
+
+### Pre-built images
+
+::::{tab-set}
+:sync-group: device
+
+:::{tab-item} TPU
+:sync: tpu
+
+```{include} tpu.inc.md
+:start-after: "### Pre-built images"
+:end-before: "### Build image from source"
+```
+
+:::
+
+:::{tab-item} Intel Gaudi
+:sync: hpu-gaudi
+
+```{include} hpu-gaudi.inc.md
+:start-after: "### Pre-built images"
+:end-before: "### Build image from source"
+```
+
+:::
+
+:::{tab-item} Neuron
+:sync: neuron
+
+```{include} neuron.inc.md
+:start-after: "### Pre-built images"
+:end-before: "### Build image from source"
+```
+
+:::
+
+:::{tab-item} OpenVINO
+:sync: openvino
+
+```{include} openvino.inc.md
+:start-after: "### Pre-built images"
+:end-before: "### Build image from source"
+```
+
+:::
+
+::::
+
+### Build image from source
+
+::::{tab-set}
+:sync-group: device
+
+:::{tab-item} TPU
+:sync: tpu
+
+```{include} tpu.inc.md
+:start-after: "### Build image from source"
+:end-before: "## Extra information"
+```
+
+:::
+
+:::{tab-item} Intel Gaudi
+:sync: hpu-gaudi
+
+```{include} hpu-gaudi.inc.md
+:start-after: "### Build image from source"
+:end-before: "## Extra information"
+```
+
+:::
+
+:::{tab-item} Neuron
+:sync: neuron
+
+```{include} neuron.inc.md
+:start-after: "### Build image from source"
+:end-before: "## Extra information"
+```
+
+:::
+
+:::{tab-item} OpenVINO
+:sync: openvino
+
+```{include} openvino.inc.md
+:start-after: "### Build image from source"
+:end-before: "## Extra information"
+```
+
+:::
+
+::::
+
+## Extra information
+
+::::{tab-set}
+:sync-group: device
+
+:::{tab-item} TPU
+:sync: tpu
+
+```{include} tpu.inc.md
+:start-after: "## Extra information"
+```
+
+:::
+
+:::{tab-item} Intel Gaudi
+:sync: hpu-gaudi
+
+```{include} hpu-gaudi.inc.md
+:start-after: "## Extra information"
+```
+
+:::
+
+:::{tab-item} Neuron
+:sync: neuron
+
+```{include} neuron.inc.md
+:start-after: "## Extra information"
+```
+
+:::
+
+:::{tab-item} OpenVINO
+:sync: openvino
+
+```{include} openvino.inc.md
+:start-after: "## Extra information"
+```
+
+:::
+
+::::
diff --git a/docs/source/getting_started/installation/neuron.md b/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md
similarity index 86%
rename from docs/source/getting_started/installation/neuron.md
rename to docs/source/getting_started/installation/ai_accelerator/neuron.inc.md
index 5581b1940ca46..575a9f9c2e2f0 100644
--- a/docs/source/getting_started/installation/neuron.md
+++ b/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md
@@ -1,6 +1,4 @@
-(installation-neuron)=
-
-# Installation for Neuron
+# Installation
 
 vLLM 0.3.3 onwards supports model inferencing and serving on AWS Trainium/Inferentia with Neuron SDK with continuous batching.
 Paged Attention and Chunked Prefill are currently in development and will be available soon.
@@ -14,28 +12,9 @@ Data types currently supported in Neuron SDK are FP16 and BF16.
 - Pytorch 2.0.1/2.1.1
 - AWS Neuron SDK 2.16/2.17 (Verified on python 3.8)
 
-Installation steps:
-
-- [Build from source](#build-from-source-neuron)
-
-  - [Step 0. Launch Trn1/Inf2 instances](#launch-instances)
-  - [Step 1. Install drivers and tools](#install-drivers)
-  - [Step 2. Install transformers-neuronx and its dependencies](#install-tnx)
-  - [Step 3. Install vLLM from source](#install-vllm)
-
-(build-from-source-neuron)=
-
-```{note}
-The currently supported version of Pytorch for Neuron installs `triton` version `2.1.0`. This is incompatible with `vllm >= 0.5.3`. You may see an error `cannot import name 'default_dump_dir...`. To work around this, run a `pip install --upgrade triton==3.0.0` after installing the vLLM wheel.
-```
-
-## Build from source
-
-Following instructions are applicable to Neuron SDK 2.16 and beyond.
-
-(launch-instances)=
+## Configure a new environment
 
-### Step 0. Launch Trn1/Inf2 instances
+### Launch Trn1/Inf2 instances
 
 Here are the steps to launch trn1/inf2 instances, in order to install [PyTorch Neuron ("torch-neuronx") Setup on Ubuntu 22.04 LTS](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/setup/neuron-setup/pytorch/neuronx/ubuntu/torch-neuronx-ubuntu22.html).
 
@@ -45,9 +24,7 @@ Here are the steps to launch trn1/inf2 instances, in order to install [PyTorch N
 - When launching a Trn1/Inf2, please adjust your primary EBS volume size to a minimum of 512GB.
 - After launching the instance, follow the instructions in [Connect to your instance](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/AccessingInstancesLinux.html) to connect to the instance
 
-(install-drivers)=
-
-### Step 1. Install drivers and tools
+### Install drivers and tools
 
 The installation of drivers and tools wouldn't be necessary, if [Deep Learning AMI Neuron](https://docs.aws.amazon.com/dlami/latest/devguide/appendix-ami-release-notes.html) is installed. In case the drivers and tools are not installed on the operating system, follow the steps below:
 
@@ -82,9 +59,21 @@ sudo apt-get install aws-neuronx-tools=2.* -y
 export PATH=/opt/aws/neuron/bin:$PATH
 ```
 
-(install-tnx)=
+## Set up using Python
+
+### Pre-built wheels
 
-### Step 2. Install transformers-neuronx and its dependencies
+Currently, there are no pre-built Neuron wheels.
+
+### Build wheel from source
+
+```{note}
+The currently supported version of Pytorch for Neuron installs `triton` version `2.1.0`. This is incompatible with `vllm >= 0.5.3`. You may see an error `cannot import name 'default_dump_dir...`. To work around this, run a `pip install --upgrade triton==3.0.0` after installing the vLLM wheel.
+```
+
+Following instructions are applicable to Neuron SDK 2.16 and beyond.
+
+#### Install transformers-neuronx and its dependencies
 
 [transformers-neuronx](https://github.com/aws-neuron/transformers-neuronx) will be the backend to support inference on trn1/inf2 instances.
 Follow the steps below to install transformer-neuronx package and its dependencies.
@@ -116,9 +105,7 @@ python -m pip install awscli
 python -m pip install --upgrade neuronx-cc==2.* --pre torch-neuronx==2.1.* torchvision transformers-neuronx
 ```
 
-(install-vllm)=
-
-### Step 3. Install vLLM from source
+#### Install vLLM from source
 
 Once neuronx-cc and transformers-neuronx packages are installed, we will be able to install vllm as follows:
 
@@ -130,3 +117,19 @@ VLLM_TARGET_DEVICE="neuron" pip install .
 ```
 
 If neuron packages are detected correctly in the installation process, `vllm-0.3.0+neuron212` will be installed.
+
+## Set up using Docker
+
+### Pre-built images
+
+Currently, there are no pre-built Neuron images.
+
+### Build image from source
+
+See <project:#deployment-docker-build-image-from-source> for instructions on building the Docker image.
+
+Make sure to use <gh-file:Dockerfile.neuron> in place of the default Dockerfile.
+
+## Extra information
+
+There is no extra information for this device.
diff --git a/docs/source/getting_started/installation/openvino.md b/docs/source/getting_started/installation/ai_accelerator/openvino.inc.md
similarity index 69%
rename from docs/source/getting_started/installation/openvino.md
rename to docs/source/getting_started/installation/ai_accelerator/openvino.inc.md
index d97d4173bf36b..a7867472583d6 100644
--- a/docs/source/getting_started/installation/openvino.md
+++ b/docs/source/getting_started/installation/ai_accelerator/openvino.inc.md
@@ -1,63 +1,65 @@
-(installation-openvino)=
+# Installation
 
-# Installation for OpenVINO
+vLLM powered by OpenVINO supports all LLM models from [vLLM supported models list](#supported-models) and can perform optimal model serving on all x86-64 CPUs with, at least, AVX2 support, as well as on both integrated and discrete Intel® GPUs ([the list of supported GPUs](https://docs.openvino.ai/2024/about-openvino/release-notes-openvino/system-requirements.html#gpu)).
 
-vLLM powered by OpenVINO supports all LLM models from [vLLM supported models list](#supported-models) and can perform optimal model serving on all x86-64 CPUs with, at least, AVX2 support, as well as on both integrated and discrete Intel® GPUs ([the list of supported GPUs](https://docs.openvino.ai/2024/about-openvino/release-notes-openvino/system-requirements.html#gpu)). OpenVINO vLLM backend supports the following advanced vLLM features:
+## Requirements
 
-- Prefix caching (`--enable-prefix-caching`)
-- Chunked prefill (`--enable-chunked-prefill`)
+- OS: Linux
+- Instruction set architecture (ISA) requirement: at least AVX2.
 
-**Table of contents**:
+## Set up using Python
 
-- [Requirements](#openvino-backend-requirements)
-- [Quick start using Dockerfile](#openvino-backend-quick-start-dockerfile)
-- [Build from source](#install-openvino-backend-from-source)
-- [Performance tips](#openvino-backend-performance-tips)
-- [Limitations](#openvino-backend-limitations)
+### Pre-built wheels
 
-(openvino-backend-requirements)=
+Currently, there are no pre-built OpenVINO wheels.
 
-## Requirements
+### Build wheel from source
 
-- OS: Linux
-- Instruction set architecture (ISA) requirement: at least AVX2.
+First, install Python. For example, on Ubuntu 22.04, you can run:
 
-(openvino-backend-quick-start-dockerfile)=
+```console
+sudo apt-get update  -y
+sudo apt-get install python3
+```
 
-## Quick start using Dockerfile
+Second, install prerequisites vLLM OpenVINO backend installation:
 
 ```console
-docker build -f Dockerfile.openvino -t vllm-openvino-env .
-docker run -it --rm vllm-openvino-env
+pip install --upgrade pip
+pip install -r requirements-build.txt --extra-index-url https://download.pytorch.org/whl/cpu
 ```
 
-(install-openvino-backend-from-source)=
+Finally, install vLLM with OpenVINO backend:
 
-## Install from source
+```console
+PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE=openvino python -m pip install -v .
+```
 
-- First, install Python. For example, on Ubuntu 22.04, you can run:
+:::{tip}
+To use vLLM OpenVINO backend with a GPU device, ensure your system is properly set up. Follow the instructions provided here: [https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html](https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html).
+:::
 
-  ```console
-  sudo apt-get update  -y
-  sudo apt-get install python3
-  ```
+## Set up using Docker
 
-- Second, install prerequisites vLLM OpenVINO backend installation:
+### Pre-built images
 
-  ```console
-  pip install --upgrade pip
-  pip install -r requirements-build.txt --extra-index-url https://download.pytorch.org/whl/cpu
-  ```
+Currently, there are no pre-built OpenVINO images.
 
-- Finally, install vLLM with OpenVINO backend:
+### Build image from source
 
-  ```console
-  PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE=openvino python -m pip install -v .
-  ```
+```console
+docker build -f Dockerfile.openvino -t vllm-openvino-env .
+docker run -it --rm vllm-openvino-env
+```
 
-- [Optional] To use vLLM OpenVINO backend with a GPU device, ensure your system is properly set up. Follow the instructions provided here: [https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html](https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html).
+## Extra information
 
-(openvino-backend-performance-tips)=
+## Supported features
+
+OpenVINO vLLM backend supports the following advanced vLLM features:
+
+- Prefix caching (`--enable-prefix-caching`)
+- Chunked prefill (`--enable-chunked-prefill`)
 
 ## Performance tips
 
@@ -95,8 +97,6 @@ $ VLLM_OPENVINO_DEVICE=GPU VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \
     python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json
 ```
 
-(openvino-backend-limitations)=
-
 ## Limitations
 
 - LoRA serving is not supported.
diff --git a/docs/source/getting_started/installation/tpu.md b/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md
similarity index 88%
rename from docs/source/getting_started/installation/tpu.md
rename to docs/source/getting_started/installation/ai_accelerator/tpu.inc.md
index 1938785ade46a..6a911cc6b9eba 100644
--- a/docs/source/getting_started/installation/tpu.md
+++ b/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md
@@ -1,6 +1,4 @@
-(installation-tpu)=
-
-# Installation for TPUs
+# Installation
 
 Tensor Processing Units (TPUs) are Google's custom-developed application-specific
 integrated circuits (ASICs) used to accelerate machine learning workloads. TPUs
@@ -54,7 +52,16 @@ In all of the following commands, replace the ALL CAPS parameter names with
 appropriate values. See the parameter descriptions table for more information.
 ```
 
-## Provision a Cloud TPU with the queued resource API
+### Provision Cloud TPUs with GKE
+
+For more information about using TPUs with GKE, see:
+- <https://cloud.google.com/kubernetes-engine/docs/how-to/tpus>
+- <https://cloud.google.com/kubernetes-engine/docs/concepts/tpus>
+- <https://cloud.google.com/kubernetes-engine/docs/concepts/plan-tpus>
+
+## Configure a new environment
+
+### Provision a Cloud TPU with the queued resource API
 
 Create a TPU v5e with 4 TPU chips:
 
@@ -102,6 +109,14 @@ Connect to your TPU using SSH:
 gcloud compute tpus tpu-vm ssh TPU_NAME --zone ZONE
 ```
 
+## Set up using Python
+
+### Pre-built wheels
+
+Currently, there are no pre-built TPU wheels.
+
+### Build wheel from source
+
 Install Miniconda:
 
 ```bash
@@ -142,16 +157,13 @@ Run the setup script:
 VLLM_TARGET_DEVICE="tpu" python setup.py develop
 ```
 
-## Provision Cloud TPUs with GKE
+## Set up using Docker
 
-For more information about using TPUs with GKE, see
-<https://cloud.google.com/kubernetes-engine/docs/how-to/tpus>
-<https://cloud.google.com/kubernetes-engine/docs/concepts/tpus>
-<https://cloud.google.com/kubernetes-engine/docs/concepts/plan-tpus>
+### Pre-built images
 
-(build-docker-tpu)=
+See <project:#deployment-docker-pre-built-image> for instructions on using the official Docker image, making sure to substitute the image name `vllm/vllm-openai` with `vllm/vllm-tpu`.
 
-## Build a docker image with {code}`Dockerfile.tpu`
+### Build image from source
 
 You can use <gh-file:Dockerfile.tpu> to build a Docker image with TPU support.
 
@@ -189,3 +201,7 @@ Install OpenBLAS with the following command:
 $ sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev
 ```
 ````
+
+## Extra information
+
+There is no extra information for this device.
diff --git a/docs/source/getting_started/installation/cpu-arm.md b/docs/source/getting_started/installation/cpu-arm.md
deleted file mode 100644
index e199073ed721f..0000000000000
--- a/docs/source/getting_started/installation/cpu-arm.md
+++ /dev/null
@@ -1,46 +0,0 @@
-(installation-arm)=
-
-# Installation for ARM CPUs
-
-vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform. This guide provides installation instructions specific to ARM (which also apply to Apple Silicon, see [Installation for macOS](#installation-apple) for more). For additional details on supported features, refer to the [x86 CPU documentation](#installation-x86) covering:
-
-- CPU backend inference capabilities
-- Relevant runtime environment variables
-- Performance optimization tips
-
-ARM CPU backend currently supports Float32, FP16 and BFloat16 datatypes.
-Contents:
-
-1. [Requirements](#arm-backend-requirements)
-2. [Quick Start with Dockerfile](#arm-backend-quick-start-dockerfile)
-3. [Building from Source](#build-arm-backend-from-source)
-
-(arm-backend-requirements)=
-
-## Requirements
-
-- **Operating System**: Linux or macOS
-- **Compilers**: `gcc/g++ >= 12.3.0` (optional, but recommended) or `Apple Clang >= 15.0.0` for macOS
-- **Instruction Set Architecture (ISA)**: NEON support is required
-
-(arm-backend-quick-start-dockerfile)=
-
-## Quick Start with Dockerfile
-
-You can quickly set up vLLM on ARM using Docker:
-
-```console
-$ docker build -f Dockerfile.arm -t vllm-cpu-env --shm-size=4g .
-$ docker run -it \
-             --rm \
-             --network=host \
-             --cpuset-cpus=<cpu-id-list, optional> \
-             --cpuset-mems=<memory-node, optional> \
-             vllm-cpu-env
-```
-
-(build-arm-backend-from-source)=
-
-## Building from Source
-
-To build vLLM from source on Ubuntu 22.04 or other Linux distributions, follow a similar process as with x86. Testing has been conducted on AWS Graviton3 instances for compatibility.
diff --git a/docs/source/getting_started/installation/cpu-apple.md b/docs/source/getting_started/installation/cpu/apple.inc.md
similarity index 70%
rename from docs/source/getting_started/installation/cpu-apple.md
rename to docs/source/getting_started/installation/cpu/apple.inc.md
index 1068893f5bafa..56545253b1ef7 100644
--- a/docs/source/getting_started/installation/cpu-apple.md
+++ b/docs/source/getting_started/installation/cpu/apple.inc.md
@@ -1,20 +1,20 @@
-(installation-apple)=
+# Installation
 
-# Installation for macOS
-
-vLLM has experimental support for macOS with Apple Silicon. For now, users shall build from the source vLLM to natively run on macOS. For more details, like running on vLLM in a docker container, see [ARM CPU Documentation](installation-arm)
+vLLM has experimental support for macOS with Apple silicon. For now, users shall build from the source vLLM to natively run on macOS.
 
 Currently the CPU implementation for macOS supports FP32 and FP16 datatypes.
 
 ## Requirements
 
-- **Operating System**: `macOS Sonoma` or later
-- **SDK** `XCode 15.4` or later with Command Line Tools
-- **Compilers**: `Apple Clang >= 15.0.0`
+- OS: `macOS Sonoma` or later
+- SDK: `XCode 15.4` or later with Command Line Tools
+- Compiler: `Apple Clang >= 15.0.0`
+
+## Set up using Python
 
-<!-- (arm-backend-quick-start-dockerfile)= -->
+### Pre-built wheels
 
-## Build and installation
+### Build wheel from source
 
 After installation of XCode and the Command Line Tools, which include Apple Clang, execute the following commands to build and install vLLM from the source.
 
@@ -29,7 +29,7 @@ pip install -e .
 On macOS the `VLLM_TARGET_DEVICE` is automatically set to `cpu`, which currently is the only supported device.
 ```
 
-## Troubleshooting
+#### Troubleshooting
 
 If the build has error like the following snippet where standard C++ headers cannot be found, try to remove and reinstall your
 [Command Line Tools for Xcode](https://developer.apple.com/download/all/).
@@ -46,3 +46,11 @@ If the build has error like the following snippet where standard C++ headers can
             |          ^~~~~~~~~
       1 error generated.
 ```
+
+## Set up using Docker
+
+### Pre-built images
+
+### Build image from source
+
+## Extra information
diff --git a/docs/source/getting_started/installation/cpu/arm.inc.md b/docs/source/getting_started/installation/cpu/arm.inc.md
new file mode 100644
index 0000000000000..08a764e1a25f4
--- /dev/null
+++ b/docs/source/getting_started/installation/cpu/arm.inc.md
@@ -0,0 +1,30 @@
+# Installation
+
+vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform.
+
+ARM CPU backend currently supports Float32, FP16 and BFloat16 datatypes.
+
+## Requirements
+
+- OS: Linux
+- Compiler: `gcc/g++ >= 12.3.0` (optional, recommended)
+- Instruction Set Architecture (ISA): NEON support is required
+
+## Set up using Python
+
+### Pre-built wheels
+
+### Build wheel from source
+
+:::{include} build.inc.md
+:::
+
+Testing has been conducted on AWS Graviton3 instances for compatibility.
+
+## Set up using Docker
+
+### Pre-built images
+
+### Build image from source
+
+## Extra information
diff --git a/docs/source/getting_started/installation/cpu/build.inc.md b/docs/source/getting_started/installation/cpu/build.inc.md
new file mode 100644
index 0000000000000..f8d1044a0d198
--- /dev/null
+++ b/docs/source/getting_started/installation/cpu/build.inc.md
@@ -0,0 +1,21 @@
+First, install recommended compiler. We recommend to use `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run:
+
+```console
+sudo apt-get update  -y
+sudo apt-get install -y gcc-12 g++-12 libnuma-dev
+sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
+```
+
+Second, install Python packages for vLLM CPU backend building:
+
+```console
+pip install --upgrade pip
+pip install cmake>=3.26 wheel packaging ninja "setuptools-scm>=8" numpy
+pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
+```
+
+Finally, build and install vLLM CPU backend:
+
+```console
+VLLM_TARGET_DEVICE=cpu python setup.py install
+```
diff --git a/docs/source/getting_started/installation/cpu-x86.md b/docs/source/getting_started/installation/cpu/index.md
similarity index 67%
rename from docs/source/getting_started/installation/cpu-x86.md
rename to docs/source/getting_started/installation/cpu/index.md
index c49c8e0f2a18c..4ec907c0e9fda 100644
--- a/docs/source/getting_started/installation/cpu-x86.md
+++ b/docs/source/getting_started/installation/cpu/index.md
@@ -1,91 +1,165 @@
-(installation-x86)=
+# CPU
 
-# Installation for x86 CPUs
+vLLM is a Python library that supports the following CPU variants. Select your CPU type to see vendor specific instructions:
 
-vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32, FP16 and BF16. vLLM CPU backend supports the following vLLM features:
+::::{tab-set}
+:sync-group: device
 
-- Tensor Parallel
-- Model Quantization (`INT8 W8A8, AWQ, GPTQ`)
-- Chunked-prefill
-- Prefix-caching
-- FP8-E5M2 KV-Caching (TODO)
+:::{tab-item} x86
+:sync: x86
+
+```{include} x86.inc.md
+:start-after: "# Installation"
+:end-before: "## Requirements"
+```
 
-Table of contents:
+:::
+
+:::{tab-item} ARM
+:sync: arm
+
+```{include} arm.inc.md
+:start-after: "# Installation"
+:end-before: "## Requirements"
+```
+
+:::
+
+:::{tab-item} Apple silicon
+:sync: apple
+
+```{include} apple.inc.md
+:start-after: "# Installation"
+:end-before: "## Requirements"
+```
 
-1. [Requirements](#cpu-backend-requirements)
-2. [Quick start using Dockerfile](#cpu-backend-quick-start-dockerfile)
-3. [Build from source](#build-cpu-backend-from-source)
-4. [Related runtime environment variables](#env-intro)
-5. [Intel Extension for PyTorch](#ipex-guidance)
-6. [Performance tips](#cpu-backend-performance-tips)
+:::
 
-(cpu-backend-requirements)=
+::::
 
 ## Requirements
 
-- OS: Linux
-- Compiler: `gcc/g++>=12.3.0` (optional, recommended)
-- Instruction set architecture (ISA) requirement: AVX512 (optional, recommended)
+- Python: 3.9 -- 3.12
 
-(cpu-backend-quick-start-dockerfile)=
+::::{tab-set}
+:sync-group: device
 
-## Quick start using Dockerfile
+:::{tab-item} x86
+:sync: x86
 
-```console
-docker build -f Dockerfile.cpu -t vllm-cpu-env --shm-size=4g .
-docker run -it \
-           --rm \
-           --network=host \
-           --cpuset-cpus=<cpu-id-list, optional> \
-           --cpuset-mems=<memory-node, optional> \
-           vllm-cpu-env
+```{include} x86.inc.md
+:start-after: "## Requirements"
+:end-before: "## Set up using Python"
 ```
 
-(build-cpu-backend-from-source)=
+:::
+
+:::{tab-item} ARM
+:sync: arm
 
-## Build from source
+```{include} arm.inc.md
+:start-after: "## Requirements"
+:end-before: "## Set up using Python"
+```
 
-- First, install recommended compiler. We recommend to use `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run:
+:::
 
-```console
-sudo apt-get update  -y
-sudo apt-get install -y gcc-12 g++-12 libnuma-dev
-sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
+:::{tab-item} Apple silicon
+:sync: apple
+
+```{include} apple.inc.md
+:start-after: "## Requirements"
+:end-before: "## Set up using Python"
 ```
 
-- Second, install Python packages for vLLM CPU backend building:
+:::
 
-```console
-pip install --upgrade pip
-pip install cmake>=3.26 wheel packaging ninja "setuptools-scm>=8" numpy
-pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
+::::
+
+## Set up using Python
+
+### Create a new Python environment
+
+```{include} ../python_env_setup.inc.md
 ```
 
-- Finally, build and install vLLM CPU backend:
+### Pre-built wheels
 
-```console
-VLLM_TARGET_DEVICE=cpu python setup.py install
+Currently, there are no pre-built CPU wheels.
+
+### Build wheel from source
+
+::::{tab-set}
+:sync-group: device
+
+:::{tab-item} x86
+:sync: x86
+
+```{include} x86.inc.md
+:start-after: "### Build wheel from source"
+:end-before: "## Set up using Docker"
 ```
 
-```{note}
-- AVX512_BF16 is an extension ISA provides native BF16 data type conversion and vector product instructions, will brings some performance improvement compared with pure AVX512. The CPU backend build script will check the host CPU flags to determine whether to enable AVX512_BF16.
-- If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable `VLLM_CPU_AVX512BF16=1` before the building.
+:::
+
+:::{tab-item} ARM
+:sync: arm
+
+```{include} arm.inc.md
+:start-after: "### Build wheel from source"
+:end-before: "## Set up using Docker"
 ```
 
-(env-intro)=
+:::
 
-## Related runtime environment variables
+:::{tab-item} Apple silicon
+:sync: apple
 
-- `VLLM_CPU_KVCACHE_SPACE`: specify the KV Cache size (e.g, `VLLM_CPU_KVCACHE_SPACE=40` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users.
-- `VLLM_CPU_OMP_THREADS_BIND`: specify the CPU cores dedicated to the OpenMP threads. For example, `VLLM_CPU_OMP_THREADS_BIND=0-31` means there will be 32 OpenMP threads bound on 0-31 CPU cores. `VLLM_CPU_OMP_THREADS_BIND=0-31|32-63` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores.
+```{include} apple.inc.md
+:start-after: "### Build wheel from source"
+:end-before: "## Set up using Docker"
+```
+
+:::
+
+::::
+
+## Set up using Docker
+
+### Pre-built images
+
+Currently, there are no pre-build CPU images.
+
+### Build image from source
 
-(ipex-guidance)=
+```console
+$ docker build -f Dockerfile.cpu -t vllm-cpu-env --shm-size=4g .
+$ docker run -it \
+             --rm \
+             --network=host \
+             --cpuset-cpus=<cpu-id-list, optional> \
+             --cpuset-mems=<memory-node, optional> \
+             vllm-cpu-env
+```
 
-## Intel Extension for PyTorch
+:::{tip}
+For ARM or Apple silicon, use `Dockerfile.arm`
+:::
 
-- [Intel Extension for PyTorch (IPEX)](https://github.com/intel/intel-extension-for-pytorch) extends PyTorch with up-to-date features optimizations for an extra performance boost on Intel hardware.
+## Supported features
 
-(cpu-backend-performance-tips)=
+vLLM CPU backend supports the following vLLM features:
+
+- Tensor Parallel
+- Model Quantization (`INT8 W8A8, AWQ, GPTQ`)
+- Chunked-prefill
+- Prefix-caching
+- FP8-E5M2 KV-Caching (TODO)
+
+## Related runtime environment variables
+
+- `VLLM_CPU_KVCACHE_SPACE`: specify the KV Cache size (e.g, `VLLM_CPU_KVCACHE_SPACE=40` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users.
+- `VLLM_CPU_OMP_THREADS_BIND`: specify the CPU cores dedicated to the OpenMP threads. For example, `VLLM_CPU_OMP_THREADS_BIND=0-31` means there will be 32 OpenMP threads bound on 0-31 CPU cores. `VLLM_CPU_OMP_THREADS_BIND=0-31|32-63` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores.
 
 ## Performance tips
 
@@ -137,13 +211,13 @@ $ python examples/offline_inference/basic.py
 
 - If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores using `VLLM_CPU_OMP_THREADS_BIND` to avoid cross NUMA node memory access.
 
-## CPU Backend Considerations
+## Other considerations
 
 - The CPU backend significantly differs from the GPU backend since the vLLM architecture was originally optimized for GPU use. A number of optimizations are needed to enhance its performance.
 
 - Decouple the HTTP serving components from the inference components. In a GPU backend configuration, the HTTP serving and tokenization tasks operate on the CPU, while inference runs on the GPU, which typically does not pose a problem. However, in a CPU-based setup, the HTTP serving and tokenization can cause significant context switching and reduced cache efficiency. Therefore, it is strongly recommended to segregate these two components for improved performance.
 
-- On CPU based setup with NUMA enabled, the memory access performance may be largely impacted by the [topology](https://github.com/intel/intel-extension-for-pytorch/blob/main/docs/tutorials/performance_tuning/tuning_guide.md#non-uniform-memory-access-numa). For NUMA architecture, two optimizations are to recommended: Tensor Parallel or Data Parallel.
+- On CPU based setup with NUMA enabled, the memory access performance may be largely impacted by the [topology](https://github.com/intel/intel-extension-for-pytorch/blob/main/docs/tutorials/performance_tuning/tuning_guide.inc.md#non-uniform-memory-access-numa). For NUMA architecture, two optimizations are to recommended: Tensor Parallel or Data Parallel.
 
   - Using Tensor Parallel for a latency constraints deployment: following GPU backend design, a Megatron-LM's parallel algorithm will be used to shard the model, based on the number of NUMA nodes (e.g. TP = 2 for a two NUMA node system). With [TP feature on CPU](gh-pr:6125) merged, Tensor Parallel is supported for serving and offline inferencing. In general each NUMA node is treated as one GPU card. Below is the example script to enable Tensor Parallel = 2 for serving:
 
@@ -151,4 +225,4 @@ $ python examples/offline_inference/basic.py
     VLLM_CPU_KVCACHE_SPACE=40 VLLM_CPU_OMP_THREADS_BIND="0-31|32-63" vllm serve meta-llama/Llama-2-7b-chat-hf -tp=2 --distributed-executor-backend mp
     ```
 
-  - Using Data Parallel for maximum throughput: to launch an LLM serving endpoint on each NUMA node along with one additional load balancer to dispatch the requests to those endpoints. Common solutions like [Nginx](#nginxloadbalancer) or HAProxy are recommended. Anyscale Ray project provides the feature on LLM [serving](https://docs.ray.io/en/latest/serve/index.html). Here is the example to setup a scalable LLM serving with [Ray Serve](https://github.com/intel/llm-on-ray/blob/main/docs/setup.md).
+  - Using Data Parallel for maximum throughput: to launch an LLM serving endpoint on each NUMA node along with one additional load balancer to dispatch the requests to those endpoints. Common solutions like [Nginx](#nginxloadbalancer) or HAProxy are recommended. Anyscale Ray project provides the feature on LLM [serving](https://docs.ray.io/en/latest/serve/index.html). Here is the example to setup a scalable LLM serving with [Ray Serve](https://github.com/intel/llm-on-ray/blob/main/docs/setup.inc.md).
diff --git a/docs/source/getting_started/installation/cpu/x86.inc.md b/docs/source/getting_started/installation/cpu/x86.inc.md
new file mode 100644
index 0000000000000..e4f99d3cebdf2
--- /dev/null
+++ b/docs/source/getting_started/installation/cpu/x86.inc.md
@@ -0,0 +1,35 @@
+# Installation
+
+vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32, FP16 and BF16.
+
+## Requirements
+
+- OS: Linux
+- Compiler: `gcc/g++ >= 12.3.0` (optional, recommended)
+- Instruction Set Architecture (ISA): AVX512 (optional, recommended)
+
+## Set up using Python
+
+### Pre-built wheels
+
+### Build wheel from source
+
+:::{include} build.inc.md
+:::
+
+```{note}
+- AVX512_BF16 is an extension ISA provides native BF16 data type conversion and vector product instructions, will brings some performance improvement compared with pure AVX512. The CPU backend build script will check the host CPU flags to determine whether to enable AVX512_BF16.
+- If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable `VLLM_CPU_AVX512BF16=1` before the building.
+```
+
+## Set up using Docker
+
+### Pre-built images
+
+### Build image from source
+
+## Extra information
+
+## Intel Extension for PyTorch
+
+- [Intel Extension for PyTorch (IPEX)](https://github.com/intel/intel-extension-for-pytorch) extends PyTorch with up-to-date features optimizations for an extra performance boost on Intel hardware.
diff --git a/docs/source/getting_started/installation/device.template.md b/docs/source/getting_started/installation/device.template.md
new file mode 100644
index 0000000000000..44f538da93659
--- /dev/null
+++ b/docs/source/getting_started/installation/device.template.md
@@ -0,0 +1,17 @@
+# Installation
+
+## Requirements
+
+## Set up using Python
+
+### Pre-built wheels
+
+### Build wheel from source
+
+## Set up using Docker
+
+### Pre-built images
+
+### Build image from source
+
+## Extra information
diff --git a/docs/source/getting_started/installation/gpu-cuda.md b/docs/source/getting_started/installation/gpu/cuda.inc.md
similarity index 84%
rename from docs/source/getting_started/installation/gpu-cuda.md
rename to docs/source/getting_started/installation/gpu/cuda.inc.md
index 727486abbd10f..4cce65278c069 100644
--- a/docs/source/getting_started/installation/gpu-cuda.md
+++ b/docs/source/getting_started/installation/gpu/cuda.inc.md
@@ -1,44 +1,24 @@
-(installation-cuda)=
+# Installation
 
-# Installation for CUDA
-
-vLLM is a Python library that also contains pre-compiled C++ and CUDA (12.1) binaries.
+vLLM contains pre-compiled C++ and CUDA (12.1) binaries.
 
 ## Requirements
 
-- OS: Linux
-- Python: 3.9 -- 3.12
 - GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.)
 
-## Install released versions
+## Set up using Python
 
 ### Create a new Python environment
 
-You can create a new Python environment using `conda`:
-
-```console
-# (Recommended) Create a new conda environment.
-conda create -n myenv python=3.12 -y
-conda activate myenv
-```
-
 ```{note}
-[PyTorch has deprecated the conda release channel](https://github.com/pytorch/pytorch/issues/138506). If you use `conda`, please only use it to create Python environment rather than installing packages. In particular, the PyTorch installed via `conda` will statically link `NCCL` library, which can cause issues when vLLM tries to use `NCCL`. See <gh-issue:8420> for more details.
-```
-
-Or you can create a new Python environment using [uv](https://docs.astral.sh/uv/), a very fast Python environment manager. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment using the following command:
-
-```console
-# (Recommended) Create a new uv environment. Use `--seed` to install `pip` and `setuptools` in the environment.
-uv venv myenv --python 3.12 --seed
-source myenv/bin/activate
+PyTorch installed via `conda` will statically link `NCCL` library, which can cause issues when vLLM tries to use `NCCL`. See <gh-issue:8420> for more details.
 ```
 
 In order to be performant, vLLM has to compile many cuda kernels. The compilation unfortunately introduces binary incompatibility with other CUDA versions and PyTorch versions, even for the same PyTorch version with different building configurations.
 
 Therefore, it is recommended to install vLLM with a **fresh new** environment. If either you have a different CUDA version or you want to use an existing PyTorch installation, you need to build vLLM from source. See [below](#build-from-source) for more details.
 
-### Install vLLM
+### Pre-built wheels
 
 You can install vLLM using either `pip` or `uv pip`:
 
@@ -59,11 +39,11 @@ pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSI
 
 (install-the-latest-code)=
 
-## Install the latest code
+#### Install the latest code
 
 LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on a x86 platform with CUDA 12 for every commit since `v0.5.3`.
 
-### Install the latest code using `pip`
+##### Install the latest code using `pip`
 
 ```console
 pip install vllm --pre --extra-index-url https://wheels.vllm.ai/nightly
@@ -80,7 +60,7 @@ pip install https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manyl
 
 Note that the wheels are built with Python 3.8 ABI (see [PEP 425](https://peps.python.org/pep-0425/) for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (`1.0.0.dev`) is just a placeholder to have a unified URL for the wheels, the actual versions of wheels are contained in the wheel metadata (the wheels listed in the extra index url have correct versions). Although we don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the wheels are still built with Python 3.8 ABI to keep the same wheel name as before.
 
-### Install the latest code using `uv`
+##### Install the latest code using `uv`
 
 Another way to install the latest code is to use `uv`:
 
@@ -97,26 +77,9 @@ uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT}
 
 The `uv` approach works for vLLM `v0.6.6` and later and offers an easy-to-remember command. A unique feature of `uv` is that packages in `--extra-index-url` have [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes). If the latest public release is `v0.6.6.post1`, `uv`'s behavior allows installing a commit before `v0.6.6.post1` by specifying the `--extra-index-url`. In contrast, `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version.
 
-### Install the latest code using `docker`
-
-Another way to access the latest code is to use the docker images:
-
-```console
-export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
-docker pull public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:${VLLM_COMMIT}
-```
-
-These docker images are used for CI and testing only, and they are not intended for production use. They will be expired after several days.
-
-The latest code can contain bugs and may not be stable. Please use it with caution.
-
-(build-from-source)=
+### Build wheel from source
 
-## Build from source
-
-(python-only-build)=
-
-### Python-only build (without compilation)
+#### Set up using Python-only build (without compilation)
 
 If you only need to change Python code, you can build and install vLLM without compilation. Using `pip`'s [`--editable` flag](https://pip.pypa.io/en/stable/topics/local-project-installs/#editable-installs), changes you make to the code will be reflected when you run vLLM:
 
@@ -135,14 +98,14 @@ export VLLM_PRECOMPILED_WHEEL_LOCATION=https://files.pythonhosted.org/packages/4
 pip install --editable .
 ```
 
-You can find more information about vLLM's wheels [above](#install-the-latest-code).
+You can find more information about vLLM's wheels in <project:#install-the-latest-code>.
 
 ```{note}
 There is a possibility that your source code may have a different commit ID compared to the latest vLLM wheel, which could potentially lead to unknown errors.
-It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to [the section above](#install-the-latest-code) for instructions on how to install a specified wheel.
+It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to <project:#install-the-latest-code> for instructions on how to install a specified wheel.
 ```
 
-### Full build (with compilation)
+#### Full build (with compilation)
 
 If you want to modify C++ or CUDA code, you'll need to build vLLM from source. This can take several minutes:
 
@@ -162,7 +125,7 @@ As long as `which ccache` command can find the `ccache` binary, it will be used
 The following environment variables can be set to configure the vLLM `sccache` remote: `SCCACHE_BUCKET=vllm-build-sccache SCCACHE_REGION=us-west-2 SCCACHE_S3_NO_CREDENTIALS=1`. We also recommend setting `SCCACHE_IDLE_TIMEOUT=0`.
 ```
 
-#### Use an existing PyTorch installation
+##### Use an existing PyTorch installation
 
 There are scenarios where the PyTorch dependency cannot be easily installed via pip, e.g.:
 
@@ -179,7 +142,7 @@ pip install -r requirements-build.txt
 pip install -e . --no-build-isolation
 ```
 
-#### Use the local cutlass for compilation
+##### Use the local cutlass for compilation
 
 Currently, before starting the build process, vLLM fetches cutlass code from GitHub. However, there may be scenarios where you want to use a local version of cutlass instead.
 To achieve this, you can set the environment variable VLLM_CUTLASS_SRC_DIR to point to your local cutlass directory.
@@ -190,7 +153,7 @@ cd vllm
 VLLM_CUTLASS_SRC_DIR=/path/to/cutlass pip install -e .
 ```
 
-#### Troubleshooting
+##### Troubleshooting
 
 To avoid your system being overloaded, you can limit the number of compilation jobs
 to be run simultaneously, via the environment variable `MAX_JOBS`. For example:
@@ -224,7 +187,7 @@ nvcc --version # verify that nvcc is in your PATH
 ${CUDA_HOME}/bin/nvcc --version # verify that nvcc is in your CUDA_HOME
 ```
 
-### Unsupported OS build
+#### Unsupported OS build
 
 vLLM can fully run only on Linux but for development purposes, you can still build it on other systems (for example, macOS), allowing for imports and a more convenient development environment. The binaries will not be compiled and won't work on non-Linux systems.
 
@@ -234,3 +197,28 @@ Simply disable the `VLLM_TARGET_DEVICE` environment variable before installing:
 export VLLM_TARGET_DEVICE=empty
 pip install -e .
 ```
+
+## Set up using Docker
+
+### Pre-built images
+
+See <project:#deployment-docker-pre-built-image> for instructions on using the official Docker image.
+
+Another way to access the latest code is to use the docker images:
+
+```console
+export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
+docker pull public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:${VLLM_COMMIT}
+```
+
+These docker images are used for CI and testing only, and they are not intended for production use. They will be expired after several days.
+
+The latest code can contain bugs and may not be stable. Please use it with caution.
+
+### Build image from source
+
+See <project:#deployment-docker-build-image-from-source> for instructions on building the Docker image.
+
+## Supported features
+
+See <project:#feature-x-hardware> compatibility matrix for feature support information.
diff --git a/docs/source/getting_started/installation/gpu/index.md b/docs/source/getting_started/installation/gpu/index.md
new file mode 100644
index 0000000000000..6c007382b2c3d
--- /dev/null
+++ b/docs/source/getting_started/installation/gpu/index.md
@@ -0,0 +1,300 @@
+# GPU
+
+vLLM is a Python library that supports the following GPU variants. Select your GPU type to see vendor specific instructions:
+
+::::{tab-set}
+:sync-group: device
+
+:::{tab-item} CUDA
+:sync: cuda
+
+```{include} cuda.inc.md
+:start-after: "# Installation"
+:end-before: "## Requirements"
+```
+
+:::
+
+:::{tab-item} ROCm
+:sync: rocm
+
+```{include} rocm.inc.md
+:start-after: "# Installation"
+:end-before: "## Requirements"
+```
+
+:::
+
+:::{tab-item} XPU
+:sync: xpu
+
+```{include} xpu.inc.md
+:start-after: "# Installation"
+:end-before: "## Requirements"
+```
+
+:::
+
+::::
+
+## Requirements
+
+- OS: Linux
+- Python: 3.9 -- 3.12
+
+::::{tab-set}
+:sync-group: device
+
+:::{tab-item} CUDA
+:sync: cuda
+
+```{include} cuda.inc.md
+:start-after: "## Requirements"
+:end-before: "## Set up using Python"
+```
+
+:::
+
+:::{tab-item} ROCm
+:sync: rocm
+
+```{include} rocm.inc.md
+:start-after: "## Requirements"
+:end-before: "## Set up using Python"
+```
+
+:::
+
+:::{tab-item} XPU
+:sync: xpu
+
+```{include} xpu.inc.md
+:start-after: "## Requirements"
+:end-before: "## Set up using Python"
+```
+
+:::
+
+::::
+
+## Set up using Python
+
+### Create a new Python environment
+
+```{include} ../python_env_setup.inc.md
+```
+
+::::{tab-set}
+:sync-group: device
+
+:::{tab-item} CUDA
+:sync: cuda
+
+```{include} cuda.inc.md
+:start-after: "## Create a new Python environment"
+:end-before: "### Pre-built wheels"
+```
+
+:::
+
+:::{tab-item} ROCm
+:sync: rocm
+
+There is no extra information on creating a new Python environment for this device.
+
+:::
+
+:::{tab-item} XPU
+:sync: xpu
+
+There is no extra information on creating a new Python environment for this device.
+
+:::
+
+::::
+
+### Pre-built wheels
+
+::::{tab-set}
+:sync-group: device
+
+:::{tab-item} CUDA
+:sync: cuda
+
+```{include} cuda.inc.md
+:start-after: "### Pre-built wheels"
+:end-before: "### Build wheel from source"
+```
+
+:::
+
+:::{tab-item} ROCm
+:sync: rocm
+
+```{include} rocm.inc.md
+:start-after: "### Pre-built wheels"
+:end-before: "### Build wheel from source"
+```
+
+:::
+
+:::{tab-item} XPU
+:sync: xpu
+
+```{include} xpu.inc.md
+:start-after: "### Pre-built wheels"
+:end-before: "### Build wheel from source"
+```
+
+:::
+
+::::
+
+(build-from-source)=
+
+### Build wheel from source
+
+::::{tab-set}
+:sync-group: device
+
+:::{tab-item} CUDA
+:sync: cuda
+
+```{include} cuda.inc.md
+:start-after: "### Build wheel from source"
+:end-before: "## Set up using Docker"
+```
+
+:::
+
+:::{tab-item} ROCm
+:sync: rocm
+
+```{include} rocm.inc.md
+:start-after: "### Build wheel from source"
+:end-before: "## Set up using Docker"
+```
+
+:::
+
+:::{tab-item} XPU
+:sync: xpu
+
+```{include} xpu.inc.md
+:start-after: "### Build wheel from source"
+:end-before: "## Set up using Docker"
+```
+
+:::
+
+::::
+
+## Set up using Docker
+
+### Pre-built images
+
+::::{tab-set}
+:sync-group: device
+
+:::{tab-item} CUDA
+:sync: cuda
+
+```{include} cuda.inc.md
+:start-after: "### Pre-built images"
+:end-before: "### Build image from source"
+```
+
+:::
+
+:::{tab-item} ROCm
+:sync: rocm
+
+```{include} rocm.inc.md
+:start-after: "### Pre-built images"
+:end-before: "### Build image from source"
+```
+
+:::
+
+:::{tab-item} XPU
+:sync: xpu
+
+```{include} xpu.inc.md
+:start-after: "### Pre-built images"
+:end-before: "### Build image from source"
+```
+
+:::
+
+::::
+
+### Build image from source
+
+::::{tab-set}
+:sync-group: device
+
+:::{tab-item} CUDA
+:sync: cuda
+
+```{include} cuda.inc.md
+:start-after: "### Build image from source"
+:end-before: "## Supported features"
+```
+
+:::
+
+:::{tab-item} ROCm
+:sync: rocm
+
+```{include} rocm.inc.md
+:start-after: "### Build image from source"
+:end-before: "## Supported features"
+```
+
+:::
+
+:::{tab-item} XPU
+:sync: xpu
+
+```{include} xpu.inc.md
+:start-after: "### Build image from source"
+:end-before: "## Supported features"
+```
+
+:::
+
+::::
+
+## Supported features
+
+::::{tab-set}
+:sync-group: device
+
+:::{tab-item} CUDA
+:sync: cuda
+
+```{include} cuda.inc.md
+:start-after: "## Supported features"
+```
+
+:::
+
+:::{tab-item} ROCm
+:sync: rocm
+
+```{include} rocm.inc.md
+:start-after: "## Supported features"
+```
+
+:::
+
+:::{tab-item} XPU
+:sync: xpu
+
+```{include} xpu.inc.md
+:start-after: "## Supported features"
+```
+
+:::
+
+::::
diff --git a/docs/source/getting_started/installation/gpu-rocm.md b/docs/source/getting_started/installation/gpu/rocm.inc.md
similarity index 87%
rename from docs/source/getting_started/installation/gpu-rocm.md
rename to docs/source/getting_started/installation/gpu/rocm.inc.md
index a8971bb96248c..f6f9d3c303f89 100644
--- a/docs/source/getting_started/installation/gpu-rocm.md
+++ b/docs/source/getting_started/installation/gpu/rocm.inc.md
@@ -1,82 +1,19 @@
-(installation-rocm)=
-
-# Installation for ROCm
+# Installation
 
 vLLM supports AMD GPUs with ROCm 6.2.
 
 ## Requirements
 
-- OS: Linux
-- Python: 3.9 -- 3.12
 - GPU: MI200s (gfx90a), MI300 (gfx942), Radeon RX 7900 series (gfx1100)
 - ROCm 6.2
 
-Installation options:
-
-1. [Build from source with docker](#build-from-source-docker-rocm)
-2. [Build from source](#build-from-source-rocm)
-
-(build-from-source-docker-rocm)=
-
-## Option 1: Build from source with docker (recommended)
-
-You can build and install vLLM from source.
-
-First, build a docker image from <gh-file:Dockerfile.rocm> and launch a docker container from the image.
-It is important that the user kicks off the docker build using buildkit. Either the user put DOCKER_BUILDKIT=1 as environment variable when calling docker build command, or the user needs to setup buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon:
-
-```console
-{
-    "features": {
-        "buildkit": true
-    }
-}
-```
-
-<gh-file:Dockerfile.rocm> uses ROCm 6.2 by default, but also supports ROCm 5.7, 6.0 and 6.1 in older vLLM branches.
-It provides flexibility to customize the build of docker image using the following arguments:
-
-- `BASE_IMAGE`: specifies the base image used when running `docker build`, specifically the PyTorch on ROCm base image.
-- `BUILD_FA`: specifies whether to build CK flash-attention. The default is 1. For [Radeon RX 7900 series (gfx1100)](https://rocm.docs.amd.com/projects/radeon/en/latest/index.html), this should be set to 0 before flash-attention supports this target.
-- `FX_GFX_ARCHS`: specifies the GFX architecture that is used to build CK flash-attention, for example, `gfx90a;gfx942` for MI200 and MI300. The default is `gfx90a;gfx942`
-- `FA_BRANCH`: specifies the branch used to build the CK flash-attention in [ROCm's flash-attention repo](https://github.com/ROCmSoftwarePlatform/flash-attention). The default is `ae7928c`
-- `BUILD_TRITON`: specifies whether to build triton flash-attention. The default value is 1.
-
-Their values can be passed in when running `docker build` with `--build-arg` options.
-
-To build vllm on ROCm 6.2 for MI200 and MI300 series, you can use the default:
-
-```console
-DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm -t vllm-rocm .
-```
+## Set up using Python
 
-To build vllm on ROCm 6.2 for Radeon RX7900 series (gfx1100), you should specify `BUILD_FA` as below:
+### Pre-built wheels
 
-```console
-DOCKER_BUILDKIT=1 docker build --build-arg BUILD_FA="0" -f Dockerfile.rocm -t vllm-rocm .
-```
+Currently, there are no pre-built ROCm wheels.
 
-To run the above docker image `vllm-rocm`, use the below command:
-
-```console
-$ docker run -it \
-   --network=host \
-   --group-add=video \
-   --ipc=host \
-   --cap-add=SYS_PTRACE \
-   --security-opt seccomp=unconfined \
-   --device /dev/kfd \
-   --device /dev/dri \
-   -v <path/to/model>:/app/model \
-   vllm-rocm \
-   bash
-```
-
-Where the `<path/to/model>` is the location where the model is stored, for example, the weights for llama2 or llama3 models.
-
-(build-from-source-rocm)=
-
-## Option 2: Build from source
+### Build wheel from source
 
 0. Install prerequisites (skip if you are already in an environment/docker with the following installed):
 
@@ -157,7 +94,73 @@ Where the `<path/to/model>` is the location where the model is stored, for examp
     - The ROCm version of PyTorch, ideally, should match the ROCm driver version.
     ```
 
-    ```{tip}
-    - For MI300x (gfx942) users, to achieve optimal performance, please refer to [MI300x tuning guide](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/index.html) for performance optimization and tuning tips on system and workflow level.
-      For vLLM, please refer to [vLLM performance optimization](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/workload.html#vllm-performance-optimization).
-    ```
+```{tip}
+- For MI300x (gfx942) users, to achieve optimal performance, please refer to [MI300x tuning guide](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/index.html) for performance optimization and tuning tips on system and workflow level.
+  For vLLM, please refer to [vLLM performance optimization](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/workload.html#vllm-performance-optimization).
+```
+
+## Set up using Docker
+
+### Pre-built images
+
+Currently, there are no pre-built ROCm images.
+
+### Build image from source
+
+Building the Docker image from source is the recommended way to use vLLM with ROCm.
+
+First, build a docker image from <gh-file:Dockerfile.rocm> and launch a docker container from the image.
+It is important that the user kicks off the docker build using buildkit. Either the user put DOCKER_BUILDKIT=1 as environment variable when calling docker build command, or the user needs to setup buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon:
+
+```console
+{
+    "features": {
+        "buildkit": true
+    }
+}
+```
+
+<gh-file:Dockerfile.rocm> uses ROCm 6.2 by default, but also supports ROCm 5.7, 6.0 and 6.1 in older vLLM branches.
+It provides flexibility to customize the build of docker image using the following arguments:
+
+- `BASE_IMAGE`: specifies the base image used when running `docker build`, specifically the PyTorch on ROCm base image.
+- `BUILD_FA`: specifies whether to build CK flash-attention. The default is 1. For [Radeon RX 7900 series (gfx1100)](https://rocm.docs.amd.com/projects/radeon/en/latest/index.html), this should be set to 0 before flash-attention supports this target.
+- `FX_GFX_ARCHS`: specifies the GFX architecture that is used to build CK flash-attention, for example, `gfx90a;gfx942` for MI200 and MI300. The default is `gfx90a;gfx942`
+- `FA_BRANCH`: specifies the branch used to build the CK flash-attention in [ROCm's flash-attention repo](https://github.com/ROCmSoftwarePlatform/flash-attention). The default is `ae7928c`
+- `BUILD_TRITON`: specifies whether to build triton flash-attention. The default value is 1.
+
+Their values can be passed in when running `docker build` with `--build-arg` options.
+
+To build vllm on ROCm 6.2 for MI200 and MI300 series, you can use the default:
+
+```console
+DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm -t vllm-rocm .
+```
+
+To build vllm on ROCm 6.2 for Radeon RX7900 series (gfx1100), you should specify `BUILD_FA` as below:
+
+```console
+DOCKER_BUILDKIT=1 docker build --build-arg BUILD_FA="0" -f Dockerfile.rocm -t vllm-rocm .
+```
+
+To run the above docker image `vllm-rocm`, use the below command:
+
+```console
+docker run -it \
+   --network=host \
+   --group-add=video \
+   --ipc=host \
+   --cap-add=SYS_PTRACE \
+   --security-opt seccomp=unconfined \
+   --device /dev/kfd \
+   --device /dev/dri \
+   -v <path/to/model>:/app/model \
+   vllm-rocm \
+   bash
+```
+
+Where the `<path/to/model>` is the location where the model is stored, for example, the weights for llama2 or llama3 models.
+
+## Supported features
+
+See <project:#feature-x-hardware> compatibility matrix for feature support information.
diff --git a/docs/source/getting_started/installation/xpu.md b/docs/source/getting_started/installation/gpu/xpu.inc.md
similarity index 80%
rename from docs/source/getting_started/installation/xpu.md
rename to docs/source/getting_started/installation/gpu/xpu.inc.md
index 73758f37cf0f6..577986eba74fd 100644
--- a/docs/source/getting_started/installation/xpu.md
+++ b/docs/source/getting_started/installation/gpu/xpu.inc.md
@@ -1,40 +1,19 @@
-(installation-xpu)=
-
-# Installation for XPUs
+# Installation
 
 vLLM initially supports basic model inferencing and serving on Intel GPU platform.
 
-Table of contents:
-
-1. [Requirements](#xpu-backend-requirements)
-2. [Quick start using Dockerfile](#xpu-backend-quick-start-dockerfile)
-3. [Build from source](#build-xpu-backend-from-source)
-
-(xpu-backend-requirements)=
-
 ## Requirements
 
-- OS: Linux
 - Supported Hardware: Intel Data Center GPU, Intel ARC GPU
 - OneAPI requirements: oneAPI 2024.2
 
-(xpu-backend-quick-start-dockerfile)=
+## Set up using Python
 
-## Quick start using Dockerfile
-
-```console
-$ docker build -f Dockerfile.xpu -t vllm-xpu-env --shm-size=4g .
-$ docker run -it \
-             --rm \
-             --network=host \
-             --device /dev/dri \
-             -v /dev/dri/by-path:/dev/dri/by-path \
-             vllm-xpu-env
-```
+### Pre-built wheels
 
-(build-xpu-backend-from-source)=
+Currently, there are no pre-built XPU wheels.
 
-## Build from source
+### Build wheel from source
 
 - First, install required driver and intel OneAPI 2024.2 or later.
 - Second, install Python packages for vLLM XPU backend building:
@@ -56,7 +35,25 @@ VLLM_TARGET_DEVICE=xpu python setup.py install
   type will be supported in the future.
 ```
 
-## Distributed inference and serving
+## Set up using Docker
+
+### Pre-built images
+
+Currently, there are no pre-built XPU images.
+
+### Build image from source
+
+```console
+$ docker build -f Dockerfile.xpu -t vllm-xpu-env --shm-size=4g .
+$ docker run -it \
+             --rm \
+             --network=host \
+             --device /dev/dri \
+             -v /dev/dri/by-path:/dev/dri/by-path \
+             vllm-xpu-env
+```
+
+## Supported features
 
 XPU platform supports tensor-parallel inference/serving and also supports pipeline parallel as a beta feature for online serving. We requires Ray as the distributed runtime backend. For example, a reference execution likes following:
 
diff --git a/docs/source/getting_started/installation/index.md b/docs/source/getting_started/installation/index.md
index 0ebadca2ccec9..bc1d268bf0c7e 100644
--- a/docs/source/getting_started/installation/index.md
+++ b/docs/source/getting_started/installation/index.md
@@ -7,14 +7,7 @@ vLLM supports the following hardware platforms:
 ```{toctree}
 :maxdepth: 1
 
-gpu-cuda
-gpu-rocm
-cpu-x86
-cpu-arm
-cpu-apple
-hpu-gaudi
-tpu
-xpu
-openvino
-neuron
+gpu/index
+cpu/index
+ai_accelerator/index
 ```
diff --git a/docs/source/getting_started/installation/python_env_setup.inc.md b/docs/source/getting_started/installation/python_env_setup.inc.md
new file mode 100644
index 0000000000000..25cfac5f58aa7
--- /dev/null
+++ b/docs/source/getting_started/installation/python_env_setup.inc.md
@@ -0,0 +1,19 @@
+You can create a new Python environment using `conda`:
+
+```console
+# (Recommended) Create a new conda environment.
+conda create -n myenv python=3.12 -y
+conda activate myenv
+```
+
+```{note}
+[PyTorch has deprecated the conda release channel](https://github.com/pytorch/pytorch/issues/138506). If you use `conda`, please only use it to create Python environment rather than installing packages.
+```
+
+Or you can create a new Python environment using [uv](https://docs.astral.sh/uv/), a very fast Python environment manager. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment using the following command:
+
+```console
+# (Recommended) Create a new uv environment. Use `--seed` to install `pip` and `setuptools` in the environment.
+uv venv myenv --python 3.12 --seed
+source myenv/bin/activate
+```

From 458e63a2c6b18e7febfa30cecb59461f96fbe324 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 13 Jan 2025 20:59:09 +0800
Subject: [PATCH 200/462] [platform] add device_control env var (#12009)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/platforms/cuda.py      |  1 +
 vllm/platforms/hpu.py       |  1 +
 vllm/platforms/interface.py | 10 ++++++++++
 vllm/platforms/neuron.py    |  1 +
 vllm/platforms/rocm.py      |  2 ++
 vllm/platforms/tpu.py       |  1 +
 vllm/platforms/xpu.py       |  1 +
 7 files changed, 17 insertions(+)

diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 3f77ec50ed31f..b6a6c461369f9 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -78,6 +78,7 @@ class CudaPlatformBase(Platform):
     device_type: str = "cuda"
     dispatch_key: str = "CUDA"
     ray_device_key: str = "GPU"
+    device_control_env_var: str = "CUDA_VISIBLE_DEVICES"
 
     @classmethod
     def get_device_capability(cls,
diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py
index 0acb2804a5f66..b579ebf494bdc 100644
--- a/vllm/platforms/hpu.py
+++ b/vllm/platforms/hpu.py
@@ -20,6 +20,7 @@ class HpuPlatform(Platform):
     device_type: str = "hpu"
     dispatch_key: str = "HPU"
     ray_device_key: str = "HPU"
+    device_control_env_var: str = "HABANA_VISIBLE_MODULES"
 
     @classmethod
     def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index ec917f75689dd..afa9daa9c98a7 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -78,20 +78,30 @@ class Platform:
     _enum: PlatformEnum
     device_name: str
     device_type: str
+
     # available dispatch keys:
     # check https://github.com/pytorch/pytorch/blob/313dac6c1ca0fa0cde32477509cce32089f8532a/torchgen/model.py#L134 # noqa
     # use "CPU" as a fallback for platforms not registered in PyTorch
     dispatch_key: str = "CPU"
+
     # available ray device keys:
     # https://github.com/ray-project/ray/blob/10ba5adadcc49c60af2c358a33bb943fb491a171/python/ray/_private/ray_constants.py#L438 # noqa
     # empty string means the device does not support ray
     ray_device_key: str = ""
+
+    # platform-agnostic way to specify the device control environment variable,
+    # .e.g. CUDA_VISIBLE_DEVICES for CUDA.
+    # hint: search for "get_visible_accelerator_ids_env_var" in
+    # https://github.com/ray-project/ray/tree/master/python/ray/_private/accelerators # noqa
+    device_control_env_var: str = "VLLM_DEVICE_CONTROL_ENV_VAR_PLACEHOLDER"
+
     # The torch.compile backend for compiling simple and
     # standalone functions. The default value is "inductor" to keep
     # the same behavior as PyTorch.
     # NOTE: for the forward part of the model, vLLM has another separate
     # compilation strategy.
     simple_compile_backend: str = "inductor"
+
     supported_quantization: list[str] = []
 
     def is_cuda(self) -> bool:
diff --git a/vllm/platforms/neuron.py b/vllm/platforms/neuron.py
index 7f4a867b32ba1..0696f73cc17b4 100644
--- a/vllm/platforms/neuron.py
+++ b/vllm/platforms/neuron.py
@@ -18,6 +18,7 @@ class NeuronPlatform(Platform):
     device_type: str = "neuron"
     ray_device_key: str = "neuron_cores"
     supported_quantization: list[str] = ["neuron_quant"]
+    device_control_env_var: str = "NEURON_RT_VISIBLE_CORES"
 
     @classmethod
     def get_device_name(cls, device_id: int = 0) -> str:
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index f12e948113723..7f1e8aef528a6 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -65,6 +65,8 @@ class RocmPlatform(Platform):
     device_type: str = "cuda"
     dispatch_key: str = "CUDA"
     ray_device_key: str = "GPU"
+    # rocm shares the same device control env var as CUDA
+    device_control_env_var: str = "CUDA_VISIBLE_DEVICES"
 
     supported_quantization: list[str] = [
         "awq", "gptq", "fp8", "compressed_tensors", "compressed-tensors",
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index 460eb170bba34..ff9487daac7a7 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -20,6 +20,7 @@ class TpuPlatform(Platform):
     device_type: str = "tpu"
     dispatch_key: str = "XLA"
     ray_device_key: str = "TPU"
+    device_control_env_var: str = "TPU_VISIBLE_CHIPS"
 
     supported_quantization: list[str] = [
         "tpu_int8", "compressed-tensors", "compressed_tensors"
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index cb74f79b31794..f34376b44e689 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -22,6 +22,7 @@ class XPUPlatform(Platform):
     # Intel XPU's device key is "GPU" for Ray.
     # see https://github.com/ray-project/ray/blob/6a5eb5865eeb9ccf058a79b44f107e327e360673/python/ray/_private/accelerators/intel_gpu.py#L20 # noqa: E501
     ray_device_key: str = "GPU"
+    device_control_env_var: str = "ONEAPI_DEVICE_SELECTOR"
 
     @classmethod
     def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,

From a7d59688fb75827db4316c24a057ac6097114bd3 Mon Sep 17 00:00:00 2001
From: Shanshan Shen <467638484@qq.com>
Date: Mon, 13 Jan 2025 21:12:10 +0800
Subject: [PATCH 201/462] [Platform] Move get_punica_wrapper() function to
 Platform (#11516)

Signed-off-by: Shanshan Shen <467638484@qq.com>
---
 vllm/lora/punica_wrapper/punica_selector.py | 26 +++++++--------------
 vllm/platforms/cpu.py                       |  4 ++++
 vllm/platforms/cuda.py                      |  4 ++++
 vllm/platforms/hpu.py                       |  4 ++++
 vllm/platforms/interface.py                 |  7 ++++++
 vllm/platforms/rocm.py                      |  4 ++++
 6 files changed, 32 insertions(+), 17 deletions(-)

diff --git a/vllm/lora/punica_wrapper/punica_selector.py b/vllm/lora/punica_wrapper/punica_selector.py
index 9f1606e672dea..a293224651992 100644
--- a/vllm/lora/punica_wrapper/punica_selector.py
+++ b/vllm/lora/punica_wrapper/punica_selector.py
@@ -1,5 +1,6 @@
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
+from vllm.utils import resolve_obj_by_qualname
 
 from .punica_base import PunicaWrapperBase
 
@@ -7,20 +8,11 @@
 
 
 def get_punica_wrapper(*args, **kwargs) -> PunicaWrapperBase:
-    if current_platform.is_cuda_alike():
-        # Lazy import to avoid ImportError
-        from vllm.lora.punica_wrapper.punica_gpu import PunicaWrapperGPU
-        logger.info_once("Using PunicaWrapperGPU.")
-        return PunicaWrapperGPU(*args, **kwargs)
-    elif current_platform.is_cpu():
-        # Lazy import to avoid ImportError
-        from vllm.lora.punica_wrapper.punica_cpu import PunicaWrapperCPU
-        logger.info_once("Using PunicaWrapperCPU.")
-        return PunicaWrapperCPU(*args, **kwargs)
-    elif current_platform.is_hpu():
-        # Lazy import to avoid ImportError
-        from vllm.lora.punica_wrapper.punica_hpu import PunicaWrapperHPU
-        logger.info_once("Using PunicaWrapperHPU.")
-        return PunicaWrapperHPU(*args, **kwargs)
-    else:
-        raise NotImplementedError
+    punica_wrapper_qualname = current_platform.get_punica_wrapper()
+    punica_wrapper_cls = resolve_obj_by_qualname(punica_wrapper_qualname)
+    punica_wrapper = punica_wrapper_cls(*args, **kwargs)
+    assert punica_wrapper is not None, \
+        "the punica_wrapper_qualname(" + punica_wrapper_qualname + ") is wrong."
+    logger.info_once("Using " + punica_wrapper_qualname.rsplit(".", 1)[1] +
+                     ".")
+    return punica_wrapper
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index eb3e269cac285..4d3b84fea887f 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -109,3 +109,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
     def is_pin_memory_available(cls) -> bool:
         logger.warning("Pin memory is not supported on CPU.")
         return False
+
+    @classmethod
+    def get_punica_wrapper(cls) -> str:
+        return "vllm.lora.punica_wrapper.punica_cpu.PunicaWrapperCPU"
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index b6a6c461369f9..80cefcb492531 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -218,6 +218,10 @@ def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
         logger.info("Using Flash Attention backend.")
         return "vllm.attention.backends.flash_attn.FlashAttentionBackend"
 
+    @classmethod
+    def get_punica_wrapper(cls) -> str:
+        return "vllm.lora.punica_wrapper.punica_gpu.PunicaWrapperGPU"
+
 
 # NVML utils
 # Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`,
diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py
index b579ebf494bdc..242c2c127979a 100644
--- a/vllm/platforms/hpu.py
+++ b/vllm/platforms/hpu.py
@@ -63,3 +63,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
     def is_pin_memory_available(cls):
         logger.warning("Pin memory is not supported on HPU.")
         return False
+
+    @classmethod
+    def get_punica_wrapper(cls) -> str:
+        return "vllm.lora.punica_wrapper.punica_hpu.PunicaWrapperHPU"
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index afa9daa9c98a7..3c2ec9636df91 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -276,6 +276,13 @@ def is_pin_memory_available(cls) -> bool:
             return False
         return True
 
+    @classmethod
+    def get_punica_wrapper(cls) -> str:
+        """
+        Return the punica wrapper for current platform.
+        """
+        raise NotImplementedError
+
 
 class UnspecifiedPlatform(Platform):
     _enum = PlatformEnum.UNSPECIFIED
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 7f1e8aef528a6..43105d7855e79 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -153,3 +153,7 @@ def verify_quantization(cls, quant: str) -> None:
                 "Using AWQ quantization with ROCm, but VLLM_USE_TRITON_AWQ"
                 " is not set, enabling VLLM_USE_TRITON_AWQ.")
         envs.VLLM_USE_TRITON_AWQ = True
+
+    @classmethod
+    def get_punica_wrapper(cls) -> str:
+        return "vllm.lora.punica_wrapper.punica_gpu.PunicaWrapperGPU"

From c6db21313cef3f1f9aa5714efba0d60b5f1a8dad Mon Sep 17 00:00:00 2001
From: elijah <30852919+e1ijah1@users.noreply.github.com>
Date: Mon, 13 Jan 2025 23:22:07 +0800
Subject: [PATCH 202/462] bugfix: Fix signature mismatch in benchmark's
 `get_tokenizer` function (#11982)

Signed-off-by: elijah <f1renze.142857@gmail.com>
---
 benchmarks/backend_request_func.py | 27 ++++++++++++++++++++++++---
 1 file changed, 24 insertions(+), 3 deletions(-)

diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index b67849038cf0d..9d71e4ecc4a37 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -417,14 +417,35 @@ def get_model(pretrained_model_name_or_path: str) -> str:
 
 
 def get_tokenizer(
-    pretrained_model_name_or_path: str, trust_remote_code: bool
+    pretrained_model_name_or_path: str,
+    tokenizer_mode: str = "auto",
+    trust_remote_code: bool = False,
+    **kwargs,
 ) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
     if pretrained_model_name_or_path is not None and not os.path.exists(
             pretrained_model_name_or_path):
         pretrained_model_name_or_path = get_model(
             pretrained_model_name_or_path)
-    return AutoTokenizer.from_pretrained(pretrained_model_name_or_path,
-                                         trust_remote_code=trust_remote_code)
+    if tokenizer_mode == "slow":
+        if kwargs.get("use_fast", False):
+            raise ValueError(
+                "Cannot use the fast tokenizer in slow tokenizer mode.")
+        kwargs["use_fast"] = False
+    if tokenizer_mode == "mistral":
+        try:
+            from vllm.transformers_utils.tokenizer import MistralTokenizer
+        except ImportError as e:
+            raise ImportError("MistralTokenizer requires vllm package.\n"
+                              "Please install it with `pip install vllm` "
+                              "to use mistral tokenizer mode.") from e
+        return MistralTokenizer.from_pretrained(
+            str(pretrained_model_name_or_path))
+    else:
+        return AutoTokenizer.from_pretrained(
+            pretrained_model_name_or_path,
+            trust_remote_code=trust_remote_code,
+            **kwargs,
+        )
 
 
 ASYNC_REQUEST_FUNCS = {

From 289b5191d5bd16c7b2e08cb19434f9f188393ac5 Mon Sep 17 00:00:00 2001
From: Yikun Jiang <yikunkero@gmail.com>
Date: Tue, 14 Jan 2025 01:23:59 +0800
Subject: [PATCH 203/462] [Doc] Fix build from source and installation link in
 README.md (#12013)

Signed-off-by: Yikun <yikunkero@gmail.com>
---
 README.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 67c557bfe13a9..e24e1a227cf40 100644
--- a/README.md
+++ b/README.md
@@ -72,16 +72,16 @@ Find the full list of supported models [here](https://docs.vllm.ai/en/latest/mod
 
 ## Getting Started
 
-Install vLLM with `pip` or [from source](https://vllm.readthedocs.io/en/latest/getting_started/installation.html#build-from-source):
+Install vLLM with `pip` or [from source](https://docs.vllm.ai/en/latest/getting_started/installation/gpu/index.html#build-wheel-from-source):
 
 ```bash
 pip install vllm
 ```
 
-Visit our [documentation](https://vllm.readthedocs.io/en/latest/) to learn more.
-- [Installation](https://vllm.readthedocs.io/en/latest/getting_started/installation.html)
-- [Quickstart](https://vllm.readthedocs.io/en/latest/getting_started/quickstart.html)
-- [List of Supported Models](https://vllm.readthedocs.io/en/latest/models/supported_models.html)
+Visit our [documentation](https://docs.vllm.ai/en/latest/) to learn more.
+- [Installation](https://docs.vllm.ai/en/latest/getting_started/installation/index.html)
+- [Quickstart](https://docs.vllm.ai/en/latest/getting_started/quickstart.html)
+- [List of Supported Models](https://docs.vllm.ai/en/latest/models/supported_models.html)
 
 ## Contributing
 

From f35ec461fc655a50abc5146fa27a79fdf42f55a1 Mon Sep 17 00:00:00 2001
From: Steve Luo <36296769+SunflowerAries@users.noreply.github.com>
Date: Tue, 14 Jan 2025 04:43:51 +0800
Subject: [PATCH 204/462] [Bugfix] Fix deepseekv3 gate bias error (#12002)

Signed-off-by: mgoin <michael@neuralmagic.com>
Co-authored-by: mgoin <michael@neuralmagic.com>
---
 .../layers/fused_moe/fused_moe.py             | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 3ea6217d7c0ef..308c1d6ac6db1 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -497,7 +497,10 @@ def grouped_topk(hidden_states: torch.Tensor,
         raise ValueError(f"Unsupported scoring function: {scoring_func}")
 
     if e_score_correction_bias is not None:
-        scores.add_(e_score_correction_bias.unsqueeze(0))
+        # Store original scores before applying correction bias. We use biased
+        # scores for expert selection but original scores for routing weights
+        original_scores = scores
+        scores = scores + e_score_correction_bias.unsqueeze(0)
 
     num_token = scores.shape[0]
     group_scores = scores.view(num_token, num_expert_group,
@@ -510,10 +513,16 @@ def grouped_topk(hidden_states: torch.Tensor,
         num_token, num_expert_group,
         scores.shape[-1] // num_expert_group).reshape(num_token, -1)  # [n, e]
     tmp_scores = scores.masked_fill(~score_mask.bool(), 0.0)  # [n, e]
-    topk_weights, topk_ids = torch.topk(tmp_scores,
-                                        k=topk,
-                                        dim=-1,
-                                        sorted=False)
+
+    if e_score_correction_bias is not None:
+        topk_ids = torch.topk(tmp_scores, k=topk, dim=-1, sorted=False)[1]
+        # Use original unbiased scores for the routing weights
+        topk_weights = original_scores.gather(1, topk_ids)
+    else:
+        topk_weights, topk_ids = torch.topk(tmp_scores,
+                                            k=topk,
+                                            dim=-1,
+                                            sorted=False)
 
     if renormalize:
         topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)

From 1a401252b5c1cb6cab348531281c5bd340257733 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Mon, 13 Jan 2025 17:24:36 -0800
Subject: [PATCH 205/462] [Docs] Add Sky Computing Lab to project intro
 (#12019)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 README.md            | 2 ++
 docs/source/index.md | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/README.md b/README.md
index e24e1a227cf40..658b9fb6edd8c 100644
--- a/README.md
+++ b/README.md
@@ -38,6 +38,8 @@ The first vLLM meetup in 2025 is happening on January 22nd, Wednesday, with Goog
 ## About
 vLLM is a fast and easy-to-use library for LLM inference and serving.
 
+Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley, vLLM has evloved into a community-driven project with contributions from both academia and industry.
+
 vLLM is fast with:
 
 - State-of-the-art serving throughput
diff --git a/docs/source/index.md b/docs/source/index.md
index 8f9493d77186e..d7a1117df9c27 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -23,6 +23,8 @@
 
 vLLM is a fast and easy-to-use library for LLM inference and serving.
 
+Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley, vLLM has evloved into a community-driven project with contributions from both academia and industry.
+
 vLLM is fast with:
 
 - State-of-the-art serving throughput

From 078da319033a32304bb3297092c9ec34c52d598b Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Tue, 14 Jan 2025 04:04:18 +0100
Subject: [PATCH 206/462] [HPU][Bugfix] set_forward_context and CI test
 execution (#12014)

Signed-off-by: Konrad Zawora <kzawora@habana.ai>
---
 .buildkite/run-hpu-test.sh      |  7 +++++--
 Dockerfile.hpu                  |  2 +-
 vllm/worker/hpu_model_runner.py | 32 +++++++++++++++++---------------
 3 files changed, 23 insertions(+), 18 deletions(-)

diff --git a/.buildkite/run-hpu-test.sh b/.buildkite/run-hpu-test.sh
index 8f3b08212fd6a..bdbb5195ce95f 100644
--- a/.buildkite/run-hpu-test.sh
+++ b/.buildkite/run-hpu-test.sh
@@ -8,9 +8,12 @@ set -ex
 docker build -t hpu-test-env -f Dockerfile.hpu .
 
 # Setup cleanup
+EXITCODE=1
 remove_docker_container() { docker rm -f hpu-test || true; }
-trap remove_docker_container EXIT
+remove_docker_container_and_exit() { remove_docker_container; exit $EXITCODE; }
+trap remove_docker_container_and_exit EXIT
 remove_docker_container
 
 # Run the image and launch offline inference
-docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic.py
\ No newline at end of file
+docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic.py
+EXITCODE=$?
diff --git a/Dockerfile.hpu b/Dockerfile.hpu
index 87e0c1a6a934e..66cf68c32f2ca 100644
--- a/Dockerfile.hpu
+++ b/Dockerfile.hpu
@@ -1,4 +1,4 @@
-FROM vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+FROM vault.habana.ai/gaudi-docker/1.19.1/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
 
 COPY ./ /workspace/vllm
 
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 3e5105f3b62e3..260ffaf27f9a1 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -289,12 +289,14 @@ def forward_hook(module, args, output):
 
 class HpuModelAdapter:
 
-    def __init__(self, model, block_size, dtype, enforce_eager):
+    def __init__(self, model, vllm_config):
         self.model = model
         self.prefill_use_fusedsdpa = os.getenv('VLLM_PROMPT_USE_FUSEDSDPA',
                                                '0').lower() in ['1', 'true']
-        self.block_size = block_size
-        self.dtype = dtype
+        self.vllm_config = vllm_config
+        self.block_size = vllm_config.cache_config.block_size
+        self.dtype = vllm_config.model_config.dtype
+        enforce_eager = vllm_config.model_config.enforce_eager
         if not htorch.utils.internal.is_lazy() and not enforce_eager:
             self.model = torch.compile(self.model,
                                        backend='hpu_backend',
@@ -353,14 +355,20 @@ def forward(self, *args, **kwargs):
         selected_token_indices = kwargs.pop('selected_token_indices')
         if 'warmup_mode' in kwargs:
             kwargs.pop('warmup_mode')
+        virtual_engine = 0
+        if 'virtual_engine' in kwargs:
+            virtual_engine = kwargs.pop('virtual_engine')
         input_ids = kwargs['input_ids']
         kwargs['attn_metadata'] = self._update_metadata(
             kwargs['attn_metadata'], input_ids.size(0), input_ids.size(1),
             input_ids.device, self.dtype)
         LoraMask.setLoraMask(kwargs.pop('lora_mask'))
-        hidden_states = self.model(*args, **kwargs)
-        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
-        hidden_states = hidden_states.index_select(0, selected_token_indices)
+        with set_forward_context(kwargs['attn_metadata'], self.vllm_config,
+                                 virtual_engine):
+            hidden_states = self.model(*args, **kwargs)
+            hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
+            hidden_states = hidden_states.index_select(0,
+                                                       selected_token_indices)
         return hidden_states
 
     def compute_logits(self, *args, **kwargs):
@@ -660,10 +668,7 @@ def load_model(self) -> None:
 
             with HabanaMemoryProfiler() as m_wrap:
                 self.model = _maybe_wrap_in_hpu_graph(
-                    self.model,
-                    self.block_size,
-                    dtype=self.model_config.dtype,
-                    enforce_eager=self.enforce_eager)
+                    self.model, vllm_config=self.vllm_config)
             msg = f"Wrapping in HPU Graph took {m_wrap.get_summary_string()}"
             logger.info(msg)
 
@@ -1934,6 +1939,7 @@ def execute_model(
             "attn_metadata": self.trim_attn_metadata(attn_metadata),
             "intermediate_tensors": intermediate_tensors,
             "lora_mask": lora_mask,
+            "virtual_engine": model_input.virtual_engine,
             **(model_input.multi_modal_kwargs or {}),
         }
         if htorch.utils.internal.is_lazy():
@@ -1948,11 +1954,7 @@ def execute_model(
                                 f"graphs{'T' if use_graphs else 'F'}")
         else:
             model_event_name = 'model_executable'
-        with set_forward_context(
-                model_input.attn_metadata, self.vllm_config,
-                model_input.virtual_engine), \
-            self.profiler.record_event(
-                    'internal', model_event_name):
+        with self.profiler.record_event('internal', model_event_name):
             hidden_states = self.model.forward(
                 **execute_model_kwargs,
                 selected_token_indices=sampling_metadata.selected_token_indices

From 8a1f938e6f02052df0f4953c149410605a2d56d8 Mon Sep 17 00:00:00 2001
From: TJian <tunjian1996@gmail.com>
Date: Tue, 14 Jan 2025 12:37:52 +0800
Subject: [PATCH 207/462] [Doc] Update Quantization Hardware Support
 Documentation (#12025)

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
Co-authored-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 docs/source/features/quantization/supported_hardware.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/features/quantization/supported_hardware.md b/docs/source/features/quantization/supported_hardware.md
index 988288a82d9bc..f5c0a95ea426e 100644
--- a/docs/source/features/quantization/supported_hardware.md
+++ b/docs/source/features/quantization/supported_hardware.md
@@ -113,7 +113,7 @@ The table below shows the compatibility of various quantization implementations
   - ✅︎
   - ✅︎
   - ✅︎
-  - ✗
+  - ✅︎
   - ✗
   - ✗
   - ✗

From ff39141a49928540c3975cc8e2a6e9bfa51f0bef Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 14 Jan 2025 19:24:06 +0800
Subject: [PATCH 208/462] [HPU][misc] add comments for explanation (#12034)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 .buildkite/run-hpu-test.sh | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.buildkite/run-hpu-test.sh b/.buildkite/run-hpu-test.sh
index bdbb5195ce95f..1edcb1d2669e9 100644
--- a/.buildkite/run-hpu-test.sh
+++ b/.buildkite/run-hpu-test.sh
@@ -8,6 +8,11 @@ set -ex
 docker build -t hpu-test-env -f Dockerfile.hpu .
 
 # Setup cleanup
+# certain versions of HPU software stack have a bug that can
+# override the exit code of the script, so we need to use
+# separate remove_docker_container and remove_docker_container_and_exit
+# functions, while other platforms only need one remove_docker_container
+# function.
 EXITCODE=1
 remove_docker_container() { docker rm -f hpu-test || true; }
 remove_docker_container_and_exit() { remove_docker_container; exit $EXITCODE; }

From bb354e6b2dd4f8154c39c33b7eee77fa452b7703 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 14 Jan 2025 20:16:11 +0800
Subject: [PATCH 209/462] [Bugfix] Fix various bugs in multi-modal processor
 (#12031)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/multimodal/test_processing.py | 19 ++++++
 vllm/multimodal/processing.py       | 89 +++++++++++++----------------
 vllm/multimodal/registry.py         |  5 +-
 3 files changed, 62 insertions(+), 51 deletions(-)

diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index 54269c3ef7ce0..9e58ed4cfde93 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -421,6 +421,8 @@ def test_find_replace_tokens(
             "pattern_1": [32000, 32000],
             "pattern_2": [],
             "pattern_3": [1550, 918, 1550],
+            # Test different modalities having the same tokens (32000)
+            "pattern_4": [32000],
         },
     ],
 )
@@ -438,6 +440,14 @@ def test_find_replace_tokens(
                         replacement=[32000, 32000],
                     ),
                 ],
+                "pattern_4": [
+                    PlaceholderInfo(
+                        modality="pattern_4",
+                        item_idx=0,
+                        start_idx=3,
+                        replacement=[32000],
+                    ),
+                ],
             }
 
         ),
@@ -466,6 +476,7 @@ def test_find_replace_tokens(
                         replacement=[1550, 918, 1550],
                     ),
                 ],
+                # No match for pattern_4 as it has lower priority than pattern_1
             }
         ),
         (
@@ -485,6 +496,14 @@ def test_find_replace_tokens(
                         replacement=[32000, 32000],
                     ),
                 ],
+                "pattern_4": [
+                    PlaceholderInfo(
+                        modality="pattern_4",
+                        item_idx=0,
+                        start_idx=5,
+                        replacement=[32000],
+                    ),
+                ],
                 "pattern_3": [
                     PlaceholderInfo(
                         modality="pattern_3",
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 8b47dfb07387f..fa199a07b4cf8 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -404,73 +404,62 @@ def replace_text_matches(
     return "".join(texts)
 
 
-def _iter_modality_placeholders(
+def _iter_placeholders(
+    mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]],
     prompt: list[int],
-    modality: str,
-    modality_repls: Sequence[BoundPromptReplacement],
-    modal_item_count: int,
+    mm_item_counts: Mapping[str, int],
 ) -> Iterable[PlaceholderInfo]:
-    if modal_item_count == 0:
-        return
+    """
+    Yield each set of placeholder tokens found in :code:`prompt`.
+
+    Matches are exclusive even when multiple modalities share
+    the same placeholder tokens. In that case, the modality that
+    appears earlier in `mm_prompt_repls` takes priority.
 
+    Note that empty matches are ignored.
+    """
     prompt_len = len(prompt)
-    item_idx = 0
+    item_idx_by_modality = defaultdict[str, int](lambda: 0)
 
     start_idx = 0
     while start_idx < prompt_len:
         found = False
 
-        for repl_info in modality_repls:
-            replacement = repl_info.get_replacement(item_idx)
-            repl_tokens = replacement.token_ids
-            repl_len = len(repl_tokens)
-            end_idx = start_idx + repl_len
-
-            if repl_len == 0 or end_idx > prompt_len:
+        for modality, modality_repls in mm_prompt_repls.items():
+            item_idx = item_idx_by_modality[modality]
+            if item_idx >= mm_item_counts.get(modality, 0):
                 continue
 
-            if prompt[start_idx:end_idx] == repl_tokens:
-                yield PlaceholderInfo(
-                    modality=modality,
-                    item_idx=item_idx,
-                    start_idx=start_idx,
-                    replacement=repl_tokens,
-                )
+            for repl_info in modality_repls:
+                replacement = repl_info.get_replacement(item_idx)
+                repl_tokens = replacement.token_ids
+                repl_len = len(repl_tokens)
+                end_idx = start_idx + repl_len
+
+                if repl_len == 0 or end_idx > prompt_len:
+                    continue
+
+                if prompt[start_idx:end_idx] == repl_tokens:
+                    yield PlaceholderInfo(
+                        modality=modality,
+                        item_idx=item_idx,
+                        start_idx=start_idx,
+                        replacement=repl_tokens,
+                    )
 
-                item_idx += 1
-                if item_idx >= modal_item_count:
-                    return
+                    # Exclude overlapping matches
+                    start_idx = end_idx
+                    item_idx_by_modality[modality] += 1
+                    found = True
+                    break
 
-                # Exclude overlapping matches
-                start_idx = end_idx
-                found = True
-                break
+            if found:
+                break  # Go back to the outer while loop
 
         if not found:
             start_idx += 1
 
 
-def _iter_placeholders(
-    mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]],
-    prompt: list[int],
-    mm_item_counts: Mapping[str, int],
-) -> Iterable[PlaceholderInfo]:
-    """
-    For each modality, yield each set of placeholder tokens found in
-    :code:`prompt`.
-
-    Note that empty matches are ignored.
-    """
-    for modality, modal_item_count in mm_item_counts.items():
-        if modality in mm_prompt_repls:
-            yield from _iter_modality_placeholders(
-                prompt,
-                modality,
-                mm_prompt_repls[modality],
-                modal_item_count,
-            )
-
-
 def find_mm_placeholders(
     mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]],
     prompt: list[int],
@@ -1156,7 +1145,7 @@ def apply(
 
         # If HF processor already inserts placeholder tokens,
         # there is no need for us to insert them
-        if all(len(repls) == 0 for repls in mm_missing_repls.items()):
+        if all(len(repls) == 0 for repls in mm_missing_repls.values()):
             tokenizer = self.info.get_tokenizer()
             prompt = decode_tokens(tokenizer, prompt_ids)
             mm_placeholders = hf_mm_placeholders
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 804a91da8c889..2961f7c76ca12 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -259,7 +259,10 @@ def get_max_tokens_per_item_by_modality(
             This is currently directly used only in V1.
         """
         if self.has_processor(model_config):
-            tokenizer = cached_get_tokenizer(model_config.tokenizer)
+            tokenizer = cached_get_tokenizer(
+                model_config.tokenizer,
+                trust_remote_code=model_config.trust_remote_code,
+            )
             processor = self.create_processor(model_config, tokenizer)
             seq_len = model_config.max_model_len
             return processor.info.get_mm_max_tokens_per_item(seq_len)

From 1f18adb2451e9b45048d17023169bd8cbb39747e Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Tue, 14 Jan 2025 20:59:32 +0800
Subject: [PATCH 210/462] [Kernel] Revert the API change of Attention.forward
 (#12038)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
---
 vllm/attention/layer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index c7e7a4d52e5a7..a06db075f334d 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -134,8 +134,8 @@ def forward(
         query: torch.Tensor,
         key: torch.Tensor,
         value: torch.Tensor,
-        _kv_cache: torch.Tensor,
-        _attn_metadata: AttentionMetadata,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         if self.use_output:
             output = torch.empty_like(query)

From 2e0e0176104965c9c8c090609f331f9b70e492f3 Mon Sep 17 00:00:00 2001
From: wangxiyuan <wangxiyuan1007@gmail.com>
Date: Tue, 14 Jan 2025 21:27:04 +0800
Subject: [PATCH 211/462] [Platform] Add output for Attention Backend (#11981)

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
---
 vllm/attention/backends/abstract.py      | 4 ++++
 vllm/attention/backends/flash_attn.py    | 2 ++
 vllm/attention/layer.py                  | 6 +-----
 vllm/v1/attention/backends/flash_attn.py | 2 ++
 4 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index f5dcaea79af93..737559bfe70ca 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -31,6 +31,10 @@ class AttentionType:
 
 class AttentionBackend(ABC):
     """Abstract class for attention backends."""
+    # For some attention backends, we allocate an output tensor before
+    # calling the custom op. When piecewise cudagraph is enabled, this
+    # makes sure the output tensor is allocated inside the cudagraph.
+    accept_output_buffer: bool = False
 
     @staticmethod
     @abstractmethod
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index 23ea244f07dfe..48b3e8d177ec9 100644
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -29,6 +29,8 @@
 
 class FlashAttentionBackend(AttentionBackend):
 
+    accept_output_buffer: bool = True
+
     @staticmethod
     def get_supported_head_sizes() -> List[int]:
         return [32, 64, 96, 128, 160, 192, 224, 256]
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index a06db075f334d..a283e87d84070 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -110,11 +110,7 @@ def __init__(
         self.use_direct_call = not current_platform.is_cuda_alike(
         ) and not current_platform.is_cpu()
 
-        # For some attention backends, we allocate an output tensor before
-        # calling the custom op. When piecewise cudagraph is enabled, this
-        # makes sure the output tensor is allocated inside the cudagraph.
-        self.use_output = self.backend == _Backend.FLASH_ATTN or \
-            self.backend == _Backend.FLASH_ATTN_VLLM_V1
+        self.use_output = attn_backend.accept_output_buffer
         compilation_config = get_current_vllm_config().compilation_config
         if prefix in compilation_config.static_forward_context:
             raise ValueError(f"Duplicate layer name: {prefix}")
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index b02bc9ffde538..7b0786261a6a6 100644
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -15,6 +15,8 @@
 
 class FlashAttentionBackend(AttentionBackend):
 
+    accept_output_buffer: bool = True
+
     @staticmethod
     def get_supported_head_sizes() -> List[int]:
         return [32, 64, 96, 128, 160, 192, 224, 256]

From a2d2acb4c8d240b1e5946afe2736e497ce5b71a2 Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Tue, 14 Jan 2025 23:45:05 +0800
Subject: [PATCH 212/462] [Bugfix][Kernel] Give unique name to
 BlockSparseFlashAttention (#12040)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
---
 vllm/attention/backends/blocksparse_attn.py | 3 +--
 vllm/platforms/interface.py                 | 1 +
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/attention/backends/blocksparse_attn.py b/vllm/attention/backends/blocksparse_attn.py
index 7089d59392c36..77cfa8490172b 100644
--- a/vllm/attention/backends/blocksparse_attn.py
+++ b/vllm/attention/backends/blocksparse_attn.py
@@ -89,8 +89,7 @@ class BlocksparseFlashAttentionBackend(AttentionBackend):
 
     @staticmethod
     def get_name() -> str:
-        # For attention layer compatibility
-        return "FLASH_ATTN"
+        return "BLOCK_SPARSE_FLASH_ATTN"
 
     @staticmethod
     def get_impl_cls() -> Type["BlocksparseFlashAttentionImpl"]:
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 3c2ec9636df91..85fde76796901 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -33,6 +33,7 @@ class _Backend(enum.Enum):
     HPU_ATTN = enum.auto()
     PALLAS = enum.auto()
     IPEX = enum.auto()
+    BLOCK_SPARSE_FLASH_ATTN = enum.auto()
     NO_ATTENTION = enum.auto()
 
 
From c9d6ff530b32c526bedda3105dcbab3d2f6ce992 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 14 Jan 2025 16:05:50 +0000
Subject: [PATCH 213/462] Explain where the engine args go when using Docker
 (#12041)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/source/deployment/docker.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/source/deployment/docker.md b/docs/source/deployment/docker.md
index 9e301483ef7f9..2606e2765c1ae 100644
--- a/docs/source/deployment/docker.md
+++ b/docs/source/deployment/docker.md
@@ -19,6 +19,8 @@ $ docker run --runtime nvidia --gpus all \
     --model mistralai/Mistral-7B-v0.1
 ```
 
+You can add any other <project:#engine-args> you need after the image tag (`vllm/vllm-openai:latest`).
+
 ```{note}
 You can either use the `ipc=host` flag or `--shm-size` flag to allow the
 container to access the host's shared memory. vLLM uses PyTorch, which uses shared

From 87054a57ab39bad6c7fe8999e7d93566ded713e3 Mon Sep 17 00:00:00 2001
From: maang-h <55082429+maang-h@users.noreply.github.com>
Date: Wed, 15 Jan 2025 01:03:04 +0800
Subject: [PATCH 214/462] [Doc]: Update the Json Example of the `Engine
 Arguments` document (#12045)

---
 vllm/engine/arg_utils.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 0850bab6bb7e1..c31b206d6f60e 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -538,7 +538,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             default=None,
             type=json.loads,
             help='RoPE scaling configuration in JSON format. '
-            'For example, {"rope_type":"dynamic","factor":2.0}')
+            'For example, ``{"rope_type":"dynamic","factor":2.0}``')
         parser.add_argument('--rope-theta',
                             default=None,
                             type=float,
@@ -607,7 +607,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             default=None,
             type=json.loads,
             help=('Overrides for the multimodal input mapping/processing, '
-                  'e.g., image processor. For example: {"num_crops": 4}.'))
+                  'e.g., image processor. For example: ``{"num_crops": 4}``.'))
         parser.add_argument(
             '--disable-mm-preprocessor-cache',
             action='store_true',
@@ -908,13 +908,13 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             type=json.loads,
             default=None,
             help="Override or set neuron device configuration. "
-            "e.g. {\"cast_logits_dtype\": \"bloat16\"}.'")
+            "e.g. ``{\"cast_logits_dtype\": \"bloat16\"}``.")
         parser.add_argument(
             '--override-pooler-config',
             type=PoolerConfig.from_json,
             default=None,
             help="Override or set the pooling method for pooling models. "
-            "e.g. {\"pooling_type\": \"mean\", \"normalize\": false}.'")
+            "e.g. ``{\"pooling_type\": \"mean\", \"normalize\": false}``.")
 
         parser.add_argument('--compilation-config',
                             '-O',

From a3a3ee4e6febe8c270fdec0765c844186a728079 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Wed, 15 Jan 2025 07:49:49 +0800
Subject: [PATCH 215/462] [Misc]  Merge bitsandbytes_stacked_params_mapping and
 packed_modules_mapping (#11924)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/model_executor/model_loader/loader.py | 27 +++++++++-------------
 vllm/model_executor/model_loader/utils.py  | 26 ++++++++++++++++++++-
 vllm/model_executor/models/baichuan.py     |  7 ------
 vllm/model_executor/models/exaone.py       |  8 -------
 vllm/model_executor/models/falcon.py       |  6 ++---
 vllm/model_executor/models/gemma.py        |  9 --------
 vllm/model_executor/models/gemma2.py       | 10 --------
 vllm/model_executor/models/granite.py      |  8 -------
 vllm/model_executor/models/idefics3.py     | 10 --------
 vllm/model_executor/models/llama.py        | 10 --------
 vllm/model_executor/models/llava.py        | 12 ++++------
 vllm/model_executor/models/minicpm.py      | 10 --------
 vllm/model_executor/models/minicpm3.py     |  6 -----
 vllm/model_executor/models/minicpmv.py     | 20 ----------------
 vllm/model_executor/models/mllama.py       | 11 +++------
 vllm/model_executor/models/molmo.py        |  6 -----
 vllm/model_executor/models/nemotron.py     |  6 -----
 vllm/model_executor/models/opt.py          | 10 +++-----
 vllm/model_executor/models/phi.py          |  8 -------
 vllm/model_executor/models/phi3.py         |  4 ----
 vllm/model_executor/models/qwen.py         |  7 ------
 vllm/model_executor/models/qwen2.py        | 10 --------
 vllm/model_executor/models/qwen2_vl.py     | 10 --------
 vllm/model_executor/models/solar.py        |  8 -------
 24 files changed, 49 insertions(+), 200 deletions(-)

diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 0033fbff0e9ac..9fe0db62435a0 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -39,7 +39,8 @@
 from vllm.model_executor.model_loader.tensorizer import (
     TensorizerConfig, is_vllm_tensorized, load_with_tensorizer,
     serialize_vllm_model, tensorizer_weights_iterator)
-from vllm.model_executor.model_loader.utils import (get_model_architecture,
+from vllm.model_executor.model_loader.utils import (ParamMapping,
+                                                    get_model_architecture,
                                                     set_default_torch_dtype)
 from vllm.model_executor.model_loader.weight_utils import (
     download_safetensors_index_file_from_hf, download_weights_from_hf,
@@ -983,21 +984,11 @@ def _unquantized_generator(self, hf_weights_files, use_safetensors,
 
     def _get_bnb_target_modules(self, model: nn.Module) -> None:
 
-        # TODO: Maybe we can replace bitsandbytes_stacked_params_mapping with
-        # packed_modules_mapping.
-        inverse_stacked_mapping: Dict[str, List[str]] = {}
-        for orig, (
-                packed,
-                idx,
-        ) in model.bitsandbytes_stacked_params_mapping.items():
-            if packed not in inverse_stacked_mapping:
-                inverse_stacked_mapping[packed] = []
-            inverse_stacked_mapping[packed].insert(idx, orig)
-
         for name, module in model.named_modules():
             if isinstance(module, (LinearBase, )):
                 last_name = name.split(".")[-1]
-                if sub_modules := inverse_stacked_mapping.get(last_name, []):
+                if sub_modules := self.modules_mapping.packed_mapping.get(
+                        last_name, []):
                     # Map vllm's names to transformers's names.
                     for sub_name in sub_modules:
                         self.target_modules.append(
@@ -1018,15 +1009,19 @@ def _load_weights(self, model_config: ModelConfig,
                 "The required method 'load_weights' is not defined in class"
                 f" {type(model).__name__}.")
 
-        if not hasattr(model, "bitsandbytes_stacked_params_mapping"):
+        if not hasattr(model, "packed_modules_mapping"):
             raise AttributeError(
                 f"Model {type(model).__name__} does not support BitsAndBytes "
-                "quantization yet.")
+                "quantization yet. No 'packed_modules_mapping' found.")
+
+        self.modules_mapping = ParamMapping(
+            copy.deepcopy(model.packed_modules_mapping))
 
         # For some models like Molmo, we need to use hf_to_vllm_mapper
         # to ensure correct loading of weights.
         if hf_to_vllm_mapper := getattr(model, "hf_to_vllm_mapper", None):
             self.weight_mapper = lambda name: hf_to_vllm_mapper._map_name(name)
+
         # Modules whose weights might have fused on disk
         # we need their output_sizes to make shard in flight correctly with TP
         self.maybe_fused_weights_modules: Dict[str, List[int]] = {}
@@ -1109,7 +1104,7 @@ def _load_weights(self, model_config: ModelConfig,
             for shard_name, (
                     weight_name,
                     index,
-            ) in model.bitsandbytes_stacked_params_mapping.items():
+            ) in self.modules_mapping.inverse_packed_mapping.items():
                 shard_pos = quant_param_name.find(shard_name)
                 # Some models, such as MiniCPM V2.5/2.6, contain both
                 # module names 'kv_proj' and 'qkv_proj'. To prevent 'kv_proj'
diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
index 44978a55e072d..3f923d2f6632a 100644
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -1,6 +1,7 @@
 """Utilities for selecting and loading models."""
 import contextlib
-from typing import Tuple, Type
+from dataclasses import dataclass, field
+from typing import Dict, List, Tuple, Type
 
 import torch
 from torch import nn
@@ -49,3 +50,26 @@ def get_model_architecture(
 
 def get_architecture_class_name(model_config: ModelConfig) -> str:
     return get_model_architecture(model_config)[1]
+
+
+@dataclass
+class ParamMapping:
+    """
+    A class to handle parameter mapping for model weight loading.
+    It creates a bidirectional mapping between packed parameters and their 
+    constituent parts.
+    """
+    packed_mapping: Dict[str, List[str]]
+    inverse_packed_mapping: Dict[str, Tuple[str,
+                                            int]] = field(default_factory=dict)
+
+    def __post_init__(self):
+        for packed_name, sub_params in self.packed_mapping.items():
+            # Skip self-contained cases (e.g., {"W_pack": ["W_pack"]})
+            if len(sub_params) == 1 and sub_params[0] == packed_name:
+                continue
+            for index, param_name in enumerate(sub_params):
+                self.inverse_packed_mapping[param_name] = (
+                    packed_name,
+                    index,
+                )
diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py
index 5e68b7f165bf4..a923ed36a9db2 100644
--- a/vllm/model_executor/models/baichuan.py
+++ b/vllm/model_executor/models/baichuan.py
@@ -350,13 +350,6 @@ class BaiChuanBaseForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     embedding_modules = {}
     embedding_padding_modules = []
 
-    # BitandBytes specific attributes
-    bitsandbytes_stacked_params_mapping = {
-        # shard_name, weight_name, index
-        "gate_proj": ("gate_up_proj", 0),
-        "up_proj": ("gate_up_proj", 1),
-    }
-
     def __init__(
         self,
         *,
diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py
index 8324a563edd64..ad15f835b1609 100644
--- a/vllm/model_executor/models/exaone.py
+++ b/vllm/model_executor/models/exaone.py
@@ -430,14 +430,6 @@ class ExaoneForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         "lm_head": "output_embeddings",
     }
     embedding_padding_modules = ["lm_head"]
-    bitsandbytes_stacked_params_mapping = {
-        # shard_name, weight_name, index
-        "q_proj": ("qkv_proj", 0),
-        "k_proj": ("qkv_proj", 1),
-        "v_proj": ("qkv_proj", 2),
-        "c_fc_0": ("gate_up_proj", 0),
-        "c_fc_1": ("gate_up_proj", 1),
-    }
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py
index 8660cf79b9cdb..c503a368e8244 100644
--- a/vllm/model_executor/models/falcon.py
+++ b/vllm/model_executor/models/falcon.py
@@ -409,9 +409,9 @@ def forward(
 
 
 class FalconForCausalLM(nn.Module, SupportsPP):
-
-    # BitandBytes specific attributes
-    bitsandbytes_stacked_params_mapping = {}
+    packed_modules_mapping = {
+        "query_key_value": ["query_key_value"],
+    }
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
index b28715c48adfb..6de0c866bc2f0 100644
--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -349,15 +349,6 @@ class GemmaForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         "gate_up_proj",
         "down_proj",
     ]
-    # BitandBytes specific attributes
-    bitsandbytes_stacked_params_mapping = {
-        # shard_name, weight_name, index
-        "q_proj": ("qkv_proj", 0),
-        "k_proj": ("qkv_proj", 1),
-        "v_proj": ("qkv_proj", 2),
-        "gate_proj": ("gate_up_proj", 0),
-        "up_proj": ("gate_up_proj", 1),
-    }
 
     # Gemma does not apply LoRA to the embedding layer.
     embedding_modules = {}
diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py
index f4530e4771960..698b9a5b6b1d6 100644
--- a/vllm/model_executor/models/gemma2.py
+++ b/vllm/model_executor/models/gemma2.py
@@ -399,16 +399,6 @@ class Gemma2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     embedding_modules = {}
     embedding_padding_modules = []
 
-    # BitandBytes specific attributes
-    bitsandbytes_stacked_params_mapping = {
-        # shard_name, weight_name, index
-        "q_proj": ("qkv_proj", 0),
-        "k_proj": ("qkv_proj", 1),
-        "v_proj": ("qkv_proj", 2),
-        "gate_proj": ("gate_up_proj", 0),
-        "up_proj": ("gate_up_proj", 1),
-    }
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py
index a91ed4158a73f..3e95926fd1e22 100644
--- a/vllm/model_executor/models/granite.py
+++ b/vllm/model_executor/models/granite.py
@@ -362,14 +362,6 @@ class GraniteForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         "lm_head": "output_embeddings",
     }
     embedding_padding_modules = ["lm_head"]
-    bitsandbytes_stacked_params_mapping = {
-        # shard_name, weight_name, index
-        "q_proj": ("qkv_proj", 0),
-        "k_proj": ("qkv_proj", 1),
-        "v_proj": ("qkv_proj", 2),
-        "gate_proj": ("gate_up_proj", 0),
-        "up_proj": ("gate_up_proj", 1),
-    }
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
index 17e772e7faa32..d16a77f862d98 100644
--- a/vllm/model_executor/models/idefics3.py
+++ b/vllm/model_executor/models/idefics3.py
@@ -662,16 +662,6 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal,
         "down_proj",
     ]
 
-    # BitandBytes specific attributes
-    bitsandbytes_stacked_params_mapping = {
-        # shard_name, weight_name, index
-        "q_proj": ("qkv_proj", 0),
-        "k_proj": ("qkv_proj", 1),
-        "v_proj": ("qkv_proj", 2),
-        "gate_proj": ("gate_up_proj", 0),
-        "up_proj": ("gate_up_proj", 1),
-    }
-
     embedding_modules = {}
     embedding_padding_modules = []
 
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 17b0fbb777e8e..16fa7acf54fdc 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -478,16 +478,6 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     }
     embedding_padding_modules = ["lm_head"]
 
-    # BitandBytes specific attributes
-    bitsandbytes_stacked_params_mapping = {
-        # shard_name, weight_name, index
-        "q_proj": ("qkv_proj", 0),
-        "k_proj": ("qkv_proj", 1),
-        "v_proj": ("qkv_proj", 2),
-        "gate_proj": ("gate_up_proj", 0),
-        "up_proj": ("gate_up_proj", 1),
-    }
-
     # Mistral/Llama models can also be loaded with --load-format mistral
     # from consolidated.safetensors checkpoints
     mistral_mapping = {
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index bb3db60c7d8ed..722fff98d5c19 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -463,14 +463,10 @@ def init_vision_tower_for_llava(
                                         info=_build_llava_or_pixtral_hf_info,
                                         dummy_inputs=LlavaDummyInputsBuilder)
 class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
-    # BitandBytes specific attributes
-    bitsandbytes_stacked_params_mapping = {
-        # shard_name, weight_name, index
-        "q_proj": ("qkv_proj", 0),
-        "k_proj": ("qkv_proj", 1),
-        "v_proj": ("qkv_proj", 2),
-        "gate_proj": ("gate_up_proj", 0),
-        "up_proj": ("gate_up_proj", 1),
+
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"]
     }
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py
index 5a0f202364f26..6254d26c7060d 100644
--- a/vllm/model_executor/models/minicpm.py
+++ b/vllm/model_executor/models/minicpm.py
@@ -534,16 +534,6 @@ class MiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     }
     embedding_padding_modules = ["lm_head"]
 
-    # BitandBytes specific attributes
-    bitsandbytes_stacked_params_mapping = {
-        # shard_name, weight_name, index
-        "q_proj": ("qkv_proj", 0),
-        "k_proj": ("qkv_proj", 1),
-        "v_proj": ("qkv_proj", 2),
-        "gate_proj": ("gate_up_proj", 0),
-        "up_proj": ("gate_up_proj", 1),
-    }
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
diff --git a/vllm/model_executor/models/minicpm3.py b/vllm/model_executor/models/minicpm3.py
index e9d7eada1d16c..5e1e6c6fa6141 100644
--- a/vllm/model_executor/models/minicpm3.py
+++ b/vllm/model_executor/models/minicpm3.py
@@ -241,11 +241,5 @@ class MiniCPM3ForCausalLM(MiniCPMForCausalLM):
     # `embedding_modules` and `embedding_padding_modules`
     # are inherited from MiniCPMForCausalLM
 
-    bitsandbytes_stacked_params_mapping = {
-        # shard_name, weight_name, index
-        "gate_proj": ("gate_up_proj", 0),
-        "up_proj": ("gate_up_proj", 1),
-    }
-
     def _init_model(self, *, vllm_config: VllmConfig, prefix: str = ""):
         return MiniCPM3Model(vllm_config=vllm_config, prefix=prefix)
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index ff7dab89e4da8..1aa529056893b 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -761,16 +761,6 @@ class MiniCPMV2_5(MiniCPMVBaseModel, SupportsLoRA):
         "kv_proj",
     ]
 
-    # BitandBytes specific attributes
-    bitsandbytes_stacked_params_mapping = {
-        # shard_name, weight_name, index
-        "q_proj": ("qkv_proj", 0),
-        "k_proj": ("qkv_proj", 1),
-        "v_proj": ("qkv_proj", 2),
-        "gate_proj": ("gate_up_proj", 0),
-        "up_proj": ("gate_up_proj", 1),
-    }
-
     embedding_modules = {}
     embedding_padding_modules = []
 
@@ -881,16 +871,6 @@ class MiniCPMV2_6(MiniCPMVBaseModel, SupportsLoRA):
         "kv_proj",
     ]
 
-    # BitandBytes specific attributes
-    bitsandbytes_stacked_params_mapping = {
-        # shard_name, weight_name, index
-        "q_proj": ("qkv_proj", 0),
-        "k_proj": ("qkv_proj", 1),
-        "v_proj": ("qkv_proj", 2),
-        "gate_proj": ("gate_up_proj", 0),
-        "up_proj": ("gate_up_proj", 1),
-    }
-
     embedding_modules = {}
     embedding_padding_modules = []
 
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index 593a4d3fb6940..b2368ffff5412 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -1107,14 +1107,9 @@ def forward(
 @INPUT_REGISTRY.register_dummy_encoder_data(dummy_encoder_data_for_mllama)
 @INPUT_REGISTRY.register_input_processor(input_processor_for_mllama)
 class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal):
-    # BitandBytes specific attributes
-    bitsandbytes_stacked_params_mapping = {
-        # shard_name, weight_name, index
-        "q_proj": ("qkv_proj", 0),
-        "k_proj": ("qkv_proj", 1),
-        "v_proj": ("qkv_proj", 2),
-        "gate_proj": ("gate_up_proj", 0),
-        "up_proj": ("gate_up_proj", 1),
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"]
     }
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index c45ee9b921c9e..a2fd1701316f2 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -1193,12 +1193,6 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP,
     embedding_modules = {}
     embedding_padding_modules = []
 
-    # BitandBytes specific attributes
-    bitsandbytes_stacked_params_mapping = {
-        "gate_proj": ("merged_linear", 0),
-        "up_proj": ("merged_linear", 1),
-    }
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py
index 34cb9981c167b..8cc62d5c803cc 100644
--- a/vllm/model_executor/models/nemotron.py
+++ b/vllm/model_executor/models/nemotron.py
@@ -395,12 +395,6 @@ class NemotronForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         "lm_head": "output_embeddings",
     }
     embedding_padding_modules = ["lm_head"]
-    bitsandbytes_stacked_params_mapping = {
-        # shard_name, weight_name, index
-        "q_proj": ("qkv_proj", 0),
-        "k_proj": ("qkv_proj", 1),
-        "v_proj": ("qkv_proj", 2),
-    }
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py
index 7edafcd20b5db..ea1185aa80dc6 100644
--- a/vllm/model_executor/models/opt.py
+++ b/vllm/model_executor/models/opt.py
@@ -329,13 +329,9 @@ def forward(
 
 
 class OPTForCausalLM(nn.Module, SupportsPP):
-
-    # BitandBytes specific attributes
-    bitsandbytes_stacked_params_mapping = {
-        # shard_name, weight_name, index
-        "q_proj": ("qkv_proj", 0),
-        "k_proj": ("qkv_proj", 1),
-        "v_proj": ("qkv_proj", 2),
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"]
     }
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py
index f9e972688ddd1..59b7508a370f8 100644
--- a/vllm/model_executor/models/phi.py
+++ b/vllm/model_executor/models/phi.py
@@ -279,14 +279,6 @@ class PhiForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         "fc2",
     ]
 
-    # BitandBytes specific attributes
-    bitsandbytes_stacked_params_mapping = {
-        # shard_name, weight_name, index
-        "q_proj": ("qkv_proj", 0),
-        "k_proj": ("qkv_proj", 1),
-        "v_proj": ("qkv_proj", 2),
-    }
-
     embedding_modules = {}
     embedding_padding_modules = []
 
diff --git a/vllm/model_executor/models/phi3.py b/vllm/model_executor/models/phi3.py
index 937858ee3b8c2..34141511ea791 100644
--- a/vllm/model_executor/models/phi3.py
+++ b/vllm/model_executor/models/phi3.py
@@ -14,7 +14,3 @@ class Phi3ForCausalLM(LlamaForCausalLM):
             "gate_up_proj",
         ],
     }
-
-    # BitandBytes specific attributes
-    # Initialize an empty dict when there is no stacked parameter mapping.
-    bitsandbytes_stacked_params_mapping = {}
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index baf955f6b515d..1345b381f0a99 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -1028,13 +1028,6 @@ class QWenLLM(QWenBaseModel):
     embedding_modules = {}
     embedding_padding_modules = []
 
-    # BitandBytes specific attributes
-    bitsandbytes_stacked_params_mapping = {
-        # shard_name, weight_name, index
-        "w2": ("gate_up_proj", 0),
-        "w1": ("gate_up_proj", 1),
-    }
-
 
 class QWenVL(QWenBaseModel, SupportsMultiModal):
     packed_modules_mapping = {
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index d20fb150f7e39..0a99c87470850 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -418,16 +418,6 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     embedding_modules = {}
     embedding_padding_modules = []
 
-    # BitandBytes specific attributes
-    bitsandbytes_stacked_params_mapping = {
-        # shard_name, weight_name, index
-        "q_proj": ("qkv_proj", 0),
-        "k_proj": ("qkv_proj", 1),
-        "v_proj": ("qkv_proj", 2),
-        "gate_proj": ("gate_up_proj", 0),
-        "up_proj": ("gate_up_proj", 1),
-    }
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 76a810e8f0c20..d00e5d362c8bc 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -1038,16 +1038,6 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
     embedding_modules = {}
     embedding_padding_modules = []
 
-    # BitandBytes specific attributes
-    bitsandbytes_stacked_params_mapping = {
-        # shard_name, weight_name, index
-        "q_proj": ("qkv_proj", 0),
-        "k_proj": ("qkv_proj", 1),
-        "v_proj": ("qkv_proj", 2),
-        "gate_proj": ("gate_up_proj", 0),
-        "up_proj": ("gate_up_proj", 1),
-    }
-
     # To ensure correct weight loading and mapping.
     hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={
         "lm_head.": "language_model.lm_head.",
diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py
index a7cf65a0e36e4..e83d316f74de2 100644
--- a/vllm/model_executor/models/solar.py
+++ b/vllm/model_executor/models/solar.py
@@ -401,14 +401,6 @@ class SolarForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         "lm_head": "output_embeddings",
     }
     embedding_padding_modules = ["lm_head"]
-    bitsandbytes_stacked_params_mapping = {
-        # shard_name, weight_name, index
-        "q_proj": ("qkv_proj", 0),
-        "k_proj": ("qkv_proj", 1),
-        "v_proj": ("qkv_proj", 2),
-        "gate_proj": ("gate_up_proj", 0),
-        "up_proj": ("gate_up_proj", 1),
-    }
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()

From 42f5e7c52a5852e20937001332572c8cb8115af0 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Wed, 15 Jan 2025 10:29:53 +0800
Subject: [PATCH 216/462] [Kernel] Support MulAndSilu (#11624)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 csrc/activation_kernels.cu               | 32 +++++++++++++++++-----
 csrc/ops.h                               |  2 ++
 csrc/torch_bindings.cpp                  |  3 ++
 tests/kernels/test_activation.py         | 20 +++++++++-----
 vllm/model_executor/layers/activation.py | 35 ++++++++++++++++++++++++
 vllm/model_executor/models/molmo.py      | 14 ++--------
 vllm/model_executor/models/ultravox.py   | 13 ++-------
 7 files changed, 83 insertions(+), 36 deletions(-)

diff --git a/csrc/activation_kernels.cu b/csrc/activation_kernels.cu
index 839dc36ba4e29..88275dbdd83a1 100644
--- a/csrc/activation_kernels.cu
+++ b/csrc/activation_kernels.cu
@@ -9,8 +9,16 @@
 
 namespace vllm {
 
+template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&),
+          bool act_first>
+__device__ __forceinline__ scalar_t compute(const scalar_t& x,
+                                            const scalar_t& y) {
+  return act_first ? ACT_FN(x) * y : x * ACT_FN(y);
+}
 // Activation and gating kernel template.
-template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&)>
+
+template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&),
+          bool act_first>
 __global__ void act_and_mul_kernel(
     scalar_t* __restrict__ out,          // [..., d]
     const scalar_t* __restrict__ input,  // [..., 2, d]
@@ -19,7 +27,7 @@ __global__ void act_and_mul_kernel(
   for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
     const scalar_t x = VLLM_LDG(&input[token_idx * 2 * d + idx]);
     const scalar_t y = VLLM_LDG(&input[token_idx * 2 * d + d + idx]);
-    out[token_idx * d + idx] = ACT_FN(x) * y;
+    out[token_idx * d + idx] = compute<scalar_t, ACT_FN, act_first>(x, y);
   }
 }
 
@@ -55,7 +63,9 @@ __device__ __forceinline__ T gelu_tanh_kernel(const T& x) {
 }  // namespace vllm
 
 // Launch activation and gating kernel.
-#define LAUNCH_ACTIVATION_GATE_KERNEL(KERNEL)                            \
+// Use ACT_FIRST (bool) indicating whether to apply the activation function
+// first.
+#define LAUNCH_ACTIVATION_GATE_KERNEL(KERNEL, ACT_FIRST)                 \
   int d = input.size(-1) / 2;                                            \
   int64_t num_tokens = input.numel() / input.size(-1);                   \
   dim3 grid(num_tokens);                                                 \
@@ -64,7 +74,7 @@ __device__ __forceinline__ T gelu_tanh_kernel(const T& x) {
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();          \
   VLLM_DISPATCH_FLOATING_TYPES(                                          \
       input.scalar_type(), "act_and_mul_kernel", [&] {                   \
-        vllm::act_and_mul_kernel<scalar_t, KERNEL<scalar_t>>             \
+        vllm::act_and_mul_kernel<scalar_t, KERNEL<scalar_t>, ACT_FIRST>  \
             <<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(),       \
                                          input.data_ptr<scalar_t>(), d); \
       });
@@ -72,19 +82,27 @@ __device__ __forceinline__ T gelu_tanh_kernel(const T& x) {
 void silu_and_mul(torch::Tensor& out,    // [..., d]
                   torch::Tensor& input)  // [..., 2 * d]
 {
-  LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel);
+  LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel, true);
+}
+
+void mul_and_silu(torch::Tensor& out,    // [..., d]
+                  torch::Tensor& input)  // [..., 2 * d]
+{
+  // The difference between mul_and_silu and silu_and_mul is that mul_and_silu
+  // applies the silu to the latter half of the input.
+  LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel, false);
 }
 
 void gelu_and_mul(torch::Tensor& out,    // [..., d]
                   torch::Tensor& input)  // [..., 2 * d]
 {
-  LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_kernel);
+  LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_kernel, true);
 }
 
 void gelu_tanh_and_mul(torch::Tensor& out,    // [..., d]
                        torch::Tensor& input)  // [..., 2 * d]
 {
-  LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_tanh_kernel);
+  LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_tanh_kernel, true);
 }
 
 namespace vllm {
diff --git a/csrc/ops.h b/csrc/ops.h
index 9efd9b0c24700..5a194a0dd3654 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -86,6 +86,8 @@ void batched_rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
 
 void silu_and_mul(torch::Tensor& out, torch::Tensor& input);
 
+void mul_and_silu(torch::Tensor& out, torch::Tensor& input);
+
 void gelu_and_mul(torch::Tensor& out, torch::Tensor& input);
 
 void gelu_tanh_and_mul(torch::Tensor& out, torch::Tensor& input);
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 956258c1001d3..fb53d122487d3 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -55,6 +55,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def("silu_and_mul(Tensor! out, Tensor input) -> ()");
   ops.impl("silu_and_mul", torch::kCUDA, &silu_and_mul);
 
+  ops.def("mul_and_silu(Tensor! out, Tensor input) -> ()");
+  ops.impl("mul_and_silu", torch::kCUDA, &mul_and_silu);
+
   // Activation function used in GeGLU with `none` approximation.
   ops.def("gelu_and_mul(Tensor! out, Tensor input) -> ()");
   ops.impl("gelu_and_mul", torch::kCUDA, &gelu_and_mul);
diff --git a/tests/kernels/test_activation.py b/tests/kernels/test_activation.py
index a84501f9c303f..dac26efe866b8 100644
--- a/tests/kernels/test_activation.py
+++ b/tests/kernels/test_activation.py
@@ -6,8 +6,9 @@
 
 from tests.kernels.utils import opcheck
 from vllm.model_executor.layers.activation import (FastGELU, FatreluAndMul,
-                                                   GeluAndMul, NewGELU,
-                                                   QuickGELU, SiluAndMul)
+                                                   GeluAndMul, MulAndSilu,
+                                                   NewGELU, QuickGELU,
+                                                   SiluAndMul)
 from vllm.platforms import current_platform
 
 from .allclose_default import get_default_atol, get_default_rtol
@@ -21,8 +22,9 @@
 ]
 
 
-@pytest.mark.parametrize("activation",
-                         ["silu", "gelu", "gelu_tanh", "fatrelu"])
+@pytest.mark.parametrize(
+    "activation",
+    ["silu_and_mul", "mul_and_silu", "gelu", "gelu_tanh", "fatrelu"])
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
 @pytest.mark.parametrize("d", D)
 @pytest.mark.parametrize("dtype", DTYPES)
@@ -40,9 +42,12 @@ def test_act_and_mul(
     current_platform.seed_everything(seed)
     torch.set_default_device(device)
     x = torch.randn(num_tokens, 2 * d, dtype=dtype)
-    if activation == "silu":
+    if activation == "silu_and_mul":
         layer = SiluAndMul()
         fn = torch.ops._C.silu_and_mul
+    if activation == "mul_and_silu":
+        layer = MulAndSilu()
+        fn = torch.ops._C.mul_and_silu
     elif activation == "gelu":
         layer = GeluAndMul(approximate="none")
         fn = torch.ops._C.gelu_and_mul
@@ -55,8 +60,9 @@ def test_act_and_mul(
         fn = torch.ops._C.fatrelu_and_mul
     out = layer(x)
     ref_out = layer.forward_native(x)
-    # The SiLU, GELU and FatReLU implementations are equivalent to the native
-    # PyTorch implementations, so we can do exact comparison.
+    # The SiluAndMul, MulAndSilu, GELU and FatReLU implementations are
+    # equivalent to the native PyTorch implementations, so we can do exact
+    # comparison.
     torch.testing.assert_close(out, ref_out, atol=0.0, rtol=0.0)
 
     d = x.shape[-1] // 2
diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index 2475190d197d3..af7894b42c560 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -87,6 +87,41 @@ def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
         return out
 
 
+@CustomOp.register("mul_and_silu")
+class MulAndSilu(CustomOp):
+    """An activation function for SwiGLU.
+
+    The function computes x -> x[:d] * silu(x[d:]) where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    def __init__(self):
+        super().__init__()
+        if current_platform.is_cuda_alike() or current_platform.is_cpu():
+            self.op = torch.ops._C.mul_and_silu
+        elif current_platform.is_xpu():
+            from vllm._ipex_ops import ipex_ops
+            self.op = ipex_ops.silu_and_mul
+
+    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
+        """PyTorch-native implementation equivalent to forward()."""
+        d = x.shape[-1] // 2
+        return x[..., :d] * F.silu(x[..., d:])
+
+    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
+        d = x.shape[-1] // 2
+        output_shape = (x.shape[:-1] + (d, ))
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        self.op(out, x)
+        return out
+
+    # TODO implement forward_xpu for MulAndSilu
+    # def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
+
+
 @CustomOp.register("gelu_and_mul")
 class GeluAndMul(CustomOp):
     """An activation function for GeGLU.
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index a2fd1701316f2..5c7ae0deefcd8 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -23,7 +23,8 @@
 from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
                          InputContext, token_inputs)
 from vllm.model_executor import SamplingMetadata
-from vllm.model_executor.layers.activation import QuickGELU, SiluAndMul
+from vllm.model_executor.layers.activation import (MulAndSilu, QuickGELU,
+                                                   SiluAndMul)
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                MergedColumnParallelLinear,
@@ -462,15 +463,6 @@ def forward(
         return output
 
 
-class SwiGLU(nn.Module):
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x, gate = x.chunk(2, dim=-1)
-        # Note that the order is reversed compared to
-        # SiluAndMul.
-        return x * F.silu(gate)
-
-
 class LanuageModelMLP(nn.Module):
     """Molmo's LLM mlp."""
 
@@ -489,7 +481,7 @@ def __init__(self,
             quant_config=quant_config,
         )
         # Activation function.
-        self.act_fn = SwiGLU()
+        self.act_fn = MulAndSilu()
         # Feed-forward output projection.
         self.down_proj = RowParallelLinear(
             self.intermediate_size,
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index 3edfb5107683a..587f18ccaf98f 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -16,7 +16,7 @@
 from vllm import envs
 from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
-from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn
+from vllm.model_executor.layers.activation import MulAndSilu, get_act_fn
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.model_loader.loader import DefaultModelLoader
@@ -248,15 +248,6 @@ def forward(self, audio_embeds: torch.Tensor) -> torch.Tensor:
         return audio_embeds
 
 
-class FlippedSiluAndMul(SiluAndMul):
-    """Ultravox is trained with SwiGLU with flipped halves."""
-
-    def forward(self, x: torch.Tensor):
-        a, b = x.chunk(2, dim=-1)
-        flipped = torch.cat((b, a), dim=-1)
-        return super().forward(flipped)
-
-
 class UltravoxProjector(nn.Module):
 
     def __init__(self, config: UltravoxConfig):
@@ -269,7 +260,7 @@ def __init__(self, config: UltravoxConfig):
         dim = self.hidden_dim
 
         if config.projector_act == "swiglu":
-            self.act = FlippedSiluAndMul()
+            self.act = MulAndSilu()
             dim = dim // 2
         else:
             self.act = get_act_fn(config.projector_act)

From 1a51b9f87226b2290c78c65c1de0f585d31f17ce Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Wed, 15 Jan 2025 03:59:18 +0100
Subject: [PATCH 217/462] [HPU][Bugfix] Don't use /dev/accel/accel0 for HPU
 autodetection in setup.py (#12046)

Signed-off-by: Konrad Zawora <kzawora@habana.ai>
---
 setup.py | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/setup.py b/setup.py
index b6c1f5bc8ac3f..7dfcec7f9f0c5 100644
--- a/setup.py
+++ b/setup.py
@@ -324,21 +324,26 @@ def run(self) -> None:
 
 
 def _is_hpu() -> bool:
-    is_hpu_available = True
+    # if VLLM_TARGET_DEVICE env var was set explicitly, skip HPU autodetection
+    if os.getenv("VLLM_TARGET_DEVICE", None) == VLLM_TARGET_DEVICE:
+        return VLLM_TARGET_DEVICE == "hpu"
+
+    # if VLLM_TARGET_DEVICE was not set explicitly, check if hl-smi succeeds,
+    # and if it doesn't, check if habanalabs driver is loaded
+    is_hpu_available = False
     try:
-        subprocess.run(["hl-smi"], capture_output=True, check=True)
+        out = subprocess.run(["hl-smi"], capture_output=True, check=True)
+        is_hpu_available = out.returncode == 0
     except (FileNotFoundError, PermissionError, subprocess.CalledProcessError):
-        if not os.path.exists('/dev/accel/accel0') and not os.path.exists(
-                '/dev/accel/accel_controlD0'):
-            # last resort...
+        if sys.platform.startswith("linux"):
             try:
                 output = subprocess.check_output(
                     'lsmod | grep habanalabs | wc -l', shell=True)
                 is_hpu_available = int(output) > 0
             except (ValueError, FileNotFoundError, PermissionError,
                     subprocess.CalledProcessError):
-                is_hpu_available = False
-    return is_hpu_available or VLLM_TARGET_DEVICE == "hpu"
+                pass
+    return is_hpu_available
 
 
 def _no_device() -> bool:

From 9ddac56311b28f08e40a941296eb66fbb1be0a7a Mon Sep 17 00:00:00 2001
From: Shanshan Shen <467638484@qq.com>
Date: Wed, 15 Jan 2025 11:38:25 +0800
Subject: [PATCH 218/462] [Platform] move current_memory_usage() into platform
 (#11369)

Signed-off-by: Shanshan Shen <467638484@qq.com>
---
 vllm/platforms/cuda.py      | 7 +++++++
 vllm/platforms/interface.py | 9 +++++++++
 vllm/platforms/rocm.py      | 7 +++++++
 vllm/platforms/xpu.py       | 7 +++++++
 vllm/utils.py               | 8 +-------
 5 files changed, 31 insertions(+), 7 deletions(-)

diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 80cefcb492531..2587e3a11dde3 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -143,6 +143,13 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         if cache_config and cache_config.block_size is None:
             cache_config.block_size = 16
 
+    @classmethod
+    def get_current_memory_usage(cls,
+                                 device: Optional[torch.types.Device] = None
+                                 ) -> float:
+        torch.cuda.reset_peak_memory_stats(device)
+        return torch.cuda.max_memory_allocated(device)
+
     @classmethod
     def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
                              kv_cache_dtype, block_size, use_v1) -> str:
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 85fde76796901..f2ecec3203fb7 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -277,6 +277,15 @@ def is_pin_memory_available(cls) -> bool:
             return False
         return True
 
+    @classmethod
+    def get_current_memory_usage(cls,
+                                 device: Optional[torch.types.Device] = None
+                                 ) -> float:
+        """
+        Return the memory usage in bytes.
+        """
+        raise NotImplementedError
+
     @classmethod
     def get_punica_wrapper(cls) -> str:
         """
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 43105d7855e79..67a9e816cb658 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -157,3 +157,10 @@ def verify_quantization(cls, quant: str) -> None:
     @classmethod
     def get_punica_wrapper(cls) -> str:
         return "vllm.lora.punica_wrapper.punica_gpu.PunicaWrapperGPU"
+
+    @classmethod
+    def get_current_memory_usage(cls,
+                                 device: Optional[torch.types.Device] = None
+                                 ) -> float:
+        torch.cuda.reset_peak_memory_stats(device)
+        return torch.cuda.max_memory_allocated(device)
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index f34376b44e689..031abdc05d517 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -94,3 +94,10 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
     def is_pin_memory_available(cls):
         logger.warning("Pin memory is not supported on XPU.")
         return False
+
+    @classmethod
+    def get_current_memory_usage(cls,
+                                 device: Optional[torch.types.Device] = None
+                                 ) -> float:
+        torch.xpu.reset_peak_memory_stats(device)
+        return torch.xpu.max_memory_allocated(device)
diff --git a/vllm/utils.py b/vllm/utils.py
index 9a509da3c1ef1..7477e7028f5ef 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -710,13 +710,7 @@ def __init__(self, device: Optional[torch.types.Device] = None):
     def current_memory_usage(self) -> float:
         # Return the memory usage in bytes.
         from vllm.platforms import current_platform
-        if current_platform.is_cuda_alike():
-            torch.cuda.reset_peak_memory_stats(self.device)
-            mem = torch.cuda.max_memory_allocated(self.device)
-        elif current_platform.is_xpu():
-            torch.xpu.reset_peak_memory_stats(self.device)  # type: ignore
-            mem = torch.xpu.max_memory_allocated(self.device)  # type: ignore
-        return mem
+        return current_platform.get_current_memory_usage(self.device)
 
     def __enter__(self):
         self.initial_memory = self.current_memory_usage()

From b7ee940a828de9d339345e28eee8b13d60d97f26 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 14 Jan 2025 20:21:28 -0800
Subject: [PATCH 219/462] [V1][BugFix] Fix edge case in VLM scheduling (#12065)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/core/scheduler.py | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index f04e529891287..2503d136aea7e 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -373,18 +373,22 @@ def _try_schedule_encoder_inputs(
             if self.encoder_cache_manager.has_cache(request, i):
                 # The encoder input is already computed and cached.
                 continue
-            if not self.encoder_cache_manager.can_allocate(request, i):
-                # The encoder cache is full. We can only schedule the decoder
-                # tokens just before the encoder input.
-                num_new_tokens = start_pos - num_computed_tokens
-                break
-            if num_encoder_tokens > encoder_budget:
-                # The encoder budget is exhausted. We can only schedule the
-                # decoder tokens up until the encoder input.
-                # NOTE(woosuk): We assume that the encoder tokens should be
-                # processed altogether, as the encoder usually uses
+            if (not self.encoder_cache_manager.can_allocate(request, i)
+                    or num_encoder_tokens > encoder_budget):
+                # The encoder cache is full or the encoder budget is exhausted.
+                # NOTE(woosuk): We assume that the encoder input tokens should
+                # be processed altogether, as the encoder usually uses
                 # bidirectional attention.
-                num_new_tokens = start_pos - num_computed_tokens
+                if num_computed_tokens < start_pos:
+                    # We only schedule the decoder tokens just before the
+                    # encoder input.
+                    num_new_tokens = start_pos - num_computed_tokens
+                else:
+                    # Because of prefix caching, num_computed_tokens is greater
+                    # than start_pos even though its encoder input is not
+                    # available. In this case, we can't schedule any token for
+                    # the request in this step.
+                    num_new_tokens = 0
                 break
 
             encoder_budget -= num_encoder_tokens

From 0794e7446efca1fd7b8ea1cde96777897660cdea Mon Sep 17 00:00:00 2001
From: Elfie Guo <164945471+elfiegg@users.noreply.github.com>
Date: Tue, 14 Jan 2025 20:47:49 -0800
Subject: [PATCH 220/462] [Misc] Add multipstep chunked-prefill support for
 FlashInfer (#10467)

---
 csrc/prepare_inputs/advance_step.cu      |  10 ++
 tests/multi_step/test_correctness_llm.py |  17 +-
 vllm/attention/backends/flashinfer.py    |  29 ++-
 vllm/worker/model_runner.py              | 220 ++++++++++++-----------
 vllm/worker/multi_step_model_runner.py   |   2 +-
 5 files changed, 169 insertions(+), 109 deletions(-)

diff --git a/csrc/prepare_inputs/advance_step.cu b/csrc/prepare_inputs/advance_step.cu
index bd184ee22682e..c3902f4c2a163 100644
--- a/csrc/prepare_inputs/advance_step.cu
+++ b/csrc/prepare_inputs/advance_step.cu
@@ -95,6 +95,16 @@ __global__ void advance_step_flashinfer_kernel(
     long* input_positions_ptr, int* seq_lens_ptr, long* slot_mapping_ptr,
     int const* block_tables_ptr, int64_t const block_tables_stride,
     int* paged_kv_last_page_len_ptr, int* block_table_bound_ptr) {
+  int const n_pad = num_seqs - num_queries;
+  if (n_pad && blockIdx.x == 0) {
+    // Handle cuda graph padding
+    int const offset = num_queries;
+    for (int i = threadIdx.x; i < n_pad; i += blockDim.x) {
+      input_tokens_ptr[offset + i] = 0;
+      input_positions_ptr[offset + i] = 0;
+      slot_mapping_ptr[offset + i] = -1;
+    }
+  }
   int num_query_blocks = div_ceil(num_queries, num_threads);
 
   if (blockIdx.x < num_query_blocks) {
diff --git a/tests/multi_step/test_correctness_llm.py b/tests/multi_step/test_correctness_llm.py
index cc1fd19252019..34030d9d6ac60 100644
--- a/tests/multi_step/test_correctness_llm.py
+++ b/tests/multi_step/test_correctness_llm.py
@@ -5,6 +5,8 @@
 
 import pytest
 
+from tests.kernels.utils import override_backend_env_variable
+
 from ..models.utils import check_logprobs_close, check_outputs_equal
 
 MODELS = [
@@ -19,10 +21,11 @@
 @pytest.mark.parametrize("tp_size", [1])
 @pytest.mark.parametrize("enable_chunked_prefill", [False, True])
 @pytest.mark.parametrize("max_tokens", [5])
-@pytest.mark.parametrize("enforce_eager", [True])
+@pytest.mark.parametrize("enforce_eager", [True, False])
 @pytest.mark.parametrize("num_scheduler_steps", NUM_SCHEDULER_STEPS)
 @pytest.mark.parametrize("num_prompts", NUM_PROMPTS)
 @pytest.mark.parametrize("num_logprobs", [None, 5])
+@pytest.mark.parametrize("attention_backend", ["FLASH_ATTN", "FLASHINFER"])
 def test_multi_step_llm(
     hf_runner,
     vllm_runner,
@@ -36,6 +39,8 @@ def test_multi_step_llm(
     num_scheduler_steps: int,
     num_prompts: int,
     num_logprobs: Optional[int],
+    attention_backend: str,
+    monkeypatch,
 ) -> None:
     """Test vLLM engine with multi-step scheduling via sync LLM Engine.
 
@@ -63,6 +68,7 @@ def test_multi_step_llm(
       num_logprobs: corresponds to the `logprobs` argument to the OpenAI
                     completions endpoint; `None` -> 1 logprob returned.
     """
+    override_backend_env_variable(monkeypatch, attention_backend)
 
     prompts = example_prompts
     if len(prompts) < num_prompts:
@@ -114,6 +120,7 @@ def test_multi_step_llm(
 @pytest.mark.parametrize("num_scheduler_steps", NUM_SCHEDULER_STEPS)
 @pytest.mark.parametrize("num_prompts", NUM_PROMPTS)
 @pytest.mark.parametrize("num_logprobs,num_prompt_logprobs", [(5, 5)])
+@pytest.mark.parametrize("attention_backend", ["FLASH_ATTN"])
 def test_multi_step_llm_w_prompt_logprobs(
     vllm_runner,
     example_prompts,
@@ -126,6 +133,8 @@ def test_multi_step_llm_w_prompt_logprobs(
     num_prompts: int,
     num_logprobs: Optional[int],
     num_prompt_logprobs: Optional[int],
+    attention_backend: str,
+    monkeypatch,
 ) -> None:
     """Test prompt logprobs with multi-step scheduling via sync LLM Engine.
 
@@ -155,6 +164,7 @@ def test_multi_step_llm_w_prompt_logprobs(
                            note that this argument is not supported by the
                            OpenAI completions endpoint.
     """
+    override_backend_env_variable(monkeypatch, attention_backend)
 
     prompts = example_prompts
     if len(prompts) < num_prompts:
@@ -205,6 +215,7 @@ def test_multi_step_llm_w_prompt_logprobs(
 @pytest.mark.parametrize("num_scheduler_steps", NUM_SCHEDULER_STEPS)
 @pytest.mark.parametrize("num_prompts", NUM_PROMPTS)
 @pytest.mark.parametrize("num_logprobs", [None, 5])
+@pytest.mark.parametrize("attention_backend", ["FLASH_ATTN"])
 def test_multi_step_llm_chunked_prefill_prefix_cache(
     vllm_runner,
     example_prompts,
@@ -216,6 +227,8 @@ def test_multi_step_llm_chunked_prefill_prefix_cache(
     num_scheduler_steps: int,
     num_prompts: int,
     num_logprobs: Optional[int],
+    attention_backend: str,
+    monkeypatch,
 ) -> None:
     """Test vLLM engine with multi-step+"single-step chunked prefill"+APC.
 
@@ -278,6 +291,8 @@ def test_multi_step_llm_chunked_prefill_prefix_cache(
     #
     # The Incorrect scheduling behavior - if it occurs - will cause an exception
     # in the model runner resulting from `do_sample=False`.
+    override_backend_env_variable(monkeypatch, attention_backend)
+
     assert len(example_prompts) >= 2
     challenge_prompts = copy.deepcopy(example_prompts)
     challenge_prompts[0] = ('vLLM is a high-throughput and memory-efficient '
diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
index a11462b2068a5..6ca75fabdfc38 100644
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -256,7 +256,12 @@ def prepare_graph_input_buffers(self,
     def begin_forward(self, model_input):
         assert not self._is_graph_capturing
         state = self
-        if model_input.attn_metadata.use_cuda_graph:
+        use_cuda_graph = model_input.attn_metadata.use_cuda_graph
+        is_decode = model_input.attn_metadata.num_prefills == 0
+        # In case of multistep chunked-prefill, there might be prefill requests
+        # scheduled while CUDA graph mode is enabled. We don't run graph in that
+        # case.
+        if use_cuda_graph and is_decode:
             batch_size = model_input.input_tokens.shape[0]
             state = (self.runner.graph_runners[model_input.virtual_engine]
                      [batch_size].attn_state)
@@ -429,10 +434,24 @@ def advance_step(self,
         Update metadata in-place to advance one decode step.
         """
 
-        assert not turn_prefills_into_decodes, \
-            ("Chunked prefill is not supported with flashinfer yet."
-             "turn_prefills_into_decodes is a Multi-Step + Chunked-Prefill "
-             "specific parameter.")
+        if turn_prefills_into_decodes:
+            # When Multi-Step is enabled with Chunked-Prefill, prefills and
+            # decodes are scheduled together. In the first step, all the
+            # prefills turn into decodes. This update reflects that
+            # conversion.
+            assert self.num_decode_tokens + self.num_prefills == num_seqs
+            # Flashinfer doesn't support speculative decoding + chunked-prefill
+            # + multi-step scheduling yet.
+            assert self.decode_query_len == 1
+            self.num_decode_tokens += self.num_prefills
+            self.num_prefills = 0
+            self.num_prefill_tokens = 0
+            self.max_prefill_seq_len = 0
+            self.max_query_len = 1
+
+            self.slot_mapping = self.slot_mapping[:num_seqs]
+        else:
+            assert self.seq_lens_tensor is not None
 
         assert num_seqs > 0
         assert num_queries > 0
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 2b918483d3675..ae8b7f97c827d 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -5,6 +5,7 @@
 import time
 import warnings
 import weakref
+from contextlib import contextmanager
 from dataclasses import dataclass
 from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Optional, Set,
                     Tuple, Type, TypeVar, Union)
@@ -1028,6 +1029,8 @@ def __init__(
 
         self.has_inner_state = model_config.has_inner_state
 
+        self.in_profile_run = False
+
         # When using CUDA graph, the input block tables must be padded to
         # max_seq_len_to_capture. However, creating the block table in
         # Python can be expensive. To optimize this, we cache the block table
@@ -1228,110 +1231,123 @@ def _prepare_model_input_tensors(
 
         return builder.build()  # type: ignore
 
+    @contextmanager
+    def set_in_profile_run(self):
+        self.in_profile_run = True
+        try:
+            yield
+        finally:
+            self.in_profile_run = False
+
     @torch.inference_mode()
     def profile_run(self) -> None:
-        # Enable top-k sampling to reflect the accurate memory usage.
-        sampling_params = SamplingParams(top_p=0.99, top_k=self.vocab_size - 1)
-        max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens
-        max_num_seqs = self.scheduler_config.max_num_seqs
-        # This represents the maximum number of different requests
-        # that will have unique loras, an therefore the max amount of memory
-        # consumption create dummy lora request copies from the lora request
-        # passed in, which contains a lora from the lora warmup path.
-        dummy_lora_requests: List[LoRARequest] = []
-        dummy_lora_requests_per_seq: List[LoRARequest] = []
-        if self.lora_config:
-            assert self.lora_manager is not None
-            with self.lora_manager.dummy_lora_cache():
-                for idx in range(self.lora_config.max_loras):
-                    lora_id = idx + 1
-                    dummy_lora_request = LoRARequest(
-                        lora_name=f"warmup_{lora_id}",
-                        lora_int_id=lora_id,
-                        lora_path="/not/a/real/path",
-                    )
-                    self.lora_manager.add_dummy_lora(dummy_lora_request,
-                                                     rank=LORA_WARMUP_RANK)
-                    dummy_lora_requests.append(dummy_lora_request)
-                dummy_lora_requests_per_seq = [
-                    dummy_lora_requests[idx % len(dummy_lora_requests)]
-                    for idx in range(max_num_seqs)
-                ]
-
-        # Profile memory usage with max_num_sequences sequences and the total
-        # number of tokens equal to max_num_batched_tokens.
-        seqs: List[SequenceGroupMetadata] = []
-        # Additional GPU memory may be needed for multi-modal encoding, which
-        # needs to be accounted for when calculating the GPU blocks for
-        # vLLM blocker manager.
-        # To exercise the worst scenario for GPU memory consumption,
-        # the number of seqs (batch_size) is chosen to maximize the number
-        # of images processed.
-
-        max_mm_tokens = self.mm_registry.get_max_multimodal_tokens(
-            self.model_config)
-        if max_mm_tokens > 0:
-            max_num_seqs_orig = max_num_seqs
-            max_num_seqs = min(max_num_seqs,
-                               max_num_batched_tokens // max_mm_tokens)
-            if max_num_seqs < 1:
-                expr = (f"min({max_num_seqs_orig}, "
-                        f"{max_num_batched_tokens} // {max_mm_tokens})")
-                logger.warning(
-                    "Computed max_num_seqs (%s) to be less than 1. "
-                    "Setting it to the minimum value of 1.", expr)
-                max_num_seqs = 1
-
-        batch_size = 0
-        for group_id in range(max_num_seqs):
-            seq_len = (max_num_batched_tokens // max_num_seqs +
-                       (group_id < max_num_batched_tokens % max_num_seqs))
-            batch_size += seq_len
-
-            dummy_data = self.input_registry \
-                .dummy_data_for_profiling(self.model_config,
-                                          seq_len,
-                                          self.mm_registry)
-
-            seq = SequenceGroupMetadata(
-                request_id=str(group_id),
-                is_prompt=True,
-                seq_data={group_id: dummy_data.seq_data},
-                sampling_params=sampling_params,
-                block_tables=None,
-                lora_request=dummy_lora_requests_per_seq[group_id]
-                if dummy_lora_requests_per_seq else None,
-                multi_modal_data=dummy_data.multi_modal_data,
-                multi_modal_placeholders=dummy_data.multi_modal_placeholders,
-            )
-            seqs.append(seq)
-
-        # Run the model with the dummy inputs.
-        num_layers = self.model_config.get_num_layers(self.parallel_config)
-        # use an empty tensor instead of `None`` to force Dynamo to pass
-        # it by reference, rather by specializing on the value ``None``.
-        # the `dtype` argument does not matter, and we use `float32` as
-        # a placeholder (it has wide hardware support).
-        # it is important to create tensors inside the loop, rather than
-        # multiplying the list, to avoid Dynamo from treating them as
-        # tensor aliasing.
-        kv_caches = [
-            torch.tensor([], dtype=torch.float32, device=self.device)
-            for _ in range(num_layers)
-        ]
-        finished_requests_ids = [seq.request_id for seq in seqs]
-        model_input = self.prepare_model_input(
-            seqs, finished_requests_ids=finished_requests_ids)
-        intermediate_tensors = None
-        if not get_pp_group().is_first_rank:
-            intermediate_tensors = self.model.make_empty_intermediate_tensors(
-                batch_size=batch_size,
-                dtype=self.model_config.dtype,
-                device=self.device)
-
-        self.execute_model(model_input, kv_caches, intermediate_tensors)
-        torch.cuda.synchronize()
-        return
+        with self.set_in_profile_run():
+            # Enable top-k sampling to reflect the accurate memory usage.
+            sampling_params = \
+                SamplingParams(top_p=0.99, top_k=self.vocab_size - 1)
+            max_num_batched_tokens = \
+                self.scheduler_config.max_num_batched_tokens
+            max_num_seqs = self.scheduler_config.max_num_seqs
+            # This represents the maximum number of different requests
+            # that will have unique loras, an therefore the max amount of memory
+            # consumption create dummy lora request copies from the lora request
+            # passed in, which contains a lora from the lora warmup path.
+            dummy_lora_requests: List[LoRARequest] = []
+            dummy_lora_requests_per_seq: List[LoRARequest] = []
+            if self.lora_config:
+                assert self.lora_manager is not None
+                with self.lora_manager.dummy_lora_cache():
+                    for idx in range(self.lora_config.max_loras):
+                        lora_id = idx + 1
+                        dummy_lora_request = LoRARequest(
+                            lora_name=f"warmup_{lora_id}",
+                            lora_int_id=lora_id,
+                            lora_path="/not/a/real/path",
+                        )
+                        self.lora_manager.add_dummy_lora(dummy_lora_request,
+                                                         rank=LORA_WARMUP_RANK)
+                        dummy_lora_requests.append(dummy_lora_request)
+                    dummy_lora_requests_per_seq = [
+                        dummy_lora_requests[idx % len(dummy_lora_requests)]
+                        for idx in range(max_num_seqs)
+                    ]
+
+            # Profile memory usage with max_num_sequences sequences and the
+            # total number of tokens equal to max_num_batched_tokens.
+            seqs: List[SequenceGroupMetadata] = []
+            # Additional GPU memory may be needed for multi-modal encoding,
+            # which needs to be accounted for when calculating the GPU blocks
+            # for vLLM blocker manager.
+            # To exercise the worst scenario for GPU memory consumption,
+            # the number of seqs (batch_size) is chosen to maximize the number
+            # of images processed.
+
+            max_mm_tokens = self.mm_registry.get_max_multimodal_tokens(
+                self.model_config)
+            if max_mm_tokens > 0:
+                max_num_seqs_orig = max_num_seqs
+                max_num_seqs = min(max_num_seqs,
+                                   max_num_batched_tokens // max_mm_tokens)
+                if max_num_seqs < 1:
+                    expr = (f"min({max_num_seqs_orig}, "
+                            f"{max_num_batched_tokens} // {max_mm_tokens})")
+                    logger.warning(
+                        "Computed max_num_seqs (%s) to be less than 1. "
+                        "Setting it to the minimum value of 1.", expr)
+                    max_num_seqs = 1
+
+            batch_size = 0
+            for group_id in range(max_num_seqs):
+                seq_len = (max_num_batched_tokens // max_num_seqs +
+                           (group_id < max_num_batched_tokens % max_num_seqs))
+                batch_size += seq_len
+
+                dummy_data = self.input_registry \
+                    .dummy_data_for_profiling(self.model_config,
+                                            seq_len,
+                                            self.mm_registry)
+
+                seq = SequenceGroupMetadata(
+                    request_id=str(group_id),
+                    is_prompt=True,
+                    seq_data={group_id: dummy_data.seq_data},
+                    sampling_params=sampling_params,
+                    block_tables=None,
+                    lora_request=dummy_lora_requests_per_seq[group_id]
+                    if dummy_lora_requests_per_seq else None,
+                    multi_modal_data=dummy_data.multi_modal_data,
+                    multi_modal_placeholders=dummy_data.
+                    multi_modal_placeholders,
+                )
+                seqs.append(seq)
+
+            # Run the model with the dummy inputs.
+            num_layers = self.model_config.get_num_layers(self.parallel_config)
+            # use an empty tensor instead of `None`` to force Dynamo to pass
+            # it by reference, rather by specializing on the value ``None``.
+            # the `dtype` argument does not matter, and we use `float32` as
+            # a placeholder (it has wide hardware support).
+            # it is important to create tensors inside the loop, rather than
+            # multiplying the list, to avoid Dynamo from treating them as
+            # tensor aliasing.
+            kv_caches = [
+                torch.tensor([], dtype=torch.float32, device=self.device)
+                for _ in range(num_layers)
+            ]
+            finished_requests_ids = [seq.request_id for seq in seqs]
+            model_input = self.prepare_model_input(
+                seqs, finished_requests_ids=finished_requests_ids)
+            intermediate_tensors = None
+            if not get_pp_group().is_first_rank:
+                intermediate_tensors = \
+                    self.model.make_empty_intermediate_tensors(
+                    batch_size=batch_size,
+                    dtype=self.model_config.dtype,
+                    device=self.device)
+
+            self.execute_model(model_input, kv_caches, intermediate_tensors)
+            torch.cuda.synchronize()
+            return
 
     def remove_all_loras(self):
         if not self.lora_manager:
diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py
index acce923498d7e..4aab09c80826b 100644
--- a/vllm/worker/multi_step_model_runner.py
+++ b/vllm/worker/multi_step_model_runner.py
@@ -32,7 +32,7 @@
 MULTI_STEP_ATTENTION_BACKENDS = [
     "FLASH_ATTN", "ROCM_FLASH", "FLASHINFER", "NO_ATTENTION"
 ]
-MULTI_STEP_CHUNKED_PREFILL_ATTENTION_BACKENDS = ["FLASH_ATTN"]
+MULTI_STEP_CHUNKED_PREFILL_ATTENTION_BACKENDS = ["FLASH_ATTN", "FLASHINFER"]
 
 def _get_supported_attention_backends(chunked_prefill_enabled: bool) \
     -> List[str]:

From f218f9c24d224800e0ea4488aa71bd8215c8bdcd Mon Sep 17 00:00:00 2001
From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
Date: Tue, 14 Jan 2025 21:19:55 -0800
Subject: [PATCH 221/462] [core] Turn off GPU communication overlap for Ray
 executor (#12051)

Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
---
 vllm/envs.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index c4a568c680db0..b7b597ea15af3 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -45,7 +45,7 @@
     VLLM_USE_RAY_SPMD_WORKER: bool = False
     VLLM_USE_RAY_COMPILED_DAG: bool = False
     VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL: bool = True
-    VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM: bool = True
+    VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM: bool = False
     VLLM_WORKER_MULTIPROC_METHOD: str = "fork"
     VLLM_ASSETS_CACHE: str = os.path.join(VLLM_CACHE_ROOT, "assets")
     VLLM_IMAGE_FETCH_TIMEOUT: int = 5
@@ -340,11 +340,11 @@ def get_default_config_root():
     lambda: bool(int(os.getenv("VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL", "1"))
                  ),
 
-    # If the env var is set, it enables GPU communication overlap in
-    # Ray's compiled DAG. This flag is ignored if
+    # If the env var is set, it enables GPU communication overlap
+    # (experimental feature) in Ray's compiled DAG. This flag is ignored if
     # VLLM_USE_RAY_COMPILED_DAG is not set.
     "VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM":
-    lambda: bool(int(os.getenv("VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM", "1"))
+    lambda: bool(int(os.getenv("VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM", "0"))
                  ),
 
     # Use dedicated multiprocess context for workers.

From ad34c0df0f1b26b303a590133685b29e3daad20e Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Wed, 15 Jan 2025 13:45:21 +0800
Subject: [PATCH 222/462] [core] platform agnostic executor via collective_rpc
 (#11256)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 tests/engine/test_custom_executor.py          |  28 +-
 tests/engine/test_multiproc_workers.py        |  12 +-
 tests/test_utils.py                           |   6 +-
 vllm/config.py                                |  12 +-
 vllm/distributed/parallel_state.py            |   6 +-
 vllm/engine/async_llm_engine.py               |  88 +--
 vllm/engine/llm_engine.py                     |  94 +---
 vllm/engine/multiprocessing/engine.py         |  11 +-
 vllm/executor/cpu_executor.py                 | 299 ----------
 vllm/executor/distributed_gpu_executor.py     | 212 -------
 vllm/executor/executor_base.py                | 267 +++++++--
 vllm/executor/gpu_executor.py                 | 145 -----
 vllm/executor/hpu_executor.py                 | 202 -------
 ...executor.py => mp_distributed_executor.py} |  81 +--
 vllm/executor/multiproc_worker_utils.py       |  12 +-
 vllm/executor/multiproc_xpu_executor.py       |  26 -
 vllm/executor/neuron_executor.py              | 114 ----
 vllm/executor/openvino_executor.py            | 125 -----
 ...xecutor.py => ray_distributed_executor.py} | 240 ++++----
 vllm/executor/ray_hpu_executor.py             | 515 ------------------
 vllm/executor/ray_tpu_executor.py             | 343 ------------
 vllm/executor/ray_utils.py                    |  26 +-
 vllm/executor/ray_xpu_executor.py             |  40 --
 vllm/executor/tpu_executor.py                 | 142 -----
 vllm/executor/uniproc_executor.py             |  57 ++
 vllm/executor/xpu_executor.py                 |  39 --
 vllm/platforms/cpu.py                         |  27 +
 vllm/platforms/cuda.py                        |  22 +
 vllm/platforms/neuron.py                      |   8 +
 vllm/platforms/openvino.py                    |  12 +-
 vllm/platforms/tpu.py                         |  10 +
 vllm/platforms/xpu.py                         |  22 +-
 vllm/spec_decode/medusa_worker.py             |   8 +-
 vllm/spec_decode/multi_step_worker.py         |  21 +-
 vllm/spec_decode/spec_decode_worker.py        |   9 +-
 vllm/v1/executor/abstract.py                  |   5 +-
 vllm/v1/executor/multiproc_executor.py        |  17 +-
 vllm/v1/executor/uniproc_executor.py          |   2 +-
 vllm/v1/worker/gpu_worker.py                  |   3 +-
 vllm/worker/hpu_worker.py                     |  66 +++
 vllm/worker/neuron_worker.py                  |  28 +-
 vllm/worker/openvino_worker.py                |   6 +-
 vllm/worker/worker_base.py                    |  86 ++-
 43 files changed, 852 insertions(+), 2642 deletions(-)
 delete mode 100644 vllm/executor/cpu_executor.py
 delete mode 100644 vllm/executor/distributed_gpu_executor.py
 delete mode 100644 vllm/executor/gpu_executor.py
 delete mode 100644 vllm/executor/hpu_executor.py
 rename vllm/executor/{multiproc_gpu_executor.py => mp_distributed_executor.py} (75%)
 delete mode 100644 vllm/executor/multiproc_xpu_executor.py
 delete mode 100644 vllm/executor/neuron_executor.py
 delete mode 100644 vllm/executor/openvino_executor.py
 rename vllm/executor/{ray_gpu_executor.py => ray_distributed_executor.py} (78%)
 delete mode 100644 vllm/executor/ray_hpu_executor.py
 delete mode 100644 vllm/executor/ray_tpu_executor.py
 delete mode 100644 vllm/executor/ray_xpu_executor.py
 delete mode 100644 vllm/executor/tpu_executor.py
 create mode 100644 vllm/executor/uniproc_executor.py
 delete mode 100644 vllm/executor/xpu_executor.py

diff --git a/tests/engine/test_custom_executor.py b/tests/engine/test_custom_executor.py
index bbabb936e92ba..2a057ca488a50 100644
--- a/tests/engine/test_custom_executor.py
+++ b/tests/engine/test_custom_executor.py
@@ -1,12 +1,13 @@
 import asyncio
 import os
+from typing import Any, Dict, List, Optional, Tuple
 
 import pytest
 
 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.engine.llm_engine import LLMEngine
-from vllm.executor.gpu_executor import GPUExecutor, GPUExecutorAsync
+from vllm.executor.uniproc_executor import UniProcExecutor
 from vllm.sampling_params import SamplingParams
 
 
@@ -14,21 +15,20 @@ class Mock:
     ...
 
 
-class CustomGPUExecutor(GPUExecutor):
+class CustomUniExecutor(UniProcExecutor):
 
-    def execute_model(self, *args, **kwargs):
+    def collective_rpc(self,
+                       method: str,
+                       timeout: Optional[float] = None,
+                       args: Tuple = (),
+                       kwargs: Optional[Dict] = None) -> List[Any]:
         # Drop marker to show that this was ran
         with open(".marker", "w"):
             ...
-        return super().execute_model(*args, **kwargs)
+        return super().collective_rpc(method, timeout, args, kwargs)
 
 
-class CustomGPUExecutorAsync(GPUExecutorAsync):
-
-    async def execute_model_async(self, *args, **kwargs):
-        with open(".marker", "w"):
-            ...
-        return await super().execute_model_async(*args, **kwargs)
+CustomUniExecutorAsync = CustomUniExecutor
 
 
 @pytest.mark.parametrize("model", ["facebook/opt-125m"])
@@ -41,10 +41,6 @@ def test_custom_executor_type_checking(model):
         engine_args = AsyncEngineArgs(model=model,
                                       distributed_executor_backend=Mock)
         AsyncLLMEngine.from_engine_args(engine_args)
-    with pytest.raises(TypeError):
-        engine_args = AsyncEngineArgs(
-            model=model, distributed_executor_backend=CustomGPUExecutor)
-        AsyncLLMEngine.from_engine_args(engine_args)
 
 
 @pytest.mark.parametrize("model", ["facebook/opt-125m"])
@@ -55,7 +51,7 @@ def test_custom_executor(model, tmp_path):
         assert not os.path.exists(".marker")
 
         engine_args = EngineArgs(
-            model=model, distributed_executor_backend=CustomGPUExecutor)
+            model=model, distributed_executor_backend=CustomUniExecutor)
         engine = LLMEngine.from_engine_args(engine_args)
         sampling_params = SamplingParams(max_tokens=1)
 
@@ -75,7 +71,7 @@ def test_custom_executor_async(model, tmp_path):
         assert not os.path.exists(".marker")
 
         engine_args = AsyncEngineArgs(
-            model=model, distributed_executor_backend=CustomGPUExecutorAsync)
+            model=model, distributed_executor_backend=CustomUniExecutorAsync)
         engine = AsyncLLMEngine.from_engine_args(engine_args)
         sampling_params = SamplingParams(max_tokens=1)
 
diff --git a/tests/engine/test_multiproc_workers.py b/tests/engine/test_multiproc_workers.py
index e07dd6deef5bf..db70a808c008b 100644
--- a/tests/engine/test_multiproc_workers.py
+++ b/tests/engine/test_multiproc_workers.py
@@ -6,16 +6,15 @@
 
 import pytest
 
+from vllm.config import VllmConfig
 from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper,
                                                   ResultHandler, WorkerMonitor)
+from vllm.worker.worker_base import WorkerWrapperBase
 
 
-class DummyWorker:
+class DummyWorkerWrapper(WorkerWrapperBase):
     """Dummy version of vllm.worker.worker.Worker"""
 
-    def __init__(self, rank: int):
-        self.rank = rank
-
     def worker_method(self, worker_input: Any) -> Tuple[int, Any]:
         sleep(0.05)
 
@@ -28,9 +27,10 @@ def worker_method(self, worker_input: Any) -> Tuple[int, Any]:
 
 def _start_workers() -> Tuple[List[ProcessWorkerWrapper], WorkerMonitor]:
     result_handler = ResultHandler()
+    vllm_config = VllmConfig()
     workers = [
-        ProcessWorkerWrapper(result_handler, partial(DummyWorker, rank=rank))
-        for rank in range(8)
+        ProcessWorkerWrapper(result_handler, DummyWorkerWrapper, vllm_config,
+                             rank) for rank in range(8)
     ]
 
     worker_monitor = WorkerMonitor(workers, result_handler)
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 6810e0302f897..c68d730af7f8a 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -2,6 +2,7 @@
 import os
 import socket
 from typing import AsyncIterator, Tuple
+from unittest.mock import patch
 
 import pytest
 import torch
@@ -390,7 +391,10 @@ def test_bind_kv_cache_encoder_decoder():
 
 
 def test_bind_kv_cache_pp():
-    cfg = VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=2))
+    with patch("vllm.utils.cuda_device_count_stateless", lambda: 2):
+        # this test runs with 1 GPU, but we simulate 2 GPUs
+        cfg = VllmConfig(
+            parallel_config=ParallelConfig(pipeline_parallel_size=2))
     with set_current_vllm_config(cfg):
         from vllm.attention import Attention
 
diff --git a/vllm/config.py b/vllm/config.py
index 59b509d5a961e..4a42aefb75026 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1294,8 +1294,11 @@ def __post_init__(self) -> None:
             from vllm.executor import ray_utils
             backend = "mp"
             ray_found = ray_utils.ray_is_available()
-            if (current_platform.is_cuda()
-                    and cuda_device_count_stateless() < self.world_size):
+            if current_platform.is_neuron():
+                # neuron uses single process to control multiple devices
+                backend = "uni"
+            elif (current_platform.is_cuda()
+                  and cuda_device_count_stateless() < self.world_size):
                 if not ray_found:
                     raise ValueError("Unable to load Ray which is "
                                      "required for multi-node inference, "
@@ -1328,13 +1331,14 @@ def _verify_args(self) -> None:
         from vllm.executor.executor_base import ExecutorBase
         from vllm.platforms import current_platform
         if self.distributed_executor_backend not in (
-                "ray", "mp", None) and not (isinstance(
+                "ray", "mp", "uni", None) and not (isinstance(
                     self.distributed_executor_backend, type) and issubclass(
                         self.distributed_executor_backend, ExecutorBase)):
             raise ValueError(
                 "Unrecognized distributed executor backend "
                 f"{self.distributed_executor_backend}. Supported "
-                "values are 'ray', 'mp' or custom ExecutorBase subclass.")
+                "values are 'ray', 'mp' 'uni', or custom ExecutorBase"
+                " subclass.")
         if self.use_ray:
             from vllm.executor import ray_utils
             ray_utils.assert_ray_available()
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index be7f16ef52a47..bf8b30cccd5f6 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -862,12 +862,14 @@ def init_model_parallel_group(
 ) -> GroupCoordinator:
     if use_custom_allreduce is None:
         use_custom_allreduce = _ENABLE_CUSTOM_ALL_REDUCE
+    from vllm.platforms import current_platform
     return GroupCoordinator(
         group_ranks=group_ranks,
         local_rank=local_rank,
         torch_distributed_backend=backend,
-        use_pynccl=True,
-        use_custom_allreduce=use_custom_allreduce,
+        use_pynccl=current_platform.is_cuda_alike(),
+        use_custom_allreduce=current_platform.is_cuda_alike()
+        and use_custom_allreduce,
         use_tpu_communicator=True,
         use_hpu_communicator=True,
         use_xpu_communicator=True,
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index da23ed19ef7be..08fef8250d483 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -18,9 +18,7 @@
 from vllm.engine.llm_engine import LLMEngine, SchedulerOutputState
 from vllm.engine.metrics_types import StatLoggerBase
 from vllm.engine.protocol import EngineClient
-from vllm.executor.executor_base import ExecutorAsyncBase
-from vllm.executor.gpu_executor import GPUExecutorAsync
-from vllm.executor.ray_utils import initialize_ray_cluster
+from vllm.executor.executor_base import ExecutorBase
 from vllm.inputs import PromptType
 from vllm.inputs.preprocess import InputPreprocessor
 from vllm.logger import init_logger
@@ -620,69 +618,9 @@ def __del__(self):
             rt.new_requests_event.set()
 
     @classmethod
-    def _get_executor_cls(
-            cls, engine_config: VllmConfig) -> Type[ExecutorAsyncBase]:
-        distributed_executor_backend = (
-            engine_config.parallel_config.distributed_executor_backend)
-        if isinstance(distributed_executor_backend, type):
-            if not issubclass(distributed_executor_backend, ExecutorAsyncBase):
-                raise TypeError(
-                    "distributed_executor_backend must be a subclass of "
-                    f"ExecutorAsyncBase. Got {distributed_executor_backend}.")
-            executor_class = distributed_executor_backend
-        elif engine_config.device_config.device_type == "neuron":
-            from vllm.executor.neuron_executor import NeuronExecutorAsync
-            executor_class = NeuronExecutorAsync
-        elif engine_config.device_config.device_type == "tpu":
-            if distributed_executor_backend == "ray":
-                from vllm.executor.ray_tpu_executor import RayTPUExecutorAsync
-                executor_class = RayTPUExecutorAsync
-            else:
-                assert distributed_executor_backend is None
-                from vllm.executor.tpu_executor import TPUExecutorAsync
-                executor_class = TPUExecutorAsync
-        elif engine_config.device_config.device_type == "cpu":
-            from vllm.executor.cpu_executor import CPUExecutorAsync
-            executor_class = CPUExecutorAsync
-        elif engine_config.device_config.device_type == "hpu":
-            if distributed_executor_backend == "ray":
-                initialize_ray_cluster(engine_config.parallel_config)
-                from vllm.executor.ray_hpu_executor import RayHPUExecutorAsync
-                executor_class = RayHPUExecutorAsync
-            else:
-                from vllm.executor.hpu_executor import HPUExecutorAsync
-                executor_class = HPUExecutorAsync
-        elif engine_config.device_config.device_type == "openvino":
-            assert distributed_executor_backend is None, (
-                "Distributed execution is not supported with "
-                "the OpenVINO backend.")
-            from vllm.executor.openvino_executor import OpenVINOExecutorAsync
-            executor_class = OpenVINOExecutorAsync
-        elif engine_config.device_config.device_type == "xpu":
-            if distributed_executor_backend is None:
-                from vllm.executor.xpu_executor import XPUExecutorAsync
-                executor_class = XPUExecutorAsync
-            elif distributed_executor_backend == "ray":
-                from vllm.executor.ray_xpu_executor import RayXPUExecutorAsync
-                executor_class = RayXPUExecutorAsync
-            elif distributed_executor_backend == "mp":
-                from vllm.executor.multiproc_xpu_executor import (
-                    MultiprocessingXPUExecutorAsync)
-                executor_class = MultiprocessingXPUExecutorAsync
-            else:
-                raise RuntimeError(
-                    "Not supported distributed execution model on XPU device.")
-        elif distributed_executor_backend == "ray":
-            from vllm.executor.ray_gpu_executor import RayGPUExecutorAsync
-            executor_class = RayGPUExecutorAsync
-        elif distributed_executor_backend == "mp":
-            from vllm.executor.multiproc_gpu_executor import (
-                MultiprocessingGPUExecutorAsync)
-            executor_class = MultiprocessingGPUExecutorAsync
-        else:
-            from vllm.executor.gpu_executor import GPUExecutorAsync
-            executor_class = GPUExecutorAsync
-        return executor_class
+    def _get_executor_cls(cls,
+                          engine_config: VllmConfig) -> Type[ExecutorBase]:
+        return LLMEngine._get_executor_cls(engine_config)
 
     @classmethod
     def from_engine_args(
@@ -700,9 +638,6 @@ def from_engine_args(
 
         executor_class = cls._get_executor_cls(engine_config)
 
-        if executor_class.uses_ray:
-            initialize_ray_cluster(engine_config.parallel_config)
-
         # Create the async LLM engine.
         engine = cls(
             vllm_config=engine_config,
@@ -1242,23 +1177,12 @@ def remove_logger(self, logger_name: str) -> None:
         self.engine.remove_logger(logger_name=logger_name)
 
     async def start_profile(self) -> None:
-        # using type instead of isinstance to check to avoid capturing
-        # inherited classes
-        if type(self.engine.model_executor) == GPUExecutorAsync:  # noqa: E721
-            self.engine.model_executor.start_profile()
-        else:
-            self.engine.model_executor._run_workers("start_profile")
+        self.engine.start_profile()
 
     async def stop_profile(self) -> None:
-        # using type instead of isinstance to check to avoid capturing
-        # inherited classes
-        if type(self.engine.model_executor) == GPUExecutorAsync:  # noqa: E721
-            self.engine.model_executor.stop_profile()
-        else:
-            self.engine.model_executor._run_workers("stop_profile")
+        self.engine.stop_profile()
 
     async def add_lora(self, lora_request: LoRARequest) -> None:
-        """Load a new LoRA adapter into the engine for future requests."""
         self.engine.add_lora(lora_request)
 
 
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 1db3e59ff3bae..49a1e9f505d9f 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -28,8 +28,6 @@
 from vllm.entrypoints.openai.logits_processors import (
     get_logits_processors as get_openai_logits_processors)
 from vllm.executor.executor_base import ExecutorBase
-from vllm.executor.gpu_executor import GPUExecutor
-from vllm.executor.ray_utils import initialize_ray_cluster
 from vllm.inputs import (INPUT_REGISTRY, InputRegistry, ProcessorInputs,
                          PromptType, SingletonInputsAdapter)
 from vllm.inputs.parse import is_encoder_decoder_inputs, is_token_prompt
@@ -442,64 +440,26 @@ def _get_executor_cls(cls,
                 raise TypeError(
                     "distributed_executor_backend must be a subclass of "
                     f"ExecutorBase. Got {distributed_executor_backend}.")
-            if distributed_executor_backend.uses_ray:  # type: ignore
-                initialize_ray_cluster(engine_config.parallel_config)
             executor_class = distributed_executor_backend
-        elif engine_config.device_config.device_type == "neuron":
-            from vllm.executor.neuron_executor import NeuronExecutor
-            executor_class = NeuronExecutor
-        elif engine_config.device_config.device_type == "tpu":
+        elif engine_config.parallel_config.world_size > 1:
             if distributed_executor_backend == "ray":
-                initialize_ray_cluster(engine_config.parallel_config)
-                from vllm.executor.ray_tpu_executor import RayTPUExecutor
-                executor_class = RayTPUExecutor
-            else:
-                assert distributed_executor_backend is None
-                from vllm.executor.tpu_executor import TPUExecutor
-                executor_class = TPUExecutor
-        elif engine_config.device_config.device_type == "cpu":
-            from vllm.executor.cpu_executor import CPUExecutor
-            executor_class = CPUExecutor
-        elif engine_config.device_config.device_type == "hpu":
-            if distributed_executor_backend == "ray":
-                initialize_ray_cluster(engine_config.parallel_config)
-                from vllm.executor.ray_hpu_executor import RayHPUExecutor
-                executor_class = RayHPUExecutor
-            else:
-                from vllm.executor.hpu_executor import HPUExecutor
-                executor_class = HPUExecutor
-        elif engine_config.device_config.device_type == "openvino":
-            from vllm.executor.openvino_executor import OpenVINOExecutor
-            executor_class = OpenVINOExecutor
-        elif engine_config.device_config.device_type == "xpu":
-            if distributed_executor_backend == "ray":
-                initialize_ray_cluster(engine_config.parallel_config)
-                from vllm.executor.ray_xpu_executor import RayXPUExecutor
-                executor_class = RayXPUExecutor
+                from vllm.executor.ray_distributed_executor import (
+                    RayDistributedExecutor)
+                executor_class = RayDistributedExecutor
             elif distributed_executor_backend == "mp":
-                # FIXME(kunshang):
-                # spawn needs calling `if __name__ == '__main__':``
-                # fork is not supported for xpu start new process.
-                logger.error(
-                    "Both start methods (spawn and fork) have issue "
-                    "on XPU if you use mp backend, Please try ray instead.")
-            else:
-                from vllm.executor.xpu_executor import XPUExecutor
-                executor_class = XPUExecutor
-        elif distributed_executor_backend == "ray":
-            initialize_ray_cluster(engine_config.parallel_config)
-            from vllm.executor.ray_gpu_executor import RayGPUExecutor
-            executor_class = RayGPUExecutor
-        elif distributed_executor_backend == "mp":
-            from vllm.executor.multiproc_gpu_executor import (
-                MultiprocessingGPUExecutor)
-            assert not envs.VLLM_USE_RAY_SPMD_WORKER, (
-                "multiprocessing distributed executor backend does not "
-                "support VLLM_USE_RAY_SPMD_WORKER=1")
-            executor_class = MultiprocessingGPUExecutor
+                from vllm.executor.mp_distributed_executor import (
+                    MultiprocessingDistributedExecutor)
+                assert not envs.VLLM_USE_RAY_SPMD_WORKER, (
+                    "multiprocessing distributed executor backend does not "
+                    "support VLLM_USE_RAY_SPMD_WORKER=1")
+                executor_class = MultiprocessingDistributedExecutor
+            elif distributed_executor_backend == "uni":
+                # JAX-style, single-process, multi-device executor.
+                from vllm.executor.uniproc_executor import UniProcExecutor
+                executor_class = UniProcExecutor
         else:
-            from vllm.executor.gpu_executor import GPUExecutor
-            executor_class = GPUExecutor
+            from vllm.executor.uniproc_executor import UniProcExecutor
+            executor_class = UniProcExecutor
         return executor_class
 
     @classmethod
@@ -1845,27 +1805,17 @@ def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
     def list_prompt_adapters(self) -> List[int]:
         return self.model_executor.list_prompt_adapters()
 
+    def start_profile(self) -> None:
+        self.model_executor.start_profile()
+
+    def stop_profile(self) -> None:
+        self.model_executor.stop_profile()
+
     def check_health(self) -> None:
         if self.tokenizer:
             self.tokenizer.check_health()
         self.model_executor.check_health()
 
-    def start_profile(self) -> None:
-        # using type instead of isinstance to check to avoid capturing
-        # inherited classes (MultiprocessingGPUExecutor)
-        if type(self.model_executor) == GPUExecutor:  # noqa: E721
-            self.model_executor.start_profile()
-        else:
-            self.model_executor._run_workers("start_profile")
-
-    def stop_profile(self) -> None:
-        # using type instead of isinstance to check to avoid capturing
-        # inherited classes (MultiprocessingGPUExecutor)
-        if type(self.model_executor) == GPUExecutor:  # noqa: E721
-            self.model_executor.stop_profile()
-        else:
-            self.model_executor._run_workers("stop_profile")
-
     def is_tracing_enabled(self) -> bool:
         return self.tracer is not None
 
diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py
index 36f4df4b02731..8f231de912c95 100644
--- a/vllm/engine/multiprocessing/engine.py
+++ b/vllm/engine/multiprocessing/engine.py
@@ -20,7 +20,6 @@
                                          RPCStartupResponse,
                                          RPCUProfileRequest)
 # yapf: enable
-from vllm.executor.gpu_executor import GPUExecutor
 from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
 from vllm.usage.usage_lib import UsageContext
@@ -356,16 +355,10 @@ def _set_errored(self, e: BaseException):
             self._errored_with = e
 
     def start_profile(self) -> None:
-        if type(self.engine.model_executor) is GPUExecutor:
-            self.engine.model_executor.start_profile()
-        else:
-            self.engine.model_executor._run_workers("start_profile")
+        self.engine.start_profile()
 
     def stop_profile(self) -> None:
-        if type(self.engine.model_executor) is GPUExecutor:
-            self.engine.model_executor.stop_profile()
-        else:
-            self.engine.model_executor._run_workers("stop_profile")
+        self.engine.stop_profile()
 
 
 def signal_handler(*_) -> None:
diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py
deleted file mode 100644
index b9a6bee5720fd..0000000000000
--- a/vllm/executor/cpu_executor.py
+++ /dev/null
@@ -1,299 +0,0 @@
-import os
-from functools import partial
-from typing import Any, Awaitable, List, Optional, Set, Tuple, Union
-
-from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
-from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper,
-                                                  ResultHandler, WorkerMonitor)
-from vllm.logger import init_logger
-from vllm.lora.request import LoRARequest
-from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.prompt_adapter.request import PromptAdapterRequest
-from vllm.sequence import ExecuteModelRequest
-from vllm.utils import get_distributed_init_method, get_open_port, make_async
-from vllm.worker.worker_base import WorkerWrapperBase
-
-logger = init_logger(__name__)
-
-
-class CPUExecutor(ExecutorBase):
-
-    uses_ray: bool = False
-
-    def _init_executor(self) -> None:
-        assert self.device_config.device_type == "cpu"
-
-        #
-        # Environment variables for CPU executor
-        #
-
-        # Disable torch async compiling which won't work with daemonic processes
-        os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"
-
-        # Intel OpenMP setting
-        ld_prealod_str = os.getenv("LD_PRELOAD", "")
-        if "libiomp5.so" in ld_prealod_str:
-            # The time(milliseconds) that a thread should wait after
-            # completing the execution of a parallel region, before sleeping.
-            os.environ['KMP_BLOCKTIME'] = "1"
-            # Prevents the CPU to run into low performance state
-            os.environ['KMP_TPAUSE'] = "0"
-            # Provides fine granularity parallelism
-            os.environ['KMP_FORKJOIN_BARRIER_PATTERN'] = "dist,dist"
-            os.environ['KMP_PLAIN_BARRIER_PATTERN'] = "dist,dist"
-            os.environ['KMP_REDUCTION_BARRIER_PATTERN'] = "dist,dist"
-
-        # To hint IPEX uses shared memory based AllReduce
-        os.environ["LOCAL_WORLD_SIZE"] = str(
-            self.parallel_config.tensor_parallel_size)
-
-        # Multiprocessing-based executor does not support multi-node setting.
-        # Since it only works for single node, we can use the loopback address
-        # 127.0.0.1 for communication.
-        ip = "127.0.0.1"
-        port = get_open_port()
-        self.distributed_init_method = get_distributed_init_method(ip, port)
-
-        is_async = isinstance(self, CPUExecutorAsync)
-
-        world_size = self.parallel_config.tensor_parallel_size
-        result_handler = ResultHandler()
-        self.parallel_worker_tasks: Optional[Union[Any, Awaitable[Any]]] = None
-        self.workers = []
-
-        if is_async:
-            self.workers = [
-                ProcessWorkerWrapper(
-                    result_handler,
-                    partial(
-                        self._create_worker,
-                        rank=rank,
-                        local_rank=rank,
-                    )) for rank in range(0, world_size)
-            ]
-            self.driver_worker = self.workers[0]
-            self.workers = self.workers[1:]
-            self.driver_method_invoker = _async_driver_method_invoker
-        else:
-            self.driver_worker = self._create_worker()
-            self.driver_method_invoker = _driver_method_invoker
-
-            if world_size != 1:
-                self.workers = [
-                    ProcessWorkerWrapper(
-                        result_handler,
-                        partial(
-                            self._create_worker,
-                            rank=rank,
-                            local_rank=rank,
-                        )) for rank in range(1, world_size)
-                ]
-
-        self.worker_monitor = None
-        if world_size != 1 or is_async:
-            if is_async:
-                async_worker_list = self.workers + [self.driver_worker]
-            else:
-                async_worker_list = self.workers
-            self.worker_monitor = WorkerMonitor(async_worker_list,
-                                                result_handler)
-            result_handler.start()
-            self.worker_monitor.start()
-
-        self._run_workers("init_device")
-        self._run_workers("load_model")
-
-    def _create_worker(
-        self,
-        local_rank: int = 0,
-        rank: int = 0,
-    ):
-
-        wrapper = WorkerWrapperBase(vllm_config=self.vllm_config)
-
-        assert self.distributed_init_method is not None
-
-        kwargs = dict(
-            vllm_config=self.vllm_config,
-            local_rank=local_rank,
-            rank=rank,
-            distributed_init_method=self.distributed_init_method,
-            kv_cache_dtype=self.cache_config.cache_dtype,
-            is_driver_worker=rank == 0,
-        )
-        wrapper.init_worker(**kwargs)
-
-        return wrapper.worker
-
-    def _run_workers(
-        self,
-        method: str,
-        *args,
-        async_run_remote_workers_only: bool = False,
-        max_concurrent_workers: Optional[int] = None,
-        **kwargs,
-    ) -> Any:
-        """Runs the given method on all workers.
-
-        Args:
-            async_run_remote_workers_only: If True the method will be run only
-                in the remote workers, not the driver worker. It will also be
-                run asynchronously and return a list of futures rather than
-                blocking on the results.
-        """
-
-        if max_concurrent_workers:
-            raise NotImplementedError(
-                "max_concurrent_workers is not supported yet.")
-
-        # Start the workers first.
-        worker_outputs = [
-            worker.execute_method(method, *args, **kwargs)
-            for worker in self.workers
-        ]
-
-        if async_run_remote_workers_only:
-            # Just return futures
-            return worker_outputs
-
-        driver_worker_output = self.driver_method_invoker(
-            self.driver_worker, method, *args, **kwargs)
-
-        # Get the results of the workers.
-        return [driver_worker_output
-                ] + [output.get() for output in worker_outputs]
-
-    def determine_num_available_blocks(self) -> Tuple[int, int]:
-        """Determine the number of available KV blocks by invoking the
-        underlying worker.
-        """
-        return self.driver_method_invoker(self.driver_worker,
-                                          "determine_num_available_blocks")
-
-    def initialize_cache(self, num_gpu_blocks: int,
-                         num_cpu_blocks: int) -> None:
-        """Initialize the KV cache by invoking the underlying worker.
-        """
-        # NOTE: We log here to avoid multiple logs when number of workers is
-        # greater than one. We could log in the engine, but not all executors
-        # have GPUs.
-        # NOTE: `cpu block` for CPU backend is located on CPU memory but is
-        # referred as `gpu block`. Because we want to reuse the existing block
-        # management procedure.
-        logger.info("# CPU blocks: %d", num_gpu_blocks)
-
-        self._run_workers("initialize_cache",
-                          num_gpu_blocks=num_gpu_blocks,
-                          num_cpu_blocks=num_cpu_blocks)
-
-    def execute_model(
-            self,
-            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
-        if (self.parallel_config.tensor_parallel_size > 1
-                and self.parallel_worker_tasks is None):
-            self.parallel_worker_tasks = self._run_workers(
-                "start_worker_execution_loop",
-                async_run_remote_workers_only=True,
-            )
-        output = self.driver_method_invoker(self.driver_worker,
-                                            "execute_model", execute_model_req)
-        return output
-
-    def stop_remote_worker_execution_loop(self) -> None:
-        if self.parallel_worker_tasks is None:
-            return
-        """
-        Passing None will cause the driver to stop the model execution
-        loop running in each of the remote workers.
-        """
-        self.driver_method_invoker(self.driver_worker, "execute_model", None)
-        parallel_worker_tasks = self.parallel_worker_tasks
-        self.parallel_worker_tasks = None
-        # Ensure that workers exit model loop cleanly
-        # (this will raise otherwise)
-        self._wait_for_tasks_completion(parallel_worker_tasks)
-
-    def add_lora(self, lora_request: LoRARequest) -> bool:
-        return all(self._run_workers("add_lora", lora_request))
-
-    def remove_lora(self, lora_id: int) -> bool:
-        return all(self._run_workers("remove_lora", lora_id))
-
-    def pin_lora(self, lora_id: int) -> bool:
-        assert lora_id > 0, "lora_id must be greater than 0."
-        return all(self._run_workers(
-            "pin_lora",
-            lora_id=lora_id,
-        ))
-
-    def list_loras(self) -> Set[int]:
-        return self.driver_method_invoker(self.driver_worker, "list_loras")
-
-    def add_prompt_adapter(
-            self, prompt_adapter_request: PromptAdapterRequest) -> bool:
-        return all(
-            self._run_workers(
-                "add_prompt_adapter",
-                prompt_adapter_request,
-            ))
-
-    def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
-        return all(
-            self._run_workers(
-                "remove_prompt_adapter",
-                prompt_adapter_id,
-            ))
-
-    def list_prompt_adapters(self) -> Set[int]:
-        return self.driver_method_invoker(self.driver_worker,
-                                          "list_prompt_adapters")
-
-    def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
-        return all(self._run_workers(
-            "pin_prompt_adapter",
-            prompt_adapter_id,
-        ))
-
-    def check_health(self) -> None:
-        """Raises an error if engine is unhealthy."""
-        if self.worker_monitor is not None and not self.worker_monitor.is_alive(
-        ):
-            raise RuntimeError("Worker processes are not running")
-
-    def shutdown(self):
-        if (worker_monitor := getattr(self, "worker_monitor",
-                                      None)) is not None:
-            worker_monitor.close()
-
-    def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None:
-        """Wait for futures returned from _run_workers() with
-        async_run_remote_workers_only to complete."""
-        for result in parallel_worker_tasks:
-            result.get()
-
-    def start_profile(self) -> None:
-        self.driver_method_invoker(self.driver_worker, "start_profile")
-
-    def stop_profile(self) -> None:
-        self.driver_method_invoker(self.driver_worker, "stop_profile")
-
-
-class CPUExecutorAsync(CPUExecutor, ExecutorAsyncBase):
-
-    async def execute_model_async(
-            self,
-            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
-        output = await make_async(self.execute_model
-                                  )(execute_model_req=execute_model_req, )
-        return output
-
-    async def check_health_async(self) -> None:
-        self.check_health()
-
-
-def _driver_method_invoker(driver, method: str, *args, **kwargs):
-    return getattr(driver, method)(*args, **kwargs)
-
-
-def _async_driver_method_invoker(driver, method: str, *args, **kwargs):
-    return driver.execute_method(method, *args, **kwargs).get()
diff --git a/vllm/executor/distributed_gpu_executor.py b/vllm/executor/distributed_gpu_executor.py
deleted file mode 100644
index deb7cb1c97ef5..0000000000000
--- a/vllm/executor/distributed_gpu_executor.py
+++ /dev/null
@@ -1,212 +0,0 @@
-import asyncio
-from abc import abstractmethod
-from typing import Any, Awaitable, Dict, List, Optional, Set, Tuple, Union
-
-from vllm.executor.executor_base import ExecutorAsyncBase
-from vllm.executor.gpu_executor import GPUExecutor
-from vllm.logger import init_logger
-from vllm.lora.request import LoRARequest
-from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.sequence import ExecuteModelRequest
-
-logger = init_logger(__name__)
-
-
-class DistributedGPUExecutor(GPUExecutor):
-    """Abstract superclass of multi-GPU executor implementations."""
-
-    def __init__(self, *args, **kwargs):
-        # This is non-None when the execute model loop is running
-        # in the parallel workers. It's a coroutine in the AsyncLLMEngine case.
-        self.parallel_worker_tasks: Optional[Union[Any, Awaitable[Any]]] = None
-        # Updated by implementations that require additional args to be passed
-        # to the _run_workers execute_model call
-        self.extra_execute_model_run_workers_kwargs: Dict[str, Any] = {}
-
-        super().__init__(*args, **kwargs)
-
-    def determine_num_available_blocks(self) -> Tuple[int, int]:
-        """Determine the number of available KV blocks.
-
-        This invokes `determine_num_available_blocks` on each worker and takes
-        the min of the results, guaranteeing that the selected cache sizes are
-        compatible with all workers.
-
-        Returns:
-            - tuple[num_gpu_blocks, num_cpu_blocks]
-        """
-        # Get the maximum number of blocks that can be allocated on GPU and CPU.
-        num_blocks = self._run_workers("determine_num_available_blocks", )
-
-        # Since we use a shared centralized controller, we take the minimum
-        # number of blocks across all workers to make sure all the memory
-        # operators can be applied to all workers.
-        num_gpu_blocks = min(b[0] for b in num_blocks)
-        num_cpu_blocks = min(b[1] for b in num_blocks)
-
-        return num_gpu_blocks, num_cpu_blocks
-
-    def initialize_cache(self, num_gpu_blocks: int,
-                         num_cpu_blocks: int) -> None:
-        """Initialize the KV cache in all workers.
-        """
-
-        # NOTE: We log here to avoid multiple logs when number of workers is
-        # greater than one. We could log in the engine, but not all executors
-        # have GPUs.
-        logger.info("# GPU blocks: %d, # CPU blocks: %d", num_gpu_blocks,
-                    num_cpu_blocks)
-        max_concurrency = (num_gpu_blocks * self.cache_config.block_size /
-                           self.model_config.max_model_len)
-        logger.info("Maximum concurrency for %s tokens per request: %.2fx",
-                    self.model_config.max_model_len, max_concurrency)
-
-        self.cache_config.num_gpu_blocks = num_gpu_blocks
-        self.cache_config.num_cpu_blocks = num_cpu_blocks
-
-        self._run_workers("initialize_cache",
-                          num_gpu_blocks=num_gpu_blocks,
-                          num_cpu_blocks=num_cpu_blocks)
-
-    def execute_model(
-        self,
-        execute_model_req: ExecuteModelRequest,
-    ) -> List[SamplerOutput]:
-        if self.parallel_worker_tasks is None:
-            self.parallel_worker_tasks = self._run_workers(
-                "start_worker_execution_loop",
-                async_run_tensor_parallel_workers_only=True,
-                **self.extra_execute_model_run_workers_kwargs)
-
-        # Only the driver worker returns the sampling results.
-        driver_outputs = self._driver_execute_model(execute_model_req)
-        assert driver_outputs is not None
-        return driver_outputs
-
-    def stop_remote_worker_execution_loop(self) -> None:
-        if self.parallel_worker_tasks is None:
-            return
-
-        self._driver_execute_model(execute_model_req=None)
-        parallel_worker_tasks = self.parallel_worker_tasks
-        self.parallel_worker_tasks = None
-        # Ensure that workers exit model loop cleanly
-        # (this will raise otherwise)
-        self._wait_for_tasks_completion(parallel_worker_tasks)
-
-    def add_lora(self, lora_request: LoRARequest) -> bool:
-        assert lora_request.lora_int_id > 0, "lora_id must be greater than 0."
-        return self._run_workers(
-            "add_lora",
-            lora_request=lora_request,
-        )
-
-    def remove_lora(self, lora_id: int) -> bool:
-        assert lora_id > 0, "lora_id must be greater than 0."
-        return self._run_workers(
-            "remove_lora",
-            lora_id=lora_id,
-        )
-
-    def pin_lora(self, lora_id: int) -> bool:
-        assert lora_id > 0, "lora_id must be greater than 0."
-        return self._run_workers(
-            "pin_lora",
-            lora_id=lora_id,
-        )
-
-    def list_loras(self) -> Set[int]:
-        return self._run_workers("list_loras")
-
-    def save_sharded_state(
-        self,
-        path: str,
-        pattern: Optional[str] = None,
-        max_size: Optional[int] = None,
-    ) -> None:
-        self._run_workers("save_sharded_state",
-                          path=path,
-                          pattern=pattern,
-                          max_size=max_size)
-
-    @abstractmethod
-    def _driver_execute_model(
-        self, execute_model_req: Optional[ExecuteModelRequest]
-    ) -> Optional[List[SamplerOutput]]:
-        """Run execute_model in the driver worker.
-
-        Passing None will cause the driver to stop the model execution loop
-        running in each of the remote workers. In this case, this method
-        returns None. Otherwise, this method returns the model output.
-        """
-        raise NotImplementedError
-
-    @abstractmethod
-    def _run_workers(
-        self,
-        method: str,
-        *args,
-        async_run_tensor_parallel_workers_only: bool = False,
-        max_concurrent_workers: Optional[int] = None,
-        **kwargs,
-    ) -> Any:
-        """Runs the given method on all workers.
-
-        Args:
-            async_run_tensor_parallel_workers_only: If True the method will be
-                run only in the remote TP workers, not the driver worker.
-                It will also be run asynchronously and return a list of futures
-                rather than blocking on the results.
-        """
-        raise NotImplementedError
-
-    @abstractmethod
-    def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None:
-        """Wait for futures returned from _run_workers() with
-        async_run_remote_workers_only to complete."""
-        raise NotImplementedError
-
-
-class DistributedGPUExecutorAsync(DistributedGPUExecutor, ExecutorAsyncBase):
-
-    async def execute_model_async(
-            self,
-            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
-        if self.parallel_worker_tasks is None:
-            # Start model execution loop running in the parallel workers
-            self.parallel_worker_tasks = asyncio.create_task(
-                self._start_worker_execution_loop())
-
-        # Only the driver worker returns the sampling results.
-        return await self._driver_execute_model_async(execute_model_req)
-
-    async def stop_remote_worker_execution_loop_async(self) -> None:
-        if self.parallel_worker_tasks is None:
-            return
-
-        await self._driver_execute_model_async()
-        parallel_worker_tasks = self.parallel_worker_tasks
-        self.parallel_worker_tasks = None
-        # Ensure that workers exit model loop cleanly
-        # (this will raise otherwise)
-        await parallel_worker_tasks
-
-    @abstractmethod
-    async def _driver_execute_model_async(
-        self,
-        execute_model_req: Optional[ExecuteModelRequest] = None,
-    ) -> List[SamplerOutput]:
-        """Execute the model asynchronously in the driver worker.
-
-        Passing None will cause the driver to stop the model execution
-        loop running in each of the remote workers.
-        """
-        raise NotImplementedError
-
-    @abstractmethod
-    async def _start_worker_execution_loop(self):
-        """Run execution loop on all workers. It guarantees all workers run
-        the loop or None of them is running the loop. Loop can be stopped by
-        `stop_remote_worker_execution_loop`.
-        The API is idempotent (guarantee only 1 loop run at any moment)."""
-        raise NotImplementedError
diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py
index 9cba189dd57f9..00ecadcf92667 100644
--- a/vllm/executor/executor_base.py
+++ b/vllm/executor/executor_base.py
@@ -1,18 +1,24 @@
+import asyncio
 from abc import ABC, abstractmethod
-from typing import List, Optional, Set, Tuple
+from typing import Any, Awaitable, Dict, List, Optional, Set, Tuple, Union
 
 from vllm.config import VllmConfig
+from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.platforms import current_platform
 from vllm.prompt_adapter.request import PromptAdapterRequest
-from vllm.sequence import ExecuteModelRequest
+from vllm.sequence import ExecuteModelRequest, PoolerOutput
+from vllm.utils import make_async
+
+logger = init_logger(__name__)
 
 
 class ExecutorBase(ABC):
     """Base class for all executors.
 
-    An executor is responsible for executing the model on a specific device
-    type (e.g., CPU, GPU, Neuron, etc.). Or it can be a distributed executor
+    An executor is responsible for executing the model on one device,
+    or it can be a distributed executor 
     that can execute the model on multiple devices.
     """
 
@@ -40,6 +46,20 @@ def _init_executor(self) -> None:
         pass
 
     @abstractmethod
+    def collective_rpc(self,
+                       method: str,
+                       timeout: Optional[float] = None,
+                       args: Tuple = (),
+                       kwargs: Optional[Dict] = None) -> List[Any]:
+        """
+        The main interface of the executor to run a method on all workers,
+        with homogeneous arguments.
+        If the args are heterogeneous, then we can pack them into a list,
+        and unpack them in the method of every worker, because every worker
+        knows their own rank.
+        """
+        pass
+
     def determine_num_available_blocks(self) -> Tuple[int, int]:
         """Determine the number of available blocks for the GPU KV cache and
         swappable CPU KV cache.
@@ -53,58 +73,113 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
         num_cpu_blocks refers to "swapped" blocks in CPU memory and cannot be
         appended to.
         """
-        raise NotImplementedError
+        results = self.collective_rpc("determine_num_available_blocks")
+        a = min([r[0] for r in results])
+        b = min([r[1] for r in results])
+        return a, b
 
-    @abstractmethod
-    def initialize_cache(self, num_gpu_blocks: int,
-                         num_cpu_blocks: int) -> None:
-        """Initialize the KV cache with the given size in blocks.
+    def initialize(self, num_gpu_blocks: int) -> None:
         """
-        raise NotImplementedError
+        Initialize the KV caches and begin the model execution loop of the
+        underlying workers.
+        For V1 compatibility.
+        """
+        logger.info("# GPU blocks: %d", num_gpu_blocks)
+        self.collective_rpc("initialize_cache", args=(num_gpu_blocks, ))
+        self.collective_rpc("compile_or_warm_up_model")
+
+    def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None:
+        """Initialize the KV cache by invoking the underlying worker.
+        """
+        # NOTE: This is logged in the executor because there can be >1 workers.
+        logger.info("# %s blocks: %d, # CPU blocks: %d",
+                    current_platform.dispatch_key, num_gpu_blocks,
+                    num_cpu_blocks)
+        max_concurrency = (num_gpu_blocks * self.cache_config.block_size /
+                           self.model_config.max_model_len)
+        logger.info("Maximum concurrency for %s tokens per request: %.2fx",
+                    self.model_config.max_model_len, max_concurrency)
+
+        self.cache_config.num_gpu_blocks = num_gpu_blocks
+        self.cache_config.num_cpu_blocks = num_cpu_blocks
+
+        self.collective_rpc("initialize_cache",
+                            args=(num_gpu_blocks, num_cpu_blocks))
 
-    @abstractmethod
     def execute_model(
         self, execute_model_req: ExecuteModelRequest
-    ) -> Optional[List[SamplerOutput]]:
-        """Executes at least one model step on the given sequences."""
-        raise NotImplementedError
+    ) -> Optional[List[Union[SamplerOutput, PoolerOutput]]]:
+        output = self.collective_rpc("execute_model",
+                                     args=(execute_model_req, ))
+        return output[0]
 
     def stop_remote_worker_execution_loop(self) -> None:
         """Releases parallel workers from model loop."""
         return
 
-    @abstractmethod
     def add_lora(self, lora_request: LoRARequest) -> bool:
-        raise NotImplementedError
+        assert lora_request.lora_int_id > 0, "lora_id must be greater than 0."
+        return all(self.collective_rpc("add_lora", args=(lora_request, )))
 
-    @abstractmethod
     def remove_lora(self, lora_id: int) -> bool:
-        raise NotImplementedError
+        assert lora_id > 0, "lora_id must be greater than 0."
+        return all(self.collective_rpc("remove_lora", args=(lora_id, )))
 
-    @abstractmethod
     def pin_lora(self, lora_id: int) -> bool:
-        raise NotImplementedError  # type: ignore
+        assert lora_id > 0, "lora_id must be greater than 0."
+        return all(self.collective_rpc("pin_lora", args=(lora_id, )))
 
-    @abstractmethod
     def list_loras(self) -> Set[int]:
-        raise NotImplementedError
+        sets = self.collective_rpc("list_loras")
+        for s in sets:
+            assert s == sets[0], "All workers should have the same LORAs."
+        return sets[0]
 
-    @abstractmethod
     def add_prompt_adapter(
             self, prompt_adapter_request: PromptAdapterRequest) -> bool:
-        raise NotImplementedError
+        assert prompt_adapter_request.prompt_adapter_id > 0, \
+            "prompt_adapter_id must be greater than 0."
+        return all(
+            self.collective_rpc("add_prompt_adapter",
+                                args=(prompt_adapter_request, )))
 
-    @abstractmethod
     def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
-        raise NotImplementedError
+        assert prompt_adapter_id > 0, \
+            "prompt_adapter_id must be greater than 0."
+        return all(
+            self.collective_rpc("remove_prompt_adapter",
+                                args=(prompt_adapter_id, )))
 
-    @abstractmethod
     def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
-        raise NotImplementedError  # type: ignore
+        assert prompt_adapter_id > 0, \
+            "prompt_adapter_id must be greater than 0."
+        return all(
+            self.collective_rpc("pin_prompt_adapter",
+                                args=(prompt_adapter_id, )))
 
-    @abstractmethod
     def list_prompt_adapters(self) -> Set[int]:
-        raise NotImplementedError
+        sets = self.collective_rpc("list_prompt_adapters")
+        for s in sets:
+            assert (s == sets[0]
+                    ), "All workers should have the same prompt adapters."
+        return sets[0]
+
+    def start_profile(self) -> None:
+        self.collective_rpc("start_profile")
+
+    def stop_profile(self) -> None:
+        self.collective_rpc("stop_profile")
+
+    def save_sharded_state(
+        self,
+        path: str,
+        pattern: Optional[str] = None,
+        max_size: Optional[int] = None,
+    ) -> None:
+        self.collective_rpc("save_sharded_state",
+                            kwargs=dict(path=path,
+                                        pattern=pattern,
+                                        max_size=max_size))
 
     @abstractmethod
     def check_health(self) -> None:
@@ -119,15 +194,12 @@ def shutdown(self) -> None:
     def __del__(self):
         self.shutdown()
 
-
-class ExecutorAsyncBase(ExecutorBase):
-
-    @abstractmethod
     async def execute_model_async(
             self,
             execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
         """Executes one model step on the given sequences."""
-        raise NotImplementedError
+        output = await make_async(self.execute_model)(execute_model_req)
+        return output
 
     async def stop_remote_worker_execution_loop_async(self) -> None:
         """Releases parallel workers from model loop."""
@@ -137,3 +209,128 @@ async def check_health_async(self) -> None:
         """Checks if the executor is healthy. If not, it should raise an
         exception."""
         self.check_health()
+
+
+class DistributedExecutorBase(ExecutorBase):
+    """Abstract superclass of distributed executor implementations."""
+
+    def __init__(self, *args, **kwargs):
+        # This is non-None when the execute model loop is running
+        # in the parallel workers. It's a coroutine in the AsyncLLMEngine case.
+        self.parallel_worker_tasks: Optional[Union[Any, Awaitable[Any]]] = None
+
+        super().__init__(*args, **kwargs)
+
+    def execute_model(
+        self,
+        execute_model_req: ExecuteModelRequest,
+    ) -> List[SamplerOutput]:
+        # TODO: unify into collective_rpc
+        if self.parallel_worker_tasks is None:
+            self.parallel_worker_tasks = self._run_workers(
+                "start_worker_execution_loop",
+                async_run_tensor_parallel_workers_only=True)
+
+        # Only the driver worker returns the sampling results.
+        driver_outputs = self._driver_execute_model(execute_model_req)
+        assert driver_outputs is not None
+        return driver_outputs
+
+    def stop_remote_worker_execution_loop(self) -> None:
+        if self.parallel_worker_tasks is None:
+            return
+
+        self._driver_execute_model(execute_model_req=None)
+        parallel_worker_tasks = self.parallel_worker_tasks
+        self.parallel_worker_tasks = None
+        # Ensure that workers exit model loop cleanly
+        # (this will raise otherwise)
+        self._wait_for_tasks_completion(parallel_worker_tasks)
+
+    @abstractmethod
+    def _driver_execute_model(
+        self, execute_model_req: Optional[ExecuteModelRequest]
+    ) -> Optional[List[SamplerOutput]]:
+        """Run execute_model in the driver worker.
+
+        Passing None will cause the driver to stop the model execution loop
+        running in each of the remote workers. In this case, this method
+        returns None. Otherwise, this method returns the model output.
+        """
+        raise NotImplementedError
+
+    def collective_rpc(self,
+                       method: str,
+                       timeout: Optional[float] = None,
+                       args: Tuple = (),
+                       kwargs: Optional[Dict] = None) -> List[Any]:
+        return self._run_workers(method, *args, **(kwargs or {}))
+
+    @abstractmethod
+    def _run_workers(
+        self,
+        method: str,
+        *args,
+        async_run_tensor_parallel_workers_only: bool = False,
+        max_concurrent_workers: Optional[int] = None,
+        **kwargs,
+    ) -> Any:
+        """Runs the given method on all workers.
+
+        Args:
+            async_run_tensor_parallel_workers_only: If True the method will be
+                run only in the remote TP workers, not the driver worker.
+                It will also be run asynchronously and return a list of futures
+                rather than blocking on the results.
+        
+        # TODO: simplify and merge with collective_rpc
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None:
+        """Wait for futures returned from _run_workers() with
+        async_run_remote_workers_only to complete."""
+        raise NotImplementedError
+
+    async def execute_model_async(
+            self,
+            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
+        if self.parallel_worker_tasks is None:
+            # Start model execution loop running in the parallel workers
+            self.parallel_worker_tasks = asyncio.create_task(
+                self._start_worker_execution_loop())
+
+        # Only the driver worker returns the sampling results.
+        return await self._driver_execute_model_async(execute_model_req)
+
+    async def stop_remote_worker_execution_loop_async(self) -> None:
+        if self.parallel_worker_tasks is None:
+            return
+
+        await self._driver_execute_model_async()
+        parallel_worker_tasks = self.parallel_worker_tasks
+        self.parallel_worker_tasks = None
+        # Ensure that workers exit model loop cleanly
+        # (this will raise otherwise)
+        await parallel_worker_tasks
+
+    @abstractmethod
+    async def _driver_execute_model_async(
+        self,
+        execute_model_req: Optional[ExecuteModelRequest] = None,
+    ) -> List[SamplerOutput]:
+        """Execute the model asynchronously in the driver worker.
+
+        Passing None will cause the driver to stop the model execution
+        loop running in each of the remote workers.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    async def _start_worker_execution_loop(self):
+        """Run execution loop on all workers. It guarantees all workers run
+        the loop or None of them is running the loop. Loop can be stopped by
+        `stop_remote_worker_execution_loop`.
+        The API is idempotent (guarantee only 1 loop run at any moment)."""
+        raise NotImplementedError
diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py
deleted file mode 100644
index 7fa34456028dd..0000000000000
--- a/vllm/executor/gpu_executor.py
+++ /dev/null
@@ -1,145 +0,0 @@
-from typing import Any, Dict, List, Optional, Set, Tuple, Union
-
-from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
-from vllm.logger import init_logger
-from vllm.lora.request import LoRARequest
-from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.prompt_adapter.request import PromptAdapterRequest
-from vllm.sequence import ExecuteModelRequest, PoolerOutput
-from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
-                        make_async)
-from vllm.worker.worker_base import WorkerWrapperBase
-
-logger = init_logger(__name__)
-
-
-def create_worker(**kwargs):
-    vllm_config = kwargs.get("vllm_config")
-    wrapper = WorkerWrapperBase(vllm_config=vllm_config)
-    wrapper.init_worker(**kwargs)
-    return wrapper.worker
-
-
-class GPUExecutor(ExecutorBase):
-
-    uses_ray: bool = False
-
-    def _init_executor(self) -> None:
-        """Initialize the worker and load the model.
-        """
-        assert self.parallel_config.world_size == 1, (
-            "GPUExecutor only supports single GPU.")
-
-        self.driver_worker = self._create_worker()
-        self.driver_worker.init_device()
-        self.driver_worker.load_model()
-
-    def _get_worker_kwargs(
-            self,
-            local_rank: int = 0,
-            rank: int = 0,
-            distributed_init_method: Optional[str] = None) -> Dict[str, Any]:
-        """Return worker init args for a given rank."""
-        if distributed_init_method is None:
-            distributed_init_method = get_distributed_init_method(
-                get_ip(), get_open_port())
-        return dict(
-            vllm_config=self.vllm_config,
-            local_rank=local_rank,
-            rank=rank,
-            distributed_init_method=distributed_init_method,
-            is_driver_worker=(not self.parallel_config)
-            or (rank % self.parallel_config.tensor_parallel_size == 0),
-        )
-
-    def _create_worker(self,
-                       local_rank: int = 0,
-                       rank: int = 0,
-                       distributed_init_method: Optional[str] = None):
-        return create_worker(**self._get_worker_kwargs(
-            local_rank=local_rank,
-            rank=rank,
-            distributed_init_method=distributed_init_method))
-
-    def determine_num_available_blocks(self) -> Tuple[int, int]:
-        """Determine the number of available KV blocks by invoking the
-        underlying worker.
-        """
-        return self.driver_worker.determine_num_available_blocks()
-
-    def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None:
-        """Initialize the KV cache by invoking the underlying worker.
-        """
-        # NOTE: This is logged in the executor because there can be >1 worker
-        # with other executors. We could log in the engine level, but work
-        # remains to abstract away the device for non-GPU configurations.
-        logger.info("# GPU blocks: %d, # CPU blocks: %d", num_gpu_blocks,
-                    num_cpu_blocks)
-        max_concurrency = (num_gpu_blocks * self.cache_config.block_size /
-                           self.model_config.max_model_len)
-        logger.info("Maximum concurrency for %s tokens per request: %.2fx",
-                    self.model_config.max_model_len, max_concurrency)
-
-        self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
-
-    def execute_model(
-        self, execute_model_req: ExecuteModelRequest
-    ) -> Optional[List[Union[SamplerOutput, PoolerOutput]]]:
-        output = self.driver_worker.execute_model(execute_model_req)
-        return output
-
-    def add_lora(self, lora_request: LoRARequest) -> bool:
-        assert lora_request.lora_int_id > 0, "lora_id must be greater than 0."
-        return self.driver_worker.add_lora(lora_request)
-
-    def remove_lora(self, lora_id: int) -> bool:
-        assert lora_id > 0, "lora_id must be greater than 0."
-        return self.driver_worker.remove_lora(lora_id)
-
-    def pin_lora(self, lora_id: int) -> bool:
-        assert lora_id > 0, "lora_id must be greater than 0."
-        return self.driver_worker.pin_lora(lora_id)
-
-    def list_loras(self) -> Set[int]:
-        return self.driver_worker.list_loras()
-
-    def add_prompt_adapter(
-            self, prompt_adapter_request: PromptAdapterRequest) -> bool:
-        assert prompt_adapter_request.prompt_adapter_id > 0, \
-            "prompt_adapter_id must be greater than 0."
-        return self.driver_worker.add_prompt_adapter(prompt_adapter_request)
-
-    def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
-        assert prompt_adapter_id > 0, \
-            "prompt_adapter_id must be greater than 0."
-        return self.driver_worker.remove_prompt_adapter(prompt_adapter_id)
-
-    def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
-        assert prompt_adapter_id > 0, \
-                "prompt_adapter_id must be greater than 0."
-        return self.driver_worker.pin_prompt_adapter(prompt_adapter_id)
-
-    def list_prompt_adapters(self) -> Set[int]:
-        return self.driver_worker.list_prompt_adapters()
-
-    def check_health(self) -> None:
-        # GPUExecutor will always be healthy as long as
-        # it's running.
-        return
-
-    def start_profile(self) -> None:
-        self.driver_worker.start_profile()
-
-    def stop_profile(self) -> None:
-        self.driver_worker.stop_profile()
-
-
-class GPUExecutorAsync(GPUExecutor, ExecutorAsyncBase):
-
-    async def execute_model_async(
-        self,
-        execute_model_req: ExecuteModelRequest,
-    ) -> List[Union[SamplerOutput, PoolerOutput]]:
-        output = await make_async(self.driver_worker.execute_model
-                                  )(execute_model_req=execute_model_req)
-        return output
diff --git a/vllm/executor/hpu_executor.py b/vllm/executor/hpu_executor.py
deleted file mode 100644
index c9b7bfa71edfa..0000000000000
--- a/vllm/executor/hpu_executor.py
+++ /dev/null
@@ -1,202 +0,0 @@
-###############################################################################
-# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
-###############################################################################
-
-import contextlib
-import os
-from typing import Any, Dict, List, Optional, Set, Tuple
-
-from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
-from vllm.logger import init_logger
-from vllm.lora.request import LoRARequest
-from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.prompt_adapter.request import PromptAdapterRequest
-from vllm.sequence import ExecuteModelRequest
-from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
-                        make_async)
-from vllm.worker.worker_base import WorkerWrapperBase
-
-logger = init_logger(__name__)
-
-
-class HPUExecutor(ExecutorBase):
-
-    uses_ray: bool = False
-
-    def _init_executor(self) -> None:
-        """Initialize the worker and load the model."""
-        self._init_worker()
-
-    def _get_worker_kwargs(
-            self,
-            local_rank: int = 0,
-            rank: int = 0,
-            distributed_init_method: Optional[str] = None) -> Dict[str, Any]:
-        """Return worker init args for a given rank."""
-        if distributed_init_method is None:
-            distributed_init_method = get_distributed_init_method(
-                get_ip(), get_open_port())
-        return dict(
-            vllm_config=self.vllm_config,
-            local_rank=local_rank,
-            rank=rank,
-            distributed_init_method=distributed_init_method,
-            is_driver_worker=rank == 0,
-        )
-
-    def _create_worker(self,
-                       local_rank: int = 0,
-                       rank: int = 0,
-                       distributed_init_method: Optional[str] = None):
-        wrapper = WorkerWrapperBase(vllm_config=self.vllm_config)
-        wrapper.init_worker(**self._get_worker_kwargs(local_rank, rank,
-                                                      distributed_init_method))
-        return wrapper.worker
-
-    def _init_worker(self):
-        assert self.parallel_config.world_size == 1, (
-            "GPUExecutor only supports single GPU.")
-
-        self.driver_worker = self._create_worker()
-        self.driver_worker.init_device()
-        self.driver_worker.load_model()
-
-    def determine_num_available_blocks(self) -> Tuple[int, int]:
-        """Determine the number of available KV blocks by invoking the
-        underlying worker.
-        """
-        return self.driver_worker.determine_num_available_blocks()
-
-    def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None:
-        """Initialize the KV cache by invoking the underlying worker.
-        """
-        # NOTE: This is logged in the executor because there can be >1 worker
-        # with other executors. We could log in the engine level, but work
-        # remains to abstract away the device for non-GPU configurations.
-        logger.info("# HPU blocks: %d, # CPU blocks: %d", num_gpu_blocks,
-                    num_cpu_blocks)
-        from vllm_hpu_extension.profiler import HabanaMemoryProfiler
-        with HabanaMemoryProfiler() as cache_init_m:
-            self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
-        msg = f"init_cache_engine took {cache_init_m.get_summary_string()}"
-        logger.info(msg)
-
-    def finish_measurements(self):
-        self.driver_worker.finish_measurements()
-
-    def execute_model(
-            self,
-            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
-        # VLLM_HPU_LOG_STEP_GRAPH_COMPILATION     - will log graph compilations per engine step, only when there was any - highly recommended to use alongside PT_HPU_METRICS_GC_DETAILS! # noqa:E501
-        # VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL - will log graph compilations per engine step, always, even if there were none # noqa:E501
-        # VLLM_HPU_LOG_STEP_CPU_FALLBACKS         - will log cpu fallbacks per engine step, only when there was any # noqa:E501
-        # VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL     - will log cpu fallbacks per engine step, always, even if there were none # noqa:E501
-        log_graph_compilation_all = os.environ.get(
-            'VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL', '0') != '0'
-        log_graph_compilation = os.environ.get(
-            'VLLM_HPU_LOG_STEP_GRAPH_COMPILATION',
-            '0') != '0' or log_graph_compilation_all
-        log_cpu_fallbacks_all = os.environ.get(
-            'VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL', '0') != '0'
-        log_cpu_fallbacks = os.environ.get('VLLM_HPU_LOG_STEP_CPU_FALLBACKS',
-                                           '0') != '0' or log_cpu_fallbacks_all
-        if log_graph_compilation or log_cpu_fallbacks:
-            from habana_frameworks.torch.hpu.metrics import metric_localcontext
-            seq_group_metadata_list = execute_model_req.seq_group_metadata_list
-            is_prompt = any([
-                seq_group_metadata.is_prompt
-                for seq_group_metadata in seq_group_metadata_list
-            ])
-            max_context_len = max([
-                max([
-                    len(v.prompt_token_ids) + len(v.output_token_ids)
-                    for v in seq_group_metadata.seq_data.values()
-                ]) for seq_group_metadata in seq_group_metadata_list
-            ])  # whoa, that's some spicy stuff right here
-            max_num_blocks = (
-                (max_context_len - 1) // self.cache_config.block_size) + 1
-            input_stats = (f'is_prompt: {is_prompt}, '
-                           f'num_seqs: {len(seq_group_metadata_list)}, '
-                           f'max_context_len: {max_context_len}, '
-                           f'max_num_blocks {max_num_blocks}')
-            gc_ctx = metric_localcontext(
-                "graph_compilation"
-            ) if log_graph_compilation else contextlib.nullcontext()
-            cpu_fallback_ctx = metric_localcontext(
-                "cpu_fallback"
-            ) if log_cpu_fallbacks else contextlib.nullcontext()
-            with gc_ctx as gc_local_metric, \
-                cpu_fallback_ctx as cpu_fallback_local_metric:
-                output = self.driver_worker.execute_model(execute_model_req)
-            if (log_graph_compilation and gc_local_metric.stats()[0][1] > 0
-                ) or log_graph_compilation_all:
-                msg = ("VLLM_HPU_STEP_GRAPH_COMPILATION: "
-                       f"{gc_local_metric.stats()}, {input_stats}")
-                logger.warning(msg)
-            if (log_cpu_fallbacks and cpu_fallback_local_metric.stats()[0][1] >
-                    0) or log_cpu_fallbacks_all:
-                msg = ("VLLM_HPU_STEP_CPU_FALLBACK: "
-                       f"{cpu_fallback_local_metric.stats()}, {input_stats}")
-                logger.warning(msg)
-
-            return output
-
-        output = self.driver_worker.execute_model(execute_model_req)
-        return output
-
-    def add_lora(self, lora_request: LoRARequest) -> bool:
-        assert lora_request.lora_int_id > 0, "lora_id must be greater than 0."
-        return self.driver_worker.add_lora(lora_request)
-
-    def remove_lora(self, lora_id: int) -> bool:
-        assert lora_id > 0, "lora_id must be greater than 0."
-        return self.driver_worker.remove_lora(lora_id)
-
-    def pin_lora(self, lora_id: int) -> bool:
-        assert lora_id > 0, "lora_id must be greater than 0."
-        return self.driver_worker.pin_lora(lora_id)
-
-    def list_loras(self) -> Set[int]:
-        return self.driver_worker.list_loras()
-
-    def add_prompt_adapter(
-            self, prompt_adapter_request: PromptAdapterRequest) -> bool:
-        raise NotImplementedError(
-            "Prompt Adapter is not implemented for HPU backend.")
-
-    def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
-        raise NotImplementedError(
-            "Prompt Adapter is not implemented for HPU backend.")
-
-    def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
-        raise NotImplementedError(
-            "Prompt Adapter is not implemented for HPU backend.")
-
-    def list_prompt_adapters(self) -> Set[int]:
-        raise NotImplementedError(
-            "Prompt Adapter is not implemented for HPU backend.")
-
-    def check_health(self) -> None:
-        # GPUExecutor will always be healthy as long as
-        # it's running.
-        return
-
-    def start_profile(self) -> None:
-        self.driver_worker.start_profile()
-
-    def stop_profile(self) -> None:
-        self.driver_worker.stop_profile()
-
-    def shutdown(self) -> None:
-        self.driver_worker.shutdown_inc()
-
-
-class HPUExecutorAsync(HPUExecutor, ExecutorAsyncBase):
-
-    async def execute_model_async(
-        self,
-        execute_model_req: ExecuteModelRequest,
-    ) -> List[SamplerOutput]:
-        output = await make_async(self.driver_worker.execute_model
-                                  )(execute_model_req=execute_model_req, )
-        return output
diff --git a/vllm/executor/multiproc_gpu_executor.py b/vllm/executor/mp_distributed_executor.py
similarity index 75%
rename from vllm/executor/multiproc_gpu_executor.py
rename to vllm/executor/mp_distributed_executor.py
index fc58163cade64..d9dde949b844a 100644
--- a/vllm/executor/multiproc_gpu_executor.py
+++ b/vllm/executor/mp_distributed_executor.py
@@ -1,32 +1,26 @@
 import asyncio
-import os
-from functools import partial
 from typing import Any, List, Optional
 
-from vllm.executor.distributed_gpu_executor import (  # yapf: disable
-    DistributedGPUExecutor, DistributedGPUExecutorAsync)
-from vllm.executor.gpu_executor import create_worker
+from vllm.executor.executor_base import DistributedExecutorBase
 from vllm.executor.multiproc_worker_utils import (
     ProcessWorkerWrapper, ResultHandler, WorkerMonitor,
     set_multiprocessing_worker_envs)
 from vllm.logger import init_logger
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.sequence import ExecuteModelRequest
-from vllm.utils import (_run_task_with_lock, cuda_device_count_stateless,
-                        get_distributed_init_method, get_open_port, make_async,
-                        update_environment_variables)
+from vllm.utils import (_run_task_with_lock, get_distributed_init_method,
+                        get_ip, get_open_port, make_async)
+from vllm.worker.worker_base import WorkerWrapperBase
 
 logger = init_logger(__name__)
 
 
-class MultiprocessingGPUExecutor(DistributedGPUExecutor):
-    """Python multiprocessing-based multi-GPU executor"""
+class MultiprocessingDistributedExecutor(DistributedExecutorBase):
+    """Python multiprocessing-based distributed executor"""
 
     uses_ray: bool = False
 
     def _init_executor(self) -> None:
-        self._check_executor_parameters()
-
         # Create the parallel GPU workers.
         world_size = self.parallel_config.world_size
         tensor_parallel_size = self.parallel_config.tensor_parallel_size
@@ -55,15 +49,9 @@ def _init_executor(self) -> None:
         else:
             result_handler = ResultHandler()
             for rank in range(1, world_size):
-                worker = ProcessWorkerWrapper(
-                    result_handler,
-                    partial(
-                        create_worker,
-                        **self._get_worker_kwargs(
-                            rank=rank,
-                            local_rank=rank,
-                            distributed_init_method=distributed_init_method,
-                        )))
+                worker = ProcessWorkerWrapper(result_handler,
+                                              WorkerWrapperBase,
+                                              self.vllm_config, rank)
                 self.workers.append(worker)
                 if rank % tensor_parallel_size == 0:
                     self.tp_driver_workers.append(worker)
@@ -77,32 +65,30 @@ def _init_executor(self) -> None:
         # Set up signal handlers to shutdown the executor cleanly
         # sometimes gc does not work well
 
-        self.driver_worker = self._create_worker(
-            distributed_init_method=distributed_init_method)
+        self.driver_worker = WorkerWrapperBase(self.vllm_config, 0)
+
+        all_kwargs = []
+        distributed_init_method = get_distributed_init_method(
+            get_ip(), get_open_port())
+        for i in range(world_size):
+            local_rank = i
+            rank = i
+            kwargs = dict(
+                vllm_config=self.vllm_config,
+                local_rank=local_rank,
+                rank=rank,
+                distributed_init_method=distributed_init_method,
+                is_driver_worker=(not self.parallel_config)
+                or (rank % self.parallel_config.tensor_parallel_size == 0),
+            )
+            all_kwargs.append(kwargs)
+        self._run_workers("init_worker", all_kwargs)
         self._run_workers("init_device")
         self._run_workers("load_model",
                           max_concurrent_workers=self.parallel_config.
                           max_parallel_loading_workers)
-
-    def _check_executor_parameters(self):
-        world_size = self.parallel_config.world_size
-        tensor_parallel_size = self.parallel_config.tensor_parallel_size
-
-        # Set CUDA_VISIBLE_DEVICES for the driver, inherited by workers
-        if "CUDA_VISIBLE_DEVICES" not in os.environ:
-            update_environment_variables({
-                "CUDA_VISIBLE_DEVICES": (",".join(map(str, range(world_size))))
-            })
-
-        cuda_device_count = cuda_device_count_stateless()
-        # Use confusing message for more common TP-only case.
-        assert tensor_parallel_size <= cuda_device_count, (
-            f"please set tensor_parallel_size ({tensor_parallel_size}) "
-            f"to less than max local gpu count ({cuda_device_count})")
-
-        assert world_size <= cuda_device_count, (
-            f"please ensure that world_size ({world_size}) "
-            f"is less than than max local gpu count ({cuda_device_count})")
+        self.driver_exec_model = make_async(self.driver_worker.execute_model)
+        self.pp_locks: Optional[List[asyncio.Lock]] = None
 
     def shutdown(self):
         if (worker_monitor := getattr(self, "worker_monitor",
@@ -172,15 +158,6 @@ def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None:
         for result in parallel_worker_tasks:
             result.get()
 
-
-class MultiprocessingGPUExecutorAsync(MultiprocessingGPUExecutor,
-                                      DistributedGPUExecutorAsync):
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.driver_exec_model = make_async(self.driver_worker.execute_model)
-        self.pp_locks: Optional[List[asyncio.Lock]] = None
-
     async def _driver_execute_model_async(
         self,
         execute_model_req: Optional[ExecuteModelRequest] = None
diff --git a/vllm/executor/multiproc_worker_utils.py b/vllm/executor/multiproc_worker_utils.py
index bc32826529eef..c9fb3c664c575 100644
--- a/vllm/executor/multiproc_worker_utils.py
+++ b/vllm/executor/multiproc_worker_utils.py
@@ -12,6 +12,7 @@
 
 import torch
 
+from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.triton_utils.importing import HAS_TRITON
 from vllm.utils import _check_multiproc_method, get_mp_context
@@ -147,7 +148,8 @@ class ProcessWorkerWrapper:
     for handling single-node multi-GPU tensor parallel."""
 
     def __init__(self, result_handler: ResultHandler,
-                 worker_factory: Callable[[], Any]) -> None:
+                 worker_factory: Callable[[VllmConfig, int], Any],
+                 vllm_config: VllmConfig, rank: int) -> None:
         self.mp = get_mp_context()
         self._task_queue = self.mp.Queue()
         self.result_queue = result_handler.result_queue
@@ -159,6 +161,8 @@ def __init__(self, result_handler: ResultHandler,
                 worker_factory=worker_factory,
                 task_queue=self._task_queue,
                 result_queue=self.result_queue,
+                vllm_config=vllm_config,
+                rank=rank,
             ),
             daemon=True)
 
@@ -199,9 +203,11 @@ def kill_worker(self):
 
 
 def _run_worker_process(
-    worker_factory: Callable[[], Any],
+    worker_factory: Callable[[VllmConfig, int], Any],
     task_queue: Queue,
     result_queue: Queue,
+    vllm_config: VllmConfig,
+    rank: int,
 ) -> None:
     """Worker process event loop"""
 
@@ -212,7 +218,7 @@ def _run_worker_process(
     _add_prefix(sys.stderr, process_name, pid)
 
     # Initialize worker
-    worker = worker_factory()
+    worker = worker_factory(vllm_config, rank)
     del worker_factory
 
     # Accept tasks from the engine in task_queue
diff --git a/vllm/executor/multiproc_xpu_executor.py b/vllm/executor/multiproc_xpu_executor.py
deleted file mode 100644
index a66afbf939ef0..0000000000000
--- a/vllm/executor/multiproc_xpu_executor.py
+++ /dev/null
@@ -1,26 +0,0 @@
-import vllm.envs as envs
-from vllm.executor.multiproc_gpu_executor import (
-    MultiprocessingGPUExecutor, MultiprocessingGPUExecutorAsync)
-from vllm.executor.xpu_executor import XPUExecutor
-from vllm.logger import init_logger
-from vllm.utils import make_async
-
-logger = init_logger(__name__)
-
-
-class MultiprocessingXPUExecutor(MultiprocessingGPUExecutor, XPUExecutor):
-    """Python multiprocessing-based multi-XPU executor"""
-
-    def _check_executor_parameters(self):
-        mp_method = envs.VLLM_WORKER_MULTIPROC_METHOD
-        if mp_method != "spawn":
-            raise RuntimeError(
-                "XPU multiprocess executor only support spawn as mp method")
-
-
-class MultiprocessingXPUExecutorAsync(MultiprocessingXPUExecutor,
-                                      MultiprocessingGPUExecutorAsync):
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.driver_exec_model = make_async(self.driver_worker.execute_model)
diff --git a/vllm/executor/neuron_executor.py b/vllm/executor/neuron_executor.py
deleted file mode 100644
index a9efc4f9a801c..0000000000000
--- a/vllm/executor/neuron_executor.py
+++ /dev/null
@@ -1,114 +0,0 @@
-from typing import List, Set, Tuple
-
-from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
-from vllm.logger import init_logger
-from vllm.lora.request import LoRARequest
-from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.sequence import ExecuteModelRequest
-from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
-                        make_async)
-from vllm.worker.worker_base import WorkerWrapperBase
-
-logger = init_logger(__name__)
-
-
-class NeuronExecutor(ExecutorBase):
-
-    uses_ray: bool = False
-
-    def _init_executor(self) -> None:
-        assert (self.lora_config is
-                None), "LoRA is not supported for Neuron backend."
-        assert (not self.speculative_config
-                ), "Speculative decoding not yet supported for Neuron backend."
-
-        # Instantiate the worker and load the model to the device.
-        self._init_worker()
-
-    def _init_worker(self):
-        wrapper = WorkerWrapperBase(vllm_config=self.vllm_config)
-        distributed_init_method = get_distributed_init_method(
-            get_ip(), get_open_port())
-        wrapper.init_worker(
-            vllm_config=self.vllm_config,
-            local_rank=0,
-            rank=0,
-            distributed_init_method=distributed_init_method,
-        )
-        self.driver_worker = wrapper.worker
-        self.driver_worker.init_device()
-        self.driver_worker.load_model()
-
-    def determine_num_available_blocks(self) -> Tuple[int, int]:
-        """Determine the number of available KV blocks by invoking the
-        underlying worker.
-        """
-        return self.driver_worker.determine_num_available_blocks()
-
-    def initialize_cache(self, num_gpu_blocks: int,
-                         num_cpu_blocks: int) -> None:
-        """Initialize the KV cache by invoking the underlying worker.
-        """
-        self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
-
-    def execute_model(
-            self,
-            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
-        assert (not execute_model_req.blocks_to_swap_in
-                and not execute_model_req.blocks_to_swap_out
-                and not execute_model_req.blocks_to_copy), (
-                    "Cache operations are not supported for Neuron backend.")
-        assert execute_model_req.num_lookahead_slots == 0, (
-            "lookahead not supported for Neuron backend.")
-
-        output = self.driver_worker.execute_model(execute_model_req)
-        return output
-
-    def add_lora(self, lora_request: LoRARequest) -> bool:
-        return self.driver_worker.add_lora(lora_request)
-
-    def remove_lora(self, lora_id: int) -> bool:
-        return self.driver_worker.remove_lora(lora_id)
-
-    def pin_lora(self, lora_id: int) -> bool:
-        return self.driver_worker.pin_lora(lora_id)
-
-    def list_loras(self) -> Set[int]:
-        return self.driver_worker.list_loras()
-
-    def add_prompt_adapter(self, prompt_adapter_request) -> bool:
-        raise NotImplementedError(
-            "Soft prompt is currently not supported by the Neuron backend.")
-
-    def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
-        raise NotImplementedError(
-            "Soft prompt is currently not supported by the Neuron backend.")
-
-    def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
-        raise NotImplementedError(
-            "Soft prompt is currently not supported by the Neuron backend.")
-
-    def list_prompt_adapters(self) -> Set[int]:
-        raise NotImplementedError(
-            "Soft prompt is currently not supported by the Neuron backend.")
-
-    def check_health(self) -> None:
-        # NeuronExecutor will always be healthy as long as
-        # it's running.
-        return
-
-
-class NeuronExecutorAsync(NeuronExecutor, ExecutorAsyncBase):
-
-    async def execute_model_async(
-        self,
-        execute_model_req: ExecuteModelRequest,
-    ) -> List[SamplerOutput]:
-        output = await make_async(self.driver_worker.execute_model
-                                  )(execute_model_req=execute_model_req, )
-        return output
-
-    async def check_health_async(self) -> None:
-        # NeuronExecutor will always be healthy as long as
-        # it's running.
-        return
diff --git a/vllm/executor/openvino_executor.py b/vllm/executor/openvino_executor.py
deleted file mode 100644
index 057a32364e512..0000000000000
--- a/vllm/executor/openvino_executor.py
+++ /dev/null
@@ -1,125 +0,0 @@
-from typing import List, Set, Tuple
-
-import openvino as ov
-
-import vllm.envs as envs
-from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
-from vllm.logger import init_logger
-from vllm.lora.request import LoRARequest
-from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.platforms import current_platform
-from vllm.sequence import ExecuteModelRequest
-from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
-                        make_async)
-from vllm.worker.worker_base import WorkerWrapperBase
-
-logger = init_logger(__name__)
-
-
-class OpenVINOExecutor(ExecutorBase):
-
-    uses_ray: bool = False
-
-    def _init_executor(self) -> None:
-        assert self.device_config.device_type == "openvino"
-        assert self.lora_config is None, "OpenVINO backend doesn't support LoRA"
-        assert current_platform.is_openvino_cpu() or \
-            current_platform.is_openvino_gpu(), \
-            "OpenVINO backend supports only CPU and GPU devices"
-
-        # Instantiate the worker and load the model to CPU.
-        self._init_worker()
-
-    def _init_worker(self):
-
-        wrapper = WorkerWrapperBase(vllm_config=self.vllm_config)
-
-        distributed_init_method = get_distributed_init_method(
-            get_ip(), get_open_port())
-        wrapper.init_worker(
-            ov_core=ov.Core(),
-            vllm_config=self.vllm_config,
-            local_rank=0,
-            rank=0,
-            distributed_init_method=distributed_init_method,
-            kv_cache_dtype=self.cache_config.cache_dtype,
-            is_driver_worker=True,
-        )
-        self.driver_worker = wrapper.worker
-        self.driver_worker.init_device()
-        self.driver_worker.load_model()
-
-    def determine_num_available_blocks(self) -> Tuple[int, int]:
-        """Determine the number of available KV blocks by invoking the
-        underlying worker.
-        """
-        return self.driver_worker.determine_num_available_blocks()
-
-    def initialize_cache(self, num_gpu_blocks: int,
-                         num_cpu_blocks: int) -> None:
-        """Initialize the KV cache by invoking the underlying worker."""
-        # NOTE: We log here to avoid multiple logs when number of workers is
-        # greater than one. We could log in the engine, but not all executors
-        # have GPUs.
-        # NOTE: In case of a CPU device, `cpu block` for OpenVINO backend
-        # is located on CPU memory but is referred as `gpu block`.
-        # Because we want to reuse the existing block management procedure.
-        device_blocks = num_gpu_blocks
-        swap_blocks = num_cpu_blocks
-        logger.info("OpenVINO %s: # device blocks: %d; # swap blocks: %d",
-                    envs.VLLM_OPENVINO_DEVICE, device_blocks, swap_blocks)
-        self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
-
-    def execute_model(
-            self,
-            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
-        output = self.driver_worker.execute_model(execute_model_req)
-        return output
-
-    def add_lora(self, lora_request: LoRARequest) -> bool:
-        return self.driver_worker.add_lora(lora_request)
-
-    def remove_lora(self, lora_id: int) -> bool:
-        return self.driver_worker.remove_lora(lora_id)
-
-    def pin_lora(self, lora_id: int) -> bool:
-        return self.driver_worker.pin_lora(lora_id)
-
-    def list_loras(self) -> Set[int]:
-        return self.driver_worker.list_loras()
-
-    def add_prompt_adapter(self, prompt_adapter_request) -> bool:
-        raise NotImplementedError(
-            "Soft prompt is currently not supported by the OPENVINO backend.")
-
-    def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
-        raise NotImplementedError(
-            "Soft prompt is currently not supported by the OPENVINO backend.")
-
-    def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
-        raise NotImplementedError(
-            "Soft prompt is currently not supported by the OPENVINO backend.")
-
-    def list_prompt_adapters(self) -> Set[int]:
-        raise NotImplementedError(
-            "Soft prompt is currently not supported by the OPENVINO backend.")
-
-    def check_health(self) -> None:
-        # OpenVINOExecutor will always be healthy as long as
-        # it's running.
-        return
-
-
-class OpenVINOExecutorAsync(OpenVINOExecutor, ExecutorAsyncBase):
-
-    async def execute_model_async(
-            self,
-            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
-        output = await make_async(self.driver_worker.execute_model
-                                  )(execute_model_req=execute_model_req, )
-        return output
-
-    async def check_health_async(self) -> None:
-        # OpenVINOExecutor will always be healthy as long as
-        # it's running.
-        return
diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_distributed_executor.py
similarity index 78%
rename from vllm/executor/ray_gpu_executor.py
rename to vllm/executor/ray_distributed_executor.py
index e2c549cbd5331..edceece4b68dc 100644
--- a/vllm/executor/ray_gpu_executor.py
+++ b/vllm/executor/ray_distributed_executor.py
@@ -1,24 +1,29 @@
 import asyncio
 import os
 from collections import defaultdict
-from itertools import islice, repeat
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Dict, List, Optional
 
 import msgspec
 
 import vllm.envs as envs
-from vllm.executor.distributed_gpu_executor import (  # yapf: disable
-    DistributedGPUExecutor, DistributedGPUExecutorAsync)
+from vllm.executor.executor_base import (
+    DistributedExecutorBase)  # yapf: disable
 from vllm.executor.msgspec_utils import encode_hook
-from vllm.executor.ray_utils import RayWorkerWrapper, ray
+from vllm.executor.ray_utils import (RayWorkerWrapper, initialize_ray_cluster,
+                                     ray)
 from vllm.logger import init_logger
 from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.platforms import current_platform
 from vllm.sequence import ExecuteModelRequest
 from vllm.utils import (_run_task_with_lock, get_distributed_init_method,
                         get_ip, get_open_port, make_async)
 
 if ray is not None:
+    from ray.actor import ActorHandle
     from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
+else:
+    ActorHandle = None
 
 if TYPE_CHECKING:
     from ray.util.placement_group import PlacementGroup
@@ -26,12 +31,29 @@
 logger = init_logger(__name__)
 
 
-class RayGPUExecutor(DistributedGPUExecutor):
+@dataclass
+class RayWorkerMetaData:
+    """
+    Metadata for a Ray worker.
+    The order of ray worker creation can be random,
+    and we need to reset the rank after creating all workers.
+    """
+    worker: ActorHandle
+    created_rank: int
+    adjusted_rank: int = -1
+    ip: str = ""
+
+
+class RayDistributedExecutor(DistributedExecutorBase):
 
     uses_ray: bool = True
 
     def _init_executor(self) -> None:
         self.forward_dag: Optional[ray.dag.CompiledDAG] = None
+        if envs.VLLM_USE_V1:
+            # v1 always uses the compiled DAG and SPMD worker.
+            os.environ["VLLM_USE_RAY_SPMD_WORKER"] = "1"
+            os.environ["VLLM_USE_RAY_COMPILED_DAG"] = "1"
         # If the env var is set, it uses the Ray's compiled DAG API
         # which optimizes the control plane overhead.
         # Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
@@ -53,6 +75,7 @@ def _init_executor(self) -> None:
                 "VLLM_USE_RAY_COMPILED_DAG=1")
 
         assert self.uses_ray
+        initialize_ray_cluster(self.parallel_config)
         placement_group = self.parallel_config.placement_group
 
         # Disable Ray usage stats collection.
@@ -66,6 +89,13 @@ def _init_executor(self) -> None:
         self.input_encoder = msgspec.msgpack.Encoder(enc_hook=encode_hook)
         self.output_decoder = msgspec.msgpack.Decoder(
             Optional[List[SamplerOutput]])
+        self.use_v1 = envs.VLLM_USE_V1
+
+        self.pp_locks: Optional[List[asyncio.Lock]] = None
+        self.use_ray_spmd_worker = envs.VLLM_USE_RAY_SPMD_WORKER
+        if not self.use_ray_compiled_dag:
+            self.driver_exec_method = make_async(
+                self.driver_worker.execute_method)
 
     def shutdown(self) -> None:
         if hasattr(self, "forward_dag") and self.forward_dag is not None:
@@ -123,9 +153,10 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
 
         # Create the workers.
         driver_ip = get_ip()
-        workers = []
+        rank = 0
+        worker_metadata: List[RayWorkerMetaData] = []
         for bundle_id, bundle in enumerate(placement_group.bundle_specs):
-            if not bundle.get("GPU", 0):
+            if not bundle.get(current_platform.ray_device_key, 0):
                 continue
             scheduling_strategy = PlacementGroupSchedulingStrategy(
                 placement_group=placement_group,
@@ -133,38 +164,51 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
                 placement_group_bundle_index=bundle_id,
             )
 
-            worker = ray.remote(
-                num_cpus=0,
-                num_gpus=num_gpus,
-                scheduling_strategy=scheduling_strategy,
-                **ray_remote_kwargs,
-            )(RayWorkerWrapper).remote(vllm_config=self.vllm_config)
-            workers.append(worker)
-
-        worker_ip_refs = [
-            worker.get_node_ip.remote()  # type: ignore[attr-defined]
-            for worker in workers
-        ]
-        worker_ips = ray.get(worker_ip_refs)
+            if current_platform.ray_device_key == "GPU":
+                # NV+AMD GPUs, and Intel XPUs
+                worker = ray.remote(
+                    num_cpus=0,
+                    num_gpus=num_gpus,
+                    scheduling_strategy=scheduling_strategy,
+                    **ray_remote_kwargs,
+                )(RayWorkerWrapper).remote(vllm_config=self.vllm_config,
+                                           rank=rank)
+            else:
+                worker = ray.remote(
+                    num_cpus=0,
+                    num_gpus=0,
+                    resources={current_platform.ray_device_key: num_gpus},
+                    scheduling_strategy=scheduling_strategy,
+                    **ray_remote_kwargs,
+                )(RayWorkerWrapper).remote(vllm_config=self.vllm_config,
+                                           rank=rank)
+            worker_metadata.append(
+                RayWorkerMetaData(worker=worker, created_rank=rank))
+            rank += 1
+
+        worker_ips = ray.get([
+            each.worker.get_node_ip.remote()  # type: ignore[attr-defined]
+            for each in worker_metadata
+        ])
+
+        for each, ip in zip(worker_metadata, worker_ips):
+            each.ip = ip
 
         if not self.use_ray_spmd_worker:
-            for i in range(len(workers)):
-                worker = workers[i]
-                worker_ip = worker_ips[i]
+            for i, each in enumerate(worker_metadata):
+                # find and remove the dummy worker from the list
+                worker = each.worker
+                worker_ip = each.ip
                 if self.driver_dummy_worker is None and worker_ip == driver_ip:
                     # If the worker is on the same node as the driver, we use it
                     # as the resource holder for the driver process.
                     self.driver_dummy_worker = worker
                     self.driver_worker = RayWorkerWrapper(
-                        vllm_config=self.vllm_config)
-                    workers.pop(i)
-                    worker_ips.pop(i)
-                    self.workers = workers
+                        vllm_config=self.vllm_config, rank=0)
+                    worker_metadata.pop(i)
                     break
-        else:
-            self.workers = workers
 
-        logger.debug("workers: %s", self.workers)
+        logger.debug("workers: %s", worker_metadata)
         logger.debug("driver_dummy_worker: %s", self.driver_dummy_worker)
         if not self.use_ray_spmd_worker and self.driver_dummy_worker is None:
             raise ValueError(
@@ -176,9 +220,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
         for ip in worker_ips:
             ip_counts[ip] = ip_counts.get(ip, 0) + 1
 
-        worker_to_ip = dict(zip(self.workers, worker_ips))
-
-        def sort_by_driver_then_worker_ip(worker):
+        def sort_by_driver_then_worker_ip(item: RayWorkerMetaData):
             """
             Sort the workers based on 3 properties:
             1. If the worker is on the same node as the driver (vllm engine),
@@ -188,13 +230,23 @@ def sort_by_driver_then_worker_ip(worker):
             3. Finally, if the work is on a node with smaller IP address, it
                 should be placed first.
             """
-            ip = worker_to_ip[worker]
-            return (ip != driver_ip, ip_counts[ip], ip)
+            ip = item.ip
+            return (0 if ip == driver_ip else 1, ip_counts[ip], ip)
 
         # After sorting, the workers on the same node will be
         # close to each other, and the workers on the driver
         # node will be placed first.
-        self.workers = sorted(self.workers, key=sort_by_driver_then_worker_ip)
+        sorted_worker_metadata = sorted(worker_metadata,
+                                        key=sort_by_driver_then_worker_ip)
+        start_rank = 0 if self.use_ray_spmd_worker else 1
+        for i, item in enumerate(sorted_worker_metadata):
+            item.adjusted_rank = i + start_rank
+        self.workers = [item.worker for item in sorted_worker_metadata]
+        rerank_mapping = {
+            item.created_rank: item.adjusted_rank
+            for item in sorted_worker_metadata
+        }
+        self._run_workers("adjust_rank", rerank_mapping)
 
         # Get the set of GPU IDs used on each node.
         worker_node_and_gpu_ids = []
@@ -235,21 +287,29 @@ def sort_by_driver_then_worker_ip(worker):
                 " each node.")
 
         # Set environment variables for the driver and workers.
-        all_args_to_update_environment_variables = [({
-            "CUDA_VISIBLE_DEVICES":
+        all_args_to_update_environment_variables = [{
+            current_platform.device_control_env_var:
             ",".join(map(str, node_gpus[node_id])),
-            "VLLM_TRACE_FUNCTION":
-            str(envs.VLLM_TRACE_FUNCTION),
-            **({
-                "VLLM_ATTENTION_BACKEND": envs.VLLM_ATTENTION_BACKEND
-            } if envs.VLLM_ATTENTION_BACKEND is not None else {})
-        }, ) for (node_id, _) in worker_node_and_gpu_ids]
+        } for (node_id, _) in worker_node_and_gpu_ids]
+
+        for args in all_args_to_update_environment_variables:
+            # some carry-over env vars from the driver
+            # TODO: refactor platform-specific env vars
+            for name in [
+                    "VLLM_ATTENTION_BACKEND",
+                    "TPU_CHIPS_PER_HOST_BOUNDS",
+                    "TPU_HOST_BOUNDS",
+                    "VLLM_USE_V1",
+                    "VLLM_TRACE_FUNCTION",
+            ]:
+                if name in os.environ:
+                    args[name] = os.environ[name]
 
         self._env_vars_for_all_workers = (
             all_args_to_update_environment_variables)
 
         self._run_workers("update_environment_variables",
-                          all_args=self._get_env_vars_to_be_updated())
+                          self._get_env_vars_to_be_updated())
 
         if len(node_gpus) == 1:
             # in single node case, we don't need to get the IP address.
@@ -265,14 +325,19 @@ def sort_by_driver_then_worker_ip(worker):
             driver_ip, get_open_port())
 
         # Initialize the actual workers inside worker wrapper.
-        init_worker_all_kwargs = [
-            self._get_worker_kwargs(
-                local_rank=node_workers[node_id].index(rank),
+        all_kwargs = []
+        for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids):
+            local_rank = node_workers[node_id].index(rank)
+            kwargs = dict(
+                vllm_config=self.vllm_config,
+                local_rank=local_rank,
                 rank=rank,
                 distributed_init_method=distributed_init_method,
-            ) for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids)
-        ]
-        self._run_workers("init_worker", all_kwargs=init_worker_all_kwargs)
+                is_driver_worker=(not self.parallel_config)
+                or (rank % self.parallel_config.tensor_parallel_size == 0),
+            )
+            all_kwargs.append(kwargs)
+        self._run_workers("init_worker", all_kwargs)
 
         self._run_workers("init_device")
         self._run_workers("load_model",
@@ -332,9 +397,15 @@ def execute_model(
         if self.forward_dag is None:
             self.forward_dag = self._compiled_ray_dag(enable_asyncio=False)
 
-        serialized_data = self.input_encoder.encode(execute_model_req)
+        if self.use_v1:
+            serialized_data = execute_model_req
+        else:
+            serialized_data = self.input_encoder.encode(execute_model_req)
         outputs = ray.get(self.forward_dag.execute(serialized_data))
-        output = self.output_decoder.decode(outputs[0])
+        if self.use_v1:
+            output = outputs[0]
+        else:
+            output = self.output_decoder.decode(outputs[0])
         return output
 
     def _run_workers(
@@ -342,8 +413,6 @@ def _run_workers(
         method: str,
         *args,
         async_run_tensor_parallel_workers_only: bool = False,
-        all_args: Optional[List[Tuple[Any, ...]]] = None,
-        all_kwargs: Optional[List[Dict[str, Any]]] = None,
         max_concurrent_workers: Optional[int] = None,
         **kwargs,
     ) -> Any:
@@ -356,8 +425,6 @@ def _run_workers(
           It will also be run asynchronously and return a list of futures
           rather than blocking on the results.
         - args/kwargs: All workers share the same args/kwargs
-        - all_args/all_kwargs: args/kwargs for each worker are specified
-          individually
         """
         if self.use_ray_spmd_worker:
             assert not async_run_tensor_parallel_workers_only, (
@@ -368,26 +435,13 @@ def _run_workers(
             raise NotImplementedError(
                 "max_concurrent_workers is not supported yet.")
 
-        count = len(self.workers) if not \
-            async_run_tensor_parallel_workers_only \
-            else len(self.non_driver_workers)
-        # If using SPMD worker, all workers are the same, so we should execute
-        # the args on all workers. Otherwise, we skip the first worker's args
-        # because those args will go to the driver worker.
-        first_worker_args_index: int = 0 if self.use_ray_spmd_worker else 1
-        all_worker_args = repeat(args, count) if all_args is None \
-            else islice(all_args, first_worker_args_index, None)
-        all_worker_kwargs = repeat(kwargs, count) if all_kwargs is None \
-            else islice(all_kwargs, first_worker_args_index, None)
-
         # Start the ray workers first.
         ray_workers = self.workers
         if async_run_tensor_parallel_workers_only:
             ray_workers = self.non_driver_workers
         ray_worker_outputs = [
-            worker.execute_method.remote(method, *worker_args, **worker_kwargs)
-            for (worker, worker_args, worker_kwargs
-                 ) in zip(ray_workers, all_worker_args, all_worker_kwargs)
+            worker.execute_method.remote(method, *args, **kwargs)
+            for worker in ray_workers
         ]
 
         if async_run_tensor_parallel_workers_only:
@@ -399,13 +453,9 @@ def _run_workers(
         # so we only explicitly execute on the driver worker if using a
         # non-SPMD worker class.
         if not self.use_ray_spmd_worker:
-            driver_args = args if all_args is None else all_args[0]
-            driver_kwargs = kwargs if all_kwargs is None else all_kwargs[0]
-
             # Start the driver worker after all the ray workers.
             driver_worker_output = [
-                self.driver_worker.execute_method(method, *driver_args,
-                                                  **driver_kwargs)
+                self.driver_worker.execute_method(method, *args, **kwargs)
             ]
 
         # Get the results of the ray workers.
@@ -467,11 +517,18 @@ def _compiled_ray_dag(self, enable_asyncio: bool):
             for pp_rank, tp_group in enumerate(self.pp_tp_workers):
                 # Each PP worker takes in the output of the previous PP worker,
                 # and the TP group executes in SPMD fashion.
-                outputs = [
-                    worker.execute_model_spmd.
-                    bind(  # type: ignore[attr-defined]
-                        outputs[i]) for i, worker in enumerate(tp_group)
-                ]
+                if self.use_v1:
+                    outputs = [
+                        worker.execute_model.
+                        bind(  # type: ignore[attr-defined]
+                            outputs[i]) for i, worker in enumerate(tp_group)
+                    ]
+                else:
+                    outputs = [
+                        worker.execute_model_spmd.
+                        bind(  # type: ignore[attr-defined]
+                            outputs[i]) for i, worker in enumerate(tp_group)
+                    ]
 
                 last_pp_rank = len(self.pp_tp_workers) - 1
                 if pp_rank < last_pp_rank:
@@ -497,17 +554,6 @@ def _compiled_ray_dag(self, enable_asyncio: bool):
     def __del__(self):
         self.shutdown()
 
-
-class RayGPUExecutorAsync(RayGPUExecutor, DistributedGPUExecutorAsync):
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.pp_locks: Optional[List[asyncio.Lock]] = None
-        self.use_ray_spmd_worker = envs.VLLM_USE_RAY_SPMD_WORKER
-        if not self.use_ray_compiled_dag:
-            self.driver_exec_method = make_async(
-                self.driver_worker.execute_method)
-
     async def execute_model_async(
             self,
             execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
@@ -568,5 +614,7 @@ async def _start_worker_execution_loop(self):
         ]
         return await asyncio.gather(*coros)
 
-    def __del__(self):
-        self.shutdown()
+    def check_health(self) -> None:
+        # Assume that the Ray workers are healthy.
+        # TODO: check the health of the Ray workers
+        return
diff --git a/vllm/executor/ray_hpu_executor.py b/vllm/executor/ray_hpu_executor.py
deleted file mode 100644
index f3025cb537ab8..0000000000000
--- a/vllm/executor/ray_hpu_executor.py
+++ /dev/null
@@ -1,515 +0,0 @@
-import asyncio
-import os
-from collections import defaultdict
-from itertools import islice, repeat
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
-
-import msgspec
-
-import vllm.envs as envs
-from vllm.executor.distributed_gpu_executor import (  # yapf: disable
-    DistributedGPUExecutor, DistributedGPUExecutorAsync)
-from vllm.executor.msgspec_utils import encode_hook
-from vllm.executor.ray_utils import RayWorkerWrapper, ray
-from vllm.logger import init_logger
-from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.sequence import ExecuteModelRequest
-from vllm.utils import (_run_task_with_lock, get_distributed_init_method,
-                        get_ip, get_open_port, make_async)
-
-if ray is not None:
-    from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
-
-if TYPE_CHECKING:
-    from ray.util.placement_group import PlacementGroup
-
-logger = init_logger(__name__)
-
-
-class RayHPUExecutor(DistributedGPUExecutor):
-
-    uses_ray: bool = True
-
-    def _init_executor(self) -> None:
-        self.forward_dag: Optional[ray.dag.CompiledDAG] = None
-        # If the env var is set, it uses the Ray's compiled DAG API
-        # which optimizes the control plane overhead.
-        # Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
-        # Currently, this requires USE_RAY_SPMD_WORKER=True.
-        self.use_ray_compiled_dag = envs.VLLM_USE_RAY_COMPILED_DAG
-        # If the env var is set, then we do not distinguish between the
-        # "driver worker" vs other workers. Also, the rank 0 worker will
-        # be executed in a remote Ray worker. Currently this requires
-        # USE_RAY_COMPILED_DAG=True.
-        self.use_ray_spmd_worker = envs.VLLM_USE_RAY_SPMD_WORKER
-        if self.use_ray_compiled_dag:
-            assert self.use_ray_spmd_worker, (
-                "VLLM_USE_RAY_COMPILED_DAG=1 requires "
-                "VLLM_USE_RAY_SPMD_WORKER=1")
-        if self.use_ray_spmd_worker:
-            # TODO: Support SPMD worker for non-DAG Ray executor.
-            assert self.use_ray_compiled_dag, (
-                "VLLM_USE_RAY_SPMD_WORKER=1 requires "
-                "VLLM_USE_RAY_COMPILED_DAG=1")
-
-        assert self.uses_ray
-        placement_group = self.parallel_config.placement_group
-
-        # Disable Ray usage stats collection.
-        ray_usage = os.environ.get("RAY_USAGE_STATS_ENABLED", "0")
-        if ray_usage != "1":
-            os.environ["RAY_USAGE_STATS_ENABLED"] = "0"
-
-        # Create the parallel GPU workers.
-        self._init_workers_ray(placement_group)
-
-        self.input_encoder = msgspec.msgpack.Encoder(enc_hook=encode_hook)
-        self.output_decoder = msgspec.msgpack.Decoder(
-            Optional[List[SamplerOutput]])
-
-    def shutdown(self) -> None:
-        if hasattr(self, "forward_dag") and self.forward_dag is not None:
-            self.forward_dag.teardown()
-            import ray
-            for worker in self.workers:
-                ray.kill(worker)
-            self.forward_dag = None
-
-    def finish_measurements(self):
-        self._run_workers("finish_measurements")
-
-    def _init_workers_ray(self, placement_group: "PlacementGroup",
-                          **ray_remote_kwargs):
-        # Otherwise, the ray workers are allocated with a full GPU.
-        num_gpus = 1
-
-        # The driver dummy worker does not actually use any resources.
-        # It holds the resource for the driver worker.
-        self.driver_dummy_worker: Optional[RayWorkerWrapper] = None
-        # The remaining workers are the actual ray actors.
-        self.workers: List[RayWorkerWrapper] = []
-
-        # Used in ray compiled DAG: indexed first by PP rank,
-        # and then TP rank. In other words, the inner list is
-        # the TP group of workers for a PP rank.
-        self.pp_tp_workers: List[List[RayWorkerWrapper]] = []
-
-        logger.info("use_ray_spmd_worker: %s", self.use_ray_spmd_worker)
-
-        # Create the workers.
-        driver_ip = get_ip()
-        for bundle_id, bundle in enumerate(placement_group.bundle_specs):
-            if not bundle.get("HPU", 0):
-                continue
-            scheduling_strategy = PlacementGroupSchedulingStrategy(
-                placement_group=placement_group,
-                placement_group_capture_child_tasks=True,
-                placement_group_bundle_index=bundle_id,
-            )
-
-            worker = ray.remote(
-                num_cpus=0,
-                num_gpus=0,
-                resources={'HPU': num_gpus},
-                scheduling_strategy=scheduling_strategy,
-                **ray_remote_kwargs,
-            )(RayWorkerWrapper).remote(vllm_config=self.vllm_config)
-
-            if self.use_ray_spmd_worker:
-                self.workers.append(worker)
-            else:
-                worker_ip = ray.get(worker.get_node_ip.remote())
-                if worker_ip == driver_ip and self.driver_dummy_worker is None:
-                    # If the worker is on the same node as the driver, we use it
-                    # as the resource holder for the driver process.
-                    self.driver_dummy_worker = worker
-                    self.driver_worker = RayWorkerWrapper(
-                        vllm_config=self.vllm_config)
-                else:
-                    # Else, added to the list of workers.
-                    self.workers.append(worker)
-
-        logger.debug("workers: %s", self.workers)
-        logger.debug("driver_dummy_worker: %s", self.driver_dummy_worker)
-        if not self.use_ray_spmd_worker and self.driver_dummy_worker is None:
-            raise ValueError(
-                "Ray does not allocate any GPUs on the driver node. Consider "
-                "adjusting the Ray placement group or running the driver on a "
-                "GPU node.")
-
-        worker_ips = [
-            ray.get(worker.get_node_ip.remote())  # type: ignore[attr-defined]
-            for worker in self.workers
-        ]
-        ip_counts: Dict[str, int] = {}
-        for ip in worker_ips:
-            ip_counts[ip] = ip_counts.get(ip, 0) + 1
-
-        def sort_by_driver_then_worker_ip(worker):
-            """
-            Sort the workers based on 3 properties:
-            1. If the worker is on the same node as the driver (vllm engine),
-                it should be placed first.
-            2. Then, if the worker is on a node with fewer workers, it should
-                be placed first.
-            3. Finally, if the work is on a node with smaller IP address, it
-                should be placed first.
-            """
-            ip = ray.get(worker.get_node_ip.remote())
-            return (ip != driver_ip, ip_counts[ip], ip)
-
-        # After sorting, the workers on the same node will be
-        # close to each other, and the workers on the driver
-        # node will be placed first.
-        self.workers = sorted(self.workers, key=sort_by_driver_then_worker_ip)
-
-        worker_node_and_gpu_ids = []
-        for worker in [self.driver_dummy_worker] + self.workers:
-            if worker is None:
-                # driver_dummy_worker can be None when using ray spmd worker.
-                continue
-            worker_node_and_gpu_ids.append(
-                ray.get(worker.get_node_and_gpu_ids.remote()) \
-            ) # type: ignore
-
-        node_workers = defaultdict(list)  # node id -> list of worker ranks
-        node_gpus = defaultdict(list)  # node id -> list of gpu ids
-
-        for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids):
-            node_workers[node_id].append(i)
-            # `gpu_ids` can be a list of strings or integers.
-            # convert them to integers for consistency.
-            # NOTE: gpu_ids can be larger than 9 (e.g. 16 GPUs),
-            # string sorting is not sufficient.
-            # see https://github.com/vllm-project/vllm/issues/5590
-            gpu_ids = [int(x) for x in gpu_ids]
-            node_gpus[node_id].extend(gpu_ids)
-        for node_id, gpu_ids in node_gpus.items():
-            node_gpus[node_id] = sorted(gpu_ids)
-
-        all_ips = set(worker_ips + [driver_ip])
-        n_ips = len(all_ips)
-        n_nodes = len(node_workers)
-
-        if n_nodes != n_ips:
-            raise RuntimeError(
-                f"Every node should have a unique IP address. Got {n_nodes}"
-                f" nodes with node ids {list(node_workers.keys())} and "
-                f"{n_ips} unique IP addresses {all_ips}. Please check your"
-                " network configuration. If you set `VLLM_HOST_IP` "
-                "environment variable, make sure it is unique for"
-                " each node.")
-
-        # Set environment variables for the driver and workers.
-        all_args_to_update_environment_variables = [({
-            "VLLM_TRACE_FUNCTION":
-            str(envs.VLLM_TRACE_FUNCTION),
-        }, ) for (node_id, _) in worker_node_and_gpu_ids]
-        self._run_workers("update_environment_variables",
-                          all_args=all_args_to_update_environment_variables)
-
-        if len(node_gpus) == 1:
-            # in single node case, we don't need to get the IP address.
-            # the loopback address is sufficient
-            # NOTE: a node may have several IP addresses, one for each
-            # network interface. `get_ip()` might return any of them,
-            # while they might not work for communication inside the node
-            # if the network setup is complicated. Using the loopback address
-            # solves this issue, as it always works for communication inside
-            # the node.
-            driver_ip = "127.0.0.1"
-        distributed_init_method = get_distributed_init_method(
-            driver_ip, get_open_port())
-
-        # Initialize the actual workers inside worker wrapper.
-        init_worker_all_kwargs = [
-            self._get_worker_kwargs(
-                local_rank=node_workers[node_id].index(rank),
-                rank=rank,
-                distributed_init_method=distributed_init_method,
-            ) for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids)
-        ]
-        self._run_workers("init_worker", all_kwargs=init_worker_all_kwargs)
-
-        self._run_workers("init_device")
-        self._run_workers("load_model",
-                          max_concurrent_workers=self.parallel_config.
-                          max_parallel_loading_workers)
-
-        if self.use_ray_spmd_worker:
-            for pp_rank in range(self.parallel_config.pipeline_parallel_size):
-                self.pp_tp_workers.append([])
-                for tp_rank in range(
-                        self.parallel_config.tensor_parallel_size):
-                    # PP=2, TP=4
-                    # pp_tp_workers = [[0, 1, 2, 3], [4, 5, 6, 7]]
-                    rank = (pp_rank * self.parallel_config.tensor_parallel_size
-                            ) + tp_rank
-                    assert len(self.pp_tp_workers[pp_rank]) == tp_rank
-                    assert pp_rank < len(self.pp_tp_workers)
-                    self.pp_tp_workers[pp_rank].append(self.workers[rank])
-
-        # This is the list of workers that are rank 0 of each TP group EXCEPT
-        # global rank 0. These are the workers that will broadcast to the
-        # rest of the workers.
-        self.tp_driver_workers: List[RayWorkerWrapper] = []
-        # This is the list of workers that are not drivers and not the first
-        # worker in a TP group. These are the workers that will be
-        # broadcasted to.
-        self.non_driver_workers: List[RayWorkerWrapper] = []
-
-        # Enforce rank order for correct rank to return final output.
-        for index, worker in enumerate(self.workers):
-            # The driver worker is rank 0 and not in self.workers.
-            rank = index + 1
-            if rank % self.parallel_config.tensor_parallel_size == 0:
-                self.tp_driver_workers.append(worker)
-            else:
-                self.non_driver_workers.append(worker)
-
-    def _driver_execute_model(
-        self, execute_model_req: Optional[ExecuteModelRequest]
-    ) -> Optional[List[SamplerOutput]]:
-        """Run execute_model in the driver worker.
-
-        Passing None will cause the driver to stop the model execution
-        loop running in each of the remote workers.
-        """
-        assert not self.use_ray_spmd_worker, (
-            "driver_worker does not exist for VLLM_USE_RAY_SPMD_WORKER=1")
-        return self.driver_worker.execute_method("execute_model",
-                                                 execute_model_req)
-
-    def execute_model(
-            self,
-            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
-        if not self.use_ray_spmd_worker:
-            return super().execute_model(execute_model_req)
-
-        if self.forward_dag is None:
-            self.forward_dag = self._compiled_ray_dag(enable_asyncio=False)
-
-        serialized_data = self.input_encoder.encode(execute_model_req)
-        outputs = ray.get(self.forward_dag.execute(serialized_data))
-        output = self.output_decoder.decode(outputs[0])
-        return output
-
-    def _run_workers(
-        self,
-        method: str,
-        *args,
-        async_run_tensor_parallel_workers_only: bool = False,
-        all_args: Optional[List[Tuple[Any, ...]]] = None,
-        all_kwargs: Optional[List[Dict[str, Any]]] = None,
-        max_concurrent_workers: Optional[int] = None,
-        **kwargs,
-    ) -> Any:
-        """Runs the given method on all workers. Can be used in the following
-        ways:
-
-        Args:
-        - async_run_tensor_parallel_workers_only: If True the method will be
-          run only in the remote TP workers, not the driver worker.
-          It will also be run asynchronously and return a list of futures
-          rather than blocking on the results.
-        - args/kwargs: All workers share the same args/kwargs
-        - all_args/all_kwargs: args/kwargs for each worker are specified
-          individually
-        """
-        if self.use_ray_spmd_worker:
-            assert not async_run_tensor_parallel_workers_only, (
-                "async_run_tensor_parallel_workers_only is not supported for "
-                "spmd mode.")
-
-        if max_concurrent_workers:
-            raise NotImplementedError(
-                "max_concurrent_workers is not supported yet.")
-
-        count = len(self.workers) if not \
-            async_run_tensor_parallel_workers_only \
-            else len(self.non_driver_workers)
-        # If using SPMD worker, all workers are the same, so we should execute
-        # the args on all workers. Otherwise, we skip the first worker's args
-        # because those args will go to the driver worker.
-        first_worker_args_index: int = 0 if self.use_ray_spmd_worker else 1
-        all_worker_args = repeat(args, count) if all_args is None \
-            else islice(all_args, first_worker_args_index, None)
-        all_worker_kwargs = repeat(kwargs, count) if all_kwargs is None \
-            else islice(all_kwargs, first_worker_args_index, None)
-
-        # Start the ray workers first.
-        ray_workers = self.workers
-        if async_run_tensor_parallel_workers_only:
-            ray_workers = self.non_driver_workers
-        ray_worker_outputs = [
-            worker.execute_method.remote(method, *worker_args, **worker_kwargs)
-            for (worker, worker_args, worker_kwargs
-                 ) in zip(ray_workers, all_worker_args, all_worker_kwargs)
-        ]
-
-        if async_run_tensor_parallel_workers_only:
-            # Just return futures
-            return ray_worker_outputs
-
-        driver_worker_output = []
-        # In SPMD mode, the driver worker is the same as any other worker,
-        # so we only explicitly execute on the driver worker if using a
-        # non-SPMD worker class.
-        if not self.use_ray_spmd_worker:
-            driver_args = args if all_args is None else all_args[0]
-            driver_kwargs = kwargs if all_kwargs is None else all_kwargs[0]
-
-            # Start the driver worker after all the ray workers.
-            driver_worker_output = [
-                self.driver_worker.execute_method(method, *driver_args,
-                                                  **driver_kwargs)
-            ]
-
-        # Get the results of the ray workers.
-        if self.workers:
-            ray_worker_outputs = ray.get(ray_worker_outputs)
-
-        return driver_worker_output + ray_worker_outputs
-
-    def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None:
-        """Wait for futures returned from _run_workers() with
-        async_run_remote_workers_only to complete."""
-        ray.get(parallel_worker_tasks)
-
-    def _check_ray_adag_installation(self):
-        import pkg_resources
-        from packaging import version
-
-        required_version = version.parse("2.35")
-        current_version = version.parse(
-            pkg_resources.get_distribution("ray").version)
-        # TODO: update the constraint once we adapt to the backward
-        # incompatible API change from ray 2.36
-        if current_version != required_version:
-            raise ValueError(f"Ray version {required_version} is "
-                             f"required, but found {current_version}")
-
-        import importlib.util
-        adag_spec = importlib.util.find_spec(
-            "ray.experimental.compiled_dag_ref")
-        if adag_spec is None:
-            raise ValueError("Ray accelerated DAG is not installed. "
-                             "Run `pip install ray[adag]` to install it.")
-
-    def _compiled_ray_dag(self, enable_asyncio: bool):
-        assert self.parallel_config.use_ray
-        self._check_ray_adag_installation()
-        from ray.dag import InputNode, MultiOutputNode
-        from ray.experimental.channel.torch_tensor_type import TorchTensorType
-
-        with InputNode() as input_data:
-            # Example DAG: PP=2, TP=4
-            # (ExecuteModelReq, None) -> 0 -> (ExecuteModelReq, IntermediateOutput) -> 4 -> SamplerOutput   # noqa: E501
-            #                         -> 1 -> (ExecuteModelReq, IntermediateOutput) -> 5 -> SamplerOutput   # noqa: E501
-            #                         -> 2 -> (ExecuteModelReq, IntermediateOutput) -> 6 -> SamplerOutput   # noqa: E501
-            #                         -> 3 -> (ExecuteModelReq, IntermediateOutput) -> 7 -> SamplerOutput   # noqa: E501
-
-            # All workers in the first TP group will take in the
-            # ExecuteModelRequest as input.
-            outputs = [input_data for _ in self.pp_tp_workers[0]]
-            for pp_rank, tp_group in enumerate(self.pp_tp_workers):
-                # Each PP worker takes in the output of the previous PP worker,
-                # and the TP group executes in SPMD fashion.
-                outputs = [
-                    worker.execute_model_spmd.
-                    bind(  # type: ignore[attr-defined]
-                        outputs[i]) for i, worker in enumerate(tp_group)
-                ]
-
-                last_pp_rank = len(self.pp_tp_workers) - 1
-                if pp_rank < last_pp_rank:
-                    # Specify how intermediate tensors should be passed
-                    # between pp stages, no need to specify for the last
-                    # pp stage.
-                    transport = "auto"
-                    outputs = [
-                        output.with_type_hint(
-                            TorchTensorType(transport=transport))
-                        for output in outputs
-                    ]
-
-            forward_dag = MultiOutputNode(outputs)
-
-        return forward_dag.experimental_compile(enable_asyncio=enable_asyncio)
-
-    def __del__(self):
-        self.shutdown()
-
-
-class RayHPUExecutorAsync(RayHPUExecutor, DistributedGPUExecutorAsync):
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.pp_locks: Optional[List[asyncio.Lock]] = None
-        self.use_ray_spmd_worker = envs.VLLM_USE_RAY_SPMD_WORKER
-        if not self.use_ray_compiled_dag:
-            self.driver_exec_method = make_async(
-                self.driver_worker.execute_method)
-
-    async def execute_model_async(
-            self,
-            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
-        if not self.use_ray_spmd_worker:
-            return await super().execute_model_async(execute_model_req)
-
-        if self.forward_dag is None:
-            self.forward_dag = self._compiled_ray_dag(enable_asyncio=True)
-
-        serialized_data = self.input_encoder.encode(execute_model_req)
-        dag_future = await self.forward_dag.execute_async(serialized_data)
-        outputs = await dag_future
-        return self.output_decoder.decode(outputs[0])
-
-    async def _driver_execute_model_async(
-        self,
-        execute_model_req: Optional[ExecuteModelRequest] = None
-    ) -> List[SamplerOutput]:
-        assert not self.use_ray_spmd_worker, (
-            "driver_worker does not exist for VLLM_USE_RAY_SPMD_WORKER=1")
-        if not self.tp_driver_workers:
-            return await self.driver_exec_method("execute_model",
-                                                 execute_model_req)
-        if self.pp_locks is None:
-            # This locks each pipeline parallel stage so multiple virtual
-            # engines can't execute on the same stage at the same time
-            # We create the locks here to avoid creating them in the constructor
-            # which uses a different asyncio loop.
-            self.pp_locks = [
-                asyncio.Lock()
-                for _ in range(self.parallel_config.pipeline_parallel_size)
-            ]
-
-        tasks = [
-            asyncio.create_task(
-                _run_task_with_lock(self.driver_exec_method, self.pp_locks[0],
-                                    "execute_model", execute_model_req))
-        ]
-        for pp_rank, driver_worker in enumerate(self.tp_driver_workers,
-                                                start=1):
-            tasks.append(
-                asyncio.create_task(
-                    _run_task_with_lock(driver_worker.execute_method.remote,
-                                        self.pp_locks[pp_rank],
-                                        "execute_model", execute_model_req)))
-
-        results = await asyncio.gather(*tasks)
-
-        # Only the last PP stage has the final results.
-        return results[-1]
-
-    async def _start_worker_execution_loop(self):
-        assert not self.use_ray_spmd_worker, (
-            "worker loop is disabled for VLLM_USE_RAY_SPMD_WORKER=1")
-        coros = [
-            worker.execute_method.remote("start_worker_execution_loop")
-            for worker in self.non_driver_workers
-        ]
-        return await asyncio.gather(*coros)
-
-    def __del__(self):
-        self.shutdown()
diff --git a/vllm/executor/ray_tpu_executor.py b/vllm/executor/ray_tpu_executor.py
deleted file mode 100644
index 5118c13934f0d..0000000000000
--- a/vllm/executor/ray_tpu_executor.py
+++ /dev/null
@@ -1,343 +0,0 @@
-import asyncio
-import os
-from collections import defaultdict
-from itertools import islice, repeat
-from typing import (TYPE_CHECKING, Any, Awaitable, Dict, List, Optional, Tuple,
-                    Union)
-
-import vllm.envs as envs
-from vllm.executor.executor_base import ExecutorAsyncBase
-from vllm.executor.ray_utils import RayWorkerWrapper, ray
-from vllm.executor.tpu_executor import TPUExecutor
-from vllm.logger import init_logger
-from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.sequence import ExecuteModelRequest
-from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
-                        make_async)
-
-if ray is not None:
-    from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
-
-if TYPE_CHECKING:
-    from ray.util.placement_group import PlacementGroup
-
-logger = init_logger(__name__)
-
-
-class RayTPUExecutor(TPUExecutor):
-
-    uses_ray: bool = True
-
-    def __init__(self, *args, **kwargs):
-        # This is non-None when the execute model loop is running
-        # in the parallel workers. It's a coroutine in the AsyncLLMEngine case.
-        self.parallel_worker_tasks: Optional[Union[Any, Awaitable[Any]]] = None
-        # Updated by implementations that require additional args to be passed
-        # to the _run_workers execute_model call
-        self.extra_execute_model_run_workers_kwargs: Dict[str, Any] = {}
-
-        super().__init__(*args, **kwargs)
-
-    def _init_executor(self) -> None:
-        assert self.parallel_config.distributed_executor_backend == "ray"
-        placement_group = self.parallel_config.placement_group
-
-        # Disable Ray usage stats collection.
-        ray_usage = os.environ.get("RAY_USAGE_STATS_ENABLED", "0")
-        if ray_usage != "1":
-            os.environ["RAY_USAGE_STATS_ENABLED"] = "0"
-
-        # Create the parallel TPU workers.
-        self._init_workers_ray(placement_group)
-
-    def _init_workers_ray(self, placement_group: "PlacementGroup",
-                          **ray_remote_kwargs):
-        # The driver dummy worker does not actually use any resources.
-        # It holds the resource for the driver worker.
-        self.driver_dummy_worker: Optional[RayWorkerWrapper] = None
-        # The remaining workers are the actual ray actors.
-        self.workers: List[RayWorkerWrapper] = []
-
-        # Create the workers.
-        driver_ip = get_ip()
-        for bundle_id, bundle in enumerate(placement_group.bundle_specs):
-            if not bundle.get("TPU", 0):
-                continue
-            scheduling_strategy = PlacementGroupSchedulingStrategy(
-                placement_group=placement_group,
-                placement_group_capture_child_tasks=True,
-                placement_group_bundle_index=bundle_id,
-            )
-
-            # GKE does not fetch environment information from metadata server
-            # and instead sets these from within the Ray process. Therefore we
-            # need to override the Ray environment variables manually.
-            override_env = {}
-            if "TPU_CHIPS_PER_HOST_BOUNDS" in os.environ:
-                override_env.update({
-                    "TPU_CHIPS_PER_HOST_BOUNDS":
-                    os.environ["TPU_CHIPS_PER_HOST_BOUNDS"]
-                })
-            if "TPU_HOST_BOUNDS" in os.environ:
-                override_env.update(
-                    {"TPU_HOST_BOUNDS": os.environ["TPU_HOST_BOUNDS"]})
-
-            worker = ray.remote(
-                num_cpus=0,
-                resources={"TPU": 1},
-                scheduling_strategy=scheduling_strategy,
-                **ray_remote_kwargs,
-            )(RayWorkerWrapper).remote(vllm_config=self.vllm_config)
-            if override_env:
-                worker.override_env_vars.remote(override_env)
-
-            worker_ip = ray.get(worker.get_node_ip.remote())
-            if worker_ip == driver_ip and self.driver_dummy_worker is None:
-                # If the worker is on the same node as the driver, we use it
-                # as the resource holder for the driver process.
-                self.driver_dummy_worker = worker
-                self.driver_worker = RayWorkerWrapper(
-                    vllm_config=self.vllm_config)
-            else:
-                # Else, added to the list of workers.
-                self.workers.append(worker)
-
-        logger.debug("workers: %s", self.workers)
-        logger.debug("driver_dummy_worker: %s", self.driver_dummy_worker)
-        if self.driver_dummy_worker is None:
-            raise ValueError(
-                "Ray does not allocate any TPUs on the driver node. Consider "
-                "adjusting the Ray placement group or running the driver on a "
-                "TPU node.")
-
-        worker_ips = [
-            ray.get(worker.get_node_ip.remote())  # type: ignore[attr-defined]
-            for worker in self.workers
-        ]
-        ip_counts: Dict[str, int] = {}
-        for ip in worker_ips:
-            ip_counts[ip] = ip_counts.get(ip, 0) + 1
-
-        def sort_by_driver_then_worker_ip(worker):
-            """
-            Sort the workers based on 3 properties:
-            1. If the worker is on the same node as the driver (vllm engine),
-                it should be placed first.
-            2. Then, if the worker is on a node with fewer workers, it should
-                be placed first.
-            3. Finally, if the work is on a node with smaller IP address, it
-                should be placed first.
-            """
-            ip = ray.get(worker.get_node_ip.remote())
-            return (ip != driver_ip, ip_counts[ip], ip)
-
-        # After sorting, the workers on the same node will be
-        # close to each other, and the workers on the driver
-        # node will be placed first.
-        self.workers = sorted(self.workers, key=sort_by_driver_then_worker_ip)
-
-        # Get the set of TPU IDs used on each node.
-        worker_node_and_gpu_ids = []
-        for worker in [self.driver_dummy_worker] + self.workers:
-            if worker is None:
-                # driver_dummy_worker can be None when using ray spmd worker.
-                continue
-            worker_node_and_gpu_ids.append(
-                ray.get(worker.get_node_and_gpu_ids.remote()) \
-            ) # type: ignore
-
-        node_workers = defaultdict(list)
-        for i, (node_id, _) in enumerate(worker_node_and_gpu_ids):
-            node_workers[node_id].append(i)
-
-        # Set environment variables for the driver and workers.
-        all_args_to_update_environment_variables = [({
-            "VLLM_TRACE_FUNCTION":
-            str(envs.VLLM_TRACE_FUNCTION),
-        }, ) for _ in worker_node_and_gpu_ids]
-        self._run_workers("update_environment_variables",
-                          all_args=all_args_to_update_environment_variables)
-
-        if len(node_workers) == 1:
-            # in single node case, we don't need to get the IP address.
-            # the loopback address is sufficient
-            # NOTE: a node may have several IP addresses, one for each
-            # network interface. `get_ip()` might return any of them,
-            # while they might not work for communication inside the node
-            # if the network setup is complicated. Using the loopback address
-            # solves this issue, as it always works for communication inside
-            # the node.
-            driver_ip = "127.0.0.1"
-        distributed_init_method = get_distributed_init_method(
-            driver_ip, get_open_port())
-
-        # Initialize the actual workers inside worker wrapper.
-        init_worker_all_kwargs = [
-            self._get_worker_kwargs(
-                local_rank=node_workers[node_id].index(rank),
-                rank=rank,
-                distributed_init_method=distributed_init_method,
-            ) for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids)
-        ]
-        self._run_workers("init_worker", all_kwargs=init_worker_all_kwargs)
-
-        self._run_workers("init_device")
-        self._run_workers("load_model",
-                          max_concurrent_workers=self.parallel_config.
-                          max_parallel_loading_workers)
-
-    def _driver_execute_model(
-        self,
-        execute_model_req: Optional[ExecuteModelRequest] = None
-    ) -> List[SamplerOutput]:
-        """Run execute_model in the driver worker.
-
-        Passing None will cause the driver to stop the model execution
-        loop running in each of the remote workers.
-        """
-        return self.driver_worker.execute_method("execute_model",
-                                                 execute_model_req)
-
-    def _run_workers(
-        self,
-        method: str,
-        *args,
-        async_run_remote_workers_only: bool = False,
-        all_args: Optional[List[Tuple[Any, ...]]] = None,
-        all_kwargs: Optional[List[Dict[str, Any]]] = None,
-        max_concurrent_workers: Optional[int] = None,
-        use_ray_compiled_dag: bool = False,
-        **kwargs,
-    ) -> Any:
-        """Runs the given method on all workers. Can be used in the following
-        ways:
-
-        - async_run_remote_workers_only: If True the method will be run only
-          in the remote workers, not the driver worker. It will also be
-          run asynchronously and return a list of futures rather than blocking
-          on the results.
-        - args/kwargs: All workers share the same args/kwargs
-        - all_args/all_kwargs: args/kwargs for each worker are specified
-          individually
-        """
-
-        if max_concurrent_workers:
-            raise NotImplementedError(
-                "max_concurrent_workers is not supported yet.")
-
-        count = len(self.workers)
-        all_worker_args = repeat(args, count) if all_args is None \
-            else islice(all_args, 1, None)
-        all_worker_kwargs = repeat(kwargs, count) if all_kwargs is None \
-            else islice(all_kwargs, 1, None)
-
-        # Start the ray workers first.
-        ray_worker_outputs = [
-            worker.execute_method.remote(method, *worker_args, **worker_kwargs)
-            for (worker, worker_args, worker_kwargs
-                 ) in zip(self.workers, all_worker_args, all_worker_kwargs)
-        ]
-
-        if async_run_remote_workers_only:
-            # Just return futures
-            return ray_worker_outputs
-
-        driver_args = args if all_args is None else all_args[0]
-        driver_kwargs = kwargs if all_kwargs is None else all_kwargs[0]
-
-        # Start the driver worker after all the ray workers.
-        driver_worker_output = self.driver_worker.execute_method(
-            method, *driver_args, **driver_kwargs)
-        # Get the results of the ray workers.
-        if self.workers:
-            ray_worker_outputs = ray.get(ray_worker_outputs)
-
-        return [driver_worker_output] + ray_worker_outputs
-
-    def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None:
-        """Wait for futures returned from _run_workers() with
-        async_run_remote_workers_only to complete."""
-        ray.get(parallel_worker_tasks)
-
-    def determine_num_available_blocks(self) -> Tuple[int, int]:
-        num_blocks = self._run_workers("determine_num_available_blocks", )
-        num_tpu_blocks = min(b[0] for b in num_blocks)
-        num_cpu_blocks = min(b[1] for b in num_blocks)
-        return num_tpu_blocks, num_cpu_blocks
-
-    def initialize_cache(self, num_gpu_blocks: int,
-                         num_cpu_blocks: int) -> None:
-        logger.info("# TPU blocks: %d, # CPU blocks: %d", num_gpu_blocks,
-                    num_cpu_blocks)
-        self.cache_config.num_gpu_blocks = num_gpu_blocks
-        self.cache_config.num_cpu_blocks = num_cpu_blocks
-        self._run_workers("initialize_cache",
-                          num_gpu_blocks=num_gpu_blocks,
-                          num_cpu_blocks=num_cpu_blocks)
-
-    def execute_model(
-        self,
-        execute_model_req: ExecuteModelRequest,
-    ) -> List[SamplerOutput]:
-        if self.parallel_worker_tasks is None:
-            self.parallel_worker_tasks = self._run_workers(
-                "start_worker_execution_loop",
-                async_run_remote_workers_only=True,
-                **self.extra_execute_model_run_workers_kwargs)
-
-        # Only the driver worker returns the sampling results.
-        return self._driver_execute_model(execute_model_req)
-
-    def stop_remote_worker_execution_loop(self) -> None:
-        if self.parallel_worker_tasks is None:
-            return
-
-        self._driver_execute_model()
-        parallel_worker_tasks = self.parallel_worker_tasks
-        self.parallel_worker_tasks = None
-        # Ensure that workers exit model loop cleanly
-        # (this will raise otherwise)
-        self._wait_for_tasks_completion(parallel_worker_tasks)
-
-
-class RayTPUExecutorAsync(RayTPUExecutor, ExecutorAsyncBase):
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.driver_exec_method = make_async(self.driver_worker.execute_method)
-
-    async def execute_model_async(
-            self,
-            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
-        if self.parallel_worker_tasks is None:
-            # Start model execution loop running in the parallel workers
-            self.parallel_worker_tasks = asyncio.create_task(
-                self._start_worker_execution_loop())
-
-        # Only the driver worker returns the sampling results.
-        return await self._driver_execute_model_async(execute_model_req)
-
-    async def stop_remote_worker_execution_loop_async(self) -> None:
-        if self.parallel_worker_tasks is None:
-            return
-
-        await self._driver_execute_model_async()
-        parallel_worker_tasks = self.parallel_worker_tasks
-        self.parallel_worker_tasks = None
-        # Ensure that workers exit model loop cleanly
-        # (this will raise otherwise)
-        await parallel_worker_tasks
-
-    async def _driver_execute_model_async(
-        self,
-        execute_model_req: Optional[ExecuteModelRequest] = None
-    ) -> List[SamplerOutput]:
-        return await self.driver_exec_method("execute_model",
-                                             execute_model_req)
-
-    async def _start_worker_execution_loop(self):
-        coros = [
-            worker.execute_method.remote("start_worker_execution_loop")
-            for worker in self.workers
-        ]
-        return await asyncio.gather(*coros)
diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py
index 9f40f6a65dcd7..e55155ea06225 100644
--- a/vllm/executor/ray_utils.py
+++ b/vllm/executor/ray_utils.py
@@ -1,7 +1,7 @@
 import os
 import time
 from collections import defaultdict
-from typing import Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
 
 import msgspec
 
@@ -13,6 +13,10 @@
 from vllm.utils import get_ip
 from vllm.worker.worker_base import WorkerWrapperBase
 
+if TYPE_CHECKING:
+    from vllm.v1.core.scheduler import SchedulerOutput
+    from vllm.v1.outputs import ModelRunnerOutput
+
 logger = init_logger(__name__)
 PG_WAIT_TIMEOUT = 1800
 
@@ -95,6 +99,26 @@ def execute_model_spmd(
 
             return output
 
+        def setup_device_if_necessary(self):
+            # TODO(swang): This is needed right now because Ray CG executes
+            # on a background thread, so we need to reset torch's current
+            # device.
+            # We can remove this API after it is fixed in compiled graph.
+            import torch
+            assert self.worker is not None, "Worker is not initialized"
+            if not self.compiled_dag_cuda_device_set:
+                torch.cuda.set_device(self.worker.device)
+                self.compiled_dag_cuda_device_set = True
+
+        def execute_model(
+            self,
+            scheduler_output: "SchedulerOutput",
+        ) -> "ModelRunnerOutput":
+            self.setup_device_if_necessary()
+            assert self.worker is not None, "Worker is not initialized"
+            output = self.worker.model_runner.execute_model(scheduler_output)
+            return output
+
         def override_env_vars(self, vars: Dict[str, str]):
             os.environ.update(vars)
 
diff --git a/vllm/executor/ray_xpu_executor.py b/vllm/executor/ray_xpu_executor.py
deleted file mode 100644
index d2086f5fef26c..0000000000000
--- a/vllm/executor/ray_xpu_executor.py
+++ /dev/null
@@ -1,40 +0,0 @@
-import asyncio
-from typing import List, Optional
-
-import ray
-
-import vllm.envs as envs
-from vllm.executor.ray_gpu_executor import RayGPUExecutor, RayGPUExecutorAsync
-from vllm.executor.xpu_executor import XPUExecutor
-from vllm.logger import init_logger
-from vllm.utils import make_async
-
-logger = init_logger(__name__)
-
-
-class RayXPUExecutor(RayGPUExecutor, XPUExecutor):
-
-    def _get_env_vars_to_be_updated(self):
-        # Get the set of GPU IDs used on each node.
-        worker_node_and_gpu_ids = []
-        for worker in [self.driver_dummy_worker] + self.workers:
-            if worker is None:
-                # driver_dummy_worker can be None when using ray spmd worker.
-                continue
-            worker_node_and_gpu_ids.append(
-                ray.get(worker.get_node_and_gpu_ids.remote()))  # type: ignore
-
-        # Set environment variables for the driver and workers.
-        all_args_to_update_environment_variables = [({
-            "VLLM_TRACE_FUNCTION":
-            str(envs.VLLM_TRACE_FUNCTION),
-        }, ) for (_, _) in worker_node_and_gpu_ids]
-        return all_args_to_update_environment_variables
-
-
-class RayXPUExecutorAsync(RayXPUExecutor, RayGPUExecutorAsync):
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.driver_exec_method = make_async(self.driver_worker.execute_method)
-        self.pp_locks: Optional[List[asyncio.Lock]] = None
diff --git a/vllm/executor/tpu_executor.py b/vllm/executor/tpu_executor.py
deleted file mode 100644
index e37e8973790db..0000000000000
--- a/vllm/executor/tpu_executor.py
+++ /dev/null
@@ -1,142 +0,0 @@
-from typing import Any, Dict, List, Optional, Set, Tuple
-
-import torch
-
-from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
-from vllm.logger import init_logger
-from vllm.lora.request import LoRARequest
-from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.sequence import ExecuteModelRequest
-from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
-                        make_async)
-
-logger = init_logger(__name__)
-
-
-class TPUExecutor(ExecutorBase):
-
-    uses_ray: bool = False
-
-    def _init_executor(self) -> None:
-        assert not self.scheduler_config.chunked_prefill_enabled, (
-            "Chunked prefill is not yet supported for TPU backend")
-        assert not self.speculative_config, (
-            "Speculative decoding is not yet supported for TPU backend")
-        if self.model_config.dtype in (torch.float16, torch.float32):
-            logger.warning(
-                "The TPU backend currently does not support %s. "
-                "Using bfloat16 instead.", self.model_config.dtype)
-            self.model_config.dtype = torch.bfloat16
-
-        # Instantiate the worker and load the model to the device.
-        self.driver_worker = self._create_worker()
-        self.driver_worker.init_device()
-        self.driver_worker.load_model()
-
-    def _get_worker_kwargs(
-        self,
-        local_rank: int = 0,
-        rank: int = 0,
-        distributed_init_method: Optional[str] = None,
-    ) -> Dict[str, Any]:
-        """Return worker init args for a given rank."""
-        if distributed_init_method is None:
-            distributed_init_method = get_distributed_init_method(
-                get_ip(), get_open_port())
-        return dict(
-            vllm_config=self.vllm_config,
-            local_rank=local_rank,
-            rank=rank,
-            distributed_init_method=distributed_init_method,
-            is_driver_worker=rank == 0,
-        )
-
-    def _create_worker(
-        self,
-        local_rank: int = 0,
-        rank: int = 0,
-        distributed_init_method: Optional[str] = None,
-    ):
-        if self.scheduler_config.is_multi_step:
-            from vllm.worker.multi_step_tpu_worker import MultiStepTPUWorker
-            worker = MultiStepTPUWorker(**self._get_worker_kwargs(
-                local_rank, rank, distributed_init_method))
-            return worker
-        else:
-            from vllm.worker.tpu_worker import TPUWorker
-
-            worker = TPUWorker(**self._get_worker_kwargs(
-                local_rank, rank, distributed_init_method))
-            return worker
-
-    def initialize_cache(
-        self,
-        num_gpu_blocks: int,
-        num_cpu_blocks: int,
-    ) -> None:
-        """Initialize the KV cache by invoking the underlying worker."""
-        # NOTE: This is logged in the executor because there can be >1 worker
-        # with other executors. We could log in the engine level, but work
-        # remains to abstract away the device for non-GPU configurations.
-        logger.info("# TPU blocks: %d, # CPU blocks: %d", num_gpu_blocks,
-                    num_cpu_blocks)
-        self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
-
-    def determine_num_available_blocks(self) -> Tuple[int, int]:
-        """Determine the number of available KV blocks by invoking the
-        underlying worker."""
-        return self.driver_worker.determine_num_available_blocks()
-
-    def execute_model(
-        self,
-        execute_model_req: ExecuteModelRequest,
-    ) -> List[SamplerOutput]:
-        output = self.driver_worker.execute_model(execute_model_req)
-        return output
-
-    def add_lora(self, lora_request: LoRARequest) -> bool:
-        raise NotImplementedError(
-            "LoRA is currently not supported by the TPU backend.")
-
-    def remove_lora(self, lora_id: int) -> bool:
-        raise NotImplementedError(
-            "LoRA is currently not supported by the TPU backend.")
-
-    def pin_lora(self, lora_id: int) -> bool:
-        raise NotImplementedError(
-            "LoRA is currently not supported by the TPU backend.")
-
-    def list_loras(self) -> Set[int]:
-        raise NotImplementedError(
-            "LoRA is currently not supported by the TPU backend.")
-
-    def add_prompt_adapter(self, prompt_adapter_request) -> bool:
-        raise NotImplementedError(
-            "Soft prompt is currently not supported by the TPU backend.")
-
-    def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
-        raise NotImplementedError(
-            "Soft prompt is currently not supported by the TPU backend.")
-
-    def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
-        raise NotImplementedError(
-            "Soft prompt is currently not supported by the TPU backend.")
-
-    def list_prompt_adapters(self) -> Set[int]:
-        raise NotImplementedError(
-            "Soft prompt is currently not supported by the TPU backend.")
-
-    def check_health(self) -> None:
-        # TPUExecutor will always be healthy as long as it's running.
-        return
-
-
-class TPUExecutorAsync(TPUExecutor, ExecutorAsyncBase):
-
-    async def execute_model_async(
-        self,
-        sexecute_model_req: ExecuteModelRequest,
-    ) -> SamplerOutput:
-        output = await make_async(self.driver_worker.execute_model
-                                  )(sexecute_model_req)
-        return output
diff --git a/vllm/executor/uniproc_executor.py b/vllm/executor/uniproc_executor.py
new file mode 100644
index 0000000000000..da1d77343cf3b
--- /dev/null
+++ b/vllm/executor/uniproc_executor.py
@@ -0,0 +1,57 @@
+from typing import Any, Dict, List, Optional, Tuple
+
+from vllm.executor.executor_base import ExecutorBase
+from vllm.logger import init_logger
+from vllm.utils import get_distributed_init_method, get_ip, get_open_port
+from vllm.worker.worker_base import WorkerWrapperBase
+
+logger = init_logger(__name__)
+
+
+class UniProcExecutor(ExecutorBase):
+
+    uses_ray: bool = False
+
+    def _init_executor(self) -> None:
+        """Initialize the worker and load the model.
+        """
+        self.driver_worker = WorkerWrapperBase(vllm_config=self.vllm_config,
+                                               rank=0)
+        distributed_init_method = get_distributed_init_method(
+            get_ip(), get_open_port())
+        local_rank = 0
+        rank = 0
+        kwargs = dict(
+            vllm_config=self.vllm_config,
+            local_rank=local_rank,
+            rank=rank,
+            distributed_init_method=distributed_init_method,
+            is_driver_worker=(not self.parallel_config)
+            or (rank % self.parallel_config.tensor_parallel_size == 0),
+        )
+        self.collective_rpc("init_worker", args=([kwargs], ))
+        self.collective_rpc("init_device")
+        self.collective_rpc("load_model")
+
+    def collective_rpc(self,
+                       method: str,
+                       timeout: Optional[float] = None,
+                       args: Tuple = (),
+                       kwargs: Optional[Dict] = None) -> List[Any]:
+        if kwargs is None:
+            kwargs = {}
+        try:
+            func = getattr(self.driver_worker, method)
+        except AttributeError:
+            raise NotImplementedError(f"Method {method} is not implemented.") \
+                from None
+        answer = func(*args, **kwargs)
+        return [answer]
+
+    def check_health(self) -> None:
+        # UniProcExecutor will always be healthy as long as
+        # it's running.
+        return
+
+
+UniProcExecutorAsync = UniProcExecutor
diff --git a/vllm/executor/xpu_executor.py b/vllm/executor/xpu_executor.py
deleted file mode 100644
index 722b86a95ff8a..0000000000000
--- a/vllm/executor/xpu_executor.py
+++ /dev/null
@@ -1,39 +0,0 @@
-from typing import List, Optional, Union
-
-from vllm.executor.executor_base import ExecutorAsyncBase
-from vllm.executor.gpu_executor import GPUExecutor
-from vllm.logger import init_logger
-from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.sequence import ExecuteModelRequest, PoolerOutput
-from vllm.utils import make_async
-
-logger = init_logger(__name__)
-
-
-class XPUExecutor(GPUExecutor):
-
-    uses_ray: bool = False
-
-    def _init_executor(self) -> None:
-        assert self.device_config.device_type == "xpu"
-        assert self.speculative_config is None, (
-            "Speculative decoding not yet supported for XPU backend")
-
-        GPUExecutor._init_executor(self)
-
-    def execute_model(
-        self, execute_model_req: ExecuteModelRequest
-    ) -> Optional[List[Union[SamplerOutput, PoolerOutput]]]:
-        output = self.driver_worker.execute_model(execute_model_req)
-        return output
-
-
-class XPUExecutorAsync(XPUExecutor, ExecutorAsyncBase):
-
-    async def execute_model_async(
-        self,
-        execute_model_req: ExecuteModelRequest,
-    ) -> List[SamplerOutput]:
-        output = await make_async(self.driver_worker.execute_model
-                                  )(execute_model_req=execute_model_req)
-        return output
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index 4d3b84fea887f..74948202cbe48 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -1,3 +1,4 @@
+import os
 from typing import TYPE_CHECKING, Optional
 
 import psutil
@@ -105,6 +106,32 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
             else:
                 parallel_config.worker_cls = "vllm.worker.cpu_worker.CPUWorker"
 
+        assert vllm_config.device_config.device_type == "cpu"
+
+        #
+        # Environment variables for CPU executor
+        #
+
+        # Disable torch async compiling which won't work with daemonic processes
+        os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"
+
+        # Intel OpenMP setting
+        ld_prealod_str = os.getenv("LD_PRELOAD", "")
+        if "libiomp5.so" in ld_prealod_str:
+            # The time(milliseconds) that a thread should wait after
+            # completing the execution of a parallel region, before sleeping.
+            os.environ['KMP_BLOCKTIME'] = "1"
+            # Prevents the CPU to run into low performance state
+            os.environ['KMP_TPAUSE'] = "0"
+            # Provides fine granularity parallelism
+            os.environ['KMP_FORKJOIN_BARRIER_PATTERN'] = "dist,dist"
+            os.environ['KMP_PLAIN_BARRIER_PATTERN'] = "dist,dist"
+            os.environ['KMP_REDUCTION_BARRIER_PATTERN'] = "dist,dist"
+
+        # To hint IPEX uses shared memory based AllReduce
+        os.environ["LOCAL_WORLD_SIZE"] = str(
+            vllm_config.parallel_config.tensor_parallel_size)
+
     @classmethod
     def is_pin_memory_available(cls) -> bool:
         logger.warning("Pin memory is not supported on CPU.")
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 2587e3a11dde3..8350177b68ade 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -139,6 +139,28 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
                 else:
                     parallel_config.worker_cls = "vllm.worker.worker.Worker"
 
+        world_size = parallel_config.world_size
+        tensor_parallel_size = parallel_config.tensor_parallel_size
+
+        from vllm.utils import (cuda_device_count_stateless,
+                                update_environment_variables)
+
+        # Set CUDA_VISIBLE_DEVICES for the driver, inherited by workers
+        if "CUDA_VISIBLE_DEVICES" not in os.environ:
+            update_environment_variables({
+                "CUDA_VISIBLE_DEVICES": (",".join(map(str, range(world_size))))
+            })
+
+        cuda_device_count = cuda_device_count_stateless()
+        # Use confusing message for more common TP-only case.
+        assert tensor_parallel_size <= cuda_device_count, (
+            f"please set tensor_parallel_size ({tensor_parallel_size}) "
+            f"to less than max local gpu count ({cuda_device_count})")
+
+        assert world_size <= cuda_device_count, (
+            f"please ensure that world_size ({world_size}) "
+            f"is less than than max local gpu count ({cuda_device_count})")
+
         cache_config = vllm_config.cache_config
         if cache_config and cache_config.block_size is None:
             cache_config.block_size = 16
diff --git a/vllm/platforms/neuron.py b/vllm/platforms/neuron.py
index 0696f73cc17b4..ead3dab05a6b1 100644
--- a/vllm/platforms/neuron.py
+++ b/vllm/platforms/neuron.py
@@ -35,6 +35,14 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
             parallel_config.worker_cls = \
                 "vllm.worker.neuron_worker.NeuronWorker"
 
+        if parallel_config.world_size > 1:
+            parallel_config.distributed_executor_backend = "uni"
+
+        assert (vllm_config.lora_config is
+                None), "LoRA is not supported for Neuron backend."
+        assert (not vllm_config.speculative_config
+                ), "Speculative decoding not yet supported for Neuron backend."
+
         cache_config = vllm_config.cache_config
         if cache_config:
             # neuron needs block_size = max_model_len
diff --git a/vllm/platforms/openvino.py b/vllm/platforms/openvino.py
index 9390eda535c8f..7d414165a8188 100644
--- a/vllm/platforms/openvino.py
+++ b/vllm/platforms/openvino.py
@@ -66,9 +66,8 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         from vllm.utils import GiB_bytes
 
         parallel_config = vllm_config.parallel_config
-        assert (
-            parallel_config.world_size == 1
-        ), "OpenVINOExecutor only supports single CPU socket currently."
+        assert (parallel_config.world_size == 1
+                ), "OpenVINO only supports single CPU socket currently."
 
         if parallel_config.worker_cls == "auto":
             parallel_config.worker_cls = \
@@ -141,3 +140,10 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
             raise RuntimeError(
                 "Invalid environment variable VLLM_OPENVINO_KVCACHE_SPACE"
                 f" {kv_cache_space}, expect a positive integer value.")
+
+        assert vllm_config.device_config.device_type == "openvino"
+        assert vllm_config.lora_config is None, \
+            "OpenVINO backend doesn't support LoRA"
+        assert cls.is_openvino_cpu() or \
+            cls.is_openvino_gpu(), \
+            "OpenVINO backend supports only CPU and GPU devices"
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index ff9487daac7a7..05a3aa4305cfa 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -72,6 +72,16 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         assert vllm_config.speculative_config is None, \
             "TPU does not support speculative decoding"
 
+        assert not vllm_config.scheduler_config.chunked_prefill_enabled, (
+            "Chunked prefill is not yet supported for TPU backend")
+        assert not vllm_config.speculative_config, (
+            "Speculative decoding is not yet supported for TPU backend")
+        if vllm_config.model_config.dtype in (torch.float16, torch.float32):
+            logger.warning(
+                "The TPU backend currently does not support %s. "
+                "Using bfloat16 instead.", vllm_config.model_config.dtype)
+            vllm_config.model_config.dtype = torch.bfloat16
+
         parallel_config = vllm_config.parallel_config
         scheduler_config = vllm_config.scheduler_config
         if parallel_config.worker_cls == "auto":
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index 031abdc05d517..c34b5b58672e7 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -78,17 +78,31 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
             raise NotImplementedError(
                 "XPU does not support speculative decoding")
 
+        if vllm_config.device_config is not None:
+            assert vllm_config.device_config.device_type == "xpu"
+
         # check and update parallel config
         parallel_config = vllm_config.parallel_config
-        if (parallel_config.distributed_executor_backend is not None
-                and parallel_config.distributed_executor_backend != "ray"):
+        if parallel_config.worker_cls == "auto":
+            parallel_config.worker_cls = "vllm.worker.xpu_worker.XPUWorker"
+
+        if parallel_config.distributed_executor_backend is None:
+            parallel_config.distributed_executor_backend = "ray"
+        elif parallel_config.distributed_executor_backend == "mp":
+            # FIXME(kunshang):
+            # spawn needs calling `if __name__ == '__main__':``
+            # fork is not supported for xpu start new process.
+            logger.error(
+                "Both start methods (spawn and fork) have issue "
+                "on XPU if you use mp backend, setting it to ray instead.")
+            parallel_config.distributed_executor_backend = "ray"
+
+        elif parallel_config.distributed_executor_backend != "ray":
             logger.warning(
                 "%s is not supported on XPU, fallback to ray distributed"
                 " executor backend.",
                 parallel_config.distributed_executor_backend)
             parallel_config.distributed_executor_backend = "ray"
-        if parallel_config.worker_cls == "auto":
-            parallel_config.worker_cls = "vllm.worker.xpu_worker.XPUWorker"
 
     @classmethod
     def is_pin_memory_available(cls):
diff --git a/vllm/spec_decode/medusa_worker.py b/vllm/spec_decode/medusa_worker.py
index 1ab691a7ef047..21a58fc426275 100644
--- a/vllm/spec_decode/medusa_worker.py
+++ b/vllm/spec_decode/medusa_worker.py
@@ -9,17 +9,15 @@
 from vllm.spec_decode.interfaces import SpeculativeProposals
 from vllm.spec_decode.proposer_worker_base import NonLLMProposerWorkerBase
 from vllm.spec_decode.top1_proposer import Top1Proposer
-from vllm.worker.worker_base import WorkerWrapperBase
+from vllm.worker.worker_base import DelegateWorkerBase
 
 
-class MedusaWorker(NonLLMProposerWorkerBase, WorkerWrapperBase):
+class MedusaWorker(NonLLMProposerWorkerBase, DelegateWorkerBase):
     """Worker for Medusa.
     """
 
     def __init__(self, *args, **kwargs):
-        super().__init__(kwargs.get("vllm_config"))
-        self.init_worker(*args, **kwargs)
-
+        DelegateWorkerBase.__init__(self, *args, **kwargs)
         # Lazy initialization list.
         self._proposer: Top1Proposer
 
diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py
index 676ac5eb3609d..32197f8cc8f2f 100644
--- a/vllm/spec_decode/multi_step_worker.py
+++ b/vllm/spec_decode/multi_step_worker.py
@@ -16,10 +16,10 @@
                                          SpeculativeProposer)
 from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase
 from vllm.spec_decode.top1_proposer import Top1Proposer
-from vllm.worker.worker_base import WorkerWrapperBase
+from vllm.worker.worker_base import DelegateWorkerBase
 
 
-class MultiStepWorker(ProposerWorkerBase, WorkerWrapperBase):
+class MultiStepWorker(ProposerWorkerBase, DelegateWorkerBase):
     """The MultiStepWorker is equivalent to a Worker except that it allows
     multiple forward passes in a single call, assuming the scheduler has
     allocated enough space to store the additional KV. This reduces overhead
@@ -32,15 +32,12 @@ class MultiStepWorker(ProposerWorkerBase, WorkerWrapperBase):
     """
 
     def __init__(self, *args, **kwargs):
-        super().__init__(kwargs.get("vllm_config"))
-        self.init_worker(*args, **kwargs)
-
+        DelegateWorkerBase.__init__(self, *args, **kwargs)
         # Lazy initialization list.
         self._proposer: SpeculativeProposer
 
     def init_device(self) -> None:
         self.worker.init_device()
-
         self._proposer = Top1Proposer(
             weakref.proxy(self),  # type: ignore[arg-type]
             self.device,
@@ -56,18 +53,6 @@ def set_should_modify_greedy_probs_inplace(self) -> None:
         self.model_runner.model.sampler.should_modify_greedy_probs_inplace = (
             True)
 
-    def determine_num_available_blocks(self) -> Tuple[int, int]:
-        return self.worker.determine_num_available_blocks()
-
-    def get_cache_block_size_bytes(self) -> int:
-        return self.worker.get_cache_block_size_bytes()
-
-    def initialize_cache(self, *args, **kwargs) -> None:
-        self.worker.initialize_cache(*args, **kwargs)
-
-    def execute_model(self, *args, **kwargs) -> List[SamplerOutput]:
-        return self.worker.execute_model(*args, **kwargs)
-
     @torch.inference_mode()
     def sampler_output(
         self,
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index e369da1a70c23..540d118d65ecb 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -40,8 +40,8 @@
                                    get_all_num_logprobs,
                                    get_sampled_token_logprobs, nvtx_range,
                                    split_batch_by_proposal_len)
-from vllm.worker.worker_base import (LoraNotSupportedWorkerBase, WorkerBase,
-                                     WorkerWrapperBase)
+from vllm.utils import resolve_obj_by_qualname
+from vllm.worker.worker_base import LoraNotSupportedWorkerBase, WorkerBase
 
 logger = init_logger(__name__)
 
@@ -64,8 +64,9 @@ def create_spec_worker(*args, **kwargs) -> "SpecDecodeWorker":
     target_worker_config = copy.deepcopy(vllm_config)
     target_worker_config.parallel_config.worker_cls =\
         target_worker_config.parallel_config.sd_worker_cls
-    target_worker = WorkerWrapperBase(vllm_config=target_worker_config)
-    target_worker.init_worker(*args, **kwargs)
+    cls = resolve_obj_by_qualname(
+        target_worker_config.parallel_config.worker_cls)
+    target_worker = cls(*args, **kwargs)
     # Set the disable_logprobs variable in the TargetModelRunner instance
     # as per its value specified in the SpeculativeConfig.
     target_worker.model_runner.disable_logprobs =\
diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py
index 5d74d4b01f500..7c17f60510ae1 100644
--- a/vllm/v1/executor/abstract.py
+++ b/vllm/v1/executor/abstract.py
@@ -14,8 +14,9 @@ def get_class(vllm_config: VllmConfig) -> Type["Executor"]:
         distributed_executor_backend = (
             vllm_config.parallel_config.distributed_executor_backend)
         if distributed_executor_backend == "ray":
-            from vllm.v1.executor.ray_executor import RayExecutor
-            executor_class = RayExecutor
+            from vllm.executor.ray_distributed_executor import (  # noqa
+                RayDistributedExecutor)
+            executor_class = RayDistributedExecutor
         elif distributed_executor_backend == "mp":
             from vllm.v1.executor.multiproc_executor import MultiprocExecutor
             executor_class = MultiprocExecutor
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index 41e6abbd67956..cee0fcc0bad68 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -246,9 +246,18 @@ def __init__(
         ready_path: str,
     ):
         self.rank = rank
-        wrapper = WorkerWrapperBase(vllm_config=vllm_config)
-        wrapper.init_worker(vllm_config, local_rank, rank,
-                            distributed_init_method)
+        wrapper = WorkerWrapperBase(vllm_config=vllm_config, rank=rank)
+        # TODO: move `init_worker` to executor level as a collective rpc call
+        all_kwargs: List[Dict] = [
+            {} for _ in range(vllm_config.parallel_config.world_size)
+        ]
+        all_kwargs[rank] = {
+            "vllm_config": vllm_config,
+            "local_rank": local_rank,
+            "rank": rank,
+            "distributed_init_method": distributed_init_method,
+        }
+        wrapper.init_worker(all_kwargs)
         self.worker = wrapper.worker
 
         pid = os.getpid()
@@ -270,7 +279,7 @@ def __init__(
             ready_socket.send_string(WorkerProc.READY_STR)
             ready_socket.send(payload)
 
-        self.worker.initialize()
+        self.worker.init_device()
         self.worker.load_model()
 
     @staticmethod
diff --git a/vllm/v1/executor/uniproc_executor.py b/vllm/v1/executor/uniproc_executor.py
index be058318de58b..a9adc0114f76d 100644
--- a/vllm/v1/executor/uniproc_executor.py
+++ b/vllm/v1/executor/uniproc_executor.py
@@ -27,7 +27,7 @@ def __init__(self, vllm_config: VllmConfig) -> None:
         self.observability_config = vllm_config.observability_config
 
         self.worker: Worker = self._create_worker()
-        self.worker.initialize()
+        self.worker.init_device()
         self.worker.load_model()
 
     def _create_worker(
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index e83bce4283555..e6feaee972a35 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -33,6 +33,7 @@ def __init__(
         local_rank: int,
         rank: int,
         distributed_init_method: str,
+        is_driver_worker: bool = False,
     ):
 
         # TODO: use WorkerBase.__init__(self, vllm_config=vllm_config)
@@ -75,7 +76,7 @@ def __init__(
         else:
             self.profiler = None
 
-    def initialize(self):
+    def init_device(self):
         if self.device_config.device.type == "cuda":
             # torch.distributed.all_reduce does not free the input tensor until
             # the synchronization point. This causes the memory usage to grow
diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py
index 8b2d8aaed2803..9401241073c7d 100644
--- a/vllm/worker/hpu_worker.py
+++ b/vllm/worker/hpu_worker.py
@@ -2,6 +2,7 @@
 # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
 ###############################################################################
 
+import contextlib
 import gc
 import os
 from typing import List, Optional, Set, Tuple, Type
@@ -18,6 +19,7 @@
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.model_executor import set_random_seed
+from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sequence import ExecuteModelRequest
 from vllm.utils import bind_kv_cache
@@ -124,6 +126,70 @@ def init_device(self) -> None:
     def load_model(self):
         self.model_runner.load_model()
 
+    def execute_model(
+        self,
+        execute_model_req: Optional[ExecuteModelRequest] = None,
+    ) -> Optional[List[SamplerOutput]]:
+        assert execute_model_req is not None
+        # VLLM_HPU_LOG_STEP_GRAPH_COMPILATION     - will log graph compilations per engine step, only when there was any - highly recommended to use alongside PT_HPU_METRICS_GC_DETAILS! # noqa:E501
+        # VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL - will log graph compilations per engine step, always, even if there were none # noqa:E501
+        # VLLM_HPU_LOG_STEP_CPU_FALLBACKS         - will log cpu fallbacks per engine step, only when there was any # noqa:E501
+        # VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL     - will log cpu fallbacks per engine step, always, even if there were none # noqa:E501
+        log_graph_compilation_all = os.environ.get(
+            'VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL', '0') != '0'
+        log_graph_compilation = os.environ.get(
+            'VLLM_HPU_LOG_STEP_GRAPH_COMPILATION',
+            '0') != '0' or log_graph_compilation_all
+        log_cpu_fallbacks_all = os.environ.get(
+            'VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL', '0') != '0'
+        log_cpu_fallbacks = os.environ.get('VLLM_HPU_LOG_STEP_CPU_FALLBACKS',
+                                           '0') != '0' or log_cpu_fallbacks_all
+        if log_graph_compilation or log_cpu_fallbacks:
+            from habana_frameworks.torch.hpu.metrics import metric_localcontext
+            seq_group_metadata_list = execute_model_req.seq_group_metadata_list
+            is_prompt = any([
+                seq_group_metadata.is_prompt
+                for seq_group_metadata in seq_group_metadata_list
+            ])
+            max_context_len = max([
+                max([
+                    len(v.prompt_token_ids) + len(v.output_token_ids)
+                    for v in seq_group_metadata.seq_data.values()
+                ]) for seq_group_metadata in seq_group_metadata_list
+            ])  # whoa, that's some spicy stuff right here
+            max_num_blocks = (
+                (max_context_len - 1) // self.cache_config.block_size) + 1
+            input_stats = (f'is_prompt: {is_prompt}, '
+                           f'num_seqs: {len(seq_group_metadata_list)}, '
+                           f'max_context_len: {max_context_len}, '
+                           f'max_num_blocks {max_num_blocks}')
+            gc_ctx = metric_localcontext(
+                "graph_compilation"
+            ) if log_graph_compilation else contextlib.nullcontext()
+            cpu_fallback_ctx = metric_localcontext(
+                "cpu_fallback"
+            ) if log_cpu_fallbacks else contextlib.nullcontext()
+            with gc_ctx as gc_local_metric, \
+                cpu_fallback_ctx as cpu_fallback_local_metric:
+                output = LocalOrDistributedWorkerBase.execute_model(
+                    self, execute_model_req)
+            if (log_graph_compilation and gc_local_metric.stats()[0][1] > 0
+                ) or log_graph_compilation_all:
+                msg = ("VLLM_HPU_STEP_GRAPH_COMPILATION: "
+                       f"{gc_local_metric.stats()}, {input_stats}")
+                logger.warning(msg)
+            if (log_cpu_fallbacks and cpu_fallback_local_metric.stats()[0][1] >
+                    0) or log_cpu_fallbacks_all:
+                msg = ("VLLM_HPU_STEP_CPU_FALLBACK: "
+                       f"{cpu_fallback_local_metric.stats()}, {input_stats}")
+                logger.warning(msg)
+
+            return output
+
+        output = LocalOrDistributedWorkerBase.execute_model(
+            self, execute_model_req)
+        return output
+
     @torch.inference_mode()
     def determine_num_available_blocks(self) -> Tuple[int, int]:
         """Profiles the peak memory usage of the model to determine how many
diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py
index 3f6269684ac93..e02c72faace70 100644
--- a/vllm/worker/neuron_worker.py
+++ b/vllm/worker/neuron_worker.py
@@ -8,6 +8,7 @@
 from vllm.distributed import (ensure_model_parallel_initialized,
                               init_distributed_environment)
 from vllm.model_executor import set_random_seed
+from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.sequence import ExecuteModelRequest
 from vllm.worker.neuron_model_runner import NeuronModelRunner
 from vllm.worker.worker_base import (LocalOrDistributedWorkerBase,
@@ -25,6 +26,7 @@ def __init__(
         local_rank: int,
         rank: int,
         distributed_init_method: str,
+        is_driver_worker: bool = True,
     ) -> None:
         WorkerBase.__init__(self, vllm_config=vllm_config)
         self.local_rank = local_rank
@@ -37,7 +39,22 @@ def __init__(
 
         self.model_runner: NeuronModelRunner = NeuronModelRunner(
             vllm_config=vllm_config)
-        self.is_driver_worker = True
+        self.is_driver_worker = is_driver_worker
+
+    def execute_model(
+        self,
+        execute_model_req: Optional[ExecuteModelRequest] = None,
+    ) -> Optional[List[SamplerOutput]]:
+        assert execute_model_req is not None
+        assert (not execute_model_req.blocks_to_swap_in
+                and not execute_model_req.blocks_to_swap_out
+                and not execute_model_req.blocks_to_copy), (
+                    "Cache operations are not supported for Neuron backend.")
+        assert execute_model_req.num_lookahead_slots == 0, (
+            "lookahead not supported for Neuron backend.")
+        output = LocalOrDistributedWorkerBase.execute_model(
+            self, execute_model_req)
+        return output
 
     def init_device(self) -> None:
         self.init_distributed_environment()
@@ -103,13 +120,14 @@ def get_cache_block_size_bytes(self) -> int:
 
     def init_distributed_environment(self):
         """Neuron uses transformers-neuronx for tensor parallelism.
-
-        vLLM still needs the environment inited when TP/PP > 1
+        It has only one process to control multiple devices.
+        vLLM still needs the environment initialized when TP/PP > 1,
+        so we initialize a distributed environment with one process.
         """
         init_distributed_environment(
             world_size=1,
-            rank=self.rank,
-            local_rank=self.local_rank,
+            rank=0,
+            local_rank=0,
             distributed_init_method=self.distributed_init_method,
             backend="gloo",
         )
diff --git a/vllm/worker/openvino_worker.py b/vllm/worker/openvino_worker.py
index 3482073566215..50a155d22c666 100644
--- a/vllm/worker/openvino_worker.py
+++ b/vllm/worker/openvino_worker.py
@@ -211,16 +211,14 @@ class OpenVINOWorker(LoraNotSupportedWorkerBase):
 
     def __init__(
         self,
-        ov_core: ov.Core,
         vllm_config: VllmConfig,
         local_rank: int,
         rank: int,
         distributed_init_method: str,
-        kv_cache_dtype: Optional[ov.Type] = ov.Type.undefined,
         is_driver_worker: bool = False,
     ) -> None:
-        self.ov_core = ov_core
         WorkerBase.__init__(self, vllm_config)
+        self.ov_core = ov.Core()
         self.parallel_config.rank = rank
         self.local_rank = local_rank
         self.rank = rank
@@ -237,7 +235,7 @@ def __init__(
         self.model_runner = OpenVINOModelRunner(
             self.ov_core,
             vllm_config=self.vllm_config,
-            kv_cache_dtype=kv_cache_dtype,
+            kv_cache_dtype=self.vllm_config.cache_config.cache_dtype,
             is_driver_worker=is_driver_worker,
         )
         # Uninitialized cache engine. Will be initialized by
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index a835718e1db19..7c14b8344b49e 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -88,7 +88,6 @@ def start_worker_execution_loop(self) -> None:
                 if output is None:
                     return None
 
-    @abstractmethod
     def execute_model(
         self,
         execute_model_req: Optional[ExecuteModelRequest] = None
@@ -119,6 +118,58 @@ def list_loras(self) -> Set[int]:
         raise NotImplementedError
 
 
+class DelegateWorkerBase(WorkerBase):
+    """
+    A class that delegates all methods to another WorkerBase instance. This is
+    useful for creating a WorkerBase that wraps another WorkerBase instance,
+    e.g. speculative decoding.
+    """
+    worker: WorkerBase
+
+    def __init__(
+        self,
+        *args,
+        **kwargs,
+    ) -> None:
+        vllm_config: VllmConfig = kwargs.get("vllm_config")
+        cls = resolve_obj_by_qualname(vllm_config.parallel_config.worker_cls)
+        self.worker = cls(*args, **kwargs)
+
+    def init_device(self) -> None:
+        self.worker.init_device()
+
+    def determine_num_available_blocks(self) -> Tuple[int, int]:
+        return self.worker.determine_num_available_blocks()
+
+    def initialize_cache(self, num_gpu_blocks: int,
+                         num_cpu_blocks: int) -> None:
+        self.worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
+
+    def execute_model(
+        self,
+        execute_model_req: Optional[ExecuteModelRequest] = None
+    ) -> Optional[List[SamplerOutput]]:
+        return self.worker.execute_model(execute_model_req)
+
+    def get_cache_block_size_bytes(self) -> int:
+        return self.worker.get_cache_block_size_bytes()
+
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        return self.worker.add_lora(lora_request)
+
+    def remove_lora(self, lora_id: int) -> bool:
+        return self.worker.remove_lora(lora_id)
+
+    def pin_lora(self, lora_id: int) -> bool:
+        return self.worker.pin_lora(lora_id)
+
+    def list_loras(self) -> Set[int]:
+        return self.worker.list_loras()
+
+    def __getattr__(self, attr):
+        return getattr(self.worker, attr)
+
+
 class LoraNotSupportedWorkerBase(WorkerBase):
     """Partial implementation of WorkerBase that raises exceptions when LoRA
     methods are invoked.
@@ -419,17 +470,31 @@ class WorkerWrapperBase:
     def __init__(
         self,
         vllm_config: VllmConfig,
+        rank: int = 0,
     ) -> None:
+        self.rank = rank
         self.vllm_config = vllm_config
-        trust_remote_code = vllm_config.model_config.trust_remote_code
         self.worker: Optional[WorkerBase] = None
-        if trust_remote_code:
-            # note: lazy import to avoid importing torch before initializing
-            from vllm.utils import init_cached_hf_modules
-            init_cached_hf_modules()
+        if vllm_config.model_config is not None:
+            # it can be None in tests
+            trust_remote_code = vllm_config.model_config.trust_remote_code
+            if trust_remote_code:
+                # note: lazy import to avoid importing torch before initializing
+                from vllm.utils import init_cached_hf_modules
+                init_cached_hf_modules()
+
+    def adjust_rank(self, rank_mapping: Dict[int, int]) -> None:
+        """
+        Adjust the rank based on the given mapping.
+        It is only used during the initialization of the executor,
+        to adjust the rank of workers after we create all workers.
+        """
+        if self.rank in rank_mapping:
+            self.rank = rank_mapping[self.rank]
 
-    @staticmethod
-    def update_environment_variables(envs: Dict[str, str]) -> None:
+    def update_environment_variables(self, envs_list: List[Dict[str,
+                                                                str]]) -> None:
+        envs = envs_list[self.rank]
         key = 'CUDA_VISIBLE_DEVICES'
         if key in envs and key in os.environ:
             # overwriting CUDA_VISIBLE_DEVICES is desired behavior
@@ -437,11 +502,12 @@ def update_environment_variables(envs: Dict[str, str]) -> None:
             del os.environ[key]
         update_environment_variables(envs)
 
-    def init_worker(self, *args, **kwargs):
+    def init_worker(self, all_kwargs: List[Dict[str, Any]]) -> None:
         """
         Here we inject some common logic before initializing the worker.
         Arguments are passed to the worker class constructor.
         """
+        kwargs = all_kwargs[self.rank]
         enable_trace_function_call_for_thread(self.vllm_config)
 
         # see https://github.com/NVIDIA/nccl/issues/1234
@@ -452,7 +518,7 @@ def init_worker(self, *args, **kwargs):
 
         worker_class = resolve_obj_by_qualname(
             self.vllm_config.parallel_config.worker_cls)
-        self.worker = worker_class(*args, **kwargs)
+        self.worker = worker_class(**kwargs)
         assert self.worker is not None
 
     def execute_method(self, method: str, *args, **kwargs):

From 3f9b7ab9f59f83ab0551a6a2f1894e30bc0cb41c Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Wed, 15 Jan 2025 01:36:01 -0500
Subject: [PATCH 223/462] [Doc] Update examples to remove
 SparseAutoModelForCausalLM (#12062)

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 docs/source/features/quantization/fp8.md  | 11 +++++------
 docs/source/features/quantization/int8.md |  7 +++----
 2 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/docs/source/features/quantization/fp8.md b/docs/source/features/quantization/fp8.md
index da49cd2747228..1398e8a324201 100644
--- a/docs/source/features/quantization/fp8.md
+++ b/docs/source/features/quantization/fp8.md
@@ -54,16 +54,15 @@ The quantization process involves three main steps:
 
 ### 1. Loading the Model
 
-Use `SparseAutoModelForCausalLM`, which wraps `AutoModelForCausalLM`, for saving and loading quantized models:
+Load your model and tokenizer using the standard `transformers` AutoModel classes:
 
 ```python
-from llmcompressor.transformers import SparseAutoModelForCausalLM
-from transformers import AutoTokenizer
+from transformers import AutoTokenizer, AutoModelForCausalLM
 
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
-
-model = SparseAutoModelForCausalLM.from_pretrained(
-  MODEL_ID, device_map="auto", torch_dtype="auto")
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID, device_map="auto", torch_dtype="auto",
+)
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 ```
 
diff --git a/docs/source/features/quantization/int8.md b/docs/source/features/quantization/int8.md
index 82a15d76d352f..592a60d3988b2 100644
--- a/docs/source/features/quantization/int8.md
+++ b/docs/source/features/quantization/int8.md
@@ -30,14 +30,13 @@ The quantization process involves four main steps:
 
 ### 1. Loading the Model
 
-Use `SparseAutoModelForCausalLM`, which wraps `AutoModelForCausalLM`, for saving and loading quantized models:
+Load your model and tokenizer using the standard `transformers` AutoModel classes:
 
 ```python
-from llmcompressor.transformers import SparseAutoModelForCausalLM
-from transformers import AutoTokenizer
+from transformers import AutoTokenizer, AutoModelForCausalLM
 
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
-model = SparseAutoModelForCausalLM.from_pretrained(
+model = AutoModelForCausalLM.from_pretrained(
     MODEL_ID, device_map="auto", torch_dtype="auto",
 )
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

From 994fc655b71f59f61b82cc44e868091dae493a84 Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Wed, 15 Jan 2025 15:55:30 +0800
Subject: [PATCH 224/462] [V1][Prefix Cache] Move the logic of
 num_computed_tokens into KVCacheManager (#12003)

---
 tests/v1/core/test_prefix_caching.py | 71 ++++++++++++++++++----------
 vllm/v1/core/kv_cache_manager.py     | 17 +++++--
 vllm/v1/core/scheduler.py            |  8 +---
 3 files changed, 61 insertions(+), 35 deletions(-)

diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index b97f55b8c6535..fafd9d0ce4455 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -49,9 +49,10 @@ def test_prefill():
     unique_token_ids = [3] * 7
     all_token_ids = common_token_ids + unique_token_ids
     req0 = make_request("0", all_token_ids)
-    computed_blocks = manager.get_computed_blocks(req0)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
     assert len(req0.kv_block_hashes) == 3
     assert not computed_blocks
+    assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req0, 55, computed_blocks)
     assert [b.block_id for b in blocks] == [0, 1, 2, 3, 4]
 
@@ -73,9 +74,10 @@ def test_prefill():
     # Incomplete 1 block (5 tokens)
     unique_token_ids = [3] * 5
     req1 = make_request("1", common_token_ids + unique_token_ids)
-    computed_blocks = manager.get_computed_blocks(req1)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
     assert len(req1.kv_block_hashes) == 3
     assert [b.block_id for b in computed_blocks] == [0, 1, 2]
+    assert num_computed_tokens == 3 * 16
     num_new_tokens = 53 - 3 * 16
     blocks = manager.allocate_slots(req1, num_new_tokens, computed_blocks)
     assert [b.block_id for b in blocks] == [5, 6]
@@ -91,7 +93,7 @@ def test_prefill():
     # All blocks should be available.
     assert manager.free_block_queue.num_free_blocks == 10
     # The order should be
-    # [unallocated (7, 8)]
+    # [unallocated (7, 8, 9)]
     # [unique_req0 (4, 3)]
     # [unique_req1 (6, 5)]
     # [common (2, 1, 0)]
@@ -103,9 +105,10 @@ def test_prefill():
     # Incomplete 1 block (6 tokens)
     unique_token_ids = [3] * 6
     req2 = make_request("2", common_token_ids + unique_token_ids)
-    computed_blocks = manager.get_computed_blocks(req2)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
     assert len(req2.kv_block_hashes) == 3
     assert [b.block_id for b in computed_blocks] == [0, 1, 2]
+    assert num_computed_tokens == 3 * 16
     num_new_tokens = 53 - 3 * 16
     blocks = manager.allocate_slots(req2, num_new_tokens, computed_blocks)
     assert [b.block_id for b in blocks] == [7, 8]
@@ -123,8 +126,9 @@ def test_prefill():
 
     # Cache miss and eviction.
     req3 = make_request("3", [99] * (16 * 9))
-    computed_blocks = manager.get_computed_blocks(req3)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req3)
     assert not computed_blocks
+    assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req3, 16 * 9, computed_blocks)
     # This block ID order also checks the eviction order.
     assert [b.block_id for b in blocks] == [9, 4, 3, 6, 5, 8, 7, 2, 1, 0]
@@ -150,8 +154,9 @@ def test_decode():
     # Incomplete 1 block (7 tokens)
     unique_token_ids = [3] * 7
     req0 = make_request("0", common_token_ids + unique_token_ids)
-    computed_blocks = manager.get_computed_blocks(req0)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
     assert not computed_blocks
+    assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req0, 55, computed_blocks)
     assert [b.block_id for b in blocks] == [0, 1, 2, 3, 4]
 
@@ -197,16 +202,18 @@ def test_evict():
 
     last_token_id = 5 * 16 + 7
     req0 = make_request("0", list(range(last_token_id)))
-    computed_blocks = manager.get_computed_blocks(req0)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
     assert not computed_blocks
+    assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req0, 5 * 16 + 7, computed_blocks)
     assert len(blocks) == 7  # 5 full + 1 partial + 1 preallocated
 
     # 3 blocks.
     req1 = make_request("1", list(range(last_token_id,
                                         last_token_id + 3 * 16)))
-    computed_blocks = manager.get_computed_blocks(req1)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
     assert not computed_blocks
+    assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req1, 3 * 16, computed_blocks)
     assert len(blocks) == 3  # 3 full blocks
     last_token_id += 3 * 16
@@ -222,8 +229,9 @@ def test_evict():
 
     # Touch the first 2 blocks.
     req2 = make_request("2", list(range(2 * 16 + 3)))
-    computed_blocks = manager.get_computed_blocks(req2)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
     assert [b.block_id for b in computed_blocks] == [0, 1]
+    assert num_computed_tokens == 2 * 16
     blocks = manager.allocate_slots(req2, 3, computed_blocks)
     assert [b.block_id for b in blocks] == [6, 5]
     assert manager.free_block_queue.num_free_blocks == 6
@@ -247,8 +255,9 @@ def test_hash_block_correct_reuse():
     # Allocate 1 block and cache it.
     num_tokens = block_size * 1
     req = make_request("0", list(range(num_tokens)))
-    computed_blocks = manager.get_computed_blocks(req)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req)
     assert not computed_blocks
+    assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req, num_tokens, computed_blocks)
     assert len(blocks) == 1
 
@@ -258,8 +267,9 @@ def test_hash_block_correct_reuse():
     # Allocate a new block that's not full, make sure hash info on the
     # block is cleared.
     req = make_request("1", list(range(num_tokens - 1)))
-    computed_blocks = manager.get_computed_blocks(req)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req)
     assert not computed_blocks
+    assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req, num_tokens - 1, computed_blocks)
     assert len(blocks) == 1
 
@@ -284,16 +294,18 @@ def test_computed_blocks_not_evicted():
     # Allocate a block and cache it.
     num_tokens = block_size * 1
     req0 = make_request("0", list(range(num_tokens)))
-    computed_blocks = manager.get_computed_blocks(req0)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
     assert not computed_blocks
+    assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req0, num_tokens, computed_blocks)
     assert len(blocks) == 1
     assert blocks[0].block_id == 0
 
     # Allocate another block.
     req1 = make_request("1", list(range(num_tokens, num_tokens * 2)))
-    computed_blocks = manager.get_computed_blocks(req1)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
     assert not computed_blocks
+    assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req1, num_tokens, computed_blocks)
     assert len(blocks) == 1
     assert blocks[0].block_id == 1
@@ -305,9 +317,10 @@ def test_computed_blocks_not_evicted():
     # Now if we have a cache hit on the first block, we should evict the second
     # cached block rather than the first one.
     req2 = make_request("2", list(range(num_tokens * 2)))
-    computed_blocks = manager.get_computed_blocks(req2)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
     assert len(computed_blocks) == 1
     assert computed_blocks[0].block_id == 0
+    assert num_computed_tokens == block_size
 
     blocks = manager.allocate_slots(req2, num_tokens * 2 - num_tokens,
                                     computed_blocks)
@@ -331,8 +344,9 @@ def test_basic_prefix_caching_disabled():
 
     req1 = make_request("1", list(range(10)))  # 2 blocks and some more
 
-    computed_blocks = manager.get_computed_blocks(req1)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
     assert not computed_blocks
+    assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req1, 10, computed_blocks)
     assert len(blocks) == 3
 
@@ -341,15 +355,17 @@ def test_basic_prefix_caching_disabled():
 
     # No caching.
     req2 = make_request("2", list(range(16)))  # shared prefix
-    computed_blocks = manager.get_computed_blocks(req2)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
     assert not computed_blocks
+    assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req2, 16, computed_blocks)
     assert len(blocks) == 4
 
     # New requests should not have any blocks.
     req3 = make_request("3", list(range(4)))
-    computed_blocks = manager.get_computed_blocks(req3)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req3)
     assert not computed_blocks
+    assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req3, 4, computed_blocks)
     assert not blocks
 
@@ -371,8 +387,9 @@ def test_preallocate_blocks(num_preallocate_tokens: int, block_size: int):
     num_preallocated_blocks = cdiv(num_preallocate_tokens, block_size)
 
     req = make_request("0", list(range(block_size * 30)))
-    computed_blocks = manager.get_computed_blocks(req)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req)
     assert not computed_blocks
+    assert num_computed_tokens == 0
     # Just ask for 1 block.
     blocks = manager.allocate_slots(req, block_size, computed_blocks)
     req.num_computed_tokens = block_size
@@ -469,10 +486,11 @@ def test_mm_prefix_caching():
                         all_token_ids,
                         mm_positions=mm_positions,
                         mm_hashes=mm_hashes)
-    computed_blocks = manager.get_computed_blocks(req0)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
 
     # Completed block should have hashes with extra keys.
     assert not computed_blocks
+    assert num_computed_tokens == 0
     assert len(req0.kv_block_hashes) == 3
     assert req0.kv_block_hashes[0].extra_keys == ("aaa", )
     assert req0.kv_block_hashes[1].extra_keys == ("aaa", "bbb")
@@ -503,8 +521,9 @@ def test_mm_prefix_caching():
                         all_token_ids,
                         mm_positions=mm_positions,
                         mm_hashes=mm_hashes)
-    computed_blocks = manager.get_computed_blocks(req1)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
     assert len(computed_blocks) == 3
+    assert num_computed_tokens == 3 * 16
 
 
 def test_prefill_not_enough_free_blocks_with_computed_blocks():
@@ -527,15 +546,17 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks():
     # | Common-0 | Common-1 | Common-2 | ... |
     common_token_ids = [i for i in range(3) for _ in range(16)]
     req0 = make_request("0", common_token_ids)
-    computed_blocks = manager.get_computed_blocks(req0)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
     assert not computed_blocks
+    assert num_computed_tokens == 0
     manager.allocate_slots(req0, 48, computed_blocks)
     block_part0 = manager.req_to_blocks[req0.request_id]
 
     # | Common-0 | Common-1 | Common-2 | Req1-3 | Req1-4 | Req1-5 | ... |
     req1 = make_request("1", common_token_ids * 2)
-    computed_blocks = manager.get_computed_blocks(req1)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
     assert computed_blocks == block_part0
+    assert num_computed_tokens == 3 * 16
     manager.allocate_slots(req1, 48, computed_blocks)
     block_part1 = manager.req_to_blocks[req1.request_id]
     # | Common-0 | Common-1 | Common-2 | Req1-3 (F) | Req1-4 (F) |
@@ -547,8 +568,9 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks():
     # | Common-0 | Common-1 | Common-2 | Req1-3 (F) | Req1-4 (F) |
     # | Req1-5(F)| Req2-0   | Req2-1   | ... |
     req2 = make_request("2", [7] * block_size * 2)
-    computed_blocks = manager.get_computed_blocks(req2)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
     assert not computed_blocks
+    assert num_computed_tokens == 0
     manager.allocate_slots(req2, block_size * 2, computed_blocks)
 
     # Req3 is Req2 + 3 new blocks, so the first 6 blocks are computed,
@@ -556,8 +578,9 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks():
     # In this case, the ref_cnt of the computed blocks should not be changed.
     assert manager.free_block_queue.num_free_blocks == 5
     req3 = make_request("3", common_token_ids * 3)
-    computed_blocks = manager.get_computed_blocks(req3)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req3)
     assert computed_blocks == block_part1
+    assert num_computed_tokens == 6 * 16
     # Req3 cannot be allocated.
     assert manager.allocate_slots(req3, 48, computed_blocks) is None
     # Block 0-2 are used by Req 1.
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index 1cbff1e2d767e..bac77443c8560 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -1,5 +1,5 @@
 from collections import defaultdict
-from typing import Dict, Iterable, List, Optional
+from typing import Dict, Iterable, List, Optional, Tuple
 
 from vllm.logger import init_logger
 from vllm.utils import cdiv
@@ -69,7 +69,8 @@ def __init__(
         # is finished.
         self.req_to_blocks: Dict[str, List[KVCacheBlock]] = {}
 
-    def get_computed_blocks(self, request: Request) -> List[KVCacheBlock]:
+    def get_computed_blocks(
+            self, request: Request) -> Tuple[List[KVCacheBlock], int]:
         """Get the computed (cached) blocks for the request.
         Note that the computed blocks must be full.
 
@@ -77,11 +78,13 @@ def get_computed_blocks(self, request: Request) -> List[KVCacheBlock]:
             request: The request to get the computed blocks.
 
         Returns:
-            A list of blocks that are computed for the request.
+            A tuple containing:
+                - A list of blocks that are computed for the request.
+                - The number of computed tokens.
         """
         if not self.enable_caching:
             # Prefix caching is disabled.
-            return []
+            return [], 0
 
         computed_blocks = []
 
@@ -101,7 +104,11 @@ def get_computed_blocks(self, request: Request) -> List[KVCacheBlock]:
             else:
                 break
 
-        return computed_blocks
+        # NOTE(woosuk): Since incomplete blocks are not eligible for
+        # sharing, `num_computed_tokens` is always a multiple of
+        # `block_size`.
+        num_computed_tokens = len(computed_blocks) * self.block_size
+        return computed_blocks, num_computed_tokens
 
     def append_slots(
         self,
diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index 2503d136aea7e..45e67c94f8f15 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -184,12 +184,8 @@ def schedule(self) -> "SchedulerOutput":
 
                 request = self.waiting[0]
                 # Get already-cached tokens.
-                computed_blocks = self.kv_cache_manager.get_computed_blocks(
-                    request)
-                # NOTE(woosuk): Since incomplete blocks are not eligible for
-                # sharing, `num_computed_tokens` is always a multiple of
-                # `block_size`.
-                num_computed_tokens = len(computed_blocks) * self.block_size
+                computed_blocks, num_computed_tokens = \
+                    self.kv_cache_manager.get_computed_blocks(request)
                 # Number of tokens to be scheduled.
                 # We use `request.num_tokens` instead of
                 # `request.num_prompt_tokens` to consider the resumed requests,

From cbe94391eb04aa9ae1be15711fec4eb453c1e053 Mon Sep 17 00:00:00 2001
From: Rahul Tuli <rahul@neuralmagic.com>
Date: Wed, 15 Jan 2025 04:41:24 -0500
Subject: [PATCH 225/462] Fix: cases with empty sparsity config (#12057)

Signed-off-by: Rahul Tuli <rahul@neuralmagic.com>
---
 .../quantization/compressed_tensors/compressed_tensors.py       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 0c1fc18228f5c..f0e4eda76734b 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -113,7 +113,7 @@ def _sparsity_scheme_map_from_config(
         :return: A dictionary mapping target layer names to their corresponding
             sparsity compression configurations
         """
-        if (sparsity_config := config.get(SPARSITY_CONFIG_NAME)) is None:
+        if not (sparsity_config := config.get(SPARSITY_CONFIG_NAME)):
             return dict()
 
         sparsity_config = SparsityCompressionConfig.model_validate(

From ad388d25a8e668545ef91c3634b67a241155e2ea Mon Sep 17 00:00:00 2001
From: Keyun Tong <tongkeyun@gmail.com>
Date: Wed, 15 Jan 2025 01:44:56 -0800
Subject: [PATCH 226/462] Type-fix: make execute_model output type optional
 (#12020)

---
 vllm/v1/executor/uniproc_executor.py | 1 +
 vllm/v1/worker/gpu_worker.py         | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/executor/uniproc_executor.py b/vllm/v1/executor/uniproc_executor.py
index a9adc0114f76d..c63d7a4c47c15 100644
--- a/vllm/v1/executor/uniproc_executor.py
+++ b/vllm/v1/executor/uniproc_executor.py
@@ -70,6 +70,7 @@ def execute_model(
         scheduler_output,
     ) -> ModelRunnerOutput:
         output = self.worker.execute_model(scheduler_output)
+        assert output is not None
         return output
 
     def profile(self, is_start: bool = True):
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index e6feaee972a35..81b247e07ef4a 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -200,7 +200,7 @@ def compile_or_warm_up_model(self) -> None:
     def execute_model(
         self,
         scheduler_output: "SchedulerOutput",
-    ) -> ModelRunnerOutput:
+    ) -> Optional[ModelRunnerOutput]:
         output = self.model_runner.execute_model(scheduler_output)
         return output if self.rank == 0 else None
 

From 3adf0ffda8de31ff32f294324e53b6cfbf16f187 Mon Sep 17 00:00:00 2001
From: wangxiyuan <wangxiyuan1007@gmail.com>
Date: Wed, 15 Jan 2025 18:14:15 +0800
Subject: [PATCH 227/462] [Platform] Do not raise error if _Backend is not
 found (#12023)

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
Signed-off-by: Mengqing Cao <cmq0113@163.com>
Co-authored-by: Mengqing Cao <cmq0113@163.com>
---
 tests/kernels/test_attention_selector.py      | 11 +++++++---
 .../dummy_attention_backend.py                |  8 ++++++++
 .../vllm_add_dummy_platform/dummy_platform.py |  4 ++++
 tests/plugins_tests/test_platform_plugins.py  | 14 +++++++++++++
 vllm/attention/layer.py                       |  8 ++++----
 vllm/attention/selector.py                    | 20 ++++++++++---------
 6 files changed, 49 insertions(+), 16 deletions(-)
 create mode 100644 tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py

diff --git a/tests/kernels/test_attention_selector.py b/tests/kernels/test_attention_selector.py
index a08c874407e3f..492acb91e8ed9 100644
--- a/tests/kernels/test_attention_selector.py
+++ b/tests/kernels/test_attention_selector.py
@@ -94,7 +94,12 @@ def test_flash_attn(monkeypatch):
 
 
 def test_invalid_env(monkeypatch):
-    """Throw an exception if the backend name is invalid."""
+    """Ignore the invalid env variable if it is set."""
     override_backend_env_variable(monkeypatch, STR_INVALID_VAL)
-    with pytest.raises(ValueError):
-        get_attn_backend(16, torch.float16, None, 16, False)
+    with patch("vllm.attention.selector.current_platform", CudaPlatform()):
+        backend = get_attn_backend(32, torch.float16, None, 16, False)
+        assert backend.get_name() == "FLASH_ATTN"
+
+        # when block size == 16, backend will fall back to XFORMERS
+        backend = get_attn_backend(16, torch.float16, None, 16, False)
+        assert backend.get_name() == "XFORMERS"
diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py
new file mode 100644
index 0000000000000..5634be3c8d882
--- /dev/null
+++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py
@@ -0,0 +1,8 @@
+from vllm.attention.backends.flash_attn import FlashAttentionBackend
+
+
+class DummyAttentionBackend(FlashAttentionBackend):
+
+    @staticmethod
+    def get_name() -> str:
+        return "Dummy_Backend"
diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
index fde93142f1103..84721d5971ccf 100644
--- a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
+++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
@@ -3,3 +3,7 @@
 
 class DummyPlatform(CudaPlatform):
     device_name = "DummyDevice"
+
+    def get_attn_backend_cls(self, backend_name, head_size, dtype,
+                             kv_cache_dtype, block_size, use_v1):
+        return "vllm_add_dummy_platform.dummy_attention_backend.DummyAttentionBackend"  # noqa E501
diff --git a/tests/plugins_tests/test_platform_plugins.py b/tests/plugins_tests/test_platform_plugins.py
index 69698b34c71a3..661aa5f649ab9 100644
--- a/tests/plugins_tests/test_platform_plugins.py
+++ b/tests/plugins_tests/test_platform_plugins.py
@@ -1,3 +1,10 @@
+import torch
+
+from tests.kernels.utils import override_backend_env_variable
+from vllm.attention.selector import get_attn_backend
+from vllm.utils import STR_INVALID_VAL
+
+
 def test_platform_plugins():
     # simulate workload by running an example
     import runpy
@@ -14,3 +21,10 @@ def test_platform_plugins():
         f"Expected DummyDevice, got {current_platform.device_name}, "
         "possibly because current_platform is imported before the plugin"
         f" is loaded. The first import:\n{_init_trace}")
+
+
+def test_oot_attention_backend(monkeypatch):
+    # ignore the backend env variable if it is set
+    override_backend_env_variable(monkeypatch, STR_INVALID_VAL)
+    backend = get_attn_backend(16, torch.float16, torch.float16, 16, False)
+    assert backend.get_name() == "Dummy_Backend"
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index a283e87d84070..9b03fd73fe690 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -190,11 +190,11 @@ def __init__(
                                         kv_cache_dtype=None,
                                         block_size=16,
                                         is_attention_free=False)
-        attn_backend = backend_name_to_enum(attn_backend.get_name())
-        if attn_backend in {_Backend.FLASH_ATTN, _Backend.FLASH_ATTN_VLLM_V1}:
-            attn_backend = _Backend.XFORMERS
+        backend = backend_name_to_enum(attn_backend.get_name())
+        if backend in {_Backend.FLASH_ATTN, _Backend.FLASH_ATTN_VLLM_V1}:
+            backend = _Backend.XFORMERS
 
-        self.attn_backend = attn_backend if attn_backend in {
+        self.attn_backend = backend if backend in {
             _Backend.TORCH_SDPA, _Backend.XFORMERS
         } else _Backend.TORCH_SDPA
 
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index 0ff007c87b1c9..81ea6eefb5410 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -14,16 +14,18 @@
 logger = init_logger(__name__)
 
 
-def backend_name_to_enum(backend_name: str) -> _Backend:
-    assert backend_name is not None
-
-    backend_members = _Backend.__members__
-    if backend_name not in backend_members:
-        raise ValueError(f"Invalid attention backend '{backend_name}'. "
-                         f"Available backends: {', '.join(backend_members)} "
-                         "(case-sensitive).")
+def backend_name_to_enum(backend_name: str) -> Optional[_Backend]:
+    """
+    Convert a string backend name to a _Backend enum value.
 
-    return _Backend[backend_name]
+    Returns:
+    * _Backend: enum value if backend_name is a valid in-tree type
+    * None: otherwise it's an invalid in-tree type or an out-of-tree platform is
+            loaded.
+    """
+    assert backend_name is not None
+    return _Backend[backend_name] if backend_name in _Backend.__members__ else \
+          None
 
 
 def get_env_variable_attn_backend() -> Optional[_Backend]:

From 97eb97b5a4fd64c3cbc97bb9d71a9bfd98348799 Mon Sep 17 00:00:00 2001
From: RunningLeon <maningsheng@sensetime.com>
Date: Wed, 15 Jan 2025 19:35:17 +0800
Subject: [PATCH 228/462] [Model]: Support internlm3 (#12037)

---
 docs/source/models/supported_models.md |  5 ++++
 tests/models/registry.py               |  2 ++
 vllm/model_executor/models/llama.py    | 35 +++++++++++++++-----------
 vllm/model_executor/models/registry.py |  1 +
 4 files changed, 28 insertions(+), 15 deletions(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 642ef3c9655b8..85d844f3d3f55 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -216,6 +216,11 @@ See [this page](#generative-models) for more information on how to use generativ
   - `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc.
   - ✅︎
   - ✅︎
+* - `InternLM3ForCausalLM`
+  - InternLM3
+  - `internlm/internlm3-8b-instruct`, etc.
+  - ✅︎
+  - ✅︎
 * - `JAISLMHeadModel`
   - Jais
   - `inceptionai/jais-13b`, `inceptionai/jais-13b-chat`, `inceptionai/jais-30b-v3`, `inceptionai/jais-30b-chat-v3`, etc.
diff --git a/tests/models/registry.py b/tests/models/registry.py
index d079725b2f78d..b0f0f9767a90f 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -85,6 +85,8 @@ class _HfExamplesInfo:
                                             trust_remote_code=True),
     "InternLM2VEForCausalLM": _HfExamplesInfo("OpenGVLab/Mono-InternVL-2B",
                                               trust_remote_code=True),
+    "InternLM3ForCausalLM": _HfExamplesInfo("internlm/internlm3-8b-instruct",
+                                            trust_remote_code=True),
     "JAISLMHeadModel": _HfExamplesInfo("inceptionai/jais-13b-chat"),
     "JambaForCausalLM": _HfExamplesInfo("ai21labs/AI21-Jamba-1.5-Mini"),
     "LlamaForCausalLM": _HfExamplesInfo("meta-llama/Meta-Llama-3-8B"),
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 16fa7acf54fdc..e8732c57fad49 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -97,20 +97,19 @@ def forward(self, x):
 
 class LlamaAttention(nn.Module):
 
-    def __init__(
-        self,
-        config: LlamaConfig,
-        hidden_size: int,
-        num_heads: int,
-        num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: Optional[Dict[str, Any]] = None,
-        max_position_embeddings: int = 8192,
-        quant_config: Optional[QuantizationConfig] = None,
-        bias: bool = False,
-        cache_config: Optional[CacheConfig] = None,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self,
+                 config: LlamaConfig,
+                 hidden_size: int,
+                 num_heads: int,
+                 num_kv_heads: int,
+                 rope_theta: float = 10000,
+                 rope_scaling: Optional[Dict[str, Any]] = None,
+                 max_position_embeddings: int = 8192,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 bias: bool = False,
+                 cache_config: Optional[CacheConfig] = None,
+                 prefix: str = "",
+                 bias_o_proj: bool = False) -> None:
         super().__init__()
         layer_idx = extract_layer_index(prefix)
         self.hidden_size = hidden_size
@@ -150,7 +149,7 @@ def __init__(
         self.o_proj = RowParallelLinear(
             input_size=self.total_num_heads * self.head_dim,
             output_size=hidden_size,
-            bias=bias,
+            bias=bias_o_proj,
             quant_config=quant_config,
             prefix=f"{prefix}.o_proj",
         )
@@ -231,6 +230,11 @@ def __init__(
         # Support internlm/internlm-7b with bias
         attention_bias = getattr(config, "attention_bias", False) or getattr(
             config, "bias", False)
+        bias_o_proj = attention_bias
+        # support internlm/internlm3-8b with qkv_bias
+        if hasattr(config, 'qkv_bias'):
+            attention_bias = config.qkv_bias
+
         self.self_attn = LlamaAttention(
             config=config,
             hidden_size=self.hidden_size,
@@ -242,6 +246,7 @@ def __init__(
             max_position_embeddings=max_position_embeddings,
             quant_config=quant_config,
             bias=attention_bias,
+            bias_o_proj=bias_o_proj,
             cache_config=cache_config,
             prefix=f"{prefix}.self_attn",
         )
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index a7286a9203f67..a71f7f7029c7d 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -60,6 +60,7 @@
     "InternLMForCausalLM": ("llama", "LlamaForCausalLM"),
     "InternLM2ForCausalLM": ("internlm2", "InternLM2ForCausalLM"),
     "InternLM2VEForCausalLM": ("internlm2_ve", "InternLM2VEForCausalLM"),
+    "InternLM3ForCausalLM": ("llama", "LlamaForCausalLM"),
     "JAISLMHeadModel": ("jais", "JAISLMHeadModel"),
     "JambaForCausalLM": ("jamba", "JambaForCausalLM"),
     "LlamaForCausalLM": ("llama", "LlamaForCausalLM"),

From 5ecf3e0aafc3ae0e2923e0635adc6b26788429a3 Mon Sep 17 00:00:00 2001
From: Yuan <yuan.zhou@intel.com>
Date: Wed, 15 Jan 2025 21:16:40 +0800
Subject: [PATCH 229/462] Misc: allow to use proxy in `HTTPConnection` (#12042)

Signed-off-by: Yuan Zhou <yuan.zhou@intel.com>
---
 vllm/connections.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/connections.py b/vllm/connections.py
index e785a0b3ebd74..4c9f4f40cf640 100644
--- a/vllm/connections.py
+++ b/vllm/connections.py
@@ -29,7 +29,7 @@ def get_sync_client(self) -> requests.Session:
     # required, so that the client is only accessible inside async event loop
     async def get_async_client(self) -> aiohttp.ClientSession:
         if self._async_client is None or not self.reuse_client:
-            self._async_client = aiohttp.ClientSession()
+            self._async_client = aiohttp.ClientSession(trust_env=True)
 
         return self._async_client
 

From de0526f668d6918c1884fd3b201308e9049e6be9 Mon Sep 17 00:00:00 2001
From: kewang-xlnx <73578509+kewang-xlnx@users.noreply.github.com>
Date: Thu, 16 Jan 2025 00:05:15 +0800
Subject: [PATCH 230/462] [Misc][Quark] Upstream Quark format to VLLM (#10765)

Signed-off-by: kewang-xlnx <kewang@xilinx.com>
Signed-off-by: kewang2 <kewang2@amd.com>
Co-authored-by: kewang2 <kewang2@amd.com>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
---
 tests/quantization/test_quark.py              |  30 ++
 vllm/config.py                                |   2 +-
 vllm/model_executor/layers/linear.py          |   2 +-
 .../layers/quantization/__init__.py           |   4 +
 .../layers/quantization/base_config.py        |   3 +
 .../compressed_tensors/compressed_tensors.py  |  16 +
 .../compressed_tensors/triton_scaled_mm.py    |   4 +
 .../quantization/compressed_tensors/utils.py  |  17 -
 .../layers/quantization/quark/__init__.py     |   0
 .../layers/quantization/quark/quark.py        | 387 ++++++++++++++++++
 .../layers/quantization/quark/quark_moe.py    | 225 ++++++++++
 .../quantization/quark/schemes/__init__.py    |   5 +
 .../quark/schemes/quark_scheme.py             |  52 +++
 .../quark/schemes/quark_w8a8_fp8.py           | 140 +++++++
 .../quark/schemes/quark_w8a8_int8.py          | 105 +++++
 .../layers/quantization/quark/utils.py        |  99 +++++
 vllm/model_executor/models/aria.py            |  11 +-
 vllm/model_executor/models/commandr.py        |  14 +
 vllm/model_executor/models/dbrx.py            |  73 +++-
 vllm/model_executor/models/exaone.py          |  12 +-
 vllm/model_executor/models/gemma2.py          |   6 +-
 vllm/model_executor/models/gpt_j.py           |  14 +
 vllm/model_executor/models/granite.py         |  12 +-
 vllm/model_executor/models/llama.py           |  12 +-
 vllm/model_executor/models/mixtral.py         |  14 +
 vllm/model_executor/models/mllama.py          |  13 +
 vllm/model_executor/models/nemotron.py        |  13 +
 vllm/model_executor/models/phimoe.py          |  14 +
 vllm/model_executor/models/qwen2.py           |  13 +
 vllm/model_executor/models/solar.py           |  12 +-
 vllm/model_executor/parameter.py              |   8 +-
 vllm/platforms/rocm.py                        |   2 +-
 32 files changed, 1264 insertions(+), 70 deletions(-)
 create mode 100644 tests/quantization/test_quark.py
 create mode 100644 vllm/model_executor/layers/quantization/quark/__init__.py
 create mode 100644 vllm/model_executor/layers/quantization/quark/quark.py
 create mode 100644 vllm/model_executor/layers/quantization/quark/quark_moe.py
 create mode 100644 vllm/model_executor/layers/quantization/quark/schemes/__init__.py
 create mode 100644 vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py
 create mode 100644 vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py
 create mode 100644 vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py
 create mode 100644 vllm/model_executor/layers/quantization/quark/utils.py

diff --git a/tests/quantization/test_quark.py b/tests/quantization/test_quark.py
new file mode 100644
index 0000000000000..27493a682b746
--- /dev/null
+++ b/tests/quantization/test_quark.py
@@ -0,0 +1,30 @@
+"""Test model set-up and weight loading for quark-quantized models.
+
+Run `pytest tests/quantization/test_quark.py`.
+"""
+
+import torch
+
+from vllm.model_executor.layers.quantization.quark.quark import (  # noqa: E501
+    QuarkLinearMethod, QuarkW8A8Fp8)
+
+
+def test_quark_fp8(vllm_runner):
+    model_path = "amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test"
+    with vllm_runner(model_path) as llm:
+        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
+        layer = model.model.layers[0]
+
+        qkv_proj = layer.self_attn.qkv_proj
+
+        assert isinstance(qkv_proj.quant_method, QuarkLinearMethod)
+        assert isinstance(qkv_proj.scheme, QuarkW8A8Fp8)
+
+        if isinstance(qkv_proj.scheme, QuarkW8A8Fp8):
+            assert len(qkv_proj.input_scale.shape) == 0
+            assert qkv_proj.weight.dtype is torch.float8_e4m3fn
+            #assert qkv_proj.weight.dtype is torch.float8_e4m3fnuz
+            assert len(qkv_proj.weight_scale.shape) == 0
+
+        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        assert output
diff --git a/vllm/config.py b/vllm/config.py
index 4a42aefb75026..65cb0d85f172a 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -553,7 +553,7 @@ def _verify_quantization(self) -> None:
         optimized_quantization_methods = [
             "fp8", "marlin", "modelopt", "gptq_marlin_24", "gptq_marlin",
             "awq_marlin", "fbgemm_fp8", "compressed_tensors",
-            "compressed-tensors", "experts_int8"
+            "compressed-tensors", "experts_int8", "quark"
         ]
         if self.quantization is not None:
             self.quantization = self.quantization.lower()
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 8876ca72792cf..00ae64bbe6388 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -32,7 +32,7 @@
     "MarlinLinearMethod", "QQQLinearMethod", "GPTQMarlin24LinearMethod",
     "TPUInt8LinearMethod", "GPTQLinearMethod", "FBGEMMFp8LinearMethod",
     "ModelOptFp8LinearMethod", "IPEXAWQLinearMethod", "IPEXGPTQLinearMethod",
-    "HQQMarlinMethod"
+    "HQQMarlinMethod", "QuarkLinearMethod"
 ]
 
 
diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
index dd10c434f0752..caeb8b95e02f2 100644
--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -26,6 +26,7 @@
     "experts_int8",
     "neuron_quant",
     "ipex",
+    "quark"
 ]
 
 
@@ -34,6 +35,8 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
         raise ValueError(f"Invalid quantization method: {quantization}")
 
     # lazy import to avoid triggering `torch.compile` too early
+    from vllm.model_executor.layers.quantization.quark.quark import QuarkConfig
+
     from .aqlm import AQLMConfig
     from .awq import AWQConfig
     from .awq_marlin import AWQMarlinConfig
@@ -79,6 +82,7 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
         "experts_int8": ExpertsInt8Config,
         "neuron_quant": NeuronQuantConfig,
         "ipex": IPEXConfig,
+        "quark": QuarkConfig
     }
 
     return method_to_config[quantization]
diff --git a/vllm/model_executor/layers/quantization/base_config.py b/vllm/model_executor/layers/quantization/base_config.py
index 6dfac8aad5358..2fb2642dd5156 100644
--- a/vllm/model_executor/layers/quantization/base_config.py
+++ b/vllm/model_executor/layers/quantization/base_config.py
@@ -133,3 +133,6 @@ def get_quant_method(self, layer: torch.nn.Module,
             method.
         """
         raise NotImplementedError
+
+    def get_cache_scale(self, name: str) -> Optional[str]:
+        return None
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index f0e4eda76734b..b2fc2360f47f1 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -412,6 +412,22 @@ def get_scheme(
         self._check_scheme_supported(scheme.get_min_capability())
         return scheme
 
+    def get_cache_scale(self, name: str) -> Optional[str]:
+        """
+        Check whether the param name matches the format for k/v cache scales
+        in compressed-tensors. If this is the case, return its equivalent
+        param name expected by vLLM
+
+        :param name: param name
+        :return: matching param name for KV cache scale in vLLM
+        """
+        if name.endswith(".output_scale") and ".k_proj" in name:
+            return name.replace(".k_proj.output_scale", ".attn.k_scale")
+        if name.endswith(".output_scale") and ".v_proj" in name:
+            return name.replace(".v_proj.output_scale", ".attn.v_scale")
+        # If no matches, return None
+        return None
+
     @staticmethod
     def supports_cutlass_24(
             weight_quant: Optional[QuantizationArgs],
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py b/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py
index 2659afcdc74a9..f4c1dbc0361c6 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py
@@ -136,6 +136,10 @@ def triton_scaled_mm(input: torch.Tensor,
     assert N > 0 and K > 0 and M > 0
     assert weight.shape[0] == K
     assert input.dtype == weight.dtype
+
+    scale_a = scale_a.reshape(-1, 1) if scale_a.dim() <= 1 else scale_a
+    scale_b = scale_b.reshape(-1, 1) if scale_b.dim() <= 1 else scale_b
+
     assert scale_a.dtype == scale_b.dtype and scale_a.is_floating_point()
     assert scale_a.shape == torch.Size([1, 1]) or scale_a.shape == torch.Size(
         [M, 1])
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
index dfae4db71e546..8fcbda377428e 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
@@ -133,23 +133,6 @@ def _find_first_match(value: str,
     return None
 
 
-def get_compressed_tensors_cache_scale(name: str) -> Optional[str]:
-    """
-    Check whether the param name matches the format for k/v cache scales
-    in compressed-tensors. If this is the case, return its equivalent
-    param name expected by vLLM
-
-    :param name: param name
-    :return: matching param name for KV cache scale in vLLM
-    """
-    if name.endswith(".output_scale") and ".k_proj" in name:
-        return name.replace(".k_proj.output_scale", ".attn.k_scale")
-    if name.endswith(".output_scale") and ".v_proj" in name:
-        return name.replace(".v_proj.output_scale", ".attn.v_scale")
-    # If no matches, return None
-    return None
-
-
 def _is_equal_or_regex_match(value: str,
                              target: str,
                              check_contains: bool = False) -> bool:
diff --git a/vllm/model_executor/layers/quantization/quark/__init__.py b/vllm/model_executor/layers/quantization/quark/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/vllm/model_executor/layers/quantization/quark/quark.py b/vllm/model_executor/layers/quantization/quark/quark.py
new file mode 100644
index 0000000000000..fc214255eca71
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/quark/quark.py
@@ -0,0 +1,387 @@
+import fnmatch
+import re
+from typing import Any, Dict, List, Optional, cast
+
+import torch
+
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
+                                               UnquantizedLinearMethod)
+from vllm.model_executor.layers.quantization.base_config import (  # noqa: E501
+    QuantizationConfig, QuantizeMethodBase)
+from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
+from vllm.model_executor.layers.quantization.quark.quark_moe import (  # noqa: E501
+    QuarkMoEMethod)
+from vllm.model_executor.layers.quantization.quark.schemes import (
+    QuarkScheme, QuarkW8A8Fp8, QuarkW8A8Int8)
+from vllm.model_executor.layers.quantization.quark.utils import (
+    deep_compare, should_ignore_layer)
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    FUSED_LAYER_NAME_MAPPING)
+from vllm.platforms import current_platform
+
+__all__ = ["QuarkLinearMethod"]
+
+
+class QuarkConfig(QuantizationConfig):
+
+    def __init__(self,
+                 quant_config: Dict[str, Any],
+                 kv_cache_group: Optional[List[str]] = None,
+                 kv_cache_config: Optional[Dict[str, Any]] = None,
+                 pack_method: str = "reorder"):
+        if kv_cache_group is None:
+            kv_cache_group = []
+        self.quant_config = quant_config
+        self.kv_cache_group = kv_cache_group
+        self.kv_cache_config = kv_cache_config
+        self.pack_method = pack_method
+
+    def get_linear_method(self) -> "QuarkLinearMethod":
+        return QuarkLinearMethod(self)
+
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.float16, torch.bfloat16]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 70
+
+    def get_name(self) -> str:
+        return "quark"
+
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional["QuantizeMethodBase"]:
+        from vllm.attention.layer import Attention  # Avoid circular import
+
+        # Check if the layer is skipped for quantization.
+        exclude_layers = cast(List[str], self.quant_config.get("exclude"))
+        if should_ignore_layer(prefix, ignore=exclude_layers):
+            return UnquantizedLinearMethod()
+        if isinstance(layer, LinearBase):
+            scheme = self.get_scheme(layer=layer, layer_name=prefix)
+            layer.scheme = scheme
+            return QuarkLinearMethod(self)
+        if isinstance(layer, Attention):
+            return QuarkKVCacheMethod(self)
+        if isinstance(layer, FusedMoE):
+            return QuarkMoEMethod.get_moe_method(self,
+                                                 module=layer,
+                                                 layer_name=prefix)
+        return None
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> "QuarkConfig":
+        export_config = config.get("export")
+        if export_config is None:
+            raise ValueError("The export key should be included in "
+                             "the configurations of Quark quantized model")
+        kv_cache_group = cast(List[str], export_config.get("kv_cache_group"))
+        pack_method = cast(str, export_config.get("pack_method"))
+
+        # In the export model of quark, the quantization configuration
+        # of kv_cache is stored in layer_quant_config. First, it is
+        # judged whether kv_cache_group exists, and then it is judged
+        # whether layer_quant_config has a quantization configuration
+        # that matches kv_cache.
+        if len(kv_cache_group) == 0:
+            kv_cache_config = None
+        else:
+            kv_cache_set = set(kv_cache_group)
+            layer_quant_config = cast(Dict[str, Any],
+                                      config.get("layer_quant_config"))
+            layer_quant_names = list(layer_quant_config.keys())
+            layer_quant_set = set(layer_quant_names)
+
+            if not kv_cache_set.issubset(layer_quant_set):
+                raise ValueError("The Quark quantized model has the "
+                                 "kv_cache_group parameter setting, "
+                                 "but no kv_cache quantization settings "
+                                 "were found in the quantization "
+                                 "configuration.")
+
+            q_configs = [
+                cast(Dict[str, Any], layer_quant_config.get(name))
+                for name in kv_cache_group
+            ]
+            if not all(
+                    deep_compare(q_config, q_configs[0])
+                    for q_config in q_configs):
+                raise ValueError(
+                    "The quantization method used for kv_cache should "
+                    "be the same, but the quantization method for the "
+                    "kv_cache layer in the config is different.")
+            kv_cache_config = q_configs[0].get("output_tensors")
+            if kv_cache_config is None:
+                raise ValueError(
+                    "The kv_cache quantization configuration is empty.")
+
+            # Since we have already set kv_cache quantization configurations,
+            # we will remove the quantization configuration for the
+            # output_tensors corresponding to the kv_cache layer.
+            for q_config in q_configs:
+                q_config["output_tensors"] = None
+
+        return cls(quant_config=config,
+                   kv_cache_group=kv_cache_group,
+                   kv_cache_config=kv_cache_config,
+                   pack_method=pack_method)
+
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        return []
+
+    def _check_scheme_supported(self,
+                                min_capability: int,
+                                error: bool = True) -> bool:
+        capability_tuple = current_platform.get_device_capability()
+
+        if capability_tuple is not None:
+            capability = capability_tuple.to_int()
+            supported = capability >= min_capability
+            if error and not supported:
+                raise RuntimeError(
+                    "Quantization scheme is not supported for ",
+                    f"the current GPU. Min capability: {min_capability}. ",
+                    f"Current capability: {capability}.")
+            return supported
+        else:
+            return False
+
+    def _is_fp8_w8a8(self, weight_quant: Optional[Dict[str, Any]],
+                     input_quant: Optional[Dict[str, Any]]) -> bool:
+        # Confirm weights and input quantized.
+        if weight_quant is None or input_quant is None:
+            return False
+
+        # Confirm weight scheme is supported
+        is_fp8_dtype = (weight_quant.get("dtype") == "fp8_e4m3"
+                        and input_quant.get("dtype") == "fp8_e4m3")
+        is_static_weight = not weight_quant.get("is_dynamic")
+        is_per_tensor_or_channel_weight = (weight_quant.get("qscheme")
+                                           in ["per_tensor", "per_channel"])
+
+        if not (is_fp8_dtype and is_static_weight
+                and is_per_tensor_or_channel_weight):
+            return False
+
+        # Dynamic quantization is always supported if weights supported.
+        if input_quant.get("is_dynamic"):
+            return True
+
+        # Confirm activation scheme is supported.
+        is_per_tensor_activation = (input_quant.get("qscheme") == "per_tensor")
+        return is_per_tensor_activation
+
+    def _is_static_tensor_w8a8(self, weight_quant: Optional[Dict[str, Any]],
+                               input_quant: Optional[Dict[str, Any]]) -> bool:
+        # Confirm weights and input quantized.
+        if weight_quant is None or input_quant is None:
+            return False
+
+        is_int8_dtype = (weight_quant.get("dtype") == "int8"
+                         and input_quant.get("dtype") == "int8")
+
+        is_tensor = (weight_quant.get("qscheme")
+                     in ["per_tensor", "per_channel"]
+                     and input_quant.get("qscheme") == "per_tensor")
+
+        is_static = (not weight_quant.get("is_dynamic")
+                     and not input_quant.get("is_dynamic"))
+
+        is_weight_symmetric = (weight_quant.get("symmetric") is True)
+
+        # Both symmetric and asymmetric input quantization supported.
+        # Only symmetric weight quantization supported.
+        return is_int8_dtype and is_tensor and is_weight_symmetric and is_static
+
+    def _find_matched_config(self, layer_name: str,
+                             module: torch.nn.Module) -> Dict[str, Any]:
+
+        proj_name = layer_name.split(".")[-1]
+        if proj_name in FUSED_LAYER_NAME_MAPPING:
+            shard_proj_names = FUSED_LAYER_NAME_MAPPING[proj_name]
+
+            # Convert fused_name --> [shard_names]
+            shard_names = [
+                layer_name.replace(proj_name, shard_proj_name)
+                for shard_proj_name in shard_proj_names
+            ]
+            shard_configs = [
+                self._find_matched_config(shard_name, module)
+                for shard_name in shard_names
+            ]
+            if not all(
+                    deep_compare(q_config, shard_configs[0])
+                    for q_config in shard_configs):
+                raise ValueError(
+                    f"Found a different quantization configuration for "
+                    f"{shard_proj_names} in {layer_name}. vLLM "
+                    "requires all to use the same scheme.")
+            return shard_configs[0]
+        else:
+            layer_quant_config = cast(
+                Dict[str, Any], self.quant_config.get("layer_quant_config"))
+            for name_pattern in layer_quant_config:
+                if fnmatch.fnmatch(layer_name, name_pattern):
+                    return layer_quant_config[name_pattern]
+
+            layer_type = cast(str, type(module))
+            layer_type_quant_config = cast(
+                Dict[str, Any],
+                self.quant_config.get("layer_type_quant_config"))
+            if layer_type in layer_type_quant_config:
+                return layer_type_quant_config[layer_type]
+
+            global_quant_config = cast(
+                Dict[str, Any], self.quant_config.get("global_quant_config"))
+            return global_quant_config
+
+    def _get_scheme_from_config(self, config: Dict[str, Any]) -> "QuarkScheme":
+        if config.get("output_tensors") or config.get("bias"):
+            raise NotImplementedError(
+                "Currently, Quark models with output_tensors "
+                "and bias quantized are not supported")
+        weight_config = cast(Dict[str, Any], config.get("weight"))
+        input_config = cast(Dict[str, Any], config.get("input_tensors"))
+
+        if self._is_fp8_w8a8(weight_config, input_config):
+            is_fp8_w8a8_supported = self._check_scheme_supported(
+                QuarkW8A8Fp8.get_min_capability(), error=False)
+            if is_fp8_w8a8_supported:
+                weight_qscheme = cast(str, weight_config.get("qscheme"))
+                input_static = (input_config is not None and
+                                not cast(bool, input_config.get("is_dynamic")))
+                return QuarkW8A8Fp8(qscheme=weight_qscheme,
+                                    is_static_input_scheme=input_static)
+        elif self._is_static_tensor_w8a8(weight_config, input_config):
+            weight_qscheme = cast(str, weight_config.get("qscheme"))
+            return QuarkW8A8Int8(qscheme=weight_qscheme,
+                                 is_static_input_scheme=True,
+                                 input_symmetric=input_config.get("symmetric"))
+
+        raise NotImplementedError("No quark compatible scheme was found. "
+                                  f"Weight config: {weight_config}, "
+                                  f"Input config: {input_config}")
+
+    def get_scheme(self, layer: torch.nn.Module,
+                   layer_name: str) -> "QuarkScheme":
+
+        layer_quant_config = self._find_matched_config(layer_name, layer)
+
+        # Find the quant_scheme
+        scheme = self._get_scheme_from_config(layer_quant_config)
+        # Raise error if device does not support the scheme
+        # (e.g. fp8 needs ada lovelace)
+        self._check_scheme_supported(scheme.get_min_capability())
+
+        return scheme
+
+    def get_cache_scale(self, name: str) -> Optional[str]:
+        """
+        Check whether the param name matches the format for k/v cache scales
+        in quark. If this is the case, return its equivalent param name 
+        expected by vLLM
+
+        :param name: param name
+        :return: matching param name for KV cache scale in vLLM
+        """
+        if self.kv_cache_group is None or len(self.kv_cache_group) == 0:
+            return None
+
+        kv_proj_names = [
+            re.split(r"[*.]", kv_cache)[-1] for kv_cache in self.kv_cache_group
+        ]
+        if name.endswith(".output_scale"):
+            if len(kv_proj_names) == 1 and kv_proj_names[0] in name:
+                kv_output_scale_name = "." + kv_proj_names[0] + ".output_scale"
+                return name.replace(kv_output_scale_name, ".attn.k_scale")
+
+            elif len(kv_proj_names) == 2:
+                for kv_proj_name in kv_proj_names:
+                    if kv_proj_name in name and kv_proj_name == "k_proj":
+                        return name.replace(".k_proj.output_scale",
+                                            ".attn.k_scale")
+                    elif kv_proj_name in name and kv_proj_name == "v_proj":
+                        return name.replace(".v_proj.output_scale",
+                                            ".attn.v_scale")
+
+        # If no matches, return None
+        return None
+
+
+class QuarkLinearMethod(LinearMethodBase):
+
+    def __init__(self, quantization_config: QuarkConfig):
+        self.quantization_config = quantization_config
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        layer.scheme.process_weights_after_loading(layer)
+
+    def create_weights(self, layer: torch.nn.Module,
+                       input_size_per_partition: int,
+                       output_partition_sizes: List[int], input_size: int,
+                       output_size: int, params_dtype: torch.dtype,
+                       **extra_weight_attrs):
+        """
+        Use the CompressedTensorsScheme associated with each layer to create
+        the necessary parameters for the layer. See LinearMethodBase for param
+        details
+        """
+        weight_loader = extra_weight_attrs.get("weight_loader")
+        layer.scheme.create_weights(
+            layer=layer,
+            input_size=input_size,
+            input_size_per_partition=input_size_per_partition,
+            output_partition_sizes=output_partition_sizes,
+            output_size=output_size,
+            params_dtype=params_dtype,
+            weight_loader=weight_loader)
+
+    def apply(self,
+              layer: torch.nn.Module,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None):
+        """
+        Use the output of create_weights and the CompressedTensorsScheme
+        associated with the layer to apply the forward pass with the
+        layer input.  See LinearMethodBase for param details
+
+        """
+        scheme = layer.scheme
+        if scheme is None:
+            raise ValueError("A scheme must be defined for each layer")
+        return scheme.apply_weights(layer, x, bias=bias)
+
+
+class QuarkKVCacheMethod(BaseKVCacheMethod):
+    """
+    Supports loading kv-cache scaling factors from quark checkpoints.
+    """
+
+    def __init__(self, quant_config: QuarkConfig):
+        self.validate_kv_cache_config(quant_config.kv_cache_config)
+        super().__init__(quant_config)
+
+    @staticmethod
+    def validate_kv_cache_config(kv_cache_config: Optional[Dict[str, Any]]):
+        """
+        Validator for the kv cache configuration. Useful for controlling the
+        kv cache quantization schemes, that are being supported in vLLM
+        :param kv_cache_config: the quark kv cache scheme
+        """
+        if kv_cache_config is None:
+            return
+
+        dtype = kv_cache_config.get("dtype")
+        if dtype != "fp8_e4m3":
+            raise NotImplementedError(
+                "Currently supported kv cache quantization is "
+                f"dtype=fp8_e4m3, however received {dtype}")
+
+        qscheme = kv_cache_config.get("qscheme")
+        if qscheme != "per_tensor":
+            raise NotImplementedError(
+                "Only support per-tensor scaling factor "
+                "for quark KV cache. "
+                f"Expected qscheme: per_tensor, found qscheme: {qscheme}")
diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py
new file mode 100644
index 0000000000000..3e19247300808
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py
@@ -0,0 +1,225 @@
+from typing import Any, Callable, Dict, Optional
+
+import torch
+
+import vllm.model_executor.layers.fused_moe  # noqa
+from vllm import _custom_ops as ops
+from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEMethodBase,
+                                                  FusedMoeWeightScaleSupported)
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    all_close_1d, normalize_e4m3fn_to_e4m3fnuz, per_tensor_dequantize)
+from vllm.model_executor.utils import set_weight_attrs
+from vllm.platforms import current_platform
+
+logger = init_logger(__name__)
+
+__all__ = ["QuarkMoEMethod", "QuarkW8A8Fp8MoEMethod"]
+
+
+class QuarkMoEMethod(FusedMoEMethodBase):
+
+    @staticmethod
+    def get_moe_method(
+            quant_config: "QuarkConfig",  # type: ignore # noqa E501 # noqa F821
+            module: torch.nn.Module,
+            layer_name: str) -> "QuarkMoEMethod":
+        layer_quant_config = quant_config._find_matched_config(
+            layer_name, module)
+
+        if (layer_quant_config.get("output_tensors")
+                or layer_quant_config.get("bias")):
+            raise NotImplementedError("Currently, Quark models with "
+                                      "output_tensors and bias "
+                                      "quantized are not supported")
+        weight_config = layer_quant_config.get("weight")
+        input_config = layer_quant_config.get("input_tensors")
+
+        if quant_config._is_fp8_w8a8(weight_config, input_config):
+            return QuarkW8A8Fp8MoEMethod(weight_config, input_config)
+        else:
+            raise RuntimeError("Unsupported FusedMoe scheme")
+
+
+class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod):
+
+    def __init__(self, weight_config: Dict[str, Any], input_config: Dict[str,
+                                                                         Any]):
+        self.weight_quant = weight_config
+        self.input_quant = input_config
+
+        weight_qscheme = self.weight_quant.get("qscheme")
+        input_qscheme = self.input_quant.get("qscheme")
+        if not (weight_qscheme == "per_tensor"
+                and input_qscheme == "per_tensor"):
+            raise ValueError(
+                "For FP8 Fused MoE layers, only per-tensor scales"
+                "for weights and activations are supported. Found "
+                f"{weight_qscheme}, {input_qscheme}")  # noqa E501
+
+        self.static_input_scales = not self.input_quant.get("is_dynamic")
+
+    def create_weights(self, layer: torch.nn.Module, num_experts: int,
+                       hidden_size: int, intermediate_size: int,
+                       params_dtype: torch.dtype, **extra_weight_attrs):
+
+        params_dtype = torch.float8_e4m3fn
+
+        # WEIGHTS
+        w13_weight = torch.nn.Parameter(torch.empty(num_experts,
+                                                    2 * intermediate_size,
+                                                    hidden_size,
+                                                    dtype=params_dtype),
+                                        requires_grad=False)
+        layer.register_parameter("w13_weight", w13_weight)
+        set_weight_attrs(w13_weight, extra_weight_attrs)
+
+        w2_weight = torch.nn.Parameter(torch.empty(num_experts,
+                                                   hidden_size,
+                                                   intermediate_size,
+                                                   dtype=params_dtype),
+                                       requires_grad=False)
+        layer.register_parameter("w2_weight", w2_weight)
+        set_weight_attrs(w2_weight, extra_weight_attrs)
+
+        # WEIGHT_SCALES
+        # Allocate 2 scales for w1 and w3 respectively.
+        # They will be combined to a single scale after weight loading.
+        w13_weight_scale = torch.nn.Parameter(torch.ones(num_experts,
+                                                         2,
+                                                         dtype=torch.float32),
+                                              requires_grad=False)
+        layer.register_parameter("w13_weight_scale", w13_weight_scale)
+
+        w2_weight_scale = torch.nn.Parameter(torch.ones(num_experts,
+                                                        dtype=torch.float32),
+                                             requires_grad=False)
+        layer.register_parameter("w2_weight_scale", w2_weight_scale)
+        # Add the quantization method used (per tensor/grouped/channel)
+        # to ensure the weight scales are loaded in properly
+        extra_weight_attrs.update(
+            {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value})
+        set_weight_attrs(w13_weight_scale, extra_weight_attrs)
+        set_weight_attrs(w2_weight_scale, extra_weight_attrs)
+
+        # INPUT_SCALES
+        if self.static_input_scales:
+            w13_input_scale = torch.nn.Parameter(torch.ones(
+                num_experts, dtype=torch.float32),
+                                                 requires_grad=False)
+            layer.register_parameter("w13_input_scale", w13_input_scale)
+            set_weight_attrs(w13_input_scale, extra_weight_attrs)
+
+            w2_input_scale = torch.nn.Parameter(torch.ones(
+                num_experts, dtype=torch.float32),
+                                                requires_grad=False)
+            layer.register_parameter("w2_input_scale", w2_input_scale)
+            set_weight_attrs(w2_input_scale, extra_weight_attrs)
+        else:
+            layer.w13_input_scale = None
+            layer.w2_input_scale = None
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        # Fp8 moe kernels require a single activation scale.
+        # We take the max of all the scales in case they differ.
+        if self.static_input_scales:
+            if (layer.w13_input_scale is None or layer.w2_input_scale is None):
+                raise ValueError(
+                    "QuantConfig has static quantization, but found "
+                    "activation scales are None.")
+            if (not all_close_1d(layer.w13_input_scale)
+                    or not all_close_1d(layer.w2_input_scale)):
+                logger.warning_once(
+                    "Found input_scales that are not equal for "
+                    "fp8 MoE layer. Using the maximum across experts "
+                    "for each layer. ")
+            layer.w13_input_scale = torch.nn.Parameter(
+                layer.w13_input_scale.max(), requires_grad=False)
+            layer.w2_input_scale = torch.nn.Parameter(
+                layer.w2_input_scale.max(), requires_grad=False)
+
+        # If rocm, normalize the weights and scales to e4m3fnuz
+        if current_platform.is_rocm():
+            # Normalize the weights and scales
+            w13_weight, w13_weight_scale, w13_input_scale = \
+                normalize_e4m3fn_to_e4m3fnuz(
+                    layer.w13_weight, layer.w13_weight_scale,
+                    layer.w13_input_scale)
+            w2_weight, w2_weight_scale, w2_input_scale = \
+                normalize_e4m3fn_to_e4m3fnuz(
+                    layer.w2_weight, layer.w2_weight_scale,
+                    layer.w2_input_scale)
+            # Reset the parameter
+            layer.w13_weight = torch.nn.Parameter(w13_weight,
+                                                  requires_grad=False)
+            layer.w13_weight_scale = torch.nn.Parameter(w13_weight_scale,
+                                                        requires_grad=False)
+            if w13_input_scale is not None:
+                layer.w13_input_scale = torch.nn.Parameter(w13_input_scale,
+                                                           requires_grad=False)
+            layer.w2_weight = torch.nn.Parameter(w2_weight,
+                                                 requires_grad=False)
+            layer.w2_weight_scale = torch.nn.Parameter(w2_weight_scale,
+                                                       requires_grad=False)
+            if w2_input_scale is not None:
+                layer.w2_input_scale = torch.nn.Parameter(w2_input_scale,
+                                                          requires_grad=False)
+
+        # Fp8 moe kernel needs single weight scale for w13 per expert.
+        # We take the max then dequant and requant each expert.
+        assert layer.w13_weight_scale is not None
+        shard_size = layer.intermediate_size_per_partition
+        max_w13_scales = layer.w13_weight_scale.max(dim=1).values
+        for expert_id in range(layer.num_experts):
+            start = 0
+            for shard_id in range(2):
+                dq_weight = per_tensor_dequantize(
+                    layer.w13_weight[expert_id][start:start + shard_size, :],
+                    layer.w13_weight_scale[expert_id][shard_id])
+                layer.w13_weight[expert_id][
+                    start:start + shard_size, :], _ = ops.scaled_fp8_quant(
+                        dq_weight, max_w13_scales[expert_id])
+                start += shard_size
+
+        layer.w13_weight_scale = torch.nn.Parameter(max_w13_scales,
+                                                    requires_grad=False)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool = False,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        from vllm.model_executor.layers.fused_moe import fused_experts
+
+        topk_weights, topk_ids = FusedMoE.select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+            use_grouped_topk=use_grouped_topk,
+            top_k=top_k,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias)
+
+        return fused_experts(x,
+                             layer.w13_weight,
+                             layer.w2_weight,
+                             topk_weights=topk_weights,
+                             topk_ids=topk_ids,
+                             inplace=True,
+                             use_fp8_w8a8=True,
+                             w1_scale=layer.w13_weight_scale,
+                             w2_scale=layer.w2_weight_scale,
+                             a1_scale=layer.w13_input_scale,
+                             a2_scale=layer.w2_input_scale)
diff --git a/vllm/model_executor/layers/quantization/quark/schemes/__init__.py b/vllm/model_executor/layers/quantization/quark/schemes/__init__.py
new file mode 100644
index 0000000000000..fb0ba9bd5220c
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/quark/schemes/__init__.py
@@ -0,0 +1,5 @@
+from .quark_scheme import QuarkScheme
+from .quark_w8a8_fp8 import QuarkW8A8Fp8
+from .quark_w8a8_int8 import QuarkW8A8Int8
+
+__all__ = ["QuarkScheme", "QuarkW8A8Fp8", "QuarkW8A8Int8"]
diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py
new file mode 100644
index 0000000000000..239597fa4be0e
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py
@@ -0,0 +1,52 @@
+from abc import ABC, abstractmethod
+from typing import Optional
+
+import torch
+
+__all__ = ["QuarkScheme"]
+
+
+class QuarkScheme(ABC):
+    """
+    Abstract class used to describe the weight creation and forward pass 
+    of different quantization schemes supported by Quark.
+    """
+
+    @classmethod
+    @abstractmethod
+    def get_min_capability(cls) -> int:
+        """
+        Get minimum device capability.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def create_weights(self, *args, **kwargs):
+        """
+        Weight creation for the particular scheme. Inputs to this function 
+
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
+                      bias: Optional[torch.Tensor]):
+        """
+        Run the forward pass for the particular scheme. This is where 
+        scheme-specific dequant/quant steps/kernels should be applied.
+
+        :param layer: torch.nn.Module with the registered weights and 
+            other parameters relevant to the particular scheme. 
+        :param x: input to the layer
+        :param bias: bias parameter
+
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def process_weights_after_loading(self, layer: torch.nn.Module):
+        """
+        Called after weight loading is complete for any cleanup that
+        needs to occur.
+        """
+        raise NotImplementedError
diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py
new file mode 100644
index 0000000000000..206931ea2ffc0
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py
@@ -0,0 +1,140 @@
+from typing import Callable, List, Optional
+
+import torch
+from torch.nn import Parameter
+
+from vllm.model_executor.layers.quantization.quark.schemes import QuarkScheme
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    apply_fp8_linear, cutlass_fp8_supported, normalize_e4m3fn_to_e4m3fnuz,
+    requantize_with_max_scale)
+from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
+                                           ModelWeightParameter,
+                                           PerTensorScaleParameter)
+from vllm.platforms import current_platform
+
+__all__ = ["QuarkW8A8Fp8"]
+
+
+class QuarkW8A8Fp8(QuarkScheme):
+
+    def __init__(self, qscheme: str, is_static_input_scheme: Optional[bool]):
+        self.qscheme = qscheme
+        self.is_static_input_scheme = is_static_input_scheme
+        self.cutlass_fp8_supported = cutlass_fp8_supported()
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        # lovelace and up
+        return 89
+
+    def process_weights_after_loading(self, layer) -> None:
+        # If per tensor, when we have a fused module (e.g. QKV) with per
+        # tensor scales (thus N scales being passed to the kernel),
+        # requantize so we can always run per tensor
+        if self.qscheme == "per_tensor":
+            max_w_scale, weight = requantize_with_max_scale(
+                weight=layer.weight,
+                weight_scale=layer.weight_scale,
+                logical_widths=layer.logical_widths,
+            )
+
+            if current_platform.is_rocm():
+                weight, max_w_scale, input_scale = normalize_e4m3fn_to_e4m3fnuz(
+                    weight=weight,
+                    weight_scale=max_w_scale,
+                    input_scale=layer.input_scale)
+                if input_scale is not None:
+                    layer.input_scale = Parameter(input_scale,
+                                                  requires_grad=False)
+
+            layer.weight = Parameter(weight.t(), requires_grad=False)
+            layer.weight_scale = Parameter(max_w_scale, requires_grad=False)
+
+        # If channelwise, scales are already lined up, so just transpose.
+        elif self.qscheme == "per_channel":
+            weight = layer.weight
+
+            if current_platform.is_rocm():
+                weight, weight_scale, input_scale = \
+                    normalize_e4m3fn_to_e4m3fnuz(
+                        weight=weight,
+                        weight_scale=layer.weight_scale,
+                        input_scale=layer.input_scale)
+                if input_scale is not None:
+                    layer.input_scale = Parameter(input_scale,
+                                                  requires_grad=False)
+            else:
+                weight_scale = layer.weight_scale.data
+
+            layer.weight = Parameter(weight.t(), requires_grad=False)
+            # required by torch.compile to be torch.nn.Parameter
+            layer.weight_scale = Parameter(weight_scale, requires_grad=False)
+
+        else:
+            raise ValueError(f"Unknown quantization scheme {self.qscheme}")
+
+        # INPUT SCALE
+        if self.is_static_input_scheme:
+            layer.input_scale = Parameter(layer.input_scale.max(),
+                                          requires_grad=False)
+        else:
+            layer.input_scale = None
+
+    def create_weights(self, layer: torch.nn.Module,
+                       output_partition_sizes: List[int],
+                       input_size_per_partition: int,
+                       params_dtype: torch.dtype, weight_loader: Callable,
+                       **kwargs):
+        output_size_per_partition = sum(output_partition_sizes)
+        layer.logical_widths = output_partition_sizes
+
+        # WEIGHT
+        weight = ModelWeightParameter(data=torch.empty(
+            output_size_per_partition,
+            input_size_per_partition,
+            dtype=torch.float8_e4m3fn),
+                                      input_dim=1,
+                                      output_dim=0,
+                                      weight_loader=weight_loader)
+        layer.register_parameter("weight", weight)
+
+        # WEIGHT SCALE
+        # TODO: update create_xxx_parameter functions to return
+        # the newly added parameters
+        if self.qscheme == "per_channel":
+            weight_scale = ChannelQuantScaleParameter(
+                data=torch.empty((sum(output_partition_sizes), 1),
+                                 dtype=torch.float32),
+                output_dim=0,
+                weight_loader=weight_loader)
+        else:
+            assert self.qscheme == "per_tensor"
+            weight_scale = PerTensorScaleParameter(data=torch.empty(
+                len(output_partition_sizes), dtype=torch.float32),
+                                                   weight_loader=weight_loader)
+
+        # min requirement for fp8 kernels
+        weight_scale[:] = torch.finfo(torch.float32).min
+        layer.register_parameter("weight_scale", weight_scale)
+
+        # INPUT SCALE
+        if self.is_static_input_scheme:
+            input_scale = PerTensorScaleParameter(data=torch.empty(
+                len(output_partition_sizes), dtype=torch.float32),
+                                                  weight_loader=weight_loader)
+            input_scale[:] = torch.finfo(torch.float32).min
+            layer.register_parameter("input_scale", input_scale)
+
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+
+        return apply_fp8_linear(
+            input=x,
+            weight=layer.weight,
+            weight_scale=layer.weight_scale,
+            input_scale=layer.input_scale,
+            bias=bias,
+            cutlass_fp8_supported=self.cutlass_fp8_supported,
+            use_per_token_if_dynamic=True)
diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py
new file mode 100644
index 0000000000000..8cb47e9c37e56
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py
@@ -0,0 +1,105 @@
+from typing import Callable, List, Optional, Set
+
+import torch
+
+from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization.kernels.scaled_mm import (
+    ScaledMMLinearLayerConfig, choose_scaled_mm_linear_kernel)
+from vllm.model_executor.layers.quantization.quark.schemes import QuarkScheme
+from vllm.model_executor.parameter import (BasevLLMParameter,
+                                           ChannelQuantScaleParameter,
+                                           ModelWeightParameter,
+                                           PerTensorScaleParameter)
+
+logger = init_logger(__name__)
+
+
+class QuarkW8A8Int8(QuarkScheme):
+    _kernel_backends_being_used: Set[str] = set()
+
+    def __init__(self, qscheme: str, is_static_input_scheme: Optional[bool],
+                 input_symmetric: Optional[bool]):
+        self.qscheme = qscheme
+        self.is_static_input_scheme = is_static_input_scheme
+        self.input_symmetric = input_symmetric
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        # turing and up
+        return 75
+
+    def create_weights(self, layer: torch.nn.Module,
+                       output_partition_sizes: List[int],
+                       input_size_per_partition: int,
+                       params_dtype: torch.dtype, weight_loader: Callable,
+                       **kwargs):
+        self.logical_widths = output_partition_sizes
+
+        scaled_mm_linear_kernel_config = ScaledMMLinearLayerConfig(
+            is_channelwise=(self.qscheme == "per_channel"),
+            is_static_input_scheme=(self.is_static_input_scheme is True),
+            input_symmetric=(self.input_symmetric is True))
+
+        kernel_type = choose_scaled_mm_linear_kernel(
+            scaled_mm_linear_kernel_config)
+
+        if kernel_type.__name__ not in self._kernel_backends_being_used:
+            logger.info("Using %s for QuarkW8A8Int8", kernel_type.__name__)
+            self._kernel_backends_being_used.add(kernel_type.__name__)
+
+        # WEIGHT
+        weight = ModelWeightParameter(data=torch.empty(
+            sum(output_partition_sizes),
+            input_size_per_partition,
+            dtype=torch.int8),
+                                      input_dim=1,
+                                      output_dim=0,
+                                      weight_loader=weight_loader)
+
+        layer.register_parameter("weight", weight)
+
+        # WEIGHT SCALE
+        if self.qscheme == "per_channel":
+            weight_scale = ChannelQuantScaleParameter(
+                data=torch.empty((sum(output_partition_sizes), 1),
+                                 dtype=torch.float32),
+                output_dim=0,
+                weight_loader=weight_loader)
+        else:
+            assert self.qscheme == "per_tensor"
+            weight_scale = PerTensorScaleParameter(data=torch.empty(
+                len(output_partition_sizes), dtype=torch.float32),
+                                                   weight_loader=weight_loader)
+        layer.register_parameter("weight_scale", weight_scale)
+
+        # INPUT SCALE
+        if self.is_static_input_scheme:
+            input_scale = BasevLLMParameter(data=torch.empty(
+                1, dtype=torch.float32),
+                                            weight_loader=weight_loader)
+            layer.register_parameter("input_scale", input_scale)
+
+            if not self.input_symmetric:
+                # Note: quark stores the zp using the same dtype
+                # as the weights
+                # AZP loaded as int8 but used as int32
+                input_zero_point = BasevLLMParameter(
+                    data=torch.empty(1, dtype=torch.int8),
+                    weight_loader=weight_loader)
+                layer.register_parameter("input_zero_point", input_zero_point)
+
+        self.kernel = kernel_type(c=scaled_mm_linear_kernel_config,
+                                  w_q_param_name="weight",
+                                  w_s_param_name="weight_scale",
+                                  i_s_param_name="input_scale",
+                                  i_zp_param_name="input_zero_point",
+                                  azp_adj_param_name="azp_adj")
+
+    # Checkpoints are serialized in quark format, which is
+    # different from the format the kernel may want. Handle repacking here.
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        self.kernel.process_weights_after_loading(layer)
+
+    def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
+                      bias: Optional[torch.Tensor]) -> torch.Tensor:
+        return self.kernel.apply_weights(layer, x, bias)
diff --git a/vllm/model_executor/layers/quantization/quark/utils.py b/vllm/model_executor/layers/quantization/quark/utils.py
new file mode 100644
index 0000000000000..742a629bdb1c5
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/quark/utils.py
@@ -0,0 +1,99 @@
+import re
+from typing import Any, Iterable, Optional
+
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    FUSED_LAYER_NAME_MAPPING)
+
+
+def deep_compare(dict1: Any, dict2: Any) -> bool:
+    if type(dict1) is not type(dict2):
+        return False
+    if isinstance(dict1, dict):
+        if dict1.keys() != dict2.keys():
+            return False
+        return all(deep_compare(dict1[k], dict2[k]) for k in dict1)
+    elif isinstance(dict1, list):
+        return set(dict1) == set(dict2)
+    else:
+        return dict1 == dict2
+
+
+def should_ignore_layer(layer_name: Optional[str],
+                        ignore: Iterable[str]) -> bool:
+    if layer_name is None:
+        return False
+
+    # layer_name = model.layers.0.self_attn.qkv_proj
+    # proj_name = qkv_proj
+    proj_name = layer_name.split(".")[-1]
+
+    # Fused layers like gate_up_proj or qkv_proj will not be fused
+    # in the safetensors checkpoint. So, we convert the name
+    # from the fused version to unfused + check to make sure that
+    # each shard of the fused layer has the same scheme.
+    if proj_name in FUSED_LAYER_NAME_MAPPING:
+        shard_proj_names = FUSED_LAYER_NAME_MAPPING[proj_name]
+
+        # Convert fused_name --> [shard_names]
+        shard_names = [
+            layer_name.replace(proj_name, shard_proj_name)
+            for shard_proj_name in shard_proj_names
+        ]
+
+        # Layer should be ignored if shards are ignored.
+        should_ignore_layer = None
+        for shard_name in shard_names:
+            should_ignore_shard = check_equal_or_regex_match(
+                layer_name=shard_name, targets=ignore)
+
+            # If shard_idx=0, set layer ignore to match shard.
+            if should_ignore_layer is None:
+                should_ignore_layer = should_ignore_shard
+
+            # If shard_idx=1+ confirm scheme matches prior shards.
+            elif should_ignore_shard != should_ignore_layer:
+                raise ValueError(f"Found a different quantization schemes for "
+                                 f"{shard_proj_names} in {layer_name}. vLLM "
+                                 "requires all to use the same scheme.")
+
+    # Unfused layers like down_proj and o_proj will match
+    # the safetensors checkpoint already.
+    else:
+        should_ignore_layer = check_equal_or_regex_match(layer_name=layer_name,
+                                                         targets=ignore)
+
+    assert should_ignore_layer is not None
+    return should_ignore_layer
+
+
+def check_equal_or_regex_match(layer_name: str,
+                               targets: Iterable[str]) -> bool:
+    """
+    Checks whether a layer_name is exactly equal or a regex match for 
+    if target starts with 're:' to any target in list.
+    """
+    for target in targets:
+        if _is_equal_or_regex_match(layer_name, target):
+            return True
+    return False
+
+
+def _is_equal_or_regex_match(value: str,
+                             target: str,
+                             check_contains: bool = False) -> bool:
+    """
+    Checks whether a value is exactly equal or a regex match for target
+    if target starts with 're:'. If check_contains is set to True,
+    additionally checks if the target string is contained within the value.
+    """
+
+    if target.startswith("re:"):
+        pattern = target[3:]
+        if re.match(pattern, value):
+            return True
+    elif check_contains:
+        if target.lower() in value.lower():
+            return True
+    elif target == value:
+        return True
+    return False
diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
index 089062ab53fc3..91225c0ddc917 100644
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@@ -13,8 +13,6 @@
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
-    get_compressed_tensors_cache_scale)
 from vllm.model_executor.layers.sampler import (SamplerOutput,
                                                 SamplingMetadata, get_sampler)
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
@@ -390,12 +388,15 @@ def load_weights(self, weights: Iterable[Tuple[str,
                 # Models trained using ColossalAI may include these tensors in
                 # the checkpoint. Skip them.
                 continue
-            if scale_name := get_compressed_tensors_cache_scale(name):
-                # Loading kv cache scales for compressed-tensors quantization
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache scales for quark and
+                # compressed-tensors quantization
                 param = params_dict[scale_name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
-                loaded_weight = loaded_weight[0]
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
                 weight_loader(param, loaded_weight)
                 loaded_params.add(scale_name)
                 continue
diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py
index 8d61ece289412..6517422697c04 100644
--- a/vllm/model_executor/models/commandr.py
+++ b/vllm/model_executor/models/commandr.py
@@ -437,6 +437,20 @@ def load_weights(self, weights: Iterable[Tuple[str,
         params_dict = dict(self.named_parameters())
         loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
+
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache scales for quark and
+                # compressed-tensors quantization
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+
             for param_name, shard_name, shard_id in stacked_params_mapping:
                 if shard_name not in name:
                     continue
diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py
index 3932d8b52a9d1..ff1f1c2a939ff 100644
--- a/vllm/model_executor/models/dbrx.py
+++ b/vllm/model_executor/models/dbrx.py
@@ -83,7 +83,7 @@ def __init__(
 
     # Define custom weight loader for dbrx model
     def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor,
-                      weight_name: str):
+                      weight_name: str, param_name: str):
         tp_rank = get_tensor_model_parallel_rank()
         param_data = param.data
         shard_size = self.intermediate_size
@@ -91,25 +91,37 @@ def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor,
         # DBRX uses GLU for each experts.
         # GLU has 3 linear layers: w1, v1 and w2.
         if weight_name.endswith("w1"):
-            loaded_weight = torch.reshape(
-                loaded_weight,
-                [-1, self.intermediate_size * self.tp_size, self.d_model],
-            )
-            param_data[:, 0:shard_size, :] = loaded_weight[:, shard, :]
+            if param_name.endswith("weight"):
+                loaded_weight = torch.reshape(
+                    loaded_weight,
+                    [-1, self.intermediate_size * self.tp_size, self.d_model],
+                )
+                param_data[:, 0:shard_size, :] = loaded_weight[:, shard, :]
+            elif param_name.endswith("weight_scale"):
+                param_data[:, 0] = loaded_weight
+            else:
+                param_data = loaded_weight
         if weight_name.endswith("v1"):
-            loaded_weight = torch.reshape(
-                loaded_weight,
-                [-1, self.intermediate_size * self.tp_size, self.d_model],
-            )
-            param_data[:,
-                       shard_size:2 * shard_size, :] = loaded_weight[:,
-                                                                     shard, :]
+            if param_name.endswith("weight"):
+                loaded_weight = torch.reshape(
+                    loaded_weight,
+                    [-1, self.intermediate_size * self.tp_size, self.d_model],
+                )
+                param_data[:, shard_size:2 *
+                           shard_size, :] = loaded_weight[:, shard, :]
+            elif param_name.endswith("weight_scale"):
+                param_data[:, 1] = loaded_weight
+            else:
+                param_data[:] = loaded_weight
         if weight_name.endswith("w2"):
-            loaded_weight = torch.reshape(
-                loaded_weight,
-                [-1, self.intermediate_size * self.tp_size, self.d_model],
-            ).transpose(1, 2)
-            param_data[:] = loaded_weight[:, :, shard]
+            if param_name.endswith("weight"):
+                loaded_weight = torch.reshape(
+                    loaded_weight,
+                    [-1, self.intermediate_size * self.tp_size, self.d_model],
+                ).transpose(1, 2)
+                param_data[:] = loaded_weight[:, :, shard]
+            else:
+                param_data[:] = loaded_weight
 
 
 class DbrxMoE(nn.Module):
@@ -430,14 +442,29 @@ def sample(
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-
         expert_params_mapping = [(
-            "w13_weight" if weight_name in ["w1", "v1"] else "w2_weight",
+            "w13" if weight_name in ["w1", "v1"] else "w2",
             f"mlp.{weight_name}",
         ) for weight_name in ["w1", "v1", "w2"]]
         params_dict = dict(self.named_parameters(remove_duplicate=False))
         loaded_params: Set[str] = set()
+
         for name, loaded_weight in weights:
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache scales for quark and
+                # compressed-tensors quantization
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+
+            if name.endswith(("w1", "w2", "v1")):
+                name = name + "_weight"
             for param_name, weight_name in expert_params_mapping:
                 if weight_name not in name:
                     continue
@@ -446,8 +473,9 @@ def load_weights(self, weights: Iterable[Tuple[str,
                     continue
                 param = params_dict[name]
                 weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, weight_name)
+                weight_loader(param, loaded_weight, weight_name, name)
                 break
+
             else:
                 # Remapping the name of FP8 kv-scale.
                 name = maybe_remap_kv_scale_name(name, params_dict)
@@ -456,6 +484,9 @@ def load_weights(self, weights: Iterable[Tuple[str,
 
                 if is_pp_missing_parameter(name, self):
                     continue
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
                 param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py
index ad15f835b1609..ac679d6ff43c7 100644
--- a/vllm/model_executor/models/exaone.py
+++ b/vllm/model_executor/models/exaone.py
@@ -39,8 +39,6 @@
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
-    get_compressed_tensors_cache_scale)
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -439,6 +437,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         self.config = config
         self.lora_config = lora_config
+        self.quant_config = quant_config
 
         self.transformer = ExaoneModel(
             vllm_config=vllm_config,
@@ -532,12 +531,15 @@ def load_weights(self, weights: Iterable[Tuple[str,
             # processed with quantization, LoRA, fine-tuning, etc.
             if self.config.tie_word_embeddings and "lm_head.weight" in name:
                 continue
-            if scale_name := get_compressed_tensors_cache_scale(name):
-                # Loading kv cache scales for compressed-tensors quantization
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache scales for quark and
+                # compressed-tensors quantization
                 param = params_dict[scale_name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
-                loaded_weight = loaded_weight[0]
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
                 weight_loader(param, loaded_weight)
                 loaded_params.add(scale_name)
                 continue
diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py
index 698b9a5b6b1d6..f0dc7693974be 100644
--- a/vllm/model_executor/models/gemma2.py
+++ b/vllm/model_executor/models/gemma2.py
@@ -31,8 +31,6 @@
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
-    get_compressed_tensors_cache_scale)
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -254,6 +252,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         self.config = config
+        self.quant_config = quant_config
 
         self.embed_tokens = VocabParallelEmbedding(
             config.vocab_size,
@@ -329,7 +328,8 @@ def load_weights(self, weights: Iterable[Tuple[str,
         params_dict = dict(self.named_parameters())
         loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
-            if scale_name := get_compressed_tensors_cache_scale(name):
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
                 # Loading kv cache scales for compressed-tensors quantization
                 param = params_dict[scale_name]
                 weight_loader = getattr(param, "weight_loader",
diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py
index 4829578a56959..56343ca9a71ac 100644
--- a/vllm/model_executor/models/gpt_j.py
+++ b/vllm/model_executor/models/gpt_j.py
@@ -313,6 +313,20 @@ def load_weights(self, weights: Iterable[Tuple[str,
         for name, loaded_weight in weights:
             if "attn.bias" in name or "attn.masked_bias" in name:
                 continue
+
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache scales for quark and
+                # compressed-tensors quantization
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+
             for (param_name, weight_name, shard_id) in stacked_params_mapping:
                 if weight_name not in name:
                     continue
diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py
index 3e95926fd1e22..67e04b57658bc 100644
--- a/vllm/model_executor/models/granite.py
+++ b/vllm/model_executor/models/granite.py
@@ -39,8 +39,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
-from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
-    get_compressed_tensors_cache_scale)
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -371,6 +369,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         self.config = config
         self.lora_config = lora_config
+        self.quant_config = quant_config
 
         self.model = GraniteModel(vllm_config=vllm_config,
                                   prefix=maybe_prefix(prefix, "model"))
@@ -474,12 +473,15 @@ def load_weights(self, weights: Iterable[Tuple[str,
             # processed with quantization, LoRA, fine-tuning, etc.
             if self.config.tie_word_embeddings and "lm_head.weight" in name:
                 continue
-            if scale_name := get_compressed_tensors_cache_scale(name):
-                # Loading kv cache scales for compressed-tensors quantization
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache scales for quark and
+                # compressed-tensors quantization
                 param = params_dict[scale_name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
-                loaded_weight = loaded_weight[0]
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
                 weight_loader(param, loaded_weight)
                 loaded_params.add(scale_name)
                 continue
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index e8732c57fad49..4667f275ecd33 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -38,8 +38,6 @@
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
-    get_compressed_tensors_cache_scale)
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -306,6 +304,7 @@ def __init__(self,
         lora_config = vllm_config.lora_config
 
         self.config = config
+        self.quant_config = quant_config
         self.padding_idx = config.pad_token_id
         lora_vocab = (lora_config.lora_extra_vocab_size *
                       (lora_config.max_loras or 1)) if lora_config else 0
@@ -396,12 +395,15 @@ def load_weights(self, weights: Iterable[Tuple[str,
                 # Models trained using ColossalAI may include these tensors in
                 # the checkpoint. Skip them.
                 continue
-            if scale_name := get_compressed_tensors_cache_scale(name):
-                # Loading kv cache scales for compressed-tensors quantization
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache scales for quark and
+                # compressed-tensors quantization
                 param = params_dict[scale_name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
-                loaded_weight = loaded_weight[0]
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
                 weight_loader(param, loaded_weight)
                 loaded_params.add(scale_name)
                 continue
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index a5b364fe5ec85..2c8895e842996 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -347,6 +347,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         lora_config = vllm_config.lora_config
         self.config = config
         self.lora_config = lora_config
+        self.quant_config = quant_config
 
         self.model = MixtralModel(vllm_config=vllm_config,
                                   prefix=maybe_prefix(prefix, "model"))
@@ -428,6 +429,19 @@ def load_weights(self, weights: Iterable[Tuple[str,
             if "rotary_emb.inv_freq" in name:
                 continue
 
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache scales for quark and
+                # compressed-tensors quantization
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+
             for (param_name, weight_name, shard_id) in stacked_params_mapping:
                 if weight_name not in name:
                     continue
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index b2368ffff5412..bd261f31499cb 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -1116,6 +1116,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
+        self.quant_config = quant_config
         self.vocab_size = config.text_config.vocab_size
         self.hidden_size = config.text_config.hidden_size
         self.max_num_tiles = config.vision_config.max_num_tiles
@@ -1429,6 +1430,18 @@ def load_weights(self, weights: Iterable[Tuple[str,
                 name = name.replace('patch_embedding.weight',
                                     'patch_embedding._linear.weight')
                 loaded_weight = loaded_weight.view(loaded_weight.shape[0], -1)
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache scales for quark and
+                # compressed-tensors quantization
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
+                weight_loader(param, loaded_weight)
+                updated_params.add(scale_name)
+                continue
             for (param_name, weight_name, shard_id) in stacked_params_mapping:
                 if weight_name not in name:
                     continue
diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py
index 8cc62d5c803cc..e7875e6fb889f 100644
--- a/vllm/model_executor/models/nemotron.py
+++ b/vllm/model_executor/models/nemotron.py
@@ -405,6 +405,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         self.config = config
         self.lora_config = lora_config
+        self.quant_config = quant_config
 
         self.model = NemotronModel(vllm_config=vllm_config,
                                    prefix=maybe_prefix(prefix, "model"))
@@ -489,6 +490,18 @@ def load_weights(self, weights: Iterable[Tuple[str,
                 # Models trained using ColossalAI may include these tensors in
                 # the checkpoint. Skip them.
                 continue
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache scales for quark and
+                # compressed-tensors quantization
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
             for (param_name, weight_name, shard_id) in stacked_params_mapping:
                 if weight_name not in name:
                     continue
diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py
index 1febd62f2f705..dc76818e22cbb 100644
--- a/vllm/model_executor/models/phimoe.py
+++ b/vllm/model_executor/models/phimoe.py
@@ -546,6 +546,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         lora_config = vllm_config.lora_config
         self.config = config
         self.lora_config = lora_config
+        self.quant_config = vllm_config.quant_config
 
         self.model = PhiMoEModel(vllm_config=vllm_config,
                                  prefix=maybe_prefix(prefix, "model"))
@@ -623,6 +624,19 @@ def load_weights(self, weights: Iterable[Tuple[str,
             if "rotary_emb.inv_freq" in name:
                 continue
 
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache scales for quark and
+                # compressed-tensors quantization
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 0a99c87470850..b9c259ad73c40 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -279,6 +279,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                              ))
 
         self.config = config
+        self.quant_config = quant_config
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
 
@@ -364,6 +365,18 @@ def load_weights(self, weights: Iterable[Tuple[str,
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache scales for quark and
+                # compressed-tensors quantization
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
             for (param_name, weight_name, shard_id) in stacked_params_mapping:
                 if weight_name not in name:
                     continue
diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py
index e83d316f74de2..b27d2b10850f5 100644
--- a/vllm/model_executor/models/solar.py
+++ b/vllm/model_executor/models/solar.py
@@ -39,8 +39,6 @@
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
-    get_compressed_tensors_cache_scale)
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -409,6 +407,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         lora_config = vllm_config.lora_config
         self.config = config
         self.lora_config = lora_config
+        self.quant_config = quant_config
 
         self.model = SolarModel(
             vllm_config=vllm_config,
@@ -491,12 +490,15 @@ def load_weights(self, weights: Iterable[Tuple[str,
                 # Models trained using ColossalAI may include these tensors in
                 # the checkpoint. Skip them.
                 continue
-            if scale_name := get_compressed_tensors_cache_scale(name):
-                # Loading kv cache scales for compressed-tensors quantization
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache scales for quark and
+                # compressed-tensors quantization
                 param = params_dict[scale_name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
-                loaded_weight = loaded_weight[0]
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
                 weight_loader(param, loaded_weight)
                 loaded_params.add(scale_name)
                 continue
diff --git a/vllm/model_executor/parameter.py b/vllm/model_executor/parameter.py
index fc5a3e7fba674..a9ce8af15d3bb 100644
--- a/vllm/model_executor/parameter.py
+++ b/vllm/model_executor/parameter.py
@@ -56,8 +56,14 @@ def __init__(self, data: torch.Tensor, weight_loader: Callable):
     def weight_loader(self):
         return self._weight_loader
 
+    def _is_1d_and_scalar(self, loaded_weight: torch.Tensor):
+        cond1 = self.data.ndim == 1 and self.data.numel() == 1
+        cond2 = loaded_weight.ndim == 0 and loaded_weight.numel() == 1
+        return (cond1 and cond2)
+
     def _assert_and_load(self, loaded_weight: torch.Tensor):
-        assert self.data.shape == loaded_weight.shape
+        assert (self.data.shape == loaded_weight.shape
+                or self._is_1d_and_scalar(loaded_weight))
         self.data.copy_(loaded_weight)
 
     def load_column_parallel_weight(self, loaded_weight: torch.Tensor):
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 67a9e816cb658..5ef56406e1935 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -70,7 +70,7 @@ class RocmPlatform(Platform):
 
     supported_quantization: list[str] = [
         "awq", "gptq", "fp8", "compressed_tensors", "compressed-tensors",
-        "fbgemm_fp8", "gguf"
+        "fbgemm_fp8", "gguf", "quark"
     ]
 
     @classmethod

From 57e729e87478d734e8d0075e35aeb4c9bd440e77 Mon Sep 17 00:00:00 2001
From: maang-h <55082429+maang-h@users.noreply.github.com>
Date: Thu, 16 Jan 2025 00:07:45 +0800
Subject: [PATCH 231/462] [Doc]: Update `OpenAI-Compatible Server` documents
 (#12082)

---
 vllm/engine/arg_utils.py            | 16 ++++----
 vllm/entrypoints/openai/cli_args.py | 60 ++++++++++++++---------------
 2 files changed, 38 insertions(+), 38 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index c31b206d6f60e..03a8959a7d9ff 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -238,7 +238,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             choices=get_args(TaskOption),
             help='The task to use the model for. Each vLLM instance only '
             'supports one task, even if the same model can be used for '
-            'multiple tasks. When the model only supports one task, "auto" '
+            'multiple tasks. When the model only supports one task, ``"auto"`` '
             'can be used to select it; otherwise, you must specify explicitly '
             'which task to use.')
         parser.add_argument(
@@ -250,7 +250,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         parser.add_argument(
             '--skip-tokenizer-init',
             action='store_true',
-            help='Skip initialization of tokenizer and detokenizer')
+            help='Skip initialization of tokenizer and detokenizer.')
         parser.add_argument(
             '--revision',
             type=nullable_str,
@@ -401,7 +401,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         parser.add_argument(
             '--worker-use-ray',
             action='store_true',
-            help='Deprecated, use --distributed-executor-backend=ray.')
+            help='Deprecated, use ``--distributed-executor-backend=ray``.')
         parser.add_argument('--pipeline-parallel-size',
                             '-pp',
                             type=int,
@@ -430,7 +430,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
                             choices=[8, 16, 32, 64, 128],
                             help='Token block size for contiguous chunks of '
                             'tokens. This is ignored on neuron devices and '
-                            'set to max-model-len. On CUDA devices, '
+                            'set to ``--max-model-len``. On CUDA devices, '
                             'only block sizes up to 32 are supported. '
                             'On HPU devices, block size defaults to 128.')
 
@@ -439,12 +439,12 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             action=argparse.BooleanOptionalAction,
             default=EngineArgs.enable_prefix_caching,
             help="Enables automatic prefix caching. "
-            "Use --no-enable-prefix-caching to disable explicitly.",
+            "Use ``--no-enable-prefix-caching`` to disable explicitly.",
         )
         parser.add_argument('--disable-sliding-window',
                             action='store_true',
                             help='Disables sliding window, '
-                            'capping to sliding window size')
+                            'capping to sliding window size.')
         parser.add_argument('--use-v2-block-manager',
                             action='store_true',
                             default=True,
@@ -861,7 +861,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             "of the provided names. The model name in the model "
             "field of a response will be the first name in this "
             "list. If not specified, the model name will be the "
-            "same as the `--model` argument. Noted that this name(s) "
+            "same as the ``--model`` argument. Noted that this name(s) "
             "will also be used in `model_name` tag content of "
             "prometheus metrics, if multiple names provided, metrics "
             "tag will take the first one.")
@@ -881,7 +881,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             default=None,
             help="Valid choices are " +
             ",".join(ALLOWED_DETAILED_TRACE_MODULES) +
-            ". It makes sense to set this only if --otlp-traces-endpoint is"
+            ". It makes sense to set this only if ``--otlp-traces-endpoint`` is"
             " set. If set, it will collect detailed traces for the specified "
             "modules. This involves use of possibly costly and or blocking "
             "operations and hence might have a performance impact.")
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index 22206ef8dbfe6..35445449463e9 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -79,29 +79,29 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
     parser.add_argument("--host",
                         type=nullable_str,
                         default=None,
-                        help="host name")
-    parser.add_argument("--port", type=int, default=8000, help="port number")
+                        help="Host name.")
+    parser.add_argument("--port", type=int, default=8000, help="Port number.")
     parser.add_argument(
         "--uvicorn-log-level",
         type=str,
         default="info",
         choices=['debug', 'info', 'warning', 'error', 'critical', 'trace'],
-        help="log level for uvicorn")
+        help="Log level for uvicorn.")
     parser.add_argument("--allow-credentials",
                         action="store_true",
-                        help="allow credentials")
+                        help="Allow credentials.")
     parser.add_argument("--allowed-origins",
                         type=json.loads,
                         default=["*"],
-                        help="allowed origins")
+                        help="Allowed origins.")
     parser.add_argument("--allowed-methods",
                         type=json.loads,
                         default=["*"],
-                        help="allowed methods")
+                        help="Allowed methods.")
     parser.add_argument("--allowed-headers",
                         type=json.loads,
                         default=["*"],
-                        help="allowed headers")
+                        help="Allowed headers.")
     parser.add_argument("--api-key",
                         type=nullable_str,
                         default=None,
@@ -115,10 +115,10 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         action=LoRAParserAction,
         help="LoRA module configurations in either 'name=path' format"
         "or JSON format. "
-        "Example (old format): 'name=path' "
+        "Example (old format): ``'name=path'`` "
         "Example (new format): "
-        "'{\"name\": \"name\", \"local_path\": \"path\", "
-        "\"base_model_name\": \"id\"}'")
+        "``{\"name\": \"name\", \"local_path\": \"path\", "
+        "\"base_model_name\": \"id\"}``")
     parser.add_argument(
         "--prompt-adapters",
         type=nullable_str,
@@ -132,7 +132,7 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
                         default=None,
                         help="The file path to the chat template, "
                         "or the template in single-line form "
-                        "for the specified model")
+                        "for the specified model.")
     parser.add_argument(
         '--chat-template-content-format',
         type=str,
@@ -141,38 +141,39 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         help='The format to render message content within a chat template.'
         '\n\n'
         '* "string" will render the content as a string. '
-        'Example: "Hello World"\n'
+        'Example: ``"Hello World"``\n'
         '* "openai" will render the content as a list of dictionaries, '
         'similar to OpenAI schema. '
-        'Example: [{"type": "text", "text": "Hello world!"}]')
+        'Example: ``[{"type": "text", "text": "Hello world!"}]``')
     parser.add_argument("--response-role",
                         type=nullable_str,
                         default="assistant",
                         help="The role name to return if "
-                        "`request.add_generation_prompt=true`.")
+                        "``request.add_generation_prompt=true``.")
     parser.add_argument("--ssl-keyfile",
                         type=nullable_str,
                         default=None,
-                        help="The file path to the SSL key file")
+                        help="The file path to the SSL key file.")
     parser.add_argument("--ssl-certfile",
                         type=nullable_str,
                         default=None,
-                        help="The file path to the SSL cert file")
+                        help="The file path to the SSL cert file.")
     parser.add_argument("--ssl-ca-certs",
                         type=nullable_str,
                         default=None,
-                        help="The CA certificates file")
+                        help="The CA certificates file.")
     parser.add_argument(
         "--ssl-cert-reqs",
         type=int,
         default=int(ssl.CERT_NONE),
-        help="Whether client certificate is required (see stdlib ssl module's)"
+        help="Whether client certificate is required (see stdlib ssl module's)."
     )
     parser.add_argument(
         "--root-path",
         type=nullable_str,
         default=None,
-        help="FastAPI root_path when app is behind a path based routing proxy")
+        help="FastAPI root_path when app is behind a path based routing proxy."
+    )
     parser.add_argument(
         "--middleware",
         type=nullable_str,
@@ -182,15 +183,15 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         "We accept multiple --middleware arguments. "
         "The value should be an import path. "
         "If a function is provided, vLLM will add it to the server "
-        "using @app.middleware('http'). "
+        "using ``@app.middleware('http')``. "
         "If a class is provided, vLLM will add it to the server "
-        "using app.add_middleware(). ")
+        "using ``app.add_middleware()``. ")
     parser.add_argument(
         "--return-tokens-as-token-ids",
         action="store_true",
-        help="When --max-logprobs is specified, represents single tokens as "
-        "strings of the form 'token_id:{token_id}' so that tokens that "
-        "are not JSON-encodable can be identified.")
+        help="When ``--max-logprobs`` is specified, represents single tokens "
+        " as strings of the form 'token_id:{token_id}' so that tokens "
+        "that are not JSON-encodable can be identified.")
     parser.add_argument(
         "--disable-frontend-multiprocessing",
         action="store_true",
@@ -205,9 +206,8 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         "--enable-auto-tool-choice",
         action="store_true",
         default=False,
-        help=
-        "Enable auto tool choice for supported models. Use --tool-call-parser"
-        " to specify which parser to use")
+        help="Enable auto tool choice for supported models. Use "
+        "``--tool-call-parser`` to specify which parser to use.")
 
     valid_tool_parsers = ToolParserManager.tool_parsers.keys()
     parser.add_argument(
@@ -219,7 +219,7 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         help=
         "Select the tool call parser depending on the model that you're using."
         " This is used to parse the model-generated tool call into OpenAI API "
-        "format. Required for --enable-auto-tool-choice.")
+        "format. Required for ``--enable-auto-tool-choice``.")
 
     parser.add_argument(
         "--tool-parser-plugin",
@@ -228,7 +228,7 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         help=
         "Special the tool parser plugin write to parse the model-generated tool"
         " into OpenAI API format, the name register in this plugin can be used "
-        "in --tool-call-parser.")
+        "in ``--tool-call-parser``.")
 
     parser = AsyncEngineArgs.add_cli_args(parser)
 
@@ -243,7 +243,7 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         "--disable-fastapi-docs",
         action='store_true',
         default=False,
-        help="Disable FastAPI's OpenAPI schema, Swagger UI, and ReDoc endpoint"
+        help="Disable FastAPI's OpenAPI schema, Swagger UI, and ReDoc endpoint."
     )
     parser.add_argument(
         "--enable-prompt-tokens-details",

From edce722eaa5e9f0b97bea611531e3341ec2e2e71 Mon Sep 17 00:00:00 2001
From: Joe Runde <Joseph.Runde@ibm.com>
Date: Wed, 15 Jan 2025 09:31:01 -0700
Subject: [PATCH 232/462] [Bugfix] use right truncation for non-generative
 tasks (#12050)

Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
---
 tests/entrypoints/llm/test_encode.py                | 7 +++++++
 vllm/config.py                                      | 4 ++++
 vllm/transformers_utils/tokenizer_group/__init__.py | 3 ++-
 3 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/tests/entrypoints/llm/test_encode.py b/tests/entrypoints/llm/test_encode.py
index 41163809237e9..3906ad766e0b6 100644
--- a/tests/entrypoints/llm/test_encode.py
+++ b/tests/entrypoints/llm/test_encode.py
@@ -105,3 +105,10 @@ def test_multiple_pooling_params(llm: LLM):
     # pooling_params is None, default params should be applied
     outputs = llm.encode(PROMPTS, pooling_params=None)
     assert len(PROMPTS) == len(outputs)
+
+
+@pytest.mark.skip_global_cleanup
+def test_right_side_truncation(llm: LLM):
+    # Embeddings models should truncate the end of the prompt
+    tokenizer = llm.get_tokenizer()
+    assert tokenizer.truncation_side == "right"
diff --git a/vllm/config.py b/vllm/config.py
index 65cb0d85f172a..4ffc13a05e026 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -357,6 +357,10 @@ def __init__(self,
         supported_tasks, task = self._resolve_task(task, self.hf_config)
         self.supported_tasks = supported_tasks
         self.task: Final = task
+        if self.task in ("draft", "generate"):
+            self.truncation_side = "left"
+        else:
+            self.truncation_side = "right"
 
         self.pooler_config = self._init_pooler_config(override_pooler_config)
         self.logits_processor_pattern = logits_processor_pattern
diff --git a/vllm/transformers_utils/tokenizer_group/__init__.py b/vllm/transformers_utils/tokenizer_group/__init__.py
index d400276796996..09569c564a58d 100644
--- a/vllm/transformers_utils/tokenizer_group/__init__.py
+++ b/vllm/transformers_utils/tokenizer_group/__init__.py
@@ -24,7 +24,8 @@ def init_tokenizer_from_configs(model_config: ModelConfig,
                        max_input_length=None,
                        tokenizer_mode=model_config.tokenizer_mode,
                        trust_remote_code=model_config.trust_remote_code,
-                       revision=model_config.tokenizer_revision)
+                       revision=model_config.tokenizer_revision,
+                       truncation_side=model_config.truncation_side)
 
     return get_tokenizer_group(parallel_config.tokenizer_pool_config,
                                **init_kwargs)

From 70755e819e0ae5d963dab7d81321bdfaef6d955a Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Wed, 15 Jan 2025 11:29:00 -0800
Subject: [PATCH 233/462] [V1][Core] Autotune encoder cache budget (#11895)

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/config.py                        | 15 ++++--
 vllm/multimodal/registry.py           | 29 ++++++++--
 vllm/v1/core/encoder_cache_manager.py | 78 ++++++++++++++++++++++++++-
 vllm/v1/core/scheduler.py             | 26 ++++++---
 vllm/v1/engine/core.py                |  9 ++--
 vllm/v1/worker/gpu_model_runner.py    | 60 +++++++++++----------
 6 files changed, 167 insertions(+), 50 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 4ffc13a05e026..e64883368a751 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1387,13 +1387,15 @@ class SchedulerConfig:
 
     is_multimodal_model: bool = False
 
-    # FIXME(woosuk & ywang96): Below are placeholder values. We need to
-    # calculate the actual values from the configurations.
-    # Multimodal encoder run compute budget, only used in V1
-    max_num_encoder_input_tokens = 16384
+    # NOTE: The following multimodal encoder budget will be initialized to
+    # max_num_batched_tokens and overridden in case max multimodal embedding
+    # size is larger.
+    # TODO (ywang96): Make these configurable.
+    # Multimodal encoder compute budget, only used in V1
+    max_num_encoder_input_tokens: int = field(default=None)  # type: ignore
 
     # Multimodal encoder cache size, only used in V1
-    encoder_cache_size = 16384
+    encoder_cache_size: int = field(default=None)  # type: ignore
 
     # Whether to perform preemption by swapping or
     # recomputation. If not specified, we determine the mode as follows:
@@ -1467,6 +1469,9 @@ def __post_init__(self) -> None:
                     _MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS,
                 )
 
+        self.max_num_encoder_input_tokens = self.max_num_batched_tokens
+        self.encoder_cache_size = self.max_num_batched_tokens
+
         if self.enable_chunked_prefill:
             logger.info(
                 "Chunked prefill is enabled with max_num_batched_tokens=%d.",
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 2961f7c76ca12..aaf7ff34ca573 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -252,11 +252,8 @@ def get_max_tokens_per_item_by_modality(
         model_config: "ModelConfig",
     ) -> Mapping[str, int]:
         """
-        Get the maximum number of tokens per data item from each modality
-        for profiling the memory usage of a model.
-
-        Note:
-            This is currently directly used only in V1.
+        Get the maximum number of tokens per data item from each modality based 
+        on underlying model configuration.
         """
         if self.has_processor(model_config):
             tokenizer = cached_get_tokenizer(
@@ -272,6 +269,28 @@ def get_max_tokens_per_item_by_modality(
             for key, plugin in self._plugins.items()
         }
 
+    def get_max_tokens_per_item_by_nonzero_modality(
+        self,
+        model_config: "ModelConfig",
+    ) -> Mapping[str, int]:
+        """
+        Get the maximum number of tokens per data item from each modality based
+        on underlying model configuration, excluding modalities that user 
+        explicitly disabled via `limit_mm_per_prompt`.
+
+        Note:
+            This is currently directly used only in V1 for profiling the memory 
+            usage of a model.
+        """
+        limits_per_plugin = self._limits_by_model[model_config]
+
+        return {
+            key: max_tokens_per_mm_item
+            for key, max_tokens_per_mm_item in
+            self.get_max_tokens_per_item_by_modality(model_config).items()
+            if limits_per_plugin[key] > 0
+        }
+
     def get_max_tokens_by_modality(
         self,
         model_config: "ModelConfig",
diff --git a/vllm/v1/core/encoder_cache_manager.py b/vllm/v1/core/encoder_cache_manager.py
index 845bd5ea05e3c..0cd8c806a3e47 100644
--- a/vllm/v1/core/encoder_cache_manager.py
+++ b/vllm/v1/core/encoder_cache_manager.py
@@ -1,7 +1,14 @@
-from typing import Dict, List, Set, Tuple
+from typing import TYPE_CHECKING, Dict, List, Set, Tuple
 
+from vllm.logger import init_logger
+from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.v1.request import Request
 
+if TYPE_CHECKING:
+    from vllm.config import ModelConfig, SchedulerConfig
+
+logger = init_logger(__name__)
+
 
 class EncoderCacheManager:
 
@@ -46,3 +53,72 @@ def get_freed_ids(self) -> List[Tuple[str, int]]:
         freed = self.freed
         self.freed = []
         return freed
+
+
+def compute_encoder_budget(
+    model_config: "ModelConfig",
+    scheduler_config: "SchedulerConfig",
+) -> Tuple[int, int]:
+    """Compute the encoder cache budget based on the model and scheduler 
+    configurations.
+
+    Args:
+        model_config: Model configuration.
+        scheduler_config: Scheduler configuration.
+
+    Returns:
+        - Compute budget for encoder execution, in unit of number of tokens 
+            in the input sequence.
+        - Space budget for encoder cache size, in unit of number of tokens 
+            in the input sequence.
+    """
+
+    if not model_config.is_multimodal_model:
+        return 0, 0
+
+    # TODO: handle encoder-decoder models once we support them.
+    (
+        encoder_compute_budget,
+        encoder_cache_size,
+    ) = _compute_encoder_budget_multimodal(model_config, scheduler_config)
+
+    return encoder_compute_budget, encoder_cache_size
+
+
+def _compute_encoder_budget_multimodal(
+    model_config: "ModelConfig",
+    scheduler_config: "SchedulerConfig",
+) -> Tuple[int, int]:
+    """Compute the encoder cache budget based on the model and scheduler 
+    configurations for a multimodal model.
+
+    Args:
+        model_config: Model configuration.
+        scheduler_config: Scheduler configuration.
+
+    Returns:
+        - Compute budget for encoder execution, in unit of number of tokens 
+            in the input sequence.
+        - Space budget for encoder cache size, in unit of number of tokens 
+            in the input sequence.
+    """
+
+    max_tokens_by_modality_dict = MULTIMODAL_REGISTRY.get_max_tokens_per_item_by_nonzero_modality(  # noqa: E501
+        model_config)
+
+    if not max_tokens_by_modality_dict:
+        logger.warning(
+            "All non-text modalities supported by the model have been "
+            "explicitly disabled via limit_mm_per_prompt. Encoder cache will "
+            "not be initialized.")
+        return 0, 0
+
+    _, max_tokens_per_mm_item = max(max_tokens_by_modality_dict.items(),
+                                    key=lambda item: item[1])
+
+    encoder_compute_budget = max(scheduler_config.max_num_encoder_input_tokens,
+                                 max_tokens_per_mm_item)
+    encoder_cache_size = max(scheduler_config.encoder_cache_size,
+                             max_tokens_per_mm_item)
+
+    return encoder_compute_budget, encoder_cache_size
diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index 45e67c94f8f15..64df21d59fef4 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -3,10 +3,11 @@
 from typing import (TYPE_CHECKING, Deque, Dict, Iterable, List, Optional, Set,
                     Tuple, Union)
 
-from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
+from vllm.config import CacheConfig, LoRAConfig, ModelConfig, SchedulerConfig
 from vllm.logger import init_logger
 from vllm.sampling_params import SamplingParams
-from vllm.v1.core.encoder_cache_manager import EncoderCacheManager
+from vllm.v1.core.encoder_cache_manager import (EncoderCacheManager,
+                                                compute_encoder_budget)
 from vllm.v1.core.kv_cache_manager import KVCacheManager
 from vllm.v1.engine import EngineCoreOutput, EngineCoreOutputs
 from vllm.v1.metrics.stats import SchedulerStats
@@ -25,6 +26,7 @@ class Scheduler:
     def __init__(
         self,
         scheduler_config: SchedulerConfig,
+        model_config: ModelConfig,
         cache_config: CacheConfig,
         lora_config: Optional[LoRAConfig],
     ) -> None:
@@ -69,16 +71,24 @@ def __init__(
         self.running_reqs_data: Dict[str, RunningRequestData] = {}
 
         # Encoder-related.
+        # Calculate encoder cache size if applicable
+        # NOTE: For now we use the same budget for both compute and space.
+        # This can be changed when we make encoder cache for embedding caching
+        # across requests.
+        encoder_compute_budget, encoder_cache_size = compute_encoder_budget(
+            model_config=model_config,
+            scheduler_config=scheduler_config,
+        )
+
         # NOTE(woosuk): Here, "encoder" includes the vision encoder (and
         # projector if needed). Currently, we assume that the encoder also
         # has the Transformer architecture (e.g., ViT).
-        self.max_num_encoder_input_tokens = self.scheduler_config.max_num_encoder_input_tokens  #noqa: E501
-        # NOTE(woosuk): For the models without encoder (e.g., text-only models),
-        # the encoder cache will not be initialized and used, regardless of
-        # the cache size. This is because the memory space for the encoder cache
-        # is preallocated in the profiling run.
+        self.max_num_encoder_input_tokens = encoder_compute_budget
+        # NOTE: For the models without encoder (e.g., text-only models),
+        # the encoder cache will not be initialized because cache size is 0
+        # for these models.
         self.encoder_cache_manager = EncoderCacheManager(
-            cache_size=self.scheduler_config.encoder_cache_size)
+            cache_size=encoder_cache_size)
 
     def schedule(self) -> "SchedulerOutput":
         # NOTE(woosuk) on the scheduling algorithm:
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index e7f90d3c62142..ef616229aa57b 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -54,9 +54,12 @@ def __init__(
         vllm_config.cache_config.num_cpu_blocks = num_cpu_blocks
 
         # Setup scheduler.
-        self.scheduler = Scheduler(vllm_config.scheduler_config,
-                                   vllm_config.cache_config,
-                                   vllm_config.lora_config)
+        self.scheduler = Scheduler(
+            scheduler_config=vllm_config.scheduler_config,
+            model_config=vllm_config.model_config,
+            cache_config=vllm_config.cache_config,
+            lora_config=vllm_config.lora_config,
+        )
 
         self.mm_input_mapper_server = MMInputMapperServer(
             vllm_config.model_config)
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index fb87dc5a8222a..de83640b27cd6 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -20,6 +20,7 @@
                         is_pin_memory_available)
 from vllm.v1.attention.backends.flash_attn import (FlashAttentionBackend,
                                                    FlashAttentionMetadata)
+from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
 from vllm.v1.engine.mm_input_mapper import MMInputMapperClient
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.sample.metadata import SamplingMetadata
@@ -88,8 +89,12 @@ def __init__(
         self.mm_input_mapper_profiling = MMInputMapperClient(self.model_config)
         self.mm_input_mapper_profiling.use_cache = False
 
-        self.max_num_encoder_input_tokens = self.scheduler_config.max_num_encoder_input_tokens  # noqa: E501
-        self.encoder_cache_size = self.scheduler_config.encoder_cache_size
+        encoder_compute_budget, encoder_cache_size = compute_encoder_budget(
+            model_config=model_config,
+            scheduler_config=scheduler_config,
+        )
+        self.max_num_encoder_input_tokens = encoder_compute_budget
+        self.encoder_cache_size = encoder_cache_size
 
         # Lazy initialization
         # self.model: nn.Module  # Set after load_model
@@ -721,44 +726,30 @@ def profile_run(self) -> None:
         ]
 
         # Profile with multimodal encoder & encoder cache.
-        if self.is_multimodal_model:
-
-            # Create dummy batch of multimodal inputs.
-            dummy_request_data = self.input_registry.dummy_data_for_profiling(
-                model_config=self.model_config,
-                seq_len=self.max_num_tokens,
-                mm_registry=self.mm_registry,
-            )
-            dummy_mm_data = dummy_request_data.multi_modal_data
+        # TODO: handle encoder-decoder models once we support them.
+        if (self.is_multimodal_model and self.max_num_encoder_input_tokens > 0
+                and self.encoder_cache_size > 0):
 
             # NOTE: Currently model is profiled with a single non-text
             # modality with the max possible input tokens even when
             # it supports multiple.
-            max_tokens_by_modality_dict = self.mm_registry.get_max_tokens_per_item_by_modality(  # noqa: E501
+            max_tokens_by_modality_dict = MULTIMODAL_REGISTRY.get_max_tokens_per_item_by_nonzero_modality(  # noqa: E501
                 self.model_config)
-
             dummy_data_modality, max_tokens_per_mm_item = max(
                 max_tokens_by_modality_dict.items(), key=lambda item: item[1])
 
             # Check how many items of this modality can be supported by
-            # the encoder cache budget.
-            encoder_cache_budget = min(self.max_num_encoder_input_tokens,
-                                       self.encoder_cache_size)
-            max_num_mm_items_encoder_budget = encoder_cache_budget // \
-                max_tokens_per_mm_item
-
-            # TODO: Allow users to set encoder_cache_budget in case this
-            # happens.
-            assert max_num_mm_items_encoder_budget > 0, (
-                f"Encoder cache budget={encoder_cache_budget} is too small to "
-                f"support the maximum possible size of multimodal embeddings"
-                f"={max_tokens_per_mm_item}.")
+            # the encoder budget.
+            encoder_budget = min(self.max_num_encoder_input_tokens,
+                                 self.encoder_cache_size)
+
+            max_num_mm_items_encoder_budget = cdiv(encoder_budget,
+                                                   max_tokens_per_mm_item)
 
             # Check how many items of this modality can be supported by
             # the decoder budget.
-            max_mm_items_per_req = max(
-                self.mm_registry.get_mm_limits_per_prompt(
-                    self.model_config).values())
+            max_mm_items_per_req = self.mm_registry.get_mm_limits_per_prompt(
+                self.model_config)[dummy_data_modality]
 
             # NOTE: We do not consider max_num_batched_tokens on purpose
             # because the multimodal embeddings can be generated in advance
@@ -769,6 +760,19 @@ def profile_run(self) -> None:
             max_num_mm_items = min(max_num_mm_items_encoder_budget,
                                    max_num_mm_items_decoder_budget)
 
+            logger.info(
+                "Encoder cache will be initialized with a budget of %s tokens,"
+                " and profiled with %s %s items of the maximum feature size.",
+                encoder_budget, max_num_mm_items, dummy_data_modality)
+
+            # Create dummy batch of multimodal inputs.
+            dummy_request_data = self.input_registry.dummy_data_for_profiling(
+                model_config=self.model_config,
+                seq_len=self.max_num_tokens,
+                mm_registry=self.mm_registry,
+            )
+            dummy_mm_data = dummy_request_data.multi_modal_data
+
             # Dummy data definition in V0 may contain multiple multimodal items
             # (e.g, multiple images) for a single request, therefore here we
             # always replicate first item by max_num_mm_items times since in V1

From ebd8c669efa54a218eb83735fd7ba40922f5f3ad Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Thu, 16 Jan 2025 01:29:42 +0530
Subject: [PATCH 234/462] [Bugfix] Fix _get_lora_device for HQQ marlin (#12090)

Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
---
 vllm/lora/layers.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index a933ccaecf15e..dd981ffce8833 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -51,6 +51,9 @@ def _get_lora_device(base_layer: nn.Module) -> torch.device:
     # marlin
     elif hasattr(base_layer, "B"):
         return base_layer.B.device
+    # HQQ marlin
+    elif hasattr(base_layer, "W_q"):
+        return base_layer.W_q.device
     else:
         raise ValueError(f"Unsupported base layer: {base_layer}")
 

From cd9d06fb8d1f89fc1bcc9305bc20d57c6d8b73d8 Mon Sep 17 00:00:00 2001
From: tvirolai-amd <teemu.virolainen@amd.com>
Date: Wed, 15 Jan 2025 23:46:03 +0200
Subject: [PATCH 235/462] Allow hip sources to be directly included when
 compiling for rocm. (#12087)

---
 cmake/utils.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index 40430dae10c5b..15b09395a889f 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -58,8 +58,8 @@ function (hipify_sources_target OUT_SRCS NAME ORIG_SRCS)
   #
   set(SRCS ${ORIG_SRCS})
   set(CXX_SRCS ${ORIG_SRCS})
-  list(FILTER SRCS EXCLUDE REGEX "\.(cc)|(cpp)$")
-  list(FILTER CXX_SRCS INCLUDE REGEX "\.(cc)|(cpp)$")
+  list(FILTER SRCS EXCLUDE REGEX "\.(cc)|(cpp)|(hip)$")
+  list(FILTER CXX_SRCS INCLUDE REGEX "\.(cc)|(cpp)|(hip)$")
 
   #
   # Generate ROCm/HIP source file names from CUDA file names.

From fa0050db08660535368ec5ea41d313bdeb69909d Mon Sep 17 00:00:00 2001
From: Elfie Guo <164945471+elfiegg@users.noreply.github.com>
Date: Wed, 15 Jan 2025 20:31:27 -0800
Subject: [PATCH 236/462] [Core] Default to using per_token quantization for
 fp8 when cutlass is supported. (#8651)

Signed-off-by: mgoin <michael@neuralmagic.com>
Co-authored-by: Michael Goin <mgoin@redhat.com>
Co-authored-by: mgoin <michael@neuralmagic.com>
---
 vllm/model_executor/layers/quantization/fp8.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index a1be45a49e94a..4969ee559522e 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -355,7 +355,8 @@ def apply(self,
             input_scale=layer.input_scale,
             bias=bias,
             cutlass_fp8_supported=self.cutlass_fp8_supported,
-            use_per_token_if_dynamic=False)
+            # Default to using per_token quantization if cutlass is supported
+            use_per_token_if_dynamic=self.cutlass_fp8_supported)
 
 
 class Fp8MoEMethod(FusedMoEMethodBase):

From f8ef146f03da8993fb3bf5638b28bef6e931fc51 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 16 Jan 2025 15:53:43 +0800
Subject: [PATCH 237/462] [Doc] Add documentation for specifying model
 architecture (#12105)

---
 docs/source/serving/offline_inference.md | 53 ++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/docs/source/serving/offline_inference.md b/docs/source/serving/offline_inference.md
index 94703a1c32ade..1f5a54f755f13 100644
--- a/docs/source/serving/offline_inference.md
+++ b/docs/source/serving/offline_inference.md
@@ -31,6 +31,59 @@ Please refer to the above pages for more details about each API.
 This section lists the most common options for running the vLLM engine.
 For a full list, refer to the [Engine Arguments](#engine-args) page.
 
+### Model resolution
+
+vLLM loads HuggingFace-compatible models by inspecting the `architectures` field in `config.json` of the model repository
+and finding the corresponding implementation that is registered to vLLM.
+Nevertheless, our model resolution may fail for the following reasons:
+
+- The `config.json` of the model repository lacks the `architectures` field.
+- Unofficial repositories refer to a model using alternative names which are not recorded in vLLM.
+- The same architecture name is used for multiple models, creating ambiguity as to which model should be loaded.
+
+In those cases, vLLM may throw an error like:
+
+```text
+Traceback (most recent call last):
+...
+  File "vllm/model_executor/models/registry.py", line xxx, in inspect_model_cls
+    for arch in architectures:
+TypeError: 'NoneType' object is not iterable
+```
+
+or:
+
+```text
+  File "vllm/model_executor/models/registry.py", line xxx, in _raise_for_unsupported
+    raise ValueError(
+ValueError: Model architectures ['<arch>'] are not supported for now. Supported architectures: [...]
+```
+
+:::{note}
+The above error is distinct from the following similar but different error:
+
+```text
+  File "vllm/model_executor/models/registry.py", line xxx, in _raise_for_unsupported
+    raise ValueError(
+ValueError: Model architectures ['<arch>'] failed to be inspected. Please check the logs for more details.
+```
+
+This error means that vLLM failed to import the model file. Usually, it is related to missing dependencies or outdated
+binaries in the vLLM build. Please read the logs carefully to determine the real cause of the error.
+:::
+
+To fix this, explicitly specify the model architecture by passing `config.json` overrides to the `hf_overrides` option.
+For example:
+
+```python
+model = LLM(
+    model="cerebras/Cerebras-GPT-1.3B",
+    hf_overrides={"architectures": ["GPT2LMHeadModel"]},  # GPT-2
+)
+```
+
+Our [list of supported models](#supported-models) shows the model architectures that are recognized by vLLM.
+
 ### Reducing memory usage
 
 Large models might cause your machine to run out of memory (OOM). Here are some options that help alleviate this problem.

From 9aa1519f089e8dc2c2bd1b4c74a8ce47d386f0a9 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Thu, 16 Jan 2025 04:59:06 -0500
Subject: [PATCH 238/462] Various cosmetic/comment fixes (#12089)

Signed-off-by: mgoin <michael@neuralmagic.com>
---
 .../compressed_tensors/schemes/compressed_tensors_24.py    | 2 +-
 vllm/model_executor/models/aria.py                         | 3 +--
 vllm/model_executor/models/commandr.py                     | 3 +--
 vllm/model_executor/models/dbrx.py                         | 3 +--
 vllm/model_executor/models/exaone.py                       | 3 +--
 vllm/model_executor/models/gpt_j.py                        | 3 +--
 vllm/model_executor/models/granite.py                      | 3 +--
 vllm/model_executor/models/llama.py                        | 7 +++----
 vllm/model_executor/models/mixtral.py                      | 3 +--
 vllm/model_executor/models/mllama.py                       | 3 +--
 vllm/model_executor/models/nemotron.py                     | 3 +--
 vllm/model_executor/models/phimoe.py                       | 3 +--
 vllm/model_executor/models/qwen2.py                        | 3 +--
 vllm/model_executor/models/solar.py                        | 3 +--
 14 files changed, 16 insertions(+), 29 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
index bc697ef93b34b..21e6fe7a22616 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
@@ -42,7 +42,7 @@ def create_weights(self, layer: torch.nn.Module, input_size: int,
 
         if not sparse_cutlass_supported():
             raise ValueError(
-                "Sparse CUTLASS not supported. vLLM must be built with"
+                "Sparse CUTLASS not supported. vLLM must be built with "
                 "CUDA 12.2 or later to use this feature")
 
         self.output_dtype = params_dtype
diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
index 91225c0ddc917..5b97eced62df0 100644
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@@ -390,8 +390,7 @@ def load_weights(self, weights: Iterable[Tuple[str,
                 continue
             if (self.quant_config is not None and
                 (scale_name := self.quant_config.get_cache_scale(name))):
-                # Loading kv cache scales for quark and
-                # compressed-tensors quantization
+                # Loading kv cache quantization scales
                 param = params_dict[scale_name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py
index 6517422697c04..989056bf5c155 100644
--- a/vllm/model_executor/models/commandr.py
+++ b/vllm/model_executor/models/commandr.py
@@ -440,8 +440,7 @@ def load_weights(self, weights: Iterable[Tuple[str,
 
             if (self.quant_config is not None and
                 (scale_name := self.quant_config.get_cache_scale(name))):
-                # Loading kv cache scales for quark and
-                # compressed-tensors quantization
+                # Loading kv cache quantization scales
                 param = params_dict[scale_name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py
index ff1f1c2a939ff..b2aa3c0709bd4 100644
--- a/vllm/model_executor/models/dbrx.py
+++ b/vllm/model_executor/models/dbrx.py
@@ -452,8 +452,7 @@ def load_weights(self, weights: Iterable[Tuple[str,
         for name, loaded_weight in weights:
             if (self.quant_config is not None and
                 (scale_name := self.quant_config.get_cache_scale(name))):
-                # Loading kv cache scales for quark and
-                # compressed-tensors quantization
+                # Loading kv cache quantization scales
                 param = params_dict[scale_name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py
index ac679d6ff43c7..eab3bf0756fca 100644
--- a/vllm/model_executor/models/exaone.py
+++ b/vllm/model_executor/models/exaone.py
@@ -533,8 +533,7 @@ def load_weights(self, weights: Iterable[Tuple[str,
                 continue
             if (self.quant_config is not None and
                 (scale_name := self.quant_config.get_cache_scale(name))):
-                # Loading kv cache scales for quark and
-                # compressed-tensors quantization
+                # Loading kv cache quantization scales
                 param = params_dict[scale_name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py
index 56343ca9a71ac..08298cc0db36f 100644
--- a/vllm/model_executor/models/gpt_j.py
+++ b/vllm/model_executor/models/gpt_j.py
@@ -316,8 +316,7 @@ def load_weights(self, weights: Iterable[Tuple[str,
 
             if (self.quant_config is not None and
                 (scale_name := self.quant_config.get_cache_scale(name))):
-                # Loading kv cache scales for quark and
-                # compressed-tensors quantization
+                # Loading kv cache quantization scales
                 param = params_dict[scale_name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py
index 67e04b57658bc..ddd2d7a16b242 100644
--- a/vllm/model_executor/models/granite.py
+++ b/vllm/model_executor/models/granite.py
@@ -475,8 +475,7 @@ def load_weights(self, weights: Iterable[Tuple[str,
                 continue
             if (self.quant_config is not None and
                 (scale_name := self.quant_config.get_cache_scale(name))):
-                # Loading kv cache scales for quark and
-                # compressed-tensors quantization
+                # Loading kv cache quantization scales
                 param = params_dict[scale_name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 4667f275ecd33..a5bd418801f2c 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -105,9 +105,9 @@ def __init__(self,
                  max_position_embeddings: int = 8192,
                  quant_config: Optional[QuantizationConfig] = None,
                  bias: bool = False,
+                 bias_o_proj: bool = False,
                  cache_config: Optional[CacheConfig] = None,
-                 prefix: str = "",
-                 bias_o_proj: bool = False) -> None:
+                 prefix: str = "") -> None:
         super().__init__()
         layer_idx = extract_layer_index(prefix)
         self.hidden_size = hidden_size
@@ -397,8 +397,7 @@ def load_weights(self, weights: Iterable[Tuple[str,
                 continue
             if (self.quant_config is not None and
                 (scale_name := self.quant_config.get_cache_scale(name))):
-                # Loading kv cache scales for quark and
-                # compressed-tensors quantization
+                # Loading kv cache quantization scales
                 param = params_dict[scale_name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 2c8895e842996..da415cdae96ed 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -431,8 +431,7 @@ def load_weights(self, weights: Iterable[Tuple[str,
 
             if (self.quant_config is not None and
                 (scale_name := self.quant_config.get_cache_scale(name))):
-                # Loading kv cache scales for quark and
-                # compressed-tensors quantization
+                # Loading kv cache quantization scales
                 param = params_dict[scale_name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index bd261f31499cb..2554281610a30 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -1432,8 +1432,7 @@ def load_weights(self, weights: Iterable[Tuple[str,
                 loaded_weight = loaded_weight.view(loaded_weight.shape[0], -1)
             if (self.quant_config is not None and
                 (scale_name := self.quant_config.get_cache_scale(name))):
-                # Loading kv cache scales for quark and
-                # compressed-tensors quantization
+                # Loading kv cache quantization scales
                 param = params_dict[scale_name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py
index e7875e6fb889f..2340283b69665 100644
--- a/vllm/model_executor/models/nemotron.py
+++ b/vllm/model_executor/models/nemotron.py
@@ -492,8 +492,7 @@ def load_weights(self, weights: Iterable[Tuple[str,
                 continue
             if (self.quant_config is not None and
                 (scale_name := self.quant_config.get_cache_scale(name))):
-                # Loading kv cache scales for quark and
-                # compressed-tensors quantization
+                # Loading kv cache quantization scales
                 param = params_dict[scale_name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py
index dc76818e22cbb..881c09ea9db99 100644
--- a/vllm/model_executor/models/phimoe.py
+++ b/vllm/model_executor/models/phimoe.py
@@ -626,8 +626,7 @@ def load_weights(self, weights: Iterable[Tuple[str,
 
             if (self.quant_config is not None and
                 (scale_name := self.quant_config.get_cache_scale(name))):
-                # Loading kv cache scales for quark and
-                # compressed-tensors quantization
+                # Loading kv cache quantization scales
                 param = params_dict[scale_name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index b9c259ad73c40..d015f60c6d065 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -367,8 +367,7 @@ def load_weights(self, weights: Iterable[Tuple[str,
                 continue
             if (self.quant_config is not None and
                 (scale_name := self.quant_config.get_cache_scale(name))):
-                # Loading kv cache scales for quark and
-                # compressed-tensors quantization
+                # Loading kv cache quantization scales
                 param = params_dict[scale_name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py
index b27d2b10850f5..37c5a4b5713b8 100644
--- a/vllm/model_executor/models/solar.py
+++ b/vllm/model_executor/models/solar.py
@@ -492,8 +492,7 @@ def load_weights(self, weights: Iterable[Tuple[str,
                 continue
             if (self.quant_config is not None and
                 (scale_name := self.quant_config.get_cache_scale(name))):
-                # Loading kv cache scales for quark and
-                # compressed-tensors quantization
+                # Loading kv cache quantization scales
                 param = params_dict[scale_name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)

From dd7c9ad87074b68d201208a196e0a4b2b5ecc27a Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Thu, 16 Jan 2025 18:11:54 +0800
Subject: [PATCH 239/462] [Bugfix] Remove hardcoded `head_size=256` for
 Deepseek v2 and v3 (#12067)

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 tests/kernels/test_attention.py           |  6 +++---
 vllm/config.py                            |  9 ++++++---
 vllm/model_executor/models/deepseek_v2.py | 24 +++++++----------------
 vllm/model_executor/models/deepseek_v3.py | 24 +++++++----------------
 4 files changed, 23 insertions(+), 40 deletions(-)

diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py
index 3e3c0668198ad..124d5d297a574 100644
--- a/tests/kernels/test_attention.py
+++ b/tests/kernels/test_attention.py
@@ -31,9 +31,9 @@
 NUM_PREFILL_SEQS = [3]  # Arbitrary values for testing
 NUM_HEADS = [(40, 40), (64, 8)]  # Arbitrary values for testing
 
-# FlashAttention forward only supports head dimension at most 128
-# https://github.com/ROCmSoftwarePlatform/flash-attention/blob/3d2b6f5d037782cc2c906909a46fb7e2e1b48b25/csrc/flash_attn_rocm/flash_api.cpp#L62
-HEAD_SIZES = [64, 80, 120, 256]
+# This should be sync with get_supported_head_sizes() in
+# vllm.attention.ops.paged_attn.PagedAttention
+HEAD_SIZES = [32, 64, 80, 96, 112, 120, 128, 192, 256]
 
 BLOCK_SIZES = [16, 32]
 USE_ALIBI = [False, True]
diff --git a/vllm/config.py b/vllm/config.py
index e64883368a751..2fe674b857e16 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -733,9 +733,12 @@ def get_head_size(self) -> int:
         if hasattr(self.hf_text_config,
                    "model_type") and (self.hf_text_config.model_type
                                       in ('deepseek_v2', 'deepseek_v3')):
-            # FlashAttention supports only head_size 32, 64, 128, 256,
-            # we need to pad head_size 192 to 256
-            return 256
+            qk_rope_head_dim = getattr(self.hf_text_config, "qk_rope_head_dim",
+                                       0)
+            qk_nope_head_dim = getattr(self.hf_text_config, "qk_nope_head_dim",
+                                       0)
+            if qk_rope_head_dim and qk_nope_head_dim:
+                return qk_rope_head_dim + qk_nope_head_dim
 
         if self.is_attention_free:
             return 0
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index d83cafaf998ab..af6810a140b43 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -262,14 +262,8 @@ def __init__(
             mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
             self.scaling = self.scaling * mscale * mscale
 
-        # self.attn = Attention(self.num_heads,
-        #                       self.qk_head_dim,
-        #                       self.scaling,
-        #                       num_kv_heads=self.num_heads)
-
-        # TODO, support head_size 192
         self.attn = Attention(self.num_local_heads,
-                              256,
+                              self.qk_head_dim,
                               self.scaling,
                               num_kv_heads=self.num_local_heads,
                               cache_config=cache_config,
@@ -319,18 +313,14 @@ def forward(
         k = torch.empty_like(q)
         k[..., :self.qk_nope_head_dim] = k_nope
         k[..., self.qk_nope_head_dim:] = k_pe
-        q = torch.nn.functional.pad(q, [0, 256 - self.qk_head_dim],
-                                    value=0).view(-1,
-                                                  self.num_local_heads * 256)
-        k = torch.nn.functional.pad(k, [0, 256 - self.qk_head_dim],
-                                    value=0).view(-1,
-                                                  self.num_local_heads * 256)
-        v = torch.nn.functional.pad(v, [0, 256 - self.v_head_dim],
-                                    value=0).view(-1,
-                                                  self.num_local_heads * 256)
+        # padding value to qk_head_dim for alignment
+        v = torch.nn.functional.pad(
+            v, [0, self.qk_head_dim - self.v_head_dim],
+            value=0).view(-1, self.num_local_heads * self.qk_head_dim)
         attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
         attn_output = attn_output.view(
-            -1, self.num_local_heads, 256)[..., :self.v_head_dim].reshape(
+            -1, self.num_local_heads,
+            self.qk_head_dim)[..., :self.v_head_dim].reshape(
                 -1, self.num_local_heads * self.v_head_dim)
         output, _ = self.o_proj(attn_output)
         return output
diff --git a/vllm/model_executor/models/deepseek_v3.py b/vllm/model_executor/models/deepseek_v3.py
index ca79b14c55fea..0b44f0d062c40 100644
--- a/vllm/model_executor/models/deepseek_v3.py
+++ b/vllm/model_executor/models/deepseek_v3.py
@@ -269,14 +269,8 @@ def __init__(
             mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
             self.scaling = self.scaling * mscale * mscale
 
-        # self.attn = Attention(self.num_heads,
-        #                       self.qk_head_dim,
-        #                       self.scaling,
-        #                       num_kv_heads=self.num_heads)
-
-        # TODO, support head_size 192
         self.attn = Attention(self.num_local_heads,
-                              256,
+                              self.qk_head_dim,
                               self.scaling,
                               num_kv_heads=self.num_local_heads,
                               cache_config=cache_config,
@@ -326,18 +320,14 @@ def forward(
         k = torch.empty_like(q)
         k[..., :self.qk_nope_head_dim] = k_nope
         k[..., self.qk_nope_head_dim:] = k_pe
-        q = torch.nn.functional.pad(q, [0, 256 - self.qk_head_dim],
-                                    value=0).view(-1,
-                                                  self.num_local_heads * 256)
-        k = torch.nn.functional.pad(k, [0, 256 - self.qk_head_dim],
-                                    value=0).view(-1,
-                                                  self.num_local_heads * 256)
-        v = torch.nn.functional.pad(v, [0, 256 - self.v_head_dim],
-                                    value=0).view(-1,
-                                                  self.num_local_heads * 256)
+        # padding value to qk_head_dim for alignment
+        v = torch.nn.functional.pad(
+            v, [0, self.qk_head_dim - self.v_head_dim],
+            value=0).view(-1, self.num_local_heads * self.qk_head_dim)
         attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
         attn_output = attn_output.view(
-            -1, self.num_local_heads, 256)[..., :self.v_head_dim].reshape(
+            -1, self.num_local_heads,
+            self.qk_head_dim)[..., :self.v_head_dim].reshape(
                 -1, self.num_local_heads * self.v_head_dim)
         output, _ = self.o_proj(attn_output)
         return output

From bf53e0c70b0fe17087914cc770fd801e0bf02137 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 16 Jan 2025 19:58:53 +0800
Subject: [PATCH 240/462] Support torchrun and SPMD-style offline inference
 (#12071)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 .buildkite/test-pipeline.yaml                 |  1 +
 .../offline_inference/torchrun_example.py     | 64 +++++++++++++++
 tests/distributed/test_torchrun_example.py    | 56 +++++++++++++
 tests/engine/test_multiproc_workers.py        |  2 +-
 vllm/config.py                                |  7 +-
 vllm/engine/arg_utils.py                      |  2 +-
 vllm/engine/llm_engine.py                     |  5 ++
 vllm/executor/ray_distributed_executor.py     |  6 +-
 vllm/executor/uniproc_executor.py             | 81 ++++++++++++++++++-
 vllm/lora/layers.py                           |  4 +-
 .../model_executor/layers/logits_processor.py | 16 ++--
 vllm/v1/executor/multiproc_executor.py        |  2 +-
 vllm/worker/worker.py                         |  3 -
 vllm/worker/worker_base.py                    | 29 ++++---
 14 files changed, 248 insertions(+), 30 deletions(-)
 create mode 100644 examples/offline_inference/torchrun_example.py
 create mode 100644 tests/distributed/test_torchrun_example.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 74b287c7adbfa..00fed96c1ac8c 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -463,6 +463,7 @@ steps:
   - vllm/worker/worker.py
   - vllm/worker/model_runner.py
   commands:
+  - torchrun --nproc-per-node=2 distributed/test_torchrun_example.py
   - pytest -v -s ./compile/test_basic_correctness.py
   - pytest -v -s ./compile/test_wrapper.py
   - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
diff --git a/examples/offline_inference/torchrun_example.py b/examples/offline_inference/torchrun_example.py
new file mode 100644
index 0000000000000..b6de73eb7266e
--- /dev/null
+++ b/examples/offline_inference/torchrun_example.py
@@ -0,0 +1,64 @@
+"""
+experimental support for tensor-parallel inference with torchrun,
+see https://github.com/vllm-project/vllm/issues/11400 for
+the motivation and use case for this example.
+run the script with `torchrun --nproc-per-node=2 torchrun_example.py`,
+the argument 2 should match the `tensor_parallel_size` below.
+see `tests/distributed/test_torchrun_example.py` for the unit test.
+"""
+
+from vllm import LLM, SamplingParams
+
+# Create prompts, the same across all ranks
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+
+# Create sampling parameters, the same across all ranks
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+# Use `distributed_executor_backend="external_launcher"` so that
+# this llm engine/instance only creates one worker.
+llm = LLM(
+    model="facebook/opt-125m",
+    tensor_parallel_size=2,
+    distributed_executor_backend="external_launcher",
+)
+
+outputs = llm.generate(prompts, sampling_params)
+
+# all ranks will have the same outputs
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, "
+          f"Generated text: {generated_text!r}")
+"""
+Further tips:
+
+1. to communicate control messages across all ranks, use the cpu group,
+a PyTorch ProcessGroup with GLOO backend.
+
+```python
+from vllm.distributed.parallel_state import get_world_group
+cpu_group = get_world_group().cpu_group
+torch_rank = dist.get_rank(group=cpu_group)
+if torch_rank == 0:
+    # do something for rank 0, e.g. saving the results to disk.
+```
+
+2. to communicate data across all ranks, use the model's device group,
+a PyTorch ProcessGroup with NCCL backend.
+```python
+from vllm.distributed.parallel_state import get_world_group
+device_group = get_world_group().device_group
+```
+
+3. to access the model directly in every rank, use the following code:
+```python
+llm.llm_engine.model_executor.driver_worker.worker.model_runner.model
+```
+"""
diff --git a/tests/distributed/test_torchrun_example.py b/tests/distributed/test_torchrun_example.py
new file mode 100644
index 0000000000000..7aa03d7f0402a
--- /dev/null
+++ b/tests/distributed/test_torchrun_example.py
@@ -0,0 +1,56 @@
+# unit test for `examples/offline_inference/torchrun_example.py`
+
+import random
+
+import torch.distributed as dist
+
+from vllm import LLM, SamplingParams
+from vllm.distributed.parallel_state import get_world_group
+
+# Create prompts
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+# set different `gpu_memory_utilization` and `swap_space` for different ranks,
+# to test if all ranks agree on the same kv cache configuration.
+llm = LLM(model="facebook/opt-125m",
+          tensor_parallel_size=2,
+          distributed_executor_backend="external_launcher",
+          gpu_memory_utilization=random.uniform(0.7, 0.9),
+          swap_space=random.randint(1, 4))
+
+outputs = llm.generate(prompts, sampling_params)
+
+cpu_group = get_world_group().cpu_group
+
+torch_rank = dist.get_rank(group=cpu_group)
+
+
+def test_consistent_across_ranks(obj):
+    if torch_rank == 0:
+        dist.broadcast_object_list([obj], src=0, group=cpu_group)
+    else:
+        container = [None]
+        dist.broadcast_object_list(container, src=0, group=cpu_group)
+        assert container[0] == obj
+
+
+test_consistent_across_ranks(
+    llm.llm_engine.vllm_config.cache_config.num_cpu_blocks)
+test_consistent_across_ranks(
+    llm.llm_engine.vllm_config.cache_config.num_gpu_blocks)
+
+# all ranks should have the same outputs
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    test_consistent_across_ranks(prompt)
+    test_consistent_across_ranks(generated_text)
+    print(f"Rank {torch_rank}, Prompt: {prompt!r}, "
+          f"Generated text: {generated_text!r}")
diff --git a/tests/engine/test_multiproc_workers.py b/tests/engine/test_multiproc_workers.py
index db70a808c008b..04505fcaae24b 100644
--- a/tests/engine/test_multiproc_workers.py
+++ b/tests/engine/test_multiproc_workers.py
@@ -22,7 +22,7 @@ def worker_method(self, worker_input: Any) -> Tuple[int, Any]:
             # simulate error case
             raise worker_input
 
-        return self.rank, input
+        return self.rpc_rank, input
 
 
 def _start_workers() -> Tuple[List[ProcessWorkerWrapper], WorkerMonitor]:
diff --git a/vllm/config.py b/vllm/config.py
index 2fe674b857e16..a5f2161068d2a 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1338,14 +1338,15 @@ def _verify_args(self) -> None:
         from vllm.executor.executor_base import ExecutorBase
         from vllm.platforms import current_platform
         if self.distributed_executor_backend not in (
-                "ray", "mp", "uni", None) and not (isinstance(
+                "ray", "mp", "uni",
+                "external_launcher", None) and not (isinstance(
                     self.distributed_executor_backend, type) and issubclass(
                         self.distributed_executor_backend, ExecutorBase)):
             raise ValueError(
                 "Unrecognized distributed executor backend "
                 f"{self.distributed_executor_backend}. Supported "
-                "values are 'ray', 'mp' 'uni', or custom ExecutorBase"
-                " subclass.")
+                "values are 'ray', 'mp' 'uni', 'external_launcher' or"
+                " custom ExecutorBase subclass.")
         if self.use_ray:
             from vllm.executor import ray_utils
             ray_utils.assert_ray_available()
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 03a8959a7d9ff..a4f4c9558d056 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -388,7 +388,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         # Parallel arguments
         parser.add_argument(
             '--distributed-executor-backend',
-            choices=['ray', 'mp'],
+            choices=['ray', 'mp', 'uni', 'external_launcher'],
             default=EngineArgs.distributed_executor_backend,
             help='Backend to use for distributed model '
             'workers, either "ray" or "mp" (multiprocessing). If the product '
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 49a1e9f505d9f..5d19ce03d5b58 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -457,6 +457,11 @@ def _get_executor_cls(cls,
                 # JAX-style, single-process, multi-device executor.
                 from vllm.executor.uniproc_executor import UniProcExecutor
                 executor_class = UniProcExecutor
+            elif distributed_executor_backend == "external_launcher":
+                # executor with external launcher
+                from vllm.executor.uniproc_executor import (  # noqa
+                    ExecutorWithExternalLauncher)
+                executor_class = ExecutorWithExternalLauncher
         else:
             from vllm.executor.uniproc_executor import UniProcExecutor
             executor_class = UniProcExecutor
diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py
index edceece4b68dc..3baeb63918a62 100644
--- a/vllm/executor/ray_distributed_executor.py
+++ b/vllm/executor/ray_distributed_executor.py
@@ -172,7 +172,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
                     scheduling_strategy=scheduling_strategy,
                     **ray_remote_kwargs,
                 )(RayWorkerWrapper).remote(vllm_config=self.vllm_config,
-                                           rank=rank)
+                                           rpc_rank=rank)
             else:
                 worker = ray.remote(
                     num_cpus=0,
@@ -181,7 +181,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
                     scheduling_strategy=scheduling_strategy,
                     **ray_remote_kwargs,
                 )(RayWorkerWrapper).remote(vllm_config=self.vllm_config,
-                                           rank=rank)
+                                           rpc_rank=rank)
             worker_metadata.append(
                 RayWorkerMetaData(worker=worker, created_rank=rank))
             rank += 1
@@ -204,7 +204,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
                     # as the resource holder for the driver process.
                     self.driver_dummy_worker = worker
                     self.driver_worker = RayWorkerWrapper(
-                        vllm_config=self.vllm_config, rank=0)
+                        vllm_config=self.vllm_config, rpc_rank=0)
                     worker_metadata.pop(i)
                     break
 
diff --git a/vllm/executor/uniproc_executor.py b/vllm/executor/uniproc_executor.py
index da1d77343cf3b..27b83e95ba95b 100644
--- a/vllm/executor/uniproc_executor.py
+++ b/vllm/executor/uniproc_executor.py
@@ -1,5 +1,10 @@
+import os
 from typing import Any, Dict, List, Optional, Tuple
 
+import torch
+import torch.distributed as dist
+
+import vllm.envs as envs
 from vllm.executor.executor_base import ExecutorBase
 from vllm.logger import init_logger
 from vllm.utils import get_distributed_init_method, get_ip, get_open_port
@@ -16,7 +21,7 @@ def _init_executor(self) -> None:
         """Initialize the worker and load the model.
         """
         self.driver_worker = WorkerWrapperBase(vllm_config=self.vllm_config,
-                                               rank=0)
+                                               rpc_rank=0)
         distributed_init_method = get_distributed_init_method(
             get_ip(), get_open_port())
         local_rank = 0
@@ -55,3 +60,77 @@ def check_health(self) -> None:
 
 
 UniProcExecutorAsync = UniProcExecutor
+
+
+class ExecutorWithExternalLauncher(UniProcExecutor):
+    """An executor that uses external launchers to launch engines,
+    specially designed for torchrun-compatible launchers, for
+    offline inference with tensor parallelism.
+
+    see https://github.com/vllm-project/vllm/issues/11400 for
+    the motivation, and examples/offline_inference/torchrun_example.py
+    for the usage example.
+
+    The key idea: although it is tensor-parallel inference, we only
+    create one worker per executor, users will launch multiple
+    engines with torchrun-compatible launchers, and all these engines
+    work together to process the same prompts. When scheduling is
+    deterministic, all the engines will generate the same outputs,
+    and they don't need to synchronize the states with each other.
+    """
+    uses_ray: bool = False
+
+    def _init_executor(self) -> None:
+        """Initialize the worker and load the model.
+        """
+        assert self.vllm_config.parallel_config.pipeline_parallel_size == 1, \
+            ("ExecutorWithExternalLauncher does not "
+            "support pipeline parallelism.")
+        assert self.vllm_config.scheduler_config.delay_factor == 0.0, \
+            ("ExecutorWithExternalLauncher needs deterministic "
+            "execution, so it"
+            "does not support delay_factor in scheduling")
+        assert not envs.VLLM_USE_V1, \
+            ("V1 architecture cannot guarantee deterministic execution, "
+            "so it is not supported in ExecutorWithExternalLauncher.")
+        self.driver_worker = WorkerWrapperBase(vllm_config=self.vllm_config,
+                                               rpc_rank=0)
+        # engines are launched in torchrun-compatible launchers
+        # so we can use the env:// method.
+        # required env vars:
+        # - RANK
+        # - MASTER_ADDR
+        # - MASTER_PORT
+        distributed_init_method = "env://"
+        rank = int(os.environ["RANK"])
+        local_rank = rank
+        is_driver_worker = True
+        kwargs = dict(
+            vllm_config=self.vllm_config,
+            local_rank=local_rank,
+            rank=rank,
+            distributed_init_method=distributed_init_method,
+            is_driver_worker=is_driver_worker,
+        )
+        self.collective_rpc("init_worker", args=([kwargs], ))
+        self.collective_rpc("init_device")
+        self.collective_rpc("load_model")
+
+    def determine_num_available_blocks(self) -> Tuple[int, int]:
+        """
+        Determine the number of available KV blocks.
+        Add an additional all_reduce to get the min across all ranks.
+        Note that even if we have the same `gpu_memory_utilization` and 
+        `swap_space`, the available memory in every rank might still 
+        differ because NCCL can take different amounts of memory in 
+        different ranks. Therefore, it is necessary to test if all ranks 
+        agree on the same KV cache configuration.
+        """
+        a, b = super().determine_num_available_blocks()
+        from vllm.distributed.parallel_state import get_world_group
+        cpu_group = get_world_group().cpu_group
+        a_tensor = torch.tensor([a], device="cpu", dtype=torch.int64)
+        b_tensor = torch.tensor([b], device="cpu", dtype=torch.int64)
+        dist.all_reduce(a_tensor, group=cpu_group, op=dist.ReduceOp.MIN)
+        dist.all_reduce(b_tensor, group=cpu_group, op=dist.ReduceOp.MIN)
+        return a_tensor.item(), b_tensor.item()
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index dd981ffce8833..e6f26d2b74b2f 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -940,8 +940,8 @@ def soft_cap(self):
         return self.base_layer.soft_cap
 
     @property
-    def use_gather(self):
-        return self.base_layer.use_gather
+    def use_all_gather(self):
+        return self.base_layer.use_all_gather
 
     @property
     def org_vocab_size(self):
diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py
index 2bc7e458494f7..42decde1d0f79 100644
--- a/vllm/model_executor/layers/logits_processor.py
+++ b/vllm/model_executor/layers/logits_processor.py
@@ -6,6 +6,7 @@
 import torch.nn as nn
 
 import vllm.envs as envs
+from vllm.config import get_current_vllm_config
 from vllm.distributed import (tensor_model_parallel_all_gather,
                               tensor_model_parallel_gather)
 from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -44,8 +45,10 @@ def __init__(self,
         self.soft_cap = soft_cap
         # Whether to use gather or all-gather to gather the logits.
 
-        self.use_gather = not current_platform.is_tpu(
-        ) and not envs.VLLM_USE_V1
+        parallel_config = get_current_vllm_config().parallel_config
+        self.use_all_gather = current_platform.is_tpu() \
+            or envs.VLLM_USE_V1 \
+            or parallel_config.distributed_executor_backend == "external_launcher" # noqa
 
     def forward(
         self,
@@ -88,16 +91,17 @@ def _get_logits(
         logits = lm_head.linear_method.apply(lm_head,
                                              hidden_states,
                                              bias=embedding_bias)
-        if self.use_gather:
-            # None may be returned for rank > 0
-            logits = tensor_model_parallel_gather(logits)
-        else:
+
+        if self.use_all_gather:
             # Gather is not supported for some devices such as TPUs.
             # Use all-gather instead.
             # NOTE(woosuk): Here, the outputs of every device should not be None
             # because XLA requires strict SPMD among all devices. Every device
             # should execute the same operations after gathering the logits.
             logits = tensor_model_parallel_all_gather(logits)
+        else:
+            # None may be returned for rank > 0
+            logits = tensor_model_parallel_gather(logits)
         # Remove paddings in vocab (if any).
         if logits is not None:
             logits = logits[..., :self.org_vocab_size]
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index cee0fcc0bad68..e111ac7ee8183 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -246,7 +246,7 @@ def __init__(
         ready_path: str,
     ):
         self.rank = rank
-        wrapper = WorkerWrapperBase(vllm_config=vllm_config, rank=rank)
+        wrapper = WorkerWrapperBase(vllm_config=vllm_config, rpc_rank=rank)
         # TODO: move `init_worker` to executor level as a collective rpc call
         all_kwargs: List[Dict] = [
             {} for _ in range(vllm_config.parallel_config.world_size)
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index a3e377ef2b19d..43eeb287d64eb 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -55,9 +55,6 @@ def __init__(
         self.rank = rank
         self.distributed_init_method = distributed_init_method
         self.is_driver_worker = is_driver_worker
-        if is_driver_worker:
-            assert rank % self.parallel_config.tensor_parallel_size == 0, \
-                   "Driver worker should be rank 0 of tensor parallel group."
         if self.model_config.trust_remote_code:
             # note: lazy import to avoid importing torch before initializing
             from vllm.utils import init_cached_hf_modules
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index 7c14b8344b49e..d464b614b12f1 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -461,7 +461,8 @@ def _execute_model_spmd(
 
 class WorkerWrapperBase:
     """
-    The whole point of this class is to lazily initialize the worker.
+    This class represents one process in an executor/engine. It is responsible
+    for lazily initializing the worker and handling the worker's lifecycle.
     We first instantiate the WorkerWrapper, which remembers the worker module
     and class name. Then, when we call `update_environment_variables`, and the
     real initialization happens in `init_worker`.
@@ -470,9 +471,19 @@ class WorkerWrapperBase:
     def __init__(
         self,
         vllm_config: VllmConfig,
-        rank: int = 0,
+        rpc_rank: int = 0,
     ) -> None:
-        self.rank = rank
+        """
+        Initialize the worker wrapper with the given vllm_config and rpc_rank.
+        Note: rpc_rank is the rank of the worker in the executor. In most cases,
+        it is also the rank of the worker in the distributed group. However,
+        when multiple executors work together, they can be different.
+        e.g. in the case of SPMD-style offline inference with TP=2,
+        users can launch 2 engines/executors, each with only 1 worker.
+        All workers have rpc_rank=0, but they have different ranks in the TP
+        group.
+        """
+        self.rpc_rank = rpc_rank
         self.vllm_config = vllm_config
         self.worker: Optional[WorkerBase] = None
         if vllm_config.model_config is not None:
@@ -485,16 +496,16 @@ def __init__(
 
     def adjust_rank(self, rank_mapping: Dict[int, int]) -> None:
         """
-        Adjust the rank based on the given mapping.
+        Adjust the rpc_rank based on the given mapping.
         It is only used during the initialization of the executor,
-        to adjust the rank of workers after we create all workers.
+        to adjust the rpc_rank of workers after we create all workers.
         """
-        if self.rank in rank_mapping:
-            self.rank = rank_mapping[self.rank]
+        if self.rpc_rank in rank_mapping:
+            self.rpc_rank = rank_mapping[self.rpc_rank]
 
     def update_environment_variables(self, envs_list: List[Dict[str,
                                                                 str]]) -> None:
-        envs = envs_list[self.rank]
+        envs = envs_list[self.rpc_rank]
         key = 'CUDA_VISIBLE_DEVICES'
         if key in envs and key in os.environ:
             # overwriting CUDA_VISIBLE_DEVICES is desired behavior
@@ -507,7 +518,7 @@ def init_worker(self, all_kwargs: List[Dict[str, Any]]) -> None:
         Here we inject some common logic before initializing the worker.
         Arguments are passed to the worker class constructor.
         """
-        kwargs = all_kwargs[self.rank]
+        kwargs = all_kwargs[self.rpc_rank]
         enable_trace_function_call_for_thread(self.vllm_config)
 
         # see https://github.com/NVIDIA/nccl/issues/1234

From 92e793d91a1a4e982662ecca0096e5edcafd21c6 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 16 Jan 2025 20:19:52 +0800
Subject: [PATCH 241/462] [core] LLM.collective_rpc interface and RLHF example
 (#12084)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 .buildkite/test-pipeline.yaml      |   4 +
 examples/offline_inference/rlhf.py | 191 +++++++++++++++++++++++++++++
 vllm/__init__.py                   |  39 ++++++
 vllm/entrypoints/llm.py            |  25 ++++
 vllm/plugins/__init__.py           |  31 -----
 vllm/worker/worker_base.py         |  15 ++-
 6 files changed, 270 insertions(+), 35 deletions(-)
 create mode 100644 examples/offline_inference/rlhf.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 00fed96c1ac8c..7442de245bd80 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -126,11 +126,15 @@ steps:
   - tests/distributed
   - tests/spec_decode/e2e/test_integration_dist_tp4
   - tests/compile
+  - examples/offline_inference/rlhf.py
   commands:
   - pytest -v -s distributed/test_utils.py
   - pytest -v -s compile/test_basic_correctness.py
   - pytest -v -s distributed/test_pynccl.py
   - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
+  # TODO: create a dedicated test section for multi-GPU example tests
+  # when we have multiple distributed example tests
+  - python3 ../examples/offline_inference/rlhf.py
 
 - label: Metrics, Tracing Test # 10min
   num_gpus: 2 
diff --git a/examples/offline_inference/rlhf.py b/examples/offline_inference/rlhf.py
new file mode 100644
index 0000000000000..3bc303dad277f
--- /dev/null
+++ b/examples/offline_inference/rlhf.py
@@ -0,0 +1,191 @@
+"""
+a simple demonstration of RLHF with vLLM, inspired by
+the OpenRLHF framework https://github.com/OpenRLHF/OpenRLHF .
+It follows the design that, training processes and inference processes
+are different, and they live on different GPUs.
+Training processes send prompts to inference processes to generate data,
+and also synchronize the weights of the model by broadcasting the weights
+from the training process to the inference process.
+Note that this is a simple demonstration of one training instance and one
+inference instance. In practice, there could be multiple training instances
+and multiple inference instances. For the full implementation, please refer
+to the OpenRLHF framework.
+"""
+import os
+
+import ray
+import torch
+from ray.util.placement_group import placement_group
+from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
+from transformers import AutoModelForCausalLM
+
+from vllm import LLM, SamplingParams, configure_as_vllm_process
+from vllm.utils import get_ip, get_open_port
+from vllm.worker.worker import Worker
+
+
+def stateless_init_process_group(master_address, master_port, rank, world_size,
+                                 device):
+    """
+    vLLM provides `StatelessProcessGroup` to create a process group
+    without considering the global process group in torch.distributed.
+    It is recommended to create `StatelessProcessGroup`, and then initialize
+    the data-plane communication (NCCL) between external (train processes) 
+    and vLLM workers.
+    """
+    from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
+    from vllm.distributed.utils import StatelessProcessGroup
+    pg = StatelessProcessGroup.create(host=master_address,
+                                      port=master_port,
+                                      rank=rank,
+                                      world_size=world_size)
+    pynccl = PyNcclCommunicator(pg, device=device)
+    return pynccl
+
+
+class MyWorker(Worker):
+    """
+    The `MyWorker` class inherits from `Worker` to provide custom functions.
+    For simplicity, we define the `MyWorker` class in this self-contained 
+    script. Normally, we should define the `MyWorker` class in a separate 
+    file and pass the qualified name of the class to the `worker_cls` 
+    parameter.
+    """
+
+    def init_weight_update_group(self, master_address, master_port,
+                                 rank_offset, world_size):
+        from vllm.distributed.parallel_state import get_world_group
+        rank = get_world_group().rank + rank_offset
+        self.model_update_group = stateless_init_process_group(
+            master_address,
+            master_port,
+            rank,
+            world_size,
+            self.device,
+        )
+
+    def update_weight(self, name, dtype, shape):
+        weight = torch.empty(shape, dtype=dtype, device="cuda")
+        self.model_update_group.broadcast(weight,
+                                          src=0,
+                                          stream=torch.cuda.current_stream())
+
+        self.model_runner.model.load_weights(weights=[(name, weight)])
+
+        del weight
+
+    def check_weights_changed(self):
+        """
+        Check if the weights are updated to 0.
+        """
+        weights_updated = True
+        for name, p in self.model_runner.model.named_parameters():
+            weights_updated = weights_updated and torch.allclose(
+                p, torch.zeros_like(p))
+        return weights_updated
+
+
+class MyLLM(LLM):
+
+    def __init__(self, *args, **kwargs):
+        # a hack to make the script work.
+        # stop ray from manipulating CUDA_VISIBLE_DEVICES
+        # at the top-level
+        del os.environ["CUDA_VISIBLE_DEVICES"]
+        super().__init__(*args, **kwargs)
+
+
+"""
+Start the training process, here we use huggingface transformers 
+as an example to hold a model on GPU 0.
+
+It is important for all the processes outside of vLLM to call
+`configure_as_vllm_process` to set some common environment variables
+the same as vLLM workers.
+"""
+configure_as_vllm_process()
+
+train_model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m")
+train_model.to("cuda:0")
+"""
+Start the inference process, here we use vLLM to hold a model on GPU 1 and 
+GPU 2. For the details on how to use ray, please refer to the ray 
+documentation https://docs.ray.io/en/latest/ .
+"""
+os.environ["CUDA_VISIBLE_DEVICES"] = "1,2"
+ray.init()
+
+pg_inference = placement_group([{"GPU": 1, "CPU": 0}] * 2)
+ray.get(pg_inference.ready())
+scheduling_inference = PlacementGroupSchedulingStrategy(
+    placement_group=pg_inference,
+    placement_group_capture_child_tasks=True,
+    placement_group_bundle_index=0,
+)
+"""
+launch the vLLM inference engine.
+here we use `enforce_eager` to reduce the start time.
+"""
+llm = ray.remote(
+    num_cpus=0,
+    num_gpus=0,
+    scheduling_strategy=scheduling_inference,
+)(MyLLM).remote(
+    model="facebook/opt-125m",
+    enforce_eager=True,
+    worker_cls=MyWorker,
+    tensor_parallel_size=2,
+    distributed_executor_backend="ray",
+)
+
+# Generate texts from the prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+
+sampling_params = SamplingParams(temperature=0)
+
+outputs = ray.get(llm.generate.remote(prompts, sampling_params))
+
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, "
+          f"Generated text: {generated_text!r}")
+
+# set up the communication between the training process
+# and the inference engine.
+master_address = get_ip()
+master_port = get_open_port()
+
+handle = llm.collective_rpc.remote("init_weight_update_group",
+                                   args=(master_address, master_port, 1, 3))
+model_update_group = stateless_init_process_group(master_address, master_port,
+                                                  0, 3, torch.device("cuda:0"))
+ray.get(handle)
+
+# simulate training, modify the weights of the model.
+for name, p in train_model.named_parameters():
+    p.data.zero_()
+
+# sync weight from the training process to the inference engine.
+for name, p in train_model.named_parameters():
+    handle = llm.collective_rpc.remote("update_weight",
+                                       args=(name, p.dtype, p.shape))
+    model_update_group.broadcast(p, src=0, stream=torch.cuda.current_stream())
+    ray.get(handle)
+
+# check if the weights are updated.
+assert all(ray.get(llm.collective_rpc.remote("check_weights_changed")))
+
+# use the updated model to generate texts, they will be nonsense
+# because the weights are all zeros.
+outputs_updated = ray.get(llm.generate.remote(prompts, sampling_params))
+for output in outputs_updated:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, "
+          f"Generated text: {generated_text!r}")
diff --git a/vllm/__init__.py b/vllm/__init__.py
index 45252b93e3d54..a533dba561c00 100644
--- a/vllm/__init__.py
+++ b/vllm/__init__.py
@@ -17,6 +17,44 @@
 
 from .version import __version__, __version_tuple__
 
+
+def configure_as_vllm_process():
+    """
+    set some common config/environment variables that should be set
+    for all processes created by vllm and all processes
+    that interact with vllm workers.
+    """
+    import os
+
+    import torch
+
+    # see https://github.com/NVIDIA/nccl/issues/1234
+    os.environ['NCCL_CUMEM_ENABLE'] = '0'
+
+    # see https://github.com/vllm-project/vllm/issues/10480
+    os.environ['TORCHINDUCTOR_COMPILE_THREADS'] = '1'
+    # see https://github.com/vllm-project/vllm/issues/10619
+    torch._inductor.config.compile_threads = 1
+
+    from vllm.platforms import current_platform
+
+    if current_platform.is_xpu():
+        # see https://github.com/pytorch/pytorch/blob/43c5f59/torch/_dynamo/config.py#L158
+        torch._dynamo.config.disable = True
+    elif current_platform.is_hpu():
+        # NOTE(kzawora): PT HPU lazy backend (PT_HPU_LAZY_MODE = 1)
+        # does not support torch.compile
+        # Eager backend (PT_HPU_LAZY_MODE = 0) must be selected for
+        # torch.compile support
+        is_lazy = os.environ.get('PT_HPU_LAZY_MODE', '1') == '1'
+        if is_lazy:
+            torch._dynamo.config.disable = True
+            # NOTE(kzawora) multi-HPU inference with HPUGraphs (lazy-only)
+            # requires enabling lazy collectives
+            # see https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html # noqa: E501
+            os.environ['PT_HPU_ENABLE_LAZY_COLLECTIVES'] = 'true'
+
+
 __all__ = [
     "__version__",
     "__version_tuple__",
@@ -42,4 +80,5 @@
     "AsyncEngineArgs",
     "initialize_ray_cluster",
     "PoolingParams",
+    "configure_as_vllm_process",
 ]
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index acb4db85632a8..b78d5c65a40f8 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -4,6 +4,7 @@
 from typing import (Any, ClassVar, Dict, List, Optional, Sequence, Tuple, Type,
                     Union, cast, overload)
 
+import cloudpickle
 from tqdm import tqdm
 from typing_extensions import deprecated
 
@@ -186,6 +187,13 @@ def __init__(
         if "disable_log_stats" not in kwargs:
             kwargs["disable_log_stats"] = True
 
+        if "worker_cls" in kwargs:
+            worker_cls = kwargs["worker_cls"]
+            # if the worker_cls is not qualified string name,
+            # we serialize it using cloudpickle to avoid pickling issues
+            if isinstance(worker_cls, type):
+                kwargs["worker_cls"] = cloudpickle.dumps(worker_cls)
+
         if compilation_config is not None:
             if isinstance(compilation_config, (int, dict)):
                 compilation_config_instance = CompilationConfig.from_cli(
@@ -455,6 +463,23 @@ def generate(
         outputs = self._run_engine(use_tqdm=use_tqdm)
         return self.engine_class.validate_outputs(outputs, RequestOutput)
 
+    def collective_rpc(self,
+                       method: str,
+                       timeout: Optional[float] = None,
+                       args: Tuple = (),
+                       kwargs: Optional[Dict] = None) -> List[Any]:
+        """
+        Run a method on all workers, with homogeneous arguments.
+        The main extension point for the LLM entrypoint.
+        Users can provide custom worker class through `worker_cls`
+        argument, and implement new methods in the worker class.
+        Then, users can call the new methods through this API.
+        It is recommended to use this API to only pass control messages,
+        and set up data-plane communication to pass data.
+        """
+        return self.llm_engine.model_executor.collective_rpc(
+            method, timeout, args, kwargs)
+
     def beam_search(
         self,
         prompts: List[Union[TokensPrompt, TextPrompt]],
diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py
index e5fa4f0e4a2f6..ff54174f634af 100644
--- a/vllm/plugins/__init__.py
+++ b/vllm/plugins/__init__.py
@@ -1,9 +1,6 @@
 import logging
-import os
 from typing import Callable, Dict
 
-import torch
-
 import vllm.envs as envs
 
 logger = logging.getLogger(__name__)
@@ -50,34 +47,6 @@ def load_general_plugins():
     processes. They should be designed in a way that they can be loaded
     multiple times without causing issues.
     """
-
-    # all processes created by vllm will load plugins,
-    # and here we can inject some common environment variables
-    # for all processes.
-
-    # see https://github.com/vllm-project/vllm/issues/10480
-    os.environ['TORCHINDUCTOR_COMPILE_THREADS'] = '1'
-    # see https://github.com/vllm-project/vllm/issues/10619
-    torch._inductor.config.compile_threads = 1
-
-    from vllm.platforms import current_platform
-
-    if current_platform.is_xpu():
-        # see https://github.com/pytorch/pytorch/blob/43c5f59/torch/_dynamo/config.py#L158
-        torch._dynamo.config.disable = True
-    if current_platform.is_hpu():
-        # NOTE(kzawora): PT HPU lazy backend (PT_HPU_LAZY_MODE = 1)
-        # does not support torch.compile
-        # Eager backend (PT_HPU_LAZY_MODE = 0) must be selected for
-        # torch.compile support
-        is_lazy = os.environ.get('PT_HPU_LAZY_MODE', '1') == '1'
-        if is_lazy:
-            torch._dynamo.config.disable = True
-            # NOTE(kzawora) multi-HPU inference with HPUGraphs (lazy-only)
-            # requires enabling lazy collectives
-            # see https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html # noqa: E501
-            os.environ['PT_HPU_ENABLE_LAZY_COLLECTIVES'] = 'true'
-
     global plugins_loaded
     if plugins_loaded:
         return
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index d464b614b12f1..bced5b9f44228 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -4,6 +4,7 @@
 from abc import ABC, abstractmethod
 from typing import Any, Dict, List, Optional, Set, Tuple, Type, Union
 
+import cloudpickle
 import torch
 
 from vllm.config import ObservabilityConfig, VllmConfig
@@ -521,14 +522,20 @@ def init_worker(self, all_kwargs: List[Dict[str, Any]]) -> None:
         kwargs = all_kwargs[self.rpc_rank]
         enable_trace_function_call_for_thread(self.vllm_config)
 
-        # see https://github.com/NVIDIA/nccl/issues/1234
-        os.environ['NCCL_CUMEM_ENABLE'] = '0'
+        from vllm import configure_as_vllm_process
+        configure_as_vllm_process()
 
         from vllm.plugins import load_general_plugins
         load_general_plugins()
 
-        worker_class = resolve_obj_by_qualname(
-            self.vllm_config.parallel_config.worker_cls)
+        if isinstance(self.vllm_config.parallel_config.worker_cls, str):
+            worker_class = resolve_obj_by_qualname(
+                self.vllm_config.parallel_config.worker_cls)
+        else:
+            assert isinstance(self.vllm_config.parallel_config.worker_cls,
+                              bytes)
+            worker_class = cloudpickle.loads(
+                self.vllm_config.parallel_config.worker_cls)
         self.worker = worker_class(**kwargs)
         assert self.worker is not None
 

From 874f7c292a4f4f5dbb89b12426187e5a70f006d6 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Thu, 16 Jan 2025 06:54:06 -0800
Subject: [PATCH 242/462] [Bugfix] Fix max image feature size for
 Llava-one-vision (#12104)

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 .../multimodal/processing/test_llava_next.py  | 61 ++++++++++++++++++
 .../processing/test_llava_onevision.py        | 62 +++++++++++++++++++
 vllm/model_executor/models/llava_onevision.py |  8 ++-
 3 files changed, 129 insertions(+), 2 deletions(-)

diff --git a/tests/models/multimodal/processing/test_llava_next.py b/tests/models/multimodal/processing/test_llava_next.py
index 1eec35d9c3c72..6de649f87204d 100644
--- a/tests/models/multimodal/processing/test_llava_next.py
+++ b/tests/models/multimodal/processing/test_llava_next.py
@@ -13,6 +13,67 @@
 from ...utils import build_model_context
 
 
+def _validate_image_max_tokens_one(
+    processor: BaseMultiModalProcessor,
+    max_tokens: int,
+    failed_size_excs: list[tuple[ImageSize, Exception]],
+    image_size: ImageSize,
+) -> None:
+    info = processor.info
+    feature_size = info.get_num_image_tokens(image_width=image_size.width,
+                                             image_height=image_size.height)
+
+    try:
+        assert feature_size <= max_tokens, f"{feature_size} <= {max_tokens}"
+    except Exception as exc:
+        failed_size_excs.append((image_size, exc))
+
+
+@pytest.mark.skip("This test takes around 5 minutes to run. "
+                  "Comment this out to run it manually.")
+@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
+def test_processor_max_tokens(model_id):
+    ctx = build_model_context(
+        model_name=model_id,
+        tokenizer_name=model_id,
+        mm_processor_kwargs=None,
+        limit_mm_per_prompt={"image": 1},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(
+        ctx.model_config,
+        tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer),
+    )
+    info = processor.info
+
+    seen_aspect_ratios = set[float]()
+    image_sizes = list[ImageSize]()
+
+    # The aspect ratio of the grid layout is between 1 and 2
+    # NOTE: Assumes that feature size calculation is the same if we
+    # swap the width and height of the image
+    for w, h in itertools.product(range(32, 4096), repeat=2):
+        aspect_ratio = w / h
+        if 1 <= aspect_ratio <= 2 and aspect_ratio not in seen_aspect_ratios:
+            image_sizes.append(ImageSize(w, h))
+            seen_aspect_ratios.add(aspect_ratio)
+
+    failed_size_excs = list[tuple[ImageSize, Exception]]()
+
+    validate_one = partial(
+        _validate_image_max_tokens_one,
+        processor,
+        info.get_max_image_tokens(),  # type: ignore
+        failed_size_excs,
+    )
+    pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
+
+    if failed_size_excs:
+        msg = "Found failing image sizes:" \
+            + "\n========\n".join(f"[{size}]\n{exc}"
+                                  for size, exc in failed_size_excs)
+        raise AssertionError(msg)
+
+
 def _validate_image_prompt_replacements_one(
     processor: BaseMultiModalProcessor,
     num_imgs: int,
diff --git a/tests/models/multimodal/processing/test_llava_onevision.py b/tests/models/multimodal/processing/test_llava_onevision.py
index 94ea604c58b43..806437d35ec87 100644
--- a/tests/models/multimodal/processing/test_llava_onevision.py
+++ b/tests/models/multimodal/processing/test_llava_onevision.py
@@ -13,6 +13,68 @@
 from ...utils import build_model_context
 
 
+def _validate_image_max_tokens_one(
+    processor: BaseMultiModalProcessor,
+    max_tokens: int,
+    failed_size_excs: list[tuple[ImageSize, Exception]],
+    image_size: ImageSize,
+) -> None:
+    info = processor.info
+    feature_size = info.get_num_image_tokens(image_width=image_size.width,
+                                             image_height=image_size.height)
+
+    try:
+        assert feature_size <= max_tokens, f"{feature_size} <= {max_tokens}"
+    except Exception as exc:
+        failed_size_excs.append((image_size, exc))
+
+
+@pytest.mark.skip("This test takes around 5 minutes to run. "
+                  "Comment this out to run it manually.")
+@pytest.mark.parametrize("model_id",
+                         ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
+def test_processor_max_tokens(model_id):
+    ctx = build_model_context(
+        model_name=model_id,
+        tokenizer_name=model_id,
+        mm_processor_kwargs=None,
+        limit_mm_per_prompt={"image": 1},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(
+        ctx.model_config,
+        tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer),
+    )
+    info = processor.info
+
+    seen_aspect_ratios = set[float]()
+    image_sizes = list[ImageSize]()
+
+    # The aspect ratio of the grid layout is between 1 and 6
+    # NOTE: Assumes that feature size calculation is the same if we
+    # swap the width and height of the image
+    for w, h in itertools.product(range(32, 4096), repeat=2):
+        aspect_ratio = w / h
+        if 1 <= aspect_ratio <= 6 and aspect_ratio not in seen_aspect_ratios:
+            image_sizes.append(ImageSize(w, h))
+            seen_aspect_ratios.add(aspect_ratio)
+
+    failed_size_excs = list[tuple[ImageSize, Exception]]()
+
+    validate_one = partial(
+        _validate_image_max_tokens_one,
+        processor,
+        info.get_max_image_tokens(),  # type: ignore
+        failed_size_excs,
+    )
+    pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
+
+    if failed_size_excs:
+        msg = "Found failing image sizes:" \
+            + "\n========\n".join(f"[{size}]\n{exc}"
+                                  for size, exc in failed_size_excs)
+        raise AssertionError(msg)
+
+
 def _validate_image_prompt_replacements_one(
     processor: BaseMultiModalProcessor,
     num_imgs: int,
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index 78a47e64d9afc..c9283e0c5ba20 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -19,8 +19,8 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
                                     NestedTensors)
-from vllm.multimodal.parse import (MultiModalDataItems, VideoEmbeddingItems,
-                                   VideoProcessorItems)
+from vllm.multimodal.parse import (ImageSize, MultiModalDataItems,
+                                   VideoEmbeddingItems, VideoProcessorItems)
 from vllm.multimodal.processing import PromptReplacement
 from vllm.multimodal.profiling import ProcessorInputs
 from vllm.sequence import IntermediateTensors
@@ -145,6 +145,10 @@ def _get_num_unpadded_features(
 
         return (unpadded_features, newline_features)
 
+    def get_image_size_with_most_features(self) -> ImageSize:
+        # NOTE: This hardcoded value is found via processor tests
+        return ImageSize(width=1153, height=944)
+
     def _get_num_frame_tokens(
         self,
         *,

From 5fd24ec02e0365f96301ac73a31ef06976c256e8 Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Thu, 16 Jan 2025 21:21:40 +0530
Subject: [PATCH 243/462] [misc] Add LoRA kernel micro benchmarks (#11579)

---
 benchmarks/kernels/benchmark_lora.py | 1147 ++++++++++++++++++++++++++
 benchmarks/kernels/utils.py          |  210 +++++
 2 files changed, 1357 insertions(+)
 create mode 100644 benchmarks/kernels/benchmark_lora.py
 create mode 100644 benchmarks/kernels/utils.py

diff --git a/benchmarks/kernels/benchmark_lora.py b/benchmarks/kernels/benchmark_lora.py
new file mode 100644
index 0000000000000..e1f613e1da509
--- /dev/null
+++ b/benchmarks/kernels/benchmark_lora.py
@@ -0,0 +1,1147 @@
+import argparse
+import copy
+import json
+import pickle
+import time
+from dataclasses import dataclass
+from enum import Enum, auto
+from itertools import product
+from pathlib import Path
+from typing import Any, Callable, Dict, List, Optional, Tuple
+
+import torch
+import torch.utils.benchmark as TBenchmark
+from torch.utils.benchmark import Measurement as TMeasurement
+from utils import ArgPool, Bench, CudaGraphBenchParams
+from weight_shapes import WEIGHT_SHAPES
+
+from vllm.lora.ops.triton_ops.bgmv_expand import bgmv_expand
+from vllm.lora.ops.triton_ops.bgmv_expand_slice import bgmv_expand_slice
+from vllm.lora.ops.triton_ops.bgmv_shrink import bgmv_shrink
+from vllm.lora.ops.triton_ops.sgmv_expand import sgmv_expand
+from vllm.lora.ops.triton_ops.sgmv_shrink import sgmv_shrink
+from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
+from vllm.utils import FlexibleArgumentParser
+
+DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
+DEFAULT_TP_SIZES = [1]
+DEFAULT_BATCH_SIZES = [
+    1, 16, 32, 64, 128, 192, 256, 320, 384, 448, 512, 640, 768, 896, 1024,
+    2048, 3072, 4096, 5120, 6144, 7168, 8192
+]
+DEFAULT_HIDDEN_SIZES = [1024, 2048, 4096, 8192, 16384]
+DEFAULT_LORA_RANKS = [16]
+DEFAULT_NUM_LORAS = [1, 2, 3, 4]
+DEFAULT_SORT_BY_LORA_IDS = [False, True]
+DEFAULT_SEQ_LENGTHS = [1]
+DEFAULT_EXPAND_FN_ADD_INPUTS = [True, False]
+
+
+# Utilities
+def dtype_to_str(dtype: torch.dtype):
+    if dtype == torch.float16:
+        return "f16"
+    if dtype == torch.bfloat16:
+        return "bf16"
+    if dtype == torch.float32:
+        return "f32"
+    raise ValueError(f"Unsupported dtype {dtype}")
+
+
+def make_rand_lora_weight_tensor(k: int,
+                                 n: int,
+                                 num_loras: int,
+                                 dtype: torch.dtype,
+                                 device: str = "cuda") -> torch.Tensor:
+
+    # LoRA weights column major
+    return torch.rand((num_loras, n, k), dtype=dtype).to(device)
+
+
+def make_rand_tensors(
+    a_shape: Tuple[int],
+    b_shape: Tuple[int],
+    c_shape: Tuple[int],
+    a_dtype: torch.dtype,
+    b_dtype: torch.dtype,
+    c_dtype: torch.dtype,
+    num_slices: int,
+    device: str = "cuda",
+) -> Tuple[torch.Tensor, List[torch.Tensor], torch.Tensor]:
+    """
+    Make LoRA input/output matrices.
+    """
+    A = torch.rand(a_shape, dtype=a_dtype).to(device)
+
+    # LoRA weights column major
+    Bs = [
+        torch.rand(b_shape, dtype=b_dtype).to(device)
+        for _ in range(num_slices)
+    ]
+
+    C = torch.zeros(c_shape, dtype=c_dtype).to(device)
+    return A, Bs, C
+
+
+def make_prompt_lora_mapping(num_prompts: int, num_active_loras: int,
+                             sort_by_lora_id: bool,
+                             device: str) -> torch.Tensor:
+    """
+    All prompts are mapped to a Lora ID in range [0, num_active_loras).
+    where 0 refers to first lora, 1 refers to second lora and so on.
+    """
+    assert num_active_loras > 0
+
+    if not sort_by_lora_id:
+        return torch.randint(0,
+                             num_active_loras, (num_prompts, ),
+                             dtype=torch.long)
+
+    # Divide LoRAs equally and in order.
+    part_size = num_prompts // num_active_loras
+    part_size = max(part_size, 1)
+
+    lora_id = 0
+    prompt_lora_mapping = []
+    while len(prompt_lora_mapping) < num_prompts:
+        prompt_lora_mapping.extend([lora_id] * part_size)
+        lora_id = lora_id + 1 if lora_id + 1 < num_active_loras else lora_id
+    return torch.tensor(prompt_lora_mapping[:num_prompts],
+                        dtype=torch.long,
+                        device=device)
+
+
+def make_token_lora_mapping(num_tokens: int, num_prompts: int,
+                            prompt_lora_mapping: torch.Tensor,
+                            seq_len_tensor: torch.Tensor, device: str):
+    """
+    Make token_lora_mapping from prompt_lora_mapping and seq_lens_tensor
+    """
+    assert prompt_lora_mapping.shape[0] == num_prompts
+
+    # token to lora index mapping
+    token_lora_mapping = [0] * num_tokens
+    current_offset = 0
+    for b_id in range(num_prompts):
+        lora_index = prompt_lora_mapping[b_id].item()
+        s = current_offset
+        e = s + seq_len_tensor[b_id].item()
+        token_lora_mapping[s:e] = [lora_index] * (e - s)
+        current_offset += seq_len_tensor[b_id].item()
+
+    return torch.tensor(token_lora_mapping, dtype=torch.long, device=device)
+
+
+def ref_group_gemm(ref_out: torch.Tensor, input: torch.Tensor,
+                   lora_weights: List[torch.Tensor],
+                   seq_lens_cpu: torch.Tensor,
+                   prompt_lora_mapping_cpu: torch.Tensor, scaling: float,
+                   add_inputs: Optional[bool]):
+    """
+    Torch group gemm reference implementation to test correctness of
+    benchmarking operations.
+    """
+    batches = seq_lens_cpu.size(0)
+    out_list = []
+    current_offset = 0
+    for lora_index, b_length in zip(range(batches), seq_lens_cpu):
+        x = input[current_offset:b_length + current_offset, :]
+        current_offset += b_length
+        w = lora_weights[prompt_lora_mapping_cpu[lora_index]]
+        result = torch.nn.functional.linear(x, w)
+        result *= scaling
+        out_list.append(result)
+    torch.cat(out_list, dim=0)
+
+    cat_result = torch.cat(out_list, dim=0)
+
+    if add_inputs:
+        ref_out += cat_result
+    else:
+        ref_out.copy_(cat_result)
+
+
+class OpType(Enum):
+    """
+    LoRA Ops to benchmark and its properties.
+    """
+    SGMV_SHRINK = auto()
+    BGMV_SHRINK = auto()
+    SGMV_EXPAND = auto()
+    BGMV_EXPAND = auto()
+    BGMV_EXPAND_SLICE = auto()
+
+    @staticmethod
+    def from_str(s: str) -> "OpType":
+        if s.lower() == 'sgmv_shrink':
+            return OpType.SGMV_SHRINK
+        if s.lower() == 'sgmv_expand':
+            return OpType.SGMV_EXPAND
+        if s.lower() == 'bgmv_shrink':
+            return OpType.BGMV_SHRINK
+        if s.lower() == 'bgmv_expand':
+            return OpType.BGMV_EXPAND
+        if s.lower() == "bgmv_expand_slice":
+            return OpType.BGMV_EXPAND_SLICE
+        raise ValueError(f"Unrecognized str {s} to convert to OpType")
+
+    def is_shrink_fn(self) -> bool:
+        return self in [OpType.SGMV_SHRINK, OpType.BGMV_SHRINK]
+
+    def is_expand_fn(self) -> bool:
+        return self in [OpType.SGMV_EXPAND, OpType.BGMV_EXPAND]
+
+    def is_prefill_op(self) -> bool:
+        return self in [OpType.SGMV_SHRINK, OpType.SGMV_EXPAND]
+
+    def is_decode_op(self) -> bool:
+        return self in [
+            OpType.BGMV_SHRINK, OpType.BGMV_EXPAND, OpType.BGMV_EXPAND_SLICE
+        ]
+
+    def is_expand_slice_fn(self) -> bool:
+        return self in [OpType.BGMV_EXPAND_SLICE]
+
+    def num_slices(self) -> List[int]:
+        if self in [OpType.SGMV_EXPAND, OpType.SGMV_SHRINK]:
+            # SGMV kernels supports slices
+            return [1, 2, 3]
+        if self in [OpType.BGMV_SHRINK, OpType.BGMV_EXPAND]:
+            return [1]
+        if self in [OpType.BGMV_EXPAND_SLICE]:
+            return [2, 3]
+        raise ValueError(f"Unrecognized OpType {self}")
+
+    def mkn(self, batch_size: int, seq_length: int, hidden_size: int,
+            lora_rank: int) -> Tuple[int, int, int]:
+        num_tokens = batch_size * seq_length
+        if self.is_shrink_fn():
+            m = num_tokens
+            k = hidden_size
+            n = lora_rank
+        else:
+            assert self.is_expand_fn() or self.is_expand_slice_fn()
+            m = num_tokens
+            k = lora_rank
+            n = hidden_size
+        return m, k, n
+
+    def matmul_dtypes(
+            self, op_dtype: torch.dtype
+    ) -> Tuple[torch.dtype, torch.dtype, torch.dtype]:
+        """
+        return a type, b type and c type for A x B = C
+        """
+        if self.is_shrink_fn():
+            return op_dtype, op_dtype, torch.float32
+        else:
+            assert self.is_expand_fn() or self.is_expand_slice_fn()
+            return torch.float32, op_dtype, op_dtype
+
+    def matmul_shapes(
+            self, batch_size: int, seq_length: int, hidden_size: int,
+            lora_rank: int, num_loras: int,
+            num_slices: int) -> Tuple[Tuple[int], Tuple[int], Tuple[int]]:
+        """
+        Given num_slices, return the shapes of the A, B, and C matrices
+        in A x B = C, for the op_type
+        """
+        m, k, n = self.mkn(batch_size, seq_length, hidden_size, lora_rank)
+
+        b_shape = (num_loras, n, k)  # col-major
+        if self == OpType.SGMV_SHRINK:
+            # SGMV shrink supports num_slices inherently in the kernel
+            return ((m, k), b_shape, (num_slices, m, n))
+        if self == OpType.SGMV_EXPAND:
+            # SGMV expand supports num_slices inherently in the kernel
+            return ((num_slices, m, k), b_shape, (m, n * num_slices))
+        if self == OpType.BGMV_SHRINK:
+            return ((m, k), b_shape, (m, n))
+        if self == OpType.BGMV_EXPAND:
+            return ((m, k), b_shape, (m, n))
+        if self == OpType.BGMV_EXPAND_SLICE:
+            return ((num_slices, m, k), b_shape, (m, n * num_slices))
+
+        raise ValueError(f"Unrecognized op_type {self}")
+
+    def bench_fn(self) -> Callable:
+
+        def emulate_bgmv_expand_slice(kwargs_list: List[Dict[str, Any]]):
+            for x in kwargs_list:
+                bgmv_expand_slice(**x)
+
+        if self == OpType.SGMV_SHRINK:
+            return sgmv_shrink
+        if self == OpType.SGMV_EXPAND:
+            return sgmv_expand
+        if self == OpType.BGMV_SHRINK:
+            return bgmv_shrink
+        if self == OpType.BGMV_EXPAND:
+            return bgmv_expand
+        if self == OpType.BGMV_EXPAND_SLICE:
+            return emulate_bgmv_expand_slice
+        raise ValueError(f"Unrecognized optype {self}")
+
+    def run_ref_group_gemm(self, output: torch.Tensor, input: torch.Tensor,
+                           lora_weights: List[torch.Tensor],
+                           **kwargs) -> Callable:
+        """Each benchmark operation expected the input, lora_weights and outputs
+           in a slightly different format. Refer to self.matmul_shapes().
+           run_ref_group_gemm accounts for those differences in executing a
+           reference group gemm for correctness testing.
+        """
+        w_dtype = lora_weights[0].dtype
+        num_slices = len(lora_weights)
+        if self == OpType.SGMV_SHRINK:
+            for slice_idx in range(num_slices):
+                ref_group_gemm(ref_out=output[slice_idx, :],
+                               input=input,
+                               lora_weights=lora_weights[slice_idx],
+                               **kwargs)
+        if self == OpType.SGMV_EXPAND:
+            hidden_size = lora_weights[0].shape[1]
+            for slice_idx in range(num_slices):
+                slice_offset = slice_idx * hidden_size
+                ref_group_gemm(
+                    ref_out=output[:, slice_offset:slice_offset + hidden_size],
+                    input=input[slice_idx].clone().to(dtype=w_dtype),
+                    lora_weights=lora_weights[slice_idx],
+                    **kwargs)
+        if self == OpType.BGMV_SHRINK:
+            assert num_slices == 1
+            ref_group_gemm(ref_out=output,
+                           input=input,
+                           lora_weights=lora_weights[0],
+                           **kwargs)
+        if self == OpType.BGMV_EXPAND:
+            assert num_slices == 1
+            ref_group_gemm(ref_out=output,
+                           input=input.clone().to(dtype=w_dtype),
+                           lora_weights=lora_weights[0],
+                           **kwargs)
+        if self == OpType.BGMV_EXPAND_SLICE:
+            hidden_size = lora_weights[0].shape[1]
+            for slice_idx in range(num_slices):
+                slice_offset = slice_idx * hidden_size
+                ref_group_gemm(
+                    ref_out=output[:, slice_offset:slice_offset + hidden_size],
+                    input=input[slice_idx].clone().to(dtype=w_dtype),
+                    lora_weights=lora_weights[slice_idx],
+                    **kwargs)
+        raise ValueError(f"Unrecognized optype {self}")
+
+
+@dataclass
+class BenchmarkContext:
+    """
+    LoRA benchmark context
+    """
+    batch_size: int
+    hidden_size: int
+    num_loras: int
+    num_active_loras: int
+    lora_rank: int
+    sort_by_lora_id: bool
+    dtype: torch.dtype
+    seq_length: Optional[int] = None
+    num_slices: Optional[int] = None  # num_slices for slice based ops
+
+    def with_seq_length(self, seq_length: int) -> "BenchmarkContext":
+        ctx = copy.copy(self)
+        ctx.seq_length = seq_length
+        return ctx
+
+    def with_num_slices(self, num_slices: int) -> "BenchmarkContext":
+        ctx = copy.copy(self)
+        ctx.num_slices = num_slices
+        return ctx
+
+    def bench_label(self) -> str:
+        return f"lora-{self.dtype}"
+
+    def bench_sublabel(self, op_type: OpType) -> str:
+        m, k, n = op_type.mkn(self.batch_size, self.seq_length,
+                              self.hidden_size, self.lora_rank)
+        desc = {
+            'bs': self.batch_size,
+            'sl': self.seq_length,
+            'm': m,
+            'k': k,
+            'n': n,
+            'num_loras': self.num_loras,
+            'sort_by_lora': self.sort_by_lora_id,
+            'num_slices': self.num_slices,
+        }
+        return json.dumps(desc)
+
+
+@dataclass
+class BenchmarkTensors:
+    """
+    Input/Output tensors used for benchmarks
+    """
+    # matmul tensors
+    input: torch.Tensor
+    lora_weights_lst: List[torch.Tensor]
+    output: torch.Tensor
+    # metadata tensors
+    seq_lens: torch.Tensor
+    seq_start_loc: torch.Tensor
+    prompt_lora_mapping: torch.Tensor
+    token_lora_mapping: torch.Tensor
+
+    def io_types(self) -> str:
+        return (f"{dtype_to_str(self.input.dtype)}x"
+                f"{dtype_to_str(self.lora_weights_lst[0].dtype)}=>"
+                f"{dtype_to_str(self.output.dtype)}")
+
+    @staticmethod
+    def make(ctx: BenchmarkContext,
+             op_type: OpType,
+             device: str = "cuda") -> "BenchmarkTensors":
+
+        # Make input / output matmul tensors.
+        a_shape, b_shape, c_shape = op_type.matmul_shapes(
+            ctx.batch_size, ctx.seq_length, ctx.hidden_size, ctx.lora_rank,
+            ctx.num_loras, ctx.num_slices)
+        a_type, b_type, c_type = op_type.matmul_dtypes(ctx.dtype)
+        input_tensor, lora_weights, output_tensor = \
+            make_rand_tensors(a_shape, b_shape, c_shape, a_type, b_type, c_type,
+                              num_slices = ctx.num_slices)
+
+        # Make metadata tensors.
+        # Keep the metadata tensors in the CPU for further processing if needed.
+        # The tensors get moved to the GPU before benchmarking.
+        assert ctx.num_active_loras <= ctx.num_loras
+        total_tokens = ctx.batch_size * ctx.seq_length
+
+        # Prepare seq lens tensor
+        seq_len_tensor = torch.randint(ctx.seq_length, ctx.seq_length + 1,
+                                       (ctx.batch_size, ))
+        # Prepare seq_start_loc tensor
+        seq_start_loc_tensor = torch.cumsum(torch.tensor(
+            [0] + seq_len_tensor[:-1].tolist(), dtype=torch.long),
+                                            dim=0)
+        assert total_tokens == seq_len_tensor.sum()
+        # Prepare prompt lora indices tensor
+        prompt_lora_indices_tensor = make_prompt_lora_mapping(
+            ctx.batch_size, ctx.num_active_loras, ctx.sort_by_lora_id, "cpu")
+        # Prepare token lora indices tensor
+        token_lora_indices_tensor = make_token_lora_mapping(
+            total_tokens, ctx.batch_size, prompt_lora_indices_tensor,
+            seq_len_tensor, "cpu")
+
+        return BenchmarkTensors(input_tensor, lora_weights, output_tensor,
+                                seq_len_tensor, seq_start_loc_tensor,
+                                prompt_lora_indices_tensor,
+                                token_lora_indices_tensor)
+
+    def sanity_check(self) -> None:
+        """
+        Fails asserts when non-conformality is detected.
+        """
+        num_tokens = self.input.shape[-2]
+        # check metadata tensors
+        assert torch.sum(self.seq_lens) == num_tokens
+        num_seqs = self.seq_lens.shape[0]
+        assert self.seq_start_loc.shape[0] == num_seqs
+        assert self.prompt_lora_mapping.shape[0] == num_seqs
+        assert self.token_lora_mapping.shape[0] == num_tokens
+
+    def to_device(self, device: str):
+        """
+        Transfer tensors to device if the tensors aren't already on the device
+        """
+
+        def to_device(tensor: torch.Tensor):
+            if tensor.device != device:
+                tensor = tensor.to(device=device)
+            return tensor
+
+        self.input = to_device(self.input)
+        self.output = to_device(self.output)
+        self.seq_lens = to_device(self.seq_lens)
+        self.seq_start_loc = to_device(self.seq_start_loc)
+        self.prompt_lora_mapping = to_device(self.prompt_lora_mapping)
+        self.token_lora_mapping = to_device(self.token_lora_mapping)
+        for i in range(len(self.lora_weights_lst)):
+            self.lora_weights_lst[i] = to_device(self.lora_weights_lst[i])
+
+    def metadata(self) -> Tuple[int, int, int]:
+        """
+        Return num_seqs, num_tokens and max_seq_len
+        """
+        num_seqs = self.seq_lens.shape[0]
+        num_tokens = self.token_lora_mapping.shape[0]
+        max_seq_len = torch.max(self.seq_lens).item()
+        num_slices = len(self.lora_weights_lst)
+        return num_seqs, num_tokens, max_seq_len, num_slices
+
+    def convert_to_sgmv_benchmark_tensors(self):
+        """
+        For sgmv punica kernels, when consecutive sequences have the
+        same LoRA ID, we just merge them together.
+        This happens in punica.py::compute_metadata
+        """
+
+        # Collapse seq_lens and seq_start_loc
+        _, seq_lens = torch.unique_consecutive(self.token_lora_mapping,
+                                               return_counts=True)
+        cum_result = torch.cumsum(seq_lens, dim=0)
+        seq_start_loc = torch.zeros_like(seq_lens)
+        seq_start_loc[1:].copy_(cum_result[:-1])
+
+        # Collapse prompt mapping
+        prompt_lora_mapping = torch.unique_consecutive(
+            self.prompt_lora_mapping)
+
+        assert torch.sum(seq_lens) == torch.sum(self.seq_lens), \
+         f"dont match - new {torch.sum(seq_lens)} vs {torch.sum(self.seq_lens)}"
+
+        self.prompt_lora_mapping = prompt_lora_mapping.to(
+            dtype=self.prompt_lora_mapping.dtype)
+        self.seq_lens = seq_lens.to(dtype=self.seq_lens.dtype)
+        self.seq_start_loc = seq_start_loc.to(dtype=self.seq_start_loc.dtype)
+
+    def as_sgmv_shrink_kwargs(self) -> Dict[str, Any]:
+        self.convert_to_sgmv_benchmark_tensors()
+        self.sanity_check()
+        self.to_device(self.input.device)
+
+        num_seqs, num_tokens, max_seq_len, num_slices = self.metadata()
+
+        # Sanity check matrix shapes.
+        i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[
+            0].shape, self.output.shape
+        # Expected input shape [num_tokens, hidden_size]
+        assert len(i_shape) == 2
+        assert i_shape[0] == num_tokens
+        hidden_size = i_shape[1]
+        # Expected lora weight shape [num_loras, lora_rank, hidden_size]
+        assert len(lw_shape) == 3
+        assert lw_shape[2] == hidden_size
+        lora_rank = lw_shape[1]
+        # Expected output shape [num_slices, num_tokens, lora_rank]
+        assert len(o_shape) == 3
+        assert o_shape == (num_slices, num_tokens, lora_rank)
+
+        return {
+            'inputs': self.input,
+            'lora_a_weights': self.lora_weights_lst,
+            'output_tensor': self.output,
+            'b_seq_start_loc': self.seq_start_loc,
+            'seq_len_tensor': self.seq_lens,
+            'lora_indices_tensor': self.prompt_lora_mapping,
+            'batches': num_seqs,
+            'max_seq_length': max_seq_len,
+            'token_nums': num_tokens,
+            'scaling': 1.0,
+        }
+
+    def as_sgmv_expand_kwargs(self, add_inputs: bool) -> Dict[str, Any]:
+
+        self.convert_to_sgmv_benchmark_tensors()
+        self.sanity_check()
+        self.to_device(self.input.device)
+
+        num_seqs, num_tokens, max_seq_len, num_slices = self.metadata()
+
+        # Sanity check matrix shapes.
+        i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[
+            0].shape, self.output.shape
+        # Expected input shape : [num_slices, num_tokens, lora_rank]
+        assert len(i_shape) == 3
+        assert i_shape[0] == num_slices
+        assert i_shape[1] == num_tokens
+        lora_rank = i_shape[2]
+        # Expected lora weight shape : [num_lora, hidden_size, lora_rank]
+        assert len(lw_shape) == 3
+        assert lw_shape[2] == lora_rank
+        hidden_size = lw_shape[1]
+        # Expected output shape : [num_tokens, hidden_size * num_slices]
+        assert len(o_shape) == 2
+        assert o_shape == (num_tokens, hidden_size * num_slices)
+
+        return {
+            'inputs': self.input,
+            'lora_b_weights': self.lora_weights_lst,
+            'output_tensor': self.output,
+            'b_seq_start_loc': self.seq_start_loc,
+            'seq_len_tensor': self.seq_lens,
+            'lora_indices_tensor': self.prompt_lora_mapping,
+            'batches': num_seqs,
+            'max_seq_length': max_seq_len,
+            'token_nums': num_tokens,
+            'offset_start': 0,
+            'add_inputs': add_inputs,
+        }
+
+    def as_bgmv_shrink_kwargs(self) -> Dict[str, Any]:
+        assert len(self.lora_weights_lst) == 1
+        self.to_device(self.input.device)
+
+        _, num_tokens, _, _ = self.metadata()
+        # Sanity check shapes
+        i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[
+            0].shape, self.output.shape
+        # Expected input shape [num_tokens, hidden_size]
+        assert len(i_shape) == 2
+        assert i_shape[0] == num_tokens
+        hidden_size = i_shape[1]
+        # Expected lora weight shape [num_loras, lora_rank, hidden_size]
+        assert len(lw_shape) == 3
+        assert lw_shape[2] == hidden_size
+        lora_rank = lw_shape[1]
+        # Expected output shape [num_tokens, lora_rank]
+        assert len(o_shape) == 2
+        assert o_shape == (num_tokens, lora_rank)
+
+        return {
+            'inputs': self.input,
+            'lora_a_weights': self.lora_weights_lst[0],
+            'output_tensor': self.output,
+            'lora_indices_tensor': self.token_lora_mapping,
+            'scaling': 1.0
+        }
+
+    def as_bgmv_expand_kwargs(self, add_inputs: bool):
+        assert len(self.lora_weights_lst) == 1
+        self.to_device(self.input.device)
+
+        _, num_tokens, _, _ = self.metadata()
+        # Sanity check shapes
+        i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[
+            0].shape, self.output.shape
+        # Expected input shape [num_tokens, lora_rank]
+        assert len(i_shape) == 2
+        assert i_shape[0] == num_tokens
+        lora_rank = i_shape[1]
+        # Expected lora weight shape [num_loras, hidden_size, lora_rank]
+        assert len(lw_shape) == 3
+        assert lw_shape[2] == lora_rank
+        hidden_size = lw_shape[1]
+        # Expected output shape [num_tokens, hidden_size]
+        assert len(o_shape) == 2
+        assert o_shape == (num_tokens, hidden_size)
+
+        return {
+            'inputs': self.input,
+            'lora_b_weights': self.lora_weights_lst[0],
+            'output_tensor': self.output,
+            'lora_indices_tensor': self.token_lora_mapping,
+            'add_inputs': add_inputs
+        }
+
+    def as_bgmv_expand_slice_kwargs(self, add_inputs: bool) -> Dict[str, Any]:
+
+        _, num_tokens, _, num_slices = self.metadata()
+        # Sanity check shapes
+        i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[
+            0].shape, self.output.shape
+        # Expected input shape [num_slices, num_tokens, lora_rank]
+        assert len(i_shape) == 3
+        assert i_shape[0] == num_slices
+        assert i_shape[1] == num_tokens
+        lora_rank = i_shape[2]
+        # Expected lora weight shape [num_loras, hidden_size, lora_rank]
+        assert len(lw_shape) == 3
+        assert lw_shape[2] == lora_rank
+        hidden_size = lw_shape[1]
+        # Expected output shape [num_tokens, hidden_size * num_slices]
+        assert len(o_shape) == 2
+        assert o_shape == (num_tokens, hidden_size * num_slices)
+
+        self.to_device(self.input.device)
+
+        kwargs_list = []
+        for i in range(num_slices):
+            kwargs_list.append({
+                'inputs': self.input[i],
+                'lora_b_weights': self.lora_weights_lst[i],
+                'output_tensor': self.output,
+                'lora_indices_tensor': self.token_lora_mapping,
+                'slice_offset': i * hidden_size,
+                'slice_size': hidden_size,
+                'add_inputs': add_inputs,
+            })
+        return {'kwargs_list': kwargs_list}
+
+    def bench_fn_kwargs(self,
+                        op_type: OpType,
+                        add_inputs: Optional[bool] = None) -> Dict[str, Any]:
+        if op_type.is_shrink_fn():
+            assert add_inputs is None
+        else:
+            assert add_inputs is not None
+
+        if op_type == OpType.SGMV_SHRINK:
+            return self.as_sgmv_shrink_kwargs()
+        if op_type == OpType.SGMV_EXPAND:
+            return self.as_sgmv_expand_kwargs(add_inputs)
+        if op_type == OpType.BGMV_SHRINK:
+            return self.as_bgmv_shrink_kwargs()
+        if op_type == OpType.BGMV_EXPAND:
+            return self.as_bgmv_expand_kwargs(add_inputs)
+        if op_type == OpType.BGMV_EXPAND_SLICE:
+            return self.as_bgmv_expand_slice_kwargs(add_inputs)
+        raise ValueError(f"Unrecognized optype {self}")
+
+    def test_correctness(self, op_type: OpType,
+                         expand_fn_add_inputs: Optional[bool]) -> bool:
+        """
+        Test correctness of op_type implementation against a grouped gemm
+        reference implementation.
+        """
+        seq_lens_cpu = self.seq_lens.to(device="cpu")
+        prompt_lora_mapping_cpu = self.prompt_lora_mapping.to(device="cpu")
+        ref_output = self.output.clone()
+
+        self.output.zero_()
+        op_type.bench_fn()(
+            **self.bench_fn_kwargs(op_type, expand_fn_add_inputs))
+
+        op_type.run_ref_group_gemm(
+            ref_output,
+            self.input,
+            self.lora_weights_lst,
+            seq_lens_cpu=seq_lens_cpu,
+            prompt_lora_mapping_cpu=prompt_lora_mapping_cpu,
+            scaling=1.0,
+            add_inputs=expand_fn_add_inputs)
+
+        rtol, atol = {
+            torch.float16: (6e-2, 6e-2),
+            torch.bfloat16: (6e-2, 6e-2),
+            torch.float32: (1e-2, 1e-2),
+        }[self.output.dtype]
+
+        return torch.allclose(ref_output, self.output, rtol=rtol, atol=atol)
+
+
+def bench_optype(ctx: BenchmarkContext,
+                 arg_pool_size: int,
+                 op_type: OpType,
+                 cuda_graph_nops: Optional[int] = None,
+                 expand_fn_add_inputs: Optional[bool] = None,
+                 test_correctness: bool = False) -> TMeasurement:
+
+    assert arg_pool_size >= 1
+    if op_type.is_shrink_fn():
+        assert expand_fn_add_inputs is None
+    else:
+        assert expand_fn_add_inputs is not None
+
+    # BenchmarkContext -> BenchmarkTensors
+    bench_tensors : List[BenchmarkTensors] = \
+        [BenchmarkTensors.make(ctx, op_type) for _ in range(arg_pool_size)]
+    for bt in bench_tensors:
+        bt.sanity_check()
+
+    # Test correctness of our implementation.
+    if test_correctness:
+        assert all([
+            bt.test_correctness(op_type, expand_fn_add_inputs)
+            for bt in bench_tensors
+        ])
+
+    # BenchmarkTensors -> Dict (kwargs)
+    kwargs_list = [
+        bt.bench_fn_kwargs(op_type, add_inputs=expand_fn_add_inputs)
+        for bt in bench_tensors
+    ]
+
+    # Clear LoRA optimization hash-maps.
+    _LORA_A_PTR_DICT.clear()
+    _LORA_B_PTR_DICT.clear()
+    # Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are setup
+    for kwargs in kwargs_list:
+        op_type.bench_fn()(**kwargs)
+    torch.cuda.synchronize()
+
+    # Merge into a single kwargs and qualify arguments as ArgPool
+    kwargs = {k: ArgPool([]) for k in kwargs_list[0]}
+    for _kwargs in kwargs_list:
+        for k, v in _kwargs.items():
+            kwargs[k].values.append(v)
+
+    describe_args = (f"add_inputs={expand_fn_add_inputs}"
+                     if expand_fn_add_inputs is not None else "")
+    description = (
+        f"{op_type.name}({describe_args}) ({bench_tensors[0].io_types()})")
+
+    cuda_graph_params = None
+    if cuda_graph_nops:
+        cuda_graph_params = CudaGraphBenchParams(cuda_graph_nops)
+    timer = None
+    with Bench(cuda_graph_params,
+               ctx.bench_label(), ctx.bench_sublabel(op_type), description,
+               op_type.bench_fn(), **kwargs) as bench:
+        timer = bench.run()
+    return timer
+
+
+def bench_torch_mm(ctx: BenchmarkContext,
+                   arg_pool_size: int,
+                   op_type: OpType,
+                   cuda_graph_nops: Optional[int] = None) -> TMeasurement:
+    """
+    Benchmark basic torch.mm as a roofline.
+
+    When all the input tokens have the same LoRA ID, the LoRA kernels are just
+    a matmul. This torch.mm benchmark serves as a roofline for that case. 
+
+    input op_type is used in determining the m, k, n dimensions for the matmul.
+    """
+
+    batch_size, hidden_size, lora_rank, seq_length, dtype = (ctx.batch_size,
+                                                             ctx.hidden_size,
+                                                             ctx.lora_rank,
+                                                             ctx.seq_length,
+                                                             ctx.dtype)
+
+    m, k, n = op_type.mkn(batch_size, seq_length, hidden_size, lora_rank)
+    # For a fairer comparison.
+    n = n * ctx.num_slices
+
+    # Get matmul input and output tensors for A x B = C
+    As, Bs, Cs = [], [], []
+    for _ in range(arg_pool_size):
+        As.append(torch.rand((m, k), dtype=dtype).to("cuda"))
+        Bs.append(torch.rand((n, k), dtype=dtype).to("cuda").t())
+        Cs.append(torch.rand((m, n), dtype=dtype).to("cuda"))
+
+    # Make torch.mm kwargs
+    mm_kwargs = {'input': ArgPool(As), 'mat2': ArgPool(Bs), 'out': ArgPool(Cs)}
+
+    description = (
+        f"single-lora roofline using torch.mm ({dtype_to_str(dtype)}"
+        f"x{dtype_to_str(dtype)}"
+        f"=>{dtype_to_str(dtype)})")
+    cuda_graph_params = None
+    if cuda_graph_nops:
+        cuda_graph_params = CudaGraphBenchParams(cuda_graph_nops)
+    with Bench(cuda_graph_params, ctx.bench_label(),
+               ctx.bench_sublabel(op_type), description, torch.mm,
+               **mm_kwargs) as bench:
+        return bench.run()
+
+
+# runner
+def use_cuda_graph_recommendation() -> str:
+    return """
+            Triton kernels have a significant launch overhead with
+            launched directly via python. This overhead is more noticeable
+            for small the problem sizes. For these cases, it is recommended
+            to use the script with `--cuda-graph-nops N` to benchmark N
+            consecutive invocations of the benchmarking operations from 
+            inside a CUDA Graph. Note that the returned measurement is for N 
+            invocations of the operation.
+            """
+
+
+def print_timers(timers: List[TMeasurement],
+                 args: Optional[argparse.Namespace] = None):
+    compare = TBenchmark.Compare(timers)
+    compare.print()
+
+    if args and args.cuda_graph_nops:
+        print(
+            f"Note : The timings reported above is for {args.cuda_graph_nops} "
+            "consecutive invocations of the benchmarking functions. "
+            f"Please divide by {args.cuda_graph_nops} for single invocation "
+            "timings.")
+
+    print("Note on Comparison with torch.mm : The torch.mm numbers are "
+          "benchmark numbers of a simple matmul emulating the single lora "
+          "case. It is provided as a roofline for comparing our LoRA Kernel "
+          "implementations. It is expected that the LoRA kernels will be "
+          "slower than torch.mm in cases where num_loras is big. But for "
+          "small num_loras the goal should be to match the torch.mm numbers.")
+
+
+def run(args: argparse.Namespace, bench_ctxs: List[BenchmarkContext]):
+
+    if args.cuda_graph_nops is not None:
+        assert args.cuda_graph_nops > 0
+        print(f"Benchmarking {args.cuda_graph_nops} invocations inside a CUDA "
+              "Graph")
+    else:
+        print(f"CUDA Graphs not enabled.\n{use_cuda_graph_recommendation()}")
+
+    timers = []
+    for bench_ctx in bench_ctxs:
+        for seq_len in args.seq_lengths:
+            bench_ops: List[OpType] = []
+            if seq_len == 1:
+                # bench all decode ops
+                bench_ops = [op for op in args.op_types if op.is_decode_op()]
+            else:
+                # bench all prefill ops
+                bench_ops = [op for op in args.op_types if op.is_prefill_op()]
+
+            seq_len_timers = []
+            for bench_op in bench_ops:
+                for num_slices in bench_op.num_slices():
+                    _ctx = bench_ctx.with_seq_length(seq_len).with_num_slices(
+                        num_slices)
+                    # Benchmark torch.mm as a roofline
+                    seq_len_timers.append(
+                        bench_torch_mm(_ctx, args.arg_pool_size, bench_op,
+                                       args.cuda_graph_nops))
+
+                    # Benchmark bench_op
+                    expand_fn_add_inputs = [
+                        None
+                    ] if bench_op.is_shrink_fn() else args.expand_fn_add_inputs
+                    for add_input_arg in expand_fn_add_inputs:
+                        seq_len_timers.append(
+                            bench_optype(_ctx, args.arg_pool_size, bench_op,
+                                         args.cuda_graph_nops, add_input_arg,
+                                         args.test_correctness))
+
+            print_timers(seq_len_timers)
+            timers.extend(seq_len_timers)
+
+    # Result stdout dump
+    print("== All Results ====")
+    print_timers(timers, args)
+
+    if args.output_directory:
+        # Result file dump
+        od = Path(args.output_directory)
+        if not od.exists():
+            od.mkdir()
+
+        timestamp = int(time.time())
+        pkl_file = od / f"lora_bench-{timestamp}.pkl"
+        print(f"Writing benchmarks to {pkl_file}")
+        with open(pkl_file, "wb") as f:
+            pickle.dump(timers, f)
+
+
+def as_benchmark_contexts(hidden_sizes: List[int], lora_ranks: List[int],
+                          args: argparse.Namespace) -> List[BenchmarkContext]:
+
+    ctxs: List[BenchmarkContext] = []
+    for batch_size, hidden_size, lora_rank, num_loras, sort_by_lora_id in product(  # noqa
+            args.batch_sizes, list(hidden_sizes), lora_ranks, args.num_loras,
+            args.sort_by_lora_id):
+        ctxs.append(
+            BenchmarkContext(
+                batch_size=batch_size,
+                hidden_size=hidden_size,
+                lora_rank=lora_rank,
+                num_loras=num_loras,
+                num_active_loras=args.num_active_loras
+                if args.num_active_loras else num_loras,
+                # To be filled based on the OpType to benchmark
+                seq_length=None,
+                sort_by_lora_id=sort_by_lora_id,
+                dtype=args.dtype,
+                # To be filled based on the OpType to benchmark
+                num_slices=None))
+
+    return ctxs
+
+
+def run_list_bench(args: argparse.Namespace):
+    print(args)
+
+    print("List bench :\n"
+          f"  Hidden Sizes {args.hidden_sizes}"
+          f"  LoRA Ranks {args.lora_ranks}")
+
+    # Get all benchmarking contexts
+    bench_contexts: List[BenchmarkContext] = as_benchmark_contexts(
+        hidden_sizes=args.hidden_sizes, lora_ranks=args.lora_ranks, args=args)
+
+    run(args, bench_contexts)
+
+
+def run_range_bench(args: argparse.Namespace):
+    print(args)
+
+    hidden_sizes = list(
+        range(args.hidden_sizes_start, args.hidden_sizes_end + 1,
+              args.hidden_sizes_increment))
+    lora_ranks = list(
+        range(args.lora_ranks_start, args.lora_ranks_end + 1,
+              args.lora_ranks_increment))
+
+    print("Range bench :\n"
+          f" Hidden Sizes {hidden_sizes}"
+          f" LoRA Ranks {lora_ranks}")
+
+    # Get all benchmarking contexts
+    bench_contexts: List[BenchmarkContext] = as_benchmark_contexts(
+        hidden_sizes=hidden_sizes, lora_ranks=lora_ranks, args=args)
+
+    run(args, bench_contexts)
+
+
+def run_model_bench(args: argparse.Namespace):
+    print(args)
+
+    def hidden_sizes_from_model(model: str, tp_size: int) -> set[int]:
+        hidden_sizes = set()
+        for KN, tp_split_dim in WEIGHT_SHAPES[model]:
+            KN[tp_split_dim] = KN[tp_split_dim] // tp_size
+            hidden_sizes.add(KN[1])
+        return hidden_sizes
+
+    # Get all hidden sizes
+    hidden_sizes: set[int] = set()
+    for model_name, tp_size in product(args.models, args.tp_sizes):
+        hidden_sizes = hidden_sizes.union(
+            hidden_sizes_from_model(model_name, tp_size))
+
+    print("Model bench :\n"
+          f" Hidden Sizes {hidden_sizes}"
+          f" LoRA Ranks {args.lora_ranks}")
+
+    # Get all benchmarking contexts
+    bench_contexts: List[BenchmarkContext] = as_benchmark_contexts(
+        hidden_sizes=hidden_sizes, lora_ranks=args.lora_ranks, args=args)
+
+    run(args, bench_contexts)
+
+
+if __name__ == '__main__':
+
+    def to_torch_dtype(dt):
+        if dt == "torch.float16":
+            return torch.float16
+        if dt == "torch.bfloat16":
+            return torch.bfloat16
+        raise ValueError("unsupported dtype")
+
+    def get_bool(s: str) -> bool:
+        return s.lower() in ['true', '1']
+
+    def add_common_command_args(p: argparse.ArgumentParser):
+        p.add_argument(
+            "--dtype",
+            type=to_torch_dtype,
+            required=True,
+            help="Available options are ['torch.float16', 'torch.bfloat16']")
+
+        p.add_argument(
+            "--arg-pool-size",
+            type=int,
+            default=32,
+            help="Run profiles with a pool of input/output/meta tensors instead"
+            "of simply reusing the same tensors for all runs. A bigger arg-pool"
+            "mitigates hardware caching effects during benchmarking.")
+
+        p.add_argument(
+            "--cuda-graph-nops",
+            type=int,
+            help=("when set profiling is done using cudagraph, "
+                  "with the given number of operations in a graph."
+                  "Note that the measurement returned is the time "
+                  "taken for N consecutive executions of the benchmarking "
+                  "functions, where N is the value of this argument."))
+        p.add_argument("--num-loras",
+                       nargs="+",
+                       type=int,
+                       default=DEFAULT_NUM_LORAS)
+        p.add_argument("--num-active-loras",
+                       type=int,
+                       default=None,
+                       help="Active LoRAs. When None, all LoRAs are active")
+        p.add_argument("--sort-by-lora-id",
+                       nargs="+",
+                       type=get_bool,
+                       default=DEFAULT_SORT_BY_LORA_IDS)
+        p.add_argument("--op-types",
+                       nargs="+",
+                       type=OpType.from_str,
+                       default=list(OpType))
+        p.add_argument('--seq-lengths',
+                       nargs="+",
+                       type=int,
+                       default=DEFAULT_SEQ_LENGTHS)
+        p.add_argument("--batch-sizes",
+                       nargs="+",
+                       type=int,
+                       default=DEFAULT_BATCH_SIZES)
+        p.add_argument("--expand-fn-add-inputs",
+                       nargs="+",
+                       type=get_bool,
+                       default=DEFAULT_EXPAND_FN_ADD_INPUTS)
+        p.add_argument(
+            '-o',
+            '--output-directory',
+            type=str,
+            help=("Output directory to store a the list of benchmarking"
+                  "TMeasurement objects as a pickle file"))
+
+        p.add_argument(
+            "--test-correctness",
+            action='store_true',
+            help=("When enabled, the benchmarking functions are tested"
+                  "for correctness before the actual benchmarking"))
+
+    parser = FlexibleArgumentParser(
+        description=f"""
+Benchmark LoRA kernels:
+    {use_cuda_graph_recommendation()}
+
+    list_bench example:
+        python3 benchmarks/kernels/benchmark_lora.py list_bench --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16 --hidden-sizes 2048 --lora-ranks 16 --num-loras 1 4 --op-types bgmv_shrink bgmv_expand sgmv_shrink sgmv_expand bgmv_expand_slice --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32
+
+    model_bench example:
+        python3 benchmarks/kernels/benchmark_lora.py model_bench --models meta-llama/Llama-3-8b  --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16  --lora-ranks 16 --num-loras 1 4 --op-types bgmv_shrink bgmv_expand sgmv_shrink sgmv_expand bgmv_expand_slice --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32 
+
+    range_bench example:
+        python3 benchmarks/kernels/benchmark_lora.py range_bench  --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16   --num-loras 1 4 --op-types bgmv_shrink bgmv_expand sgmv_shrink sgmv_expand bgmv_expand_slice --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32 --hidden-sizes-start 1024 --hidden-sizes-end 4096 --hidden-sizes-increment 1024 --lora-ranks-start 8 --lora-ranks-end 24 --lora-ranks-increment 8 
+            """,  # noqa: E501
+        formatter_class=argparse.RawTextHelpFormatter)
+
+    subparsers = parser.add_subparsers(dest="cmd", required=True)
+
+    list_parser = subparsers.add_parser("list_bench")
+    list_parser.add_argument("--hidden-sizes",
+                             nargs="+",
+                             type=int,
+                             default=DEFAULT_HIDDEN_SIZES)
+    list_parser.add_argument("--lora-ranks",
+                             nargs="+",
+                             type=int,
+                             default=DEFAULT_LORA_RANKS)
+    add_common_command_args(list_parser)
+    list_parser.set_defaults(func=run_list_bench)
+
+    range_parser = subparsers.add_parser("range_bench")
+    range_parser.add_argument("--hidden-sizes-start", type=int, required=True)
+    range_parser.add_argument("--hidden-sizes-end", type=int, required=True)
+    range_parser.add_argument("--hidden-sizes-increment",
+                              type=int,
+                              required=True)
+    range_parser.add_argument("--lora-ranks-start", type=int, required=True)
+    range_parser.add_argument("--lora-ranks-end", type=int, required=True)
+    range_parser.add_argument("--lora-ranks-increment",
+                              type=int,
+                              required=True)
+    add_common_command_args(range_parser)
+    range_parser.set_defaults(func=run_range_bench)
+
+    model_parser = subparsers.add_parser("model_bench")
+    model_parser.add_argument("--models",
+                              nargs="+",
+                              type=str,
+                              default=DEFAULT_MODELS,
+                              choices=WEIGHT_SHAPES.keys())
+    model_parser.add_argument("--tp-sizes",
+                              nargs="+",
+                              type=int,
+                              default=DEFAULT_TP_SIZES)
+    model_parser.add_argument("--lora-ranks",
+                              nargs="+",
+                              type=int,
+                              default=DEFAULT_LORA_RANKS)
+    add_common_command_args(model_parser)
+    model_parser.set_defaults(func=run_model_bench)
+
+    args = parser.parse_args()
+    args.func(args)
diff --git a/benchmarks/kernels/utils.py b/benchmarks/kernels/utils.py
new file mode 100644
index 0000000000000..fee877b6f76fa
--- /dev/null
+++ b/benchmarks/kernels/utils.py
@@ -0,0 +1,210 @@
+import dataclasses
+from typing import Any, Callable, Iterable, Optional
+
+import torch
+import torch.utils.benchmark as TBenchmark
+from torch.utils.benchmark import Measurement as TMeasurement
+
+
+@dataclasses.dataclass
+class CudaGraphBenchParams:
+    num_ops_in_cuda_graph: int
+
+
+@dataclasses.dataclass
+class ArgPool:
+    """
+    When some argument of the benchmarking function is annotated with this type,
+    the benchmarking class (BenchMM) will collapse the argument to a pick a
+    single value from the given list of values, during function invocation.
+    For every invocation during a benchmarking run, it will choose a
+    different value from the list.
+    """
+    values: Iterable[Any]
+
+    def __getitem__(self, index):
+        return self.values[index]
+
+
+class Bench:
+
+    class ArgsIterator:
+
+        def __init__(self, args_list, kwargs_list):
+            assert len(args_list) == len(kwargs_list)
+            self.args_list = args_list
+            self.kwargs_list = kwargs_list
+            self.n = len(self.args_list)
+            self.idx = 0
+
+        def __next__(self):
+            while True:
+                yield (self.args_list[self.idx], self.kwargs_list[self.idx])
+                self.idx += 1
+                self.idx = self.idx % self.n
+
+        def reset(self):
+            self.idx = 0
+
+        @property
+        def n_args(self):
+            return self.n
+
+    def __init__(self, cuda_graph_params: Optional[CudaGraphBenchParams],
+                 label: str, sub_label: str, description: str, fn: Callable,
+                 *args, **kwargs):
+
+        self.cuda_graph_params = cuda_graph_params
+        self.use_cuda_graph = self.cuda_graph_params is not None
+        self.label = label
+        self.sub_label = sub_label
+        self.description = description
+        self.fn = fn
+
+        # Process args
+        self._args = args
+        self._kwargs = kwargs
+        self.args_list, self.kwargs_list = self.collapse_argpool(
+            *args, **kwargs)
+        self.args_iterator = self.ArgsIterator(self.args_list,
+                                               self.kwargs_list)
+
+        # Cudagraph runner
+        self.g = None
+        if self.use_cuda_graph:
+            self.g = self.get_cuda_graph_runner()
+
+        # benchmark run params
+        self.min_run_time = 1
+
+    def collapse_argpool(self, *args, **kwargs):
+        argpool_args = [arg for arg in args if isinstance(arg, ArgPool)] + [
+            arg for arg in kwargs.values() if isinstance(arg, ArgPool)
+        ]
+        if len(argpool_args) == 0:
+            return [args], [kwargs]
+
+        # Make sure all argpools are of the same size
+        argpool_size = len(argpool_args[0].values)
+        assert all([argpool_size == len(arg.values) for arg in argpool_args])
+
+        # create copies of the args
+        args_list = []
+        kwargs_list = []
+        for _ in range(argpool_size):
+            args_list.append(args)
+            kwargs_list.append(kwargs.copy())
+
+        for i in range(argpool_size):
+            # collapse args; Just pick the ith value
+            args_list[i] = tuple([
+                arg[i] if isinstance(arg, ArgPool) else arg
+                for arg in args_list[i]
+            ])
+
+            # collapse kwargs
+            kwargs_i = kwargs_list[i]
+            arg_pool_keys = [
+                k for k, v in kwargs_i.items() if isinstance(v, ArgPool)
+            ]
+            for k in arg_pool_keys:
+                # again just pick the ith value
+                kwargs_i[k] = kwargs_i[k][i]
+            kwargs_list[i] = kwargs_i
+
+        return args_list, kwargs_list
+
+    def get_cuda_graph_runner(self):
+        assert self.use_cuda_graph
+        assert self.args_iterator is not None
+
+        num_graph_ops = self.cuda_graph_params.num_ops_in_cuda_graph
+
+        # warmup
+        args_it = self.args_iterator.__next__()
+        for _ in range(2):
+            args, kwargs = next(args_it)
+            self.fn(*args, **kwargs)
+
+        self.args_iterator.reset()
+        args_it = self.args_iterator.__next__()
+        stream = torch.cuda.Stream()
+        with torch.cuda.stream(stream):
+            g = torch.cuda.CUDAGraph()
+            with torch.cuda.graph(g):
+                for _ in range(num_graph_ops):
+                    args, kwargs = next(args_it)
+                    self.fn(*args, **kwargs)
+        return g
+
+    def run_cudagrah(self) -> TMeasurement:
+        assert self.use_cuda_graph
+        globals = {'g': self.g}
+
+        return TBenchmark.Timer(
+            stmt="g.replay()",
+            globals=globals,
+            label=(
+                f"{self.label}"
+                f" | cugraph {self.cuda_graph_params.num_ops_in_cuda_graph} ops"
+            ),
+            sub_label=self.sub_label,
+            description=self.description,
+        ).blocked_autorange(min_run_time=self.min_run_time)
+
+    def run_eager(self) -> TMeasurement:
+        setup = None
+        stmt = None
+        globals = None
+
+        has_arg_pool = self.args_iterator.n_args > 1
+        if has_arg_pool:
+            setup = '''
+                    args_iterator.reset()
+                    args_it = args_iterator.__next__()
+                    '''
+            stmt = '''
+                    args, kwargs = next(args_it)
+                    fn(*args, **kwargs)
+                    '''
+            globals = {'fn': self.fn, 'args_iterator': self.args_iterator}
+        else:
+            # no arg pool. Just use the args and kwargs directly
+            self.args_iterator.reset()
+            args_it = self.args_iterator.__next__()
+            args, kwargs = next(args_it)
+
+            setup = ""
+            stmt = '''
+                    fn(*args, **kwargs)
+                   '''
+            globals = {'fn': self.fn, 'args': args, 'kwargs': kwargs}
+
+        return TBenchmark.Timer(
+            stmt=stmt,
+            setup=setup,
+            globals=globals,
+            label=self.label,
+            sub_label=self.sub_label,
+            description=self.description,
+        ).blocked_autorange(min_run_time=self.min_run_time)
+
+    def run(self) -> TMeasurement:
+        timer = None
+        if self.use_cuda_graph:  # noqa SIM108
+            timer = self.run_cudagrah()
+        else:
+            timer = self.run_eager()
+        if not timer.meets_confidence() or timer.has_warnings:
+            print("Doesn't meet confidence - re-running bench ...")
+            return self.run()
+        return timer
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        if exc_type:
+            print(f"exc type {exc_type}")
+            print(f"exc value {exc_value}")
+            print(f"exc traceback {traceback}")

From 62b06ba23deaca5a0e7602cd2e3a85aeec57f306 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Fri, 17 Jan 2025 01:14:48 +0800
Subject: [PATCH 244/462] [Model] Add support for deepseek-vl2-tiny model
 (#12068)

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 docs/source/models/supported_models.md        |  5 ++---
 examples/offline_inference/vision_language.py |  2 +-
 .../vision_language_multi_image.py            |  2 +-
 .../vision_language/test_models.py            | 20 +++++++++----------
 tests/models/registry.py                      |  3 +--
 vllm/model_executor/models/deepseek_vl2.py    | 11 +++++++---
 6 files changed, 22 insertions(+), 21 deletions(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 85d844f3d3f55..d07cde3db5c6e 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -618,7 +618,7 @@ See [this page](#generative-models) for more information on how to use generativ
 * - `DeepseekVLV2ForCausalLM`
   - DeepSeek-VL2
   - T + I<sup>+</sup>
-  - `deepseek-ai/deepseek-vl2-tiny`(WIP), `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2` etc. (see note)
+  - `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2` etc. (see note)
   -
   - ✅︎
   - ✅︎
@@ -768,9 +768,8 @@ See [this page](#generative-models) for more information on how to use generativ
 <sup>+</sup> Multiple items can be inputted per text prompt for this modality.
 
 ````{note}
-The `deepseek-ai/deepseek-vl2-tiny` is not supported yet.
-
 To use `DeepSeek-VL2` series models, you need to install a fork version `deepseek_vl2` package:
+
 ```shell
 pip install git+https://github.com/Isotr0py/DeepSeek-VL2.git
 ```
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index ad32b9fe242e9..8bc715a50e0db 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -70,7 +70,7 @@ def run_chameleon(question: str, modality: str):
 def run_deepseek_vl2(question: str, modality: str):
     assert modality == "image"
 
-    model_name = "deepseek-ai/deepseek-vl2-small"
+    model_name = "deepseek-ai/deepseek-vl2-tiny"
 
     llm = LLM(model=model_name,
               max_model_len=4096,
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index c6cf3f30c31cb..33ef5f316f040 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -55,7 +55,7 @@ def load_aria(question, image_urls: List[str]) -> ModelRequestData:
 
 
 def load_deepseek_vl2(question: str, image_urls: List[str]):
-    model_name = "deepseek-ai/deepseek-vl2-small"
+    model_name = "deepseek-ai/deepseek-vl2-tiny"
 
     llm = LLM(model=model_name,
               max_model_len=4096,
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index 7620ed1107e8f..5710303548c34 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -9,6 +9,7 @@
 
 import pytest
 from transformers import AutoModelForVision2Seq
+from transformers import __version__ as TRANSFORMERS_VERSION
 from transformers.utils import is_flash_attn_2_available
 
 from vllm.platforms import current_platform
@@ -189,30 +190,27 @@
         dtype="bfloat16",
     ),
     "deepseek_vl_v2": VLMTestInfo(
-        models=["deepseek-ai/deepseek-vl2-small"],
+        models=["deepseek-ai/deepseek-vl2-tiny"],
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
-        dtype="bfloat16",
         prompt_formatter=lambda img_prompt: f"<|User|>: {img_prompt}\n\n<|Assistant|>: ", # noqa: E501
         max_model_len=4096,
         max_num_seqs=2,
         single_image_prompts=IMAGE_ASSETS.prompts({
-            "stop_sign": "<image>\nWhat's the color of the stop sign and car?",
-            "cherry_blossom": "<image>\nWhat's the color of the tower?",
+            "stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501
+            "cherry_blossom": "<image>\nPlease infer the season with reason in details.",   # noqa: E501
         }),
-        multi_image_prompt="image_1:<image>\nimage_2:<image>\nDescribe the two images shortly.",    # noqa: E501
+        multi_image_prompt="image_1:<image>\nimage_2:<image>\nWhich image can we see the car and the tower?",    # noqa: E501
         vllm_runner_kwargs={"hf_overrides": {"architectures": ["DeepseekVLV2ForCausalLM"]}},  # noqa: E501
-        image_size_factors=[(0.10, 0.15)],
         patch_hf_runner=model_utils.deepseekvl2_patch_hf_runner,
         postprocess_inputs=model_utils.cast_dtype_post_processor("images"),
         hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output,
         stop_str=["<｜end▁of▁sentence｜>", "<｜begin▁of▁sentence｜>"],  # noqa: E501
-        num_logprobs=5,
+        image_size_factors=[(), (1.0, ), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)],
         marks=[
             pytest.mark.skipif(
-                not is_flash_attn_2_available(),
-                reason="Model needs flash-attn for numeric convergence.",
-            ),
-            large_gpu_mark(min_gb=48),
+                TRANSFORMERS_VERSION >= "4.48.0",
+                reason="HF model is not compatible with transformers>=4.48.0",
+            )
         ],
     ),
     "fuyu": VLMTestInfo(
diff --git a/tests/models/registry.py b/tests/models/registry.py
index b0f0f9767a90f..938c838617e8b 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -181,8 +181,7 @@ class _HfExamplesInfo:
                                     trust_remote_code=True),
     "ChatGLMForConditionalGeneration": _HfExamplesInfo("chatglm2-6b",
                                                        is_available_online=False),
-    # TODO(Isotr0py): Use deepseek-vl2-tiny for test after it's supported
-    "DeepseekVLV2ForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-vl2-small"),   # noqa: E501
+    "DeepseekVLV2ForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-vl2-tiny"),   # noqa: E501
     "FuyuForCausalLM": _HfExamplesInfo("adept/fuyu-8b"),
     "H2OVLChatModel": _HfExamplesInfo("h2oai/h2ovl-mississippi-800m"),
     "InternVLChatModel": _HfExamplesInfo("OpenGVLab/InternVL2-1B",
diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
index 99fa941c055d2..4553695022169 100644
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -356,13 +356,18 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                 f"Only 2D tile_tag is supported currently, got: {self.tile_tag}"
             )
 
+        if self.text_config.topk_method == "noaux_tc":
+            architectures = ["DeepseekV3ForCausalLM"]
+        elif not self.text_config.use_mla:
+            architectures = ["DeepseekForCausalLM"]
+        else:
+            architectures = ["DeepseekV2ForCausalLM"]
+
         self.language_model = init_vllm_registered_model(
             vllm_config=vllm_config,
             hf_config=self.text_config,
             prefix=maybe_prefix(prefix, "language"),
-            architectures=["DeepseekV3ForCausalLM"]
-            if self.text_config.topk_method == "noaux_tc" else
-            ["DeepseekV2ForCausalLM"],
+            architectures=architectures,
         )
 
         self.make_empty_intermediate_tensors = (

From d06e824006d1ba4b92871347738ce1b89f658499 Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Fri, 17 Jan 2025 04:30:08 +0800
Subject: [PATCH 245/462] [Bugfix] Set enforce_eager automatically for mllama
 (#12127)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
---
 examples/offline_inference/vision_language.py             | 1 -
 examples/offline_inference/vision_language_multi_image.py | 1 -
 vllm/config.py                                            | 8 +++++---
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 8bc715a50e0db..69228bbf22949 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -325,7 +325,6 @@ def run_mllama(question: str, modality: str):
         model=model_name,
         max_model_len=4096,
         max_num_seqs=16,
-        enforce_eager=True,
         disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
 
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index 33ef5f316f040..cf3c5dd4e0a2c 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -186,7 +186,6 @@ def load_mllama(question, image_urls: List[str]) -> ModelRequestData:
         model=model_name,
         max_model_len=4096,
         max_num_seqs=16,
-        enforce_eager=True,
         limit_mm_per_prompt={"image": len(image_urls)},
     )
 
diff --git a/vllm/config.py b/vllm/config.py
index a5f2161068d2a..79754bd04102f 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -607,10 +607,12 @@ def _verify_cuda_graph(self) -> None:
         self.max_seq_len_to_capture = min(self.max_seq_len_to_capture,
                                           self.max_model_len)
 
-        if (self.hf_config.model_type == 'deepseek_v3'
+        MODEL_NOT_SUPPORT_CUDA_GRAPH = ['deepseek_v3', 'mllama']
+        if (self.hf_config.model_type in MODEL_NOT_SUPPORT_CUDA_GRAPH
                 and not self.enforce_eager):
-            logger.warning("CUDA graph is not supported for Deepseek V3 yet, "
-                           "fallback to the eager mode.")
+            logger.warning(
+                "CUDA graph is not supported for %s yet, fallback to the eager "
+                "mode.", self.hf_config.model_type)
             self.enforce_eager = True
 
     def _verify_bnb_config(self) -> None:

From ebc73f2828df48f0ffbb99e52f0e4b394a23dbd3 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Fri, 17 Jan 2025 11:12:41 +0800
Subject: [PATCH 246/462] [Bugfix] Fix a path bug in disaggregated prefill
 example script. (#12121)

Signed-off-by: Kuntai Du <kuntai@uchicago.edu>
---
 examples/online_serving/disaggregated_prefill.sh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/online_serving/disaggregated_prefill.sh b/examples/online_serving/disaggregated_prefill.sh
index 87155273a81d1..2bb2824c6c86f 100644
--- a/examples/online_serving/disaggregated_prefill.sh
+++ b/examples/online_serving/disaggregated_prefill.sh
@@ -3,6 +3,8 @@
 # We will launch 2 vllm instances (1 for prefill and 1 for decode),
 # and then transfer the KV cache between them.
 
+set -xe
+
 echo "🚧🚧 Warning: The usage of disaggregated prefill is experimental and subject to change 🚧🚧"
 sleep 1
 
@@ -69,7 +71,7 @@ wait_for_server 8200
 #   instance
 # NOTE: the usage of this API is subject to change --- in the future we will 
 # introduce "vllm connect" to connect between prefill and decode instances
-python3 ../benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py &
+python3 ../../benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py &
 sleep 1
 
 # serve two example requests

From fead53ba78dbcdd4da616308f1ef1b4a312f8897 Mon Sep 17 00:00:00 2001
From: Kunshang Ji <kunshang.ji@intel.com>
Date: Fri, 17 Jan 2025 12:15:09 +0800
Subject: [PATCH 247/462] [CI]add genai-perf benchmark in nightly benchmark
 (#10704)

Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
---
 .../scripts/run-nightly-benchmarks.sh         | 107 ++++++++++++++++++
 .../tests/genai-perf-tests.json               |  23 ++++
 requirements-test.in                          |   3 +
 requirements-test.txt                         |  67 ++++++++++-
 4 files changed, 196 insertions(+), 4 deletions(-)
 create mode 100644 .buildkite/nightly-benchmarks/tests/genai-perf-tests.json

diff --git a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
index 3f38cf5137535..32bd34c431c89 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
@@ -301,6 +301,104 @@ run_serving_tests() {
   kill_gpu_processes
 }
 
+run_genai_perf_tests() {
+  # run genai-perf tests 
+
+  # $1: a json file specifying genai-perf test cases
+  local genai_perf_test_file
+  genai_perf_test_file=$1
+
+  # Iterate over genai-perf tests
+  jq -c '.[]' "$genai_perf_test_file" | while read -r params; do
+    # get the test name, and append the GPU type back to it.
+    test_name=$(echo "$params" | jq -r '.test_name')    
+    
+    # if TEST_SELECTOR is set, only run the test cases that match the selector
+    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
+      echo "Skip test case $test_name."
+      continue
+    fi
+    
+    # prepend the current serving engine to the test name
+    test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
+
+    # get common parameters
+    common_params=$(echo "$params" | jq -r '.common_parameters')
+    model=$(echo "$common_params" | jq -r '.model')
+    tp=$(echo "$common_params" | jq -r '.tp')
+    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
+    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
+    port=$(echo "$common_params" | jq -r '.port')
+    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
+    reuse_server=$(echo "$common_params" | jq -r '.reuse_server')
+
+    # get client and server arguments
+    server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters")
+    qps_list=$(echo "$params" | jq -r '.qps_list')
+    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
+    echo "Running over qps list $qps_list"
+
+    # check if there is enough GPU to run the test
+    if [[ $gpu_count -lt $tp ]]; then
+      echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
+      continue
+    fi
+
+    if [[ $reuse_server == "true" ]]; then
+      echo "Reuse previous server for test case $test_name"
+    else
+      kill_gpu_processes
+      bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \
+        "$server_params" "$common_params"
+    fi
+
+    if wait_for_server; then
+      echo ""
+      echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
+    else
+      echo ""
+      echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period."
+      break
+    fi
+
+    # iterate over different QPS
+    for qps in $qps_list; do
+      # remove the surrounding single quote from qps
+      if [[ "$qps" == *"inf"* ]]; then
+        echo "qps was $qps"
+        qps=$num_prompts
+        echo "now qps is $qps"
+      fi
+    
+      new_test_name=$test_name"_qps_"$qps
+      backend=$CURRENT_LLM_SERVING_ENGINE
+      
+      if [[ "$backend" == *"vllm"* ]]; then
+        backend="vllm"
+      fi
+      #TODO: add output dir.
+      client_command="genai-perf profile \
+        -m $model \
+        --service-kind openai \
+        --backend vllm \
+        --endpoint-type chat \
+        --streaming \
+        --url localhost:$port \
+        --request-rate $qps \
+        --num-prompts $num_prompts \
+      "
+
+    echo "Client command: $client_command"
+
+    eval "$client_command"
+
+    #TODO: process/record outputs
+    done
+  done
+
+  kill_gpu_processes
+
+}
 
 prepare_dataset() {
 
@@ -328,12 +426,17 @@ main() {
 
   pip install -U transformers
 
+  pip install -r requirements-dev.txt
+  which genai-perf
+
   # check storage
   df -h
 
   ensure_installed wget
   ensure_installed curl
   ensure_installed jq
+  # genai-perf dependency
+  ensure_installed libb64-0d
 
   prepare_dataset
 
@@ -345,6 +448,10 @@ main() {
   # run the test
   run_serving_tests "$BENCHMARK_ROOT/tests/nightly-tests.json"
 
+  # run genai-perf tests
+  run_genai_perf_tests "$BENCHMARK_ROOT/tests/genai-perf-tests.json"
+  mv artifacts/ $RESULTS_FOLDER/
+
   # upload benchmark results to buildkite
   python3 -m pip install tabulate pandas
   python3 "$BENCHMARK_ROOT/scripts/summary-nightly-results.py"
diff --git a/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json b/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json
new file mode 100644
index 0000000000000..edbe9f2df0ce0
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json
@@ -0,0 +1,23 @@
+[
+    {
+        "test_name": "llama8B_tp1_genai_perf",
+        "qps_list": [4,8,16,32],
+        "common_parameters": {
+            "model": "meta-llama/Meta-Llama-3-8B-Instruct",
+            "tp": 1,
+            "port": 8000,
+            "num_prompts": 500,
+            "reuse_server": false
+        },
+        "vllm_server_parameters": {
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "gpu_memory_utilization": 0.9,
+            "num_scheduler_steps": 10,
+            "max_num_seqs": 512,
+            "dtype": "bfloat16"
+        },
+        "genai_perf_input_parameters": {
+        }
+    }
+]
\ No newline at end of file
diff --git a/requirements-test.in b/requirements-test.in
index 4b4dc376d1fa5..bc76a91ad5356 100644
--- a/requirements-test.in
+++ b/requirements-test.in
@@ -29,4 +29,7 @@ lm-eval[api]==0.4.4 # required for model evaluation test
 bitsandbytes>=0.45.0
 buildkite-test-collector==0.1.9
 
+genai_perf==0.0.8
+tritonclient==2.51.0
+
 numpy < 2.0.0
diff --git a/requirements-test.txt b/requirements-test.txt
index f576e42afcbbf..09e009c2e21f4 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -37,7 +37,7 @@ audioread==3.0.1
     # via librosa
 awscli==1.35.23
     # via -r requirements-test.in
-bitsandbytes>=0.45.0
+bitsandbytes==0.45.0
     # via -r requirements-test.in
 black==24.10.0
     # via datamodel-code-generator
@@ -75,6 +75,8 @@ colorama==0.4.6
     #   tqdm-multiprocess
 contourpy==1.3.0
     # via matplotlib
+cramjam==2.9.0
+    # via fastparquet
 cupy-cuda12x==13.3.0
     # via ray
 cycler==0.12.1
@@ -109,6 +111,8 @@ email-validator==2.2.0
     # via pydantic
 evaluate==0.4.3
     # via lm-eval
+fastparquet==2024.11.0
+    # via genai-perf
 fastrlock==0.8.2
     # via cupy-cuda12x
 filelock==3.16.1
@@ -130,8 +134,11 @@ fsspec[http]==2024.9.0
     # via
     #   datasets
     #   evaluate
+    #   fastparquet
     #   huggingface-hub
     #   torch
+genai-perf==0.0.8
+    # via -r requirements-test.in
 genson==1.3.0
     # via datamodel-code-generator
 h11==0.14.0
@@ -186,6 +193,8 @@ jsonschema==4.23.0
     #   ray
 jsonschema-specifications==2024.10.1
     # via jsonschema
+kaleido==0.2.1
+    # via genai-perf
 kiwisolver==1.4.7
     # via matplotlib
 lazy-loader==0.4
@@ -200,6 +209,8 @@ lm-eval[api]==0.4.4
     # via -r requirements-test.in
 lxml==5.3.0
     # via sacrebleu
+markdown-it-py==3.0.0
+    # via rich
 markupsafe==3.0.2
     # via jinja2
 matplotlib==3.9.2
@@ -209,6 +220,8 @@ mbstrdecoder==1.1.3
     #   dataproperty
     #   pytablewriter
     #   typepy
+mdurl==0.1.2
+    # via markdown-it-py
 mistral-common[opencv]==1.5.1
     # via
     #   -r requirements-test.in
@@ -249,6 +262,8 @@ numpy==1.26.4
     #   datasets
     #   decord
     #   evaluate
+    #   fastparquet
+    #   genai-perf
     #   librosa
     #   matplotlib
     #   mistral-common
@@ -256,15 +271,18 @@ numpy==1.26.4
     #   numexpr
     #   opencv-python-headless
     #   pandas
+    #   patsy
     #   peft
     #   rouge-score
     #   sacrebleu
     #   scikit-learn
     #   scipy
     #   soxr
+    #   statsmodels
     #   tensorizer
     #   torchvision
     #   transformers
+    #   tritonclient
 nvidia-cublas-cu12==12.4.5.8
     # via
     #   nvidia-cudnn-cu12
@@ -306,30 +324,39 @@ packaging==24.1
     #   datamodel-code-generator
     #   datasets
     #   evaluate
+    #   fastparquet
     #   huggingface-hub
     #   lazy-loader
     #   matplotlib
     #   peft
+    #   plotly
     #   pooch
     #   pytest
     #   pytest-rerunfailures
     #   ray
+    #   statsmodels
     #   transformers
     #   typepy
 pandas==2.2.3
     # via
     #   datasets
     #   evaluate
+    #   fastparquet
+    #   genai-perf
+    #   statsmodels
 pathspec==0.12.1
     # via black
 pathvalidate==3.2.1
     # via pytablewriter
+patsy==1.0.1
+    # via statsmodels
 peft==0.13.2
     # via
     #   -r requirements-test.in
     #   lm-eval
 pillow==10.4.0
     # via
+    #   genai-perf
     #   matplotlib
     #   mistral-common
     #   sentence-transformers
@@ -338,6 +365,8 @@ platformdirs==4.3.6
     # via
     #   black
     #   pooch
+plotly==5.24.1
+    # via genai-perf
 pluggy==1.5.0
     # via pytest
 pooch==1.8.2
@@ -360,7 +389,9 @@ psutil==6.1.0
 py==1.11.0
     # via pytest-forked
 pyarrow==18.0.0
-    # via datasets
+    # via
+    #   datasets
+    #   genai-perf
 pyasn1==0.6.1
     # via rsa
 pybind11==2.13.6
@@ -373,6 +404,8 @@ pydantic[email]==2.9.2
     #   mistral-common
 pydantic-core==2.23.4
     # via pydantic
+pygments==2.18.0
+    # via rich
 pyparsing==3.2.0
     # via matplotlib
 pytablewriter==1.2.0
@@ -381,14 +414,18 @@ pytest==8.3.3
     # via
     #   -r requirements-test.in
     #   buildkite-test-collector
+    #   genai-perf
     #   pytest-asyncio
     #   pytest-forked
+    #   pytest-mock
     #   pytest-rerunfailures
     #   pytest-shard
 pytest-asyncio==0.24.0
     # via -r requirements-test.in
 pytest-forked==1.6.0
     # via -r requirements-test.in
+pytest-mock==3.14.0
+    # via genai-perf
 pytest-rerunfailures==14.0
     # via -r requirements-test.in
 pytest-shard==0.1.2
@@ -399,6 +436,8 @@ python-dateutil==2.9.0.post0
     #   matplotlib
     #   pandas
     #   typepy
+python-rapidjson==1.20
+    # via tritonclient
 pytz==2024.2
     # via
     #   pandas
@@ -409,9 +448,11 @@ pyyaml==6.0.2
     #   awscli
     #   datamodel-code-generator
     #   datasets
+    #   genai-perf
     #   huggingface-hub
     #   peft
     #   ray
+    #   responses
     #   timm
     #   transformers
 ray[adag]==2.40.0
@@ -438,8 +479,13 @@ requests==2.32.3
     #   mistral-common
     #   pooch
     #   ray
+    #   responses
     #   tiktoken
     #   transformers
+responses==0.25.3
+    # via genai-perf
+rich==13.9.4
+    # via genai-perf
 rouge-score==0.1.2
     # via lm-eval
 rpds-py==0.20.1
@@ -470,6 +516,7 @@ scipy==1.13.1
     #   librosa
     #   scikit-learn
     #   sentence-transformers
+    #   statsmodels
 sentence-transformers==3.2.1
     # via -r requirements-test.in
 sentencepiece==0.2.0
@@ -490,6 +537,8 @@ soxr==0.5.0.post1
     # via librosa
 sqlitedict==2.1.0
     # via lm-eval
+statsmodels==0.14.4
+    # via genai-perf
 sympy==1.13.1
     # via torch
 tabledata==1.3.3
@@ -499,7 +548,9 @@ tabulate==0.9.0
 tcolorpy==0.1.6
     # via pytablewriter
 tenacity==9.0.0
-    # via lm-eval
+    # via
+    #   lm-eval
+    #   plotly
 tensorizer==2.9.0
     # via -r requirements-test.in
 threadpoolctl==3.5.0
@@ -540,6 +591,7 @@ tqdm-multiprocess==0.0.11
     # via lm-eval
 transformers==4.47.0
     # via
+    #   genai-perf
     #   lm-eval
     #   peft
     #   sentence-transformers
@@ -548,6 +600,10 @@ transformers-stream-generator==0.0.5
     # via -r requirements-test.in
 triton==3.1.0
     # via torch
+tritonclient==2.51.0
+    # via
+    #   -r requirements-test.in
+    #   genai-perf
 typepy[datetime]==1.3.2
     # via
     #   dataproperty
@@ -555,6 +611,7 @@ typepy[datetime]==1.3.2
     #   tabledata
 typing-extensions==4.12.2
     # via
+    #   bitsandbytes
     #   huggingface-hub
     #   librosa
     #   mistral-common
@@ -563,10 +620,12 @@ typing-extensions==4.12.2
     #   torch
 tzdata==2024.2
     # via pandas
-urllib3==1.26.20
+urllib3==2.2.3
     # via
     #   botocore
     #   requests
+    #   responses
+    #   tritonclient
 word2number==1.1
     # via lm-eval
 xxhash==3.5.0

From 1475847a14e3693128fcc4f8740493d12074ed93 Mon Sep 17 00:00:00 2001
From: Yuan Tang <terrytangyuan@gmail.com>
Date: Thu, 16 Jan 2025 23:45:36 -0500
Subject: [PATCH 248/462] [Doc] Add instructions on using Podman when SELinux
 is active (#12136)

Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
---
 docs/source/deployment/docker.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/docs/source/deployment/docker.md b/docs/source/deployment/docker.md
index 2606e2765c1ae..438be47316f3b 100644
--- a/docs/source/deployment/docker.md
+++ b/docs/source/deployment/docker.md
@@ -42,6 +42,9 @@ DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai
 By default vLLM will build for all GPU types for widest distribution. If you are just building for the
 current GPU type the machine is running on, you can add the argument `--build-arg torch_cuda_arch_list=""`
 for vLLM to find the current GPU type and build for that.
+
+If you are using Podman instead of Docker, you might need to disable SELinux labeling by
+adding `--security-opt label=disable` when running `podman build` command to avoid certain [existing issues](https://github.com/containers/buildah/discussions/4184).
 ```
 
 ## Building for Arm64/aarch64

From b8bfa46a18abe0bf9f48a29e1e8dd2bc1a79af98 Mon Sep 17 00:00:00 2001
From: Yuan Tang <terrytangyuan@gmail.com>
Date: Thu, 16 Jan 2025 23:54:01 -0500
Subject: [PATCH 249/462] [Bugfix] Fix issues in CPU build Dockerfile (#12135)

Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
---
 Dockerfile.cpu |  6 +++---
 setup.py       | 10 +++-------
 2 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/Dockerfile.cpu b/Dockerfile.cpu
index f163edc27cba8..ebe226cf6d148 100644
--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@@ -26,10 +26,10 @@ RUN pip install intel_extension_for_pytorch==2.5.0
 
 WORKDIR /workspace
 
-COPY requirements-build.txt requirements-build.txt
 ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
 ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
 RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \
     pip install --upgrade pip && \
     pip install -r requirements-build.txt
 
@@ -37,9 +37,9 @@ FROM cpu-test-1 AS build
 
 WORKDIR /workspace/vllm
 
-COPY requirements-common.txt requirements-common.txt
-COPY requirements-cpu.txt requirements-cpu.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=bind,src=requirements-common.txt,target=requirements-common.txt \
+    --mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \
     pip install -v -r requirements-cpu.txt
 
 COPY . .
diff --git a/setup.py b/setup.py
index 7dfcec7f9f0c5..978625a069778 100644
--- a/setup.py
+++ b/setup.py
@@ -472,13 +472,9 @@ def get_gaudi_sw_version():
 
 
 def get_vllm_version() -> str:
-    # TODO: Revisit this temporary approach: https://github.com/vllm-project/vllm/issues/9182#issuecomment-2404860236
-    try:
-        version = get_version(
-            write_to="vllm/_version.py",  # TODO: move this to pyproject.toml
-        )
-    except LookupError:
-        version = "0.0.0"
+    version = get_version(
+        write_to="vllm/_version.py",  # TODO: move this to pyproject.toml
+    )
 
     sep = "+" if "+" not in version else "."  # dev versions might contain +
 

From d1adb9b4032dd430bb28b8e91feb8164c3a1ca9c Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Fri, 17 Jan 2025 13:33:22 +0800
Subject: [PATCH 250/462] [BugFix] add more `is not None` check in
 VllmConfig.__post_init__ (#12138)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
---
 vllm/config.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/config.py b/vllm/config.py
index 79754bd04102f..ac5a4c91b1738 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -3174,7 +3174,8 @@ def __post_init__(self):
 
         if self.compilation_config is None:
             self.compilation_config = CompilationConfig()
-        if envs.VLLM_USE_V1 and not self.model_config.enforce_eager:
+        if envs.VLLM_USE_V1 and self.model_config is not None and \
+            not self.model_config.enforce_eager:
             # NOTE(woosuk): Currently, we use inductor because the piecewise
             # CUDA graphs do not work properly with the custom CUDA kernels.
             # FIXME(woosuk): Disable inductor to reduce the compilation time

From d75ab55f1035309c96814af46da1c5166209854b Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Fri, 17 Jan 2025 14:34:48 +0800
Subject: [PATCH 251/462] [Misc] Add deepseek_vl2 chat template (#12143)

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 examples/template_deepseek_vl2.jinja | 23 +++++++++++++++++++++++
 tests/entrypoints/test_chat_utils.py |  1 +
 2 files changed, 24 insertions(+)
 create mode 100644 examples/template_deepseek_vl2.jinja

diff --git a/examples/template_deepseek_vl2.jinja b/examples/template_deepseek_vl2.jinja
new file mode 100644
index 0000000000000..fbf3d320094d5
--- /dev/null
+++ b/examples/template_deepseek_vl2.jinja
@@ -0,0 +1,23 @@
+{%- if messages[0]['role'] == 'system' -%}
+    {%- set system_message = messages[0]['content'] -%}
+    {%- set messages = messages[1:] -%}
+{%- else -%}
+    {% set system_message = '' -%}
+{%- endif -%}
+
+{{ bos_token + system_message }}
+{%- for message in messages -%}
+    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
+        {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}
+    {%- endif -%}
+
+    {%- if message['role'] == 'user' -%}
+        {{ '<|User|>: ' + message['content'] + '\n' }}
+    {%- elif message['role'] == 'assistant' -%}
+        {{ '<|Assistant|>: ' + message['content'] + eos_token + '\n' }}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {{ '<|Assistant|>: ' }}
+{% endif %}
diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index 8f242df4a60e3..513b466c10d60 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -754,6 +754,7 @@ def test_resolve_content_format_hf_defined(model, expected_format):
      ("template_chatglm.jinja", "string"),
      ("template_chatglm2.jinja", "string"),
      ("template_chatml.jinja", "string"),
+     ("template_deepseek_vl2.jinja", "string"),
      ("template_falcon_180b.jinja", "string"),
      ("template_falcon.jinja", "string"),
      ("template_inkbot.jinja", "string"),

From 8027a724611353d2ff3a504f91c5607e94f635b0 Mon Sep 17 00:00:00 2001
From: Divakar Verma <137818590+divakar-amd@users.noreply.github.com>
Date: Fri, 17 Jan 2025 00:49:16 -0600
Subject: [PATCH 252/462] [ROCm][MoE] moe tuning support for rocm (#12049)

Signed-off-by: Divakar Verma <divakar.verma@amd.com>
---
 benchmarks/kernels/benchmark_moe.py | 272 +++++++++++++++++++++++-----
 1 file changed, 224 insertions(+), 48 deletions(-)

diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
index 8f538c21f7f7e..1d59a01422412 100644
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -1,6 +1,7 @@
 import argparse
 import time
 from datetime import datetime
+from itertools import product
 from typing import Any, Dict, List, Tuple, TypedDict
 
 import ray
@@ -11,7 +12,10 @@
 
 from vllm.model_executor.layers.fused_moe.fused_moe import *
 from vllm.platforms import current_platform
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils import FlexibleArgumentParser, is_navi
+
+FP8_DTYPE = torch.float8_e4m3fnuz if current_platform.is_rocm(
+) and not is_navi() else torch.float8_e4m3fn
 
 
 class BenchmarkConfig(TypedDict):
@@ -80,8 +84,8 @@ def benchmark_config(
         a1_scale = torch.randn(1, dtype=torch.float32)
         a2_scale = torch.randn(1, dtype=torch.float32)
 
-        w1 = w1.to(torch.float8_e4m3fn)
-        w2 = w2.to(torch.float8_e4m3fn)
+        w1 = w1.to(FP8_DTYPE)
+        w2 = w2.to(FP8_DTYPE)
 
     input_gating = torch.empty(num_tokens, num_experts, dtype=torch.float32)
 
@@ -141,28 +145,172 @@ def run():
     return avg
 
 
-def get_configs_compute_bound() -> List[Dict[str, int]]:
-    # Reduced search space for faster tuning.
-    # TODO(woosuk): Increase the search space and use a performance model to
-    # prune the search space.
+def get_rocm_tuning_space(use_fp16):
+    block_mn_range = [16, 32, 64, 128, 256]
+    block_k_range = [16, 32, 64, 128, 256]
+    if not use_fp16:
+        block_k_range.remove(16)  # BLOCK_K=16 not supported for fp8
+    num_warps_range = [1, 2, 4, 8]
+    group_m_range = [1, 4, 8, 16, 32]
+    num_stage_range = [2]
+    waves_per_eu_range = [0]
+    matrix_instr_nonkdim_range = [16, 32] if use_fp16 else []
+    kpack_range = [1, 2] if use_fp16 else []
+
+    param_ranges = {
+        "BLOCK_SIZE_M": block_mn_range,
+        "BLOCK_SIZE_N": block_mn_range,
+        "BLOCK_SIZE_K": block_k_range,
+        "GROUP_SIZE_M": group_m_range,
+        "num_warps": num_warps_range,
+        "num_stages": num_stage_range,
+        "waves_per_eu": waves_per_eu_range,
+    }
+    if use_fp16:
+        param_ranges["matrix_instr_nonkdim"] = matrix_instr_nonkdim_range
+        param_ranges["kpack"] = kpack_range
+
+    return param_ranges
+
+
+def get_configs_compute_bound(use_fp16) -> List[Dict[str, int]]:
     configs: List[BenchmarkConfig] = []
-    for num_stages in [2, 3, 4, 5]:
-        for block_m in [16, 32, 64, 128, 256]:
-            for block_k in [64, 128, 256]:
-                for block_n in [32, 64, 128, 256]:
-                    for num_warps in [4, 8]:
-                        for group_size in [1, 16, 32, 64]:
-                            configs.append({
-                                "BLOCK_SIZE_M": block_m,
-                                "BLOCK_SIZE_N": block_n,
-                                "BLOCK_SIZE_K": block_k,
-                                "GROUP_SIZE_M": group_size,
-                                "num_warps": num_warps,
-                                "num_stages": num_stages,
-                            })
+
+    if current_platform.is_rocm():
+        param_ranges = get_rocm_tuning_space(use_fp16)
+    else:
+        # Reduced search space for faster tuning.
+        # TODO(woosuk): Increase the search space and use a performance model to
+        # prune the search space.
+        block_m_range = [16, 32, 64, 128, 256]
+        block_n_range = [32, 64, 128, 256]
+        block_k_range = [64, 128, 256]
+        num_warps_range = [4, 8]
+        group_m_range = [1, 16, 32, 64]
+        num_stage_range = [2, 3, 4, 5]
+
+        param_ranges = {
+            "BLOCK_SIZE_M": block_m_range,
+            "BLOCK_SIZE_N": block_n_range,
+            "BLOCK_SIZE_K": block_k_range,
+            "GROUP_SIZE_M": group_m_range,
+            "num_warps": num_warps_range,
+            "num_stages": num_stage_range,
+        }
+
+    keys, values = zip(*param_ranges.items())
+    for config_values in product(*values):
+        config = dict(zip(keys, config_values))
+        configs.append(config)
     return configs
 
 
+def prune_rocm_search_space(num_tokens, shard_intermediate_size, hidden_size,
+                            search_space, is_fp16):
+    N1, K1 = shard_intermediate_size, hidden_size
+    N2, K2 = hidden_size, shard_intermediate_size // 2
+    pruned_space_1 = prune_rocm_configs(num_tokens * 2, N1, K1, search_space,
+                                        is_fp16)
+    pruned_space_2 = prune_rocm_configs(num_tokens * 2, N2, K2, search_space,
+                                        is_fp16)
+    search_space = merge_unique_dicts(pruned_space_1, pruned_space_2)
+    return search_space
+
+
+# The following code is inspired by ROCm/Triton GEMM tuning script:
+# https://github.com/ROCm/triton/blob/triton-mlir/scripts/amd/gemm/tune_gemm.py#L89
+def prune_rocm_configs(M, N, K, configs, is_fp16=True):
+    pruned_configs = []
+    elemBytes_a = 2 if is_fp16 else 1
+    elemBytes_b = 2 if is_fp16 else 1
+
+    mfma = 16 if M < 32 or N < 32 else 32
+
+    # TODO (zhanglx): figure out the boundary between large and small gemms
+    large_gemm = False
+    if M >= 2048 and N >= 2048:
+        large_gemm = True
+
+    for config in configs:
+        BLOCK_SIZE_M = config.get("BLOCK_SIZE_M")
+        BLOCK_SIZE_N = config.get("BLOCK_SIZE_N")
+        BLOCK_SIZE_K = config.get("BLOCK_SIZE_K")
+        num_warps = config.get("num_warps")
+
+        if is_fp16:
+            matrix_instr_nonkdim = config.get("matrix_instr_nonkdim")
+            if matrix_instr_nonkdim > mfma:
+                continue
+        if mfma == 4 and BLOCK_SIZE_K < 64:
+            continue
+        # some layouts could not work properly in case
+        # number elements per thread is less 1
+        if BLOCK_SIZE_M * BLOCK_SIZE_N < 64:
+            continue
+        SPLIT_K = config.get("SPLIT_K", 1)
+        GROUP_M = config.get("GROUP_SIZE_M")
+        if is_fp16:
+            if (matrix_instr_nonkdim > BLOCK_SIZE_M
+                    or matrix_instr_nonkdim > BLOCK_SIZE_N):
+                continue
+            if (matrix_instr_nonkdim >= M
+                    and matrix_instr_nonkdim != BLOCK_SIZE_M):
+                continue
+            if (matrix_instr_nonkdim >= N
+                    and matrix_instr_nonkdim != BLOCK_SIZE_N):
+                continue
+        # Skip BLOCK_SIZE that is too large compare to M/N
+        # unless BLOCK_SIZE is already small enough
+        if M * 2 < BLOCK_SIZE_M and BLOCK_SIZE_M != 16:
+            continue
+        if N * 2 < BLOCK_SIZE_N and BLOCK_SIZE_N != 16:
+            continue
+        # skip large split_k when not necessary
+        if SPLIT_K != 1 and not need_split_k(M, N, K):
+            continue
+        # skip split_k that leads to EVEN_K = false
+        leap = SPLIT_K * BLOCK_SIZE_K
+        modv = K % leap
+        if modv != 0:
+            continue
+        # skip large GROUP_M
+        if GROUP_M * BLOCK_SIZE_M > M and GROUP_M != 1:
+            continue
+        # out of shared memory resource
+        # TODO (zhanglx): This does not consider the LDS usage in the epilogue
+        LDS = (BLOCK_SIZE_K * BLOCK_SIZE_M * elemBytes_a +
+               BLOCK_SIZE_K * BLOCK_SIZE_N * elemBytes_b)
+        if LDS > 65536:
+            continue
+        # Skip small block sizes and num_warps for large gemm
+        # For fp16 and f8, we want to only use BLOCK_SIZE >= 64
+        if large_gemm:
+            if BLOCK_SIZE_M < 64 or BLOCK_SIZE_N < 64:
+                continue
+            if BLOCK_SIZE_K < 64:
+                continue
+            if num_warps < 4:
+                continue
+
+        pruned_configs.append(config)
+
+    return pruned_configs
+
+
+def need_split_k(SIZE_M, SIZE_N, SIZE_K):
+    return (SIZE_M < 64 or SIZE_N < 64) and SIZE_K > 1024
+
+
+def merge_unique_dicts(list1, list2):
+    result = []
+    combined_list = list1.copy()
+    combined_list.extend(list2)
+    for dictionary in combined_list:
+        if dictionary not in result:
+            result.append(dictionary)
+    return result
+
+
 @ray.remote(num_gpus=1)
 class BenchmarkWorker:
 
@@ -170,6 +318,10 @@ def __init__(self, seed: int) -> None:
         torch.set_default_device("cuda")
         current_platform.seed_everything(seed)
         self.seed = seed
+        # Get the device ID to allocate tensors and kernels
+        # on the respective GPU. This is required for Ray to work
+        # correctly with multi-GPU tuning on the ROCm platform.
+        self.device_id = int(ray.get_gpu_ids()[0])
 
     def benchmark(
         self,
@@ -217,25 +369,33 @@ def tune(
     ) -> Dict[str, int]:
         best_config = None
         best_time = float("inf")
-        for config in tqdm(search_space):
-            try:
-                kernel_time = benchmark_config(config,
-                                               num_tokens,
-                                               num_experts,
-                                               shard_intermediate_size,
-                                               hidden_size,
-                                               topk,
-                                               dtype,
-                                               use_fp8_w8a8,
-                                               use_int8_w8a16,
-                                               num_iters=10)
-            except triton.runtime.autotuner.OutOfResources:
-                # Some configurations may be invalid and fail to compile.
-                continue
-
-            if kernel_time < best_time:
-                best_time = kernel_time
-                best_config = config
+        if current_platform.is_rocm():
+            is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16)
+            search_space = prune_rocm_search_space(num_tokens,
+                                                   shard_intermediate_size,
+                                                   hidden_size, search_space,
+                                                   is_fp16)
+
+        with torch.cuda.device(self.device_id):
+            for config in tqdm(search_space):
+                try:
+                    kernel_time = benchmark_config(config,
+                                                   num_tokens,
+                                                   num_experts,
+                                                   shard_intermediate_size,
+                                                   hidden_size,
+                                                   topk,
+                                                   dtype,
+                                                   use_fp8_w8a8,
+                                                   use_int8_w8a16,
+                                                   num_iters=20)
+                except triton.runtime.autotuner.OutOfResources:
+                    # Some configurations may be invalid and fail to compile.
+                    continue
+
+                if kernel_time < best_time:
+                    best_time = kernel_time
+                    best_config = config
         now = datetime.now()
         print(f"{now.ctime()}] Completed tuning for batch_size={num_tokens}")
         assert best_config is not None
@@ -244,12 +404,27 @@ def tune(
 
 def sort_config(config: BenchmarkConfig) -> BenchmarkConfig:
     return {
-        "BLOCK_SIZE_M": config["BLOCK_SIZE_M"],
-        "BLOCK_SIZE_N": config["BLOCK_SIZE_N"],
-        "BLOCK_SIZE_K": config["BLOCK_SIZE_K"],
-        "GROUP_SIZE_M": config["GROUP_SIZE_M"],
-        "num_warps": config["num_warps"],
-        "num_stages": config["num_stages"],
+        "BLOCK_SIZE_M":
+        config["BLOCK_SIZE_M"],
+        "BLOCK_SIZE_N":
+        config["BLOCK_SIZE_N"],
+        "BLOCK_SIZE_K":
+        config["BLOCK_SIZE_K"],
+        "GROUP_SIZE_M":
+        config["GROUP_SIZE_M"],
+        "num_warps":
+        config["num_warps"],
+        "num_stages":
+        config["num_stages"],
+        **({
+            "waves_per_eu": config["waves_per_eu"]
+        } if "waves_per_eu" in config else {}),
+        **({
+            "matrix_instr_nonkdim": config["matrix_instr_nonkdim"]
+        } if "matrix_instr_nonkdim" in config else {}),
+        **({
+            "kpack": config["kpack"]
+        } if "kpack" in config else {}),
     }
 
 
@@ -294,7 +469,7 @@ def main(args: argparse.Namespace):
         shard_intermediate_size = 2 * intermediate_size // args.tp_size
 
     hidden_size = config.hidden_size
-    dtype = config.torch_dtype
+    dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
     use_fp8_w8a8 = args.dtype == "fp8_w8a8"
     use_int8_w8a16 = args.dtype == "int8_w8a16"
 
@@ -322,7 +497,8 @@ def _distribute(method: str, inputs: List[Any]) -> List[Any]:
         return ray.get(outputs)
 
     if args.tune:
-        search_space = get_configs_compute_bound()
+        is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16)
+        search_space = get_configs_compute_bound(is_fp16)
         print(f"Start tuning over {len(search_space)} configurations...")
 
         start = time.time()

From 69d765f5a5bbbe1ea9843be19b9480660fc5bc8b Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Fri, 17 Jan 2025 15:39:35 +0800
Subject: [PATCH 253/462] [V1] Move more control of kv cache initialization
 from model_executor to EngineCore (#11960)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>
---
 tests/v1/test_utils.py                 |  62 +++++++++++++
 vllm/attention/layer.py                |   2 +
 vllm/v1/core/kv_cache_utils.py         | 124 +++++++++++++++++++++++++
 vllm/v1/engine/core.py                 |  31 ++++---
 vllm/v1/executor/abstract.py           |  11 ++-
 vllm/v1/executor/multiproc_executor.py |  25 +++--
 vllm/v1/executor/ray_executor.py       |  40 ++++----
 vllm/v1/executor/uniproc_executor.py   |  25 ++---
 vllm/v1/kv_cache_interface.py          | 111 ++++++++++++++++++++++
 vllm/v1/utils.py                       |  56 ++++++++++-
 vllm/v1/worker/gpu_model_runner.py     |  84 ++++++++++++++---
 vllm/v1/worker/gpu_worker.py           |  48 +++-------
 12 files changed, 515 insertions(+), 104 deletions(-)
 create mode 100644 tests/v1/test_utils.py
 create mode 100644 vllm/v1/kv_cache_interface.py

diff --git a/tests/v1/test_utils.py b/tests/v1/test_utils.py
new file mode 100644
index 0000000000000..ac773b611f406
--- /dev/null
+++ b/tests/v1/test_utils.py
@@ -0,0 +1,62 @@
+from typing import List
+
+import torch
+
+from vllm.v1.utils import bind_kv_cache
+
+
+def test_bind_kv_cache():
+    from vllm.attention import Attention
+
+    ctx = {
+        'layers.0.self_attn': Attention(32, 128, 0.1),
+        'layers.1.self_attn': Attention(32, 128, 0.1),
+        'layers.2.self_attn': Attention(32, 128, 0.1),
+        'layers.3.self_attn': Attention(32, 128, 0.1),
+    }
+    kv_cache = {
+        'layers.0.self_attn': torch.zeros((1, )),
+        'layers.1.self_attn': torch.zeros((1, )),
+        'layers.2.self_attn': torch.zeros((1, )),
+        'layers.3.self_attn': torch.zeros((1, )),
+    }
+    runner_kv_caches: List[torch.Tensor] = []
+    bind_kv_cache(kv_cache, ctx, runner_kv_caches)
+    assert ctx['layers.0.self_attn'].kv_cache[0] is kv_cache[
+        'layers.0.self_attn']
+    assert ctx['layers.1.self_attn'].kv_cache[0] is kv_cache[
+        'layers.1.self_attn']
+    assert ctx['layers.2.self_attn'].kv_cache[0] is kv_cache[
+        'layers.2.self_attn']
+    assert ctx['layers.3.self_attn'].kv_cache[0] is kv_cache[
+        'layers.3.self_attn']
+
+    assert runner_kv_caches[0] is kv_cache['layers.0.self_attn']
+    assert runner_kv_caches[1] is kv_cache['layers.1.self_attn']
+    assert runner_kv_caches[2] is kv_cache['layers.2.self_attn']
+    assert runner_kv_caches[3] is kv_cache['layers.3.self_attn']
+
+
+def test_bind_kv_cache_non_attention():
+    from vllm.attention import Attention
+
+    # example from Jamba PP=2
+    ctx = {
+        'model.layers.20.attn': Attention(32, 128, 0.1),
+        'model.layers.28.attn': Attention(32, 128, 0.1),
+    }
+    kv_cache = {
+        'model.layers.20.attn': torch.zeros((1, )),
+        'model.layers.28.attn': torch.zeros((1, )),
+    }
+
+    runner_kv_caches: List[torch.Tensor] = []
+    bind_kv_cache(kv_cache, ctx, runner_kv_caches)
+
+    assert ctx['model.layers.20.attn'].kv_cache[0] is kv_cache[
+        'model.layers.20.attn']
+    assert ctx['model.layers.28.attn'].kv_cache[0] is kv_cache[
+        'model.layers.28.attn']
+
+    assert runner_kv_caches[0] is kv_cache['model.layers.20.attn']
+    assert runner_kv_caches[1] is kv_cache['model.layers.28.attn']
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 9b03fd73fe690..e2403306950a3 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -101,7 +101,9 @@ def __init__(
         self.num_heads = num_heads
         self.head_size = head_size
         self.num_kv_heads = num_kv_heads
+        self.sliding_window = sliding_window
         self.backend = backend_name_to_enum(attn_backend.get_name())
+        self.dtype = dtype
 
         # For cuda-alike (CUDA and ROCM) and cpu platforms, we control how
         # torch.compile works by registering the attention as one giant
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index 22a5d2fb08a48..bab99fe37caee 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -3,7 +3,10 @@
 from dataclasses import dataclass
 from typing import Any, List, NamedTuple, Optional, Tuple
 
+from vllm.config import VllmConfig
 from vllm.logger import init_logger
+from vllm.v1.kv_cache_interface import (KVCacheConfig, KVCacheSpec,
+                                        KVCacheTensor)
 from vllm.v1.request import Request
 
 logger = init_logger(__name__)
@@ -305,3 +308,124 @@ def hash_request_tokens(block_size: int,
         ret.append(block_hash)
         parent_block_hash_value = block_hash.hash_value
     return ret
+
+
+def check_enough_kv_cache_memory(vllm_config: VllmConfig,
+                                 kv_cache_spec: KVCacheSpec,
+                                 available_memory: int):
+    """
+    Checks whether `available_memory` is enough for the KV cache to hold at 
+    least one request with the model's max_model_len.
+
+    Args:
+        vllm_config: The global VllmConfig
+        kv_cache_spec: The kv cache spec of the model
+        available_memory: Memory available for KV cache in bytes.
+
+    Raises:
+        ValueError: If there is not enough memory available for the KV cache.
+    """
+
+    if available_memory <= 0:
+        raise ValueError("No available memory for the cache blocks. "
+                         "Try increasing `gpu_memory_utilization` when "
+                         "initializing the engine.")
+
+    max_model_len = vllm_config.model_config.max_model_len
+    needed_memory = 0
+    for layer_spec in kv_cache_spec.values():
+        needed_memory += layer_spec.bytes_for_tokens(max_model_len)
+
+    if needed_memory > available_memory:
+        raise ValueError(
+            f"To serve at least one request with the models's max seq len "
+            f"({max_model_len}), ({needed_memory/1024/1024/1024:.2f} GB KV "
+            f"cache is needed, which is larger than the available KV cache "
+            f"memory ({available_memory/1024/1024/1024:.2f} GB). Try "
+            f"increasing `gpu_memory_utilization` or decreasing "
+            f"`max_model_len` when initializing the engine.")
+
+
+def is_kv_cache_type_uniform(kv_cache_spec: KVCacheSpec) -> bool:
+    """
+    Whether all layers in the given KVCacheSpec have the same type of KV cache.
+
+    Args:
+        kv_cache_spec: The KVCacheSpec of the model
+
+    Returns:
+        True if all layers have the same type, False otherwise.
+    """
+
+    layer_keys = set(layer.type_id for layer in kv_cache_spec.values())
+    return len(layer_keys) == 1
+
+
+def _get_kv_cache_config_uniform_type(vllm_config: VllmConfig,
+                                      kv_cache_spec: KVCacheSpec,
+                                      available_memory: int) -> KVCacheConfig:
+    """
+    Generates the KV cache configuration for a model with one type of KV cache.
+    Divide the available memory equally among all layers.
+
+    Args:
+        vllm_config: The global VllmConfig
+        kv_cache_spec: The kv cache spec of the model
+        available_memory: Memory available for KV cache in bytes.
+
+    Returns:
+        The generated KVCacheConfig
+    """
+
+    page_sizes = {layer.page_size_bytes for layer in kv_cache_spec.values()}
+    assert len(page_sizes) == 1
+    page_size = page_sizes.pop()
+
+    num_blocks = int(available_memory // page_size // len(kv_cache_spec))
+    num_blocks = max(num_blocks, 0)
+
+    if vllm_config.cache_config.num_gpu_blocks_override is not None:
+        num_gpu_blocks_override = \
+            vllm_config.cache_config.num_gpu_blocks_override
+        logger.info(
+            "Overriding num_gpu_blocks=%d with "
+            "num_gpu_blocks_override=%d", num_blocks, num_gpu_blocks_override)
+        num_blocks = num_gpu_blocks_override
+
+    logger.info("# GPU blocks: %d", num_blocks)
+
+    per_layer_size = page_size * num_blocks
+
+    kv_cache_config = KVCacheConfig(
+        num_blocks=num_blocks,
+        tensors={
+            layer_name: KVCacheTensor(size=per_layer_size)
+            for layer_name in kv_cache_spec
+        },
+        groups=[[layer_name for layer_name in kv_cache_spec]],
+        kv_cache_spec=kv_cache_spec)
+    return kv_cache_config
+
+
+def get_kv_cache_config(vllm_config: VllmConfig, kv_cache_spec: KVCacheSpec,
+                        available_memory: int) -> KVCacheConfig:
+    """
+    Generates the KV cache configuration for a model
+    TODO: support hybrid models with more than one type of KV cache.
+
+    Args:
+        vllm_config: The global VllmConfig
+        kv_cache_spec: The kv cache spec of the model
+        available_memory: Memory available for KV cache in bytes.
+
+    Returns:
+        The generated KVCacheConfig
+    """
+    check_enough_kv_cache_memory(vllm_config, kv_cache_spec, available_memory)
+    if is_kv_cache_type_uniform(kv_cache_spec):
+        # KV cache of all layers are the same, which is true for most models.
+        # Allocate the same amount of memory for each layer.
+        return _get_kv_cache_config_uniform_type(vllm_config, kv_cache_spec,
+                                                 available_memory)
+    else:
+        raise NotImplementedError
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index ef616229aa57b..26ebc7edcf03e 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -11,11 +11,12 @@
 import zmq.asyncio
 from msgspec import msgpack
 
-from vllm.config import CacheConfig, VllmConfig
+from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.transformers_utils.config import (
     maybe_register_config_serialize_by_value)
 from vllm.utils import get_exception_traceback, zmq_socket_ctx
+from vllm.v1.core.kv_cache_utils import get_kv_cache_config
 from vllm.v1.core.scheduler import Scheduler
 from vllm.v1.engine import (EngineCoreOutputs, EngineCoreProfile,
                             EngineCoreRequest, EngineCoreRequestType,
@@ -49,7 +50,7 @@ def __init__(
 
         # Setup KV Caches and update CacheConfig after profiling.
         num_gpu_blocks, num_cpu_blocks = self._initialize_kv_caches(
-            vllm_config.cache_config)
+            vllm_config)
         vllm_config.cache_config.num_gpu_blocks = num_gpu_blocks
         vllm_config.cache_config.num_cpu_blocks = num_cpu_blocks
 
@@ -65,21 +66,25 @@ def __init__(
             vllm_config.model_config)
 
     def _initialize_kv_caches(self,
-                              cache_config: CacheConfig) -> Tuple[int, int]:
+                              vllm_config: VllmConfig) -> Tuple[int, int]:
         start = time.time()
-        num_gpu_blocks, _ = self.model_executor.determine_num_available_blocks(
-        )
 
-        if cache_config.num_gpu_blocks_override is not None:
-            num_gpu_blocks_override = cache_config.num_gpu_blocks_override
-            logger.info(
-                "Overriding num_gpu_blocks=%d with "
-                "num_gpu_blocks_override=%d", num_gpu_blocks,
-                num_gpu_blocks_override)
-            num_gpu_blocks = num_gpu_blocks_override
+        # Get all kv cache needed by the model
+        kv_cache_spec = self.model_executor.get_kv_cache_spec()
+
+        # Profiles the peak memory usage of the model to determine how much
+        # memory can be allocated for kv cache.
+        availble_gpu_memory = self.model_executor.determine_available_memory()
 
+        # Get the kv cache tensor size
+        kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec,
+                                              availble_gpu_memory)
+        num_gpu_blocks = kv_cache_config.num_blocks
         num_cpu_blocks = 0
-        self.model_executor.initialize(num_gpu_blocks)
+
+        # Initialize kv cache and warmup the execution
+        self.model_executor.initialize(kv_cache_config)
+
         elapsed = time.time() - start
         logger.info(("init engine (profile, create kv cache, "
                      "warmup model) took %.2f seconds"), elapsed)
diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py
index 7c17f60510ae1..5240778ebf330 100644
--- a/vllm/v1/executor/abstract.py
+++ b/vllm/v1/executor/abstract.py
@@ -1,7 +1,8 @@
 from abc import ABC, abstractmethod
-from typing import Tuple, Type
+from typing import Type
 
 from vllm.config import VllmConfig
+from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
 from vllm.v1.outputs import ModelRunnerOutput
 
 
@@ -31,11 +32,15 @@ def __init__(self, vllm_config: VllmConfig) -> None:
         raise NotImplementedError
 
     @abstractmethod
-    def initialize(self, num_gpu_blocks: int) -> None:
+    def initialize(self, kv_cache_config: KVCacheConfig) -> None:
         raise NotImplementedError
 
     @abstractmethod
-    def determine_num_available_blocks(self) -> Tuple[int, int]:
+    def determine_available_memory(self) -> int:  # in bytes
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_kv_cache_spec(self) -> KVCacheSpec:
         raise NotImplementedError
 
     @abstractmethod
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index e111ac7ee8183..e92acc7cb5e41 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -23,6 +23,7 @@
 from vllm.utils import (get_distributed_init_method, get_mp_context,
                         get_open_port, get_open_zmq_ipc_path, zmq_socket_ctx)
 from vllm.v1.executor.abstract import Executor
+from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.worker.worker_base import WorkerWrapperBase
 
@@ -90,29 +91,33 @@ def sigusr1_handler(signum, frame):
         for w in self.workers:
             w.worker_response_mq.wait_until_ready()
 
-    def initialize(self, num_gpu_blocks: int) -> None:
+    def initialize(self, kv_cache_config: KVCacheConfig) -> None:
         """
         Initialize the KV caches and begin the model execution loop of the
         underlying workers.
         """
-        logger.info("# GPU blocks: %d", num_gpu_blocks)
-        self.collective_rpc("initialize_cache", args=(num_gpu_blocks, ))
+        self.collective_rpc("initialize_cache", args=(kv_cache_config, ))
         self.collective_rpc("compile_or_warm_up_model")
 
-    def determine_num_available_blocks(self) -> Tuple[int, int]:
+    def determine_available_memory(self) -> int:
         """
-        Determine the number of available KV blocks by invoking the
+        Determine the available memory (in bytes) for KV cache by invoking the
         underlying worker.
         """
-        num_blocks = self.collective_rpc("determine_num_available_blocks")
+        memory_sizes = self.collective_rpc("determine_available_memory")
 
         # Since we use a shared centralized controller, we take the minimum
-        # number of blocks across all workers to make sure all the memory
+        # memory size across all workers to make sure all the memory
         # operators can be applied to all workers.
-        num_gpu_blocks = min(b[0] for b in num_blocks)
-        num_cpu_blocks = min(b[1] for b in num_blocks)
+        return min(memory_sizes)
 
-        return num_gpu_blocks, num_cpu_blocks
+    def get_kv_cache_spec(self) -> KVCacheSpec:
+        """
+        Get all kv cache needed by the model by invoking the underlying worker.
+        """
+        kv_cache_specs = self.collective_rpc("get_kv_cache_spec")
+        assert all(s == kv_cache_specs[0] for s in kv_cache_specs)
+        return kv_cache_specs[0]
 
     def collective_rpc(self,
                        method: str,
diff --git a/vllm/v1/executor/ray_executor.py b/vllm/v1/executor/ray_executor.py
index 79acc60001c99..fd67fa2235770 100644
--- a/vllm/v1/executor/ray_executor.py
+++ b/vllm/v1/executor/ray_executor.py
@@ -10,6 +10,7 @@
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.executor.ray_utils import (RayWorkerWrapper,
                                         initialize_ray_cluster, ray)
+from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
 from vllm.v1.outputs import ModelRunnerOutput
 
 if ray is not None:
@@ -211,39 +212,40 @@ def _get_worker_kwargs(
             distributed_init_method=distributed_init_method,
         )
 
-    def determine_num_available_blocks(self) -> Tuple[int, int]:
+    def determine_available_memory(self) -> int:
         """
-        Determine the number of available KV blocks.
+        Determine the available GPU memory in bytes.
         
-        This invokes `determine_num_available_blocks` on each worker and takes
+        This invokes `determine_available_memory` on each worker and takes
         the min of the results, guaranteeing that the selected cache sizes are
         compatible with all workers.
-        
-        Returns:
-            - tuple[num_gpu_blocks, num_cpu_blocks]
         """
-        # Get the maximum number of blocks that can be allocated on GPU and CPU.
-        num_blocks = self._run_workers("determine_num_available_blocks")
+
+        memory_sizes = self._run_workers("determine_available_memory")
 
         # Since we use a shared centralized controller, we take the minimum
-        # number of blocks across all workers to make sure all the memory
+        # memory size across all workers to make sure all the memory
         # operators can be applied to all workers.
-        num_gpu_blocks = min(b[0] for b in num_blocks)
-        num_cpu_blocks = min(b[1] for b in num_blocks)
+        return min(memory_sizes)
 
-        return num_gpu_blocks, num_cpu_blocks
-
-    def initialize(self, num_gpu_blocks: int) -> None:
+    def initialize(self, kv_cache_config: KVCacheConfig) -> None:
         """
         Initialize the KV cache in all workers.
         """
-        # NOTE: This is logged in the executor because there can be >1 worker
-        # with other executors. We could log in the engine level, but work
-        # remains to abstract away the device for non-GPU configurations.
-        logger.info("# GPU blocks: %d", num_gpu_blocks)
-        self._run_workers("initialize_cache", num_gpu_blocks)
+        self._run_workers("initialize_cache", kv_cache_config)
         self._run_workers("compile_or_warm_up_model")
 
+    def get_kv_cache_spec(self) -> KVCacheSpec:
+        """
+        Get all kv cache needed by the model
+        
+        This invokes `get_kv_cache_spec` on each worker and asserts that
+        they are identical. The KVCacheSpec is then returned.
+        """
+        kv_cache_specs = self._run_workers("get_kv_cache_spec")
+        assert all(s == kv_cache_specs[0] for s in kv_cache_specs)
+        return kv_cache_specs[0]
+
     def _run_workers(
         self,
         method: str,
diff --git a/vllm/v1/executor/uniproc_executor.py b/vllm/v1/executor/uniproc_executor.py
index c63d7a4c47c15..b3997caac726b 100644
--- a/vllm/v1/executor/uniproc_executor.py
+++ b/vllm/v1/executor/uniproc_executor.py
@@ -1,10 +1,11 @@
 import os
-from typing import Optional, Tuple
+from typing import Optional
 
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.utils import get_distributed_init_method, get_ip, get_open_port
 from vllm.v1.executor.abstract import Executor
+from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.worker.gpu_worker import Worker
 
@@ -49,20 +50,22 @@ def _create_worker(
             distributed_init_method=distributed_init_method,
         )
 
-    def determine_num_available_blocks(self) -> Tuple[int, int]:
-        """Determine the number of available KV blocks by invoking the
-        underlying worker.
+    def determine_available_memory(self) -> int:
+        """Determine the available memory (in bytes) for KV cache by invoking 
+        the underlying worker.
         """
-        return self.worker.determine_num_available_blocks()
+        return self.worker.determine_available_memory()
 
-    def initialize(self, num_gpu_blocks: int) -> None:
+    def get_kv_cache_spec(self) -> KVCacheSpec:
+        """Get all kv cache needed by the model by invoking the underlying
+        worker.
+        """
+        return self.worker.get_kv_cache_spec()
+
+    def initialize(self, kv_cache_config: KVCacheConfig) -> None:
         """Initialize the KV cache by invoking the underlying worker.
         """
-        # NOTE: This is logged in the executor because there can be >1 worker
-        # with other executors. We could log in the engine level, but work
-        # remains to abstract away the device for non-GPU configurations.
-        logger.info("# GPU blocks: %d", num_gpu_blocks)
-        self.worker.initialize_cache(num_gpu_blocks)
+        self.worker.initialize_cache(kv_cache_config)
         self.worker.compile_or_warm_up_model()
 
     def execute_model(
diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py
new file mode 100644
index 0000000000000..6d5cc32ffc5b8
--- /dev/null
+++ b/vllm/v1/kv_cache_interface.py
@@ -0,0 +1,111 @@
+from dataclasses import dataclass
+from typing import Dict, List
+
+import torch
+
+from vllm.logger import init_logger
+from vllm.utils import cdiv, get_dtype_size
+
+logger = init_logger(__name__)
+
+
+@dataclass
+class KVCacheSpecBase:
+    """
+    A base class for specifying the KV cache format of one layer.
+    """
+
+    # number of tokens in a block
+    block_size: int
+
+    @property
+    def type_id(self) -> str:
+        """
+        The type identifier of this KV cache.
+        Return different strings for layers with different KV cache type (e.g., 
+        different number of tokens like full attention vs sliding window 
+        attention, different KV cache size per token like layers with different 
+        number of heads)
+
+        Returns:
+            The type identifier of this KV cache.
+        """
+        raise NotImplementedError
+
+    @property
+    def page_size_bytes(self) -> int:
+        """
+        The size of a page with `block_size` tokens in bytes.
+
+        Returns:
+            The page size
+        """
+        raise NotImplementedError
+
+    def bytes_for_tokens(self, num_tokens: int) -> int:
+        """
+        The KV cache size for `num_tokens` tokens in bytes. Returns the real
+        memory size after padding `num_tokens` to full blocks.
+
+        Returns:
+            The KV cache size
+        """
+        raise NotImplementedError
+
+
+@dataclass
+class FullAttentionSpec(KVCacheSpecBase):
+    num_kv_heads: int
+    head_size: int
+    dtype: torch.dtype
+
+    @property
+    def type_id(self) -> str:
+        return f"full_attention_{self.block_size}_{self.page_size_bytes}"
+
+    @property
+    def page_size_bytes(self) -> int:
+        return  2 * self.block_size * self.num_kv_heads * self.head_size \
+                * get_dtype_size(self.dtype)
+
+    def bytes_for_tokens(self, num_tokens: int) -> int:
+        return cdiv(num_tokens, self.block_size) * self.page_size_bytes
+
+
+KVCacheSpec = Dict[str, KVCacheSpecBase]
+
+
+@dataclass
+class KVCacheTensor:
+    """
+    A dataclass for specifying how the workers should initialize the KV cache
+    for a layer. Only contains the size of KV cache for that layer for now. Will
+    be extended to support multiple layers sharing the same memory pool.
+    """
+    size: int  # The size of KV cache Tensor in bytes
+
+
+@dataclass
+class KVCacheConfig:
+    """
+    The KV cache configuration of a model.
+    """
+    """The number of KV cache blocks"""
+    num_blocks: int
+    """layer_name -> how to initialize KV cache for that layer"""
+    tensors: Dict[str, KVCacheTensor]
+    """
+    A list of kv-cache groups. Each group includes a set of layers with
+    the same kv-cache spec, and the total page_size of layers inside a group
+    is same across all groups (as the KVCacheManager only supports allocating
+    pages of the same size). For example:
+    1. A model only uses full attention: one group with all layers in the model.
+    2. (not implemented yet) A model with the same number of full attention
+    layers and sliding window attention layers: two groups, one for full
+    attention layers and one for sliding window attention layers.
+    3. (not implemented yet) A model with 2 full attention layers and 4 sliding 
+    window attention layers: three groups, (full * 2), (sw * 2), (sw * 2).
+    """
+    groups: List[List[str]]
+    """the KVCacheSpec of the model"""
+    kv_cache_spec: KVCacheSpec
diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index b0a7affbebb7e..8dfcf2dd78606 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -1,13 +1,20 @@
 import multiprocessing
 import os
 import weakref
+from collections import defaultdict
 from collections.abc import Sequence
-from typing import (Any, Callable, Dict, Generic, List, Optional, TypeVar,
-                    Union, overload)
+from typing import (TYPE_CHECKING, Any, Callable, Dict, Generic, List,
+                    Optional, TypeVar, Union, overload)
+
+import torch
 
 from vllm.logger import init_logger
+from vllm.model_executor.models.utils import extract_layer_index
 from vllm.utils import get_mp_context, kill_process_tree
 
+if TYPE_CHECKING:
+    from vllm.attention.layer import Attention
+
 logger = init_logger(__name__)
 
 T = TypeVar("T")
@@ -134,3 +141,48 @@ def shutdown(proc: multiprocessing.Process, input_path: str, output_path: str):
         socket_file = ipc_socket.replace("ipc://", "")
         if os and os.path.exists(socket_file):
             os.remove(socket_file)
+
+
+def bind_kv_cache(
+    kv_caches: Dict[str, torch.Tensor],
+    forward_context: Dict[str, "Attention"],
+    runner_kv_caches: List[torch.Tensor],
+) -> None:
+    """
+    Bind the allocated KV cache to both ModelRunner and forward context so
+    that the KV cache can be used in the forward pass.
+
+    This function:
+      1) Fills the ModelRunner's kv cache list (`runner_kv_caches`) with
+         kv_caches.
+      2) Associates each attention layer in the `forward_context` with its 
+         corresponding KV cache in kv_caches.
+
+    Args:
+        kv_caches: The allocated kv_caches with layer names as keys.
+        forward_context: The global forward context containing all Attention 
+        layers with layer names as keys.
+        runner_kv_caches: The kv_cache declared by ModelRunner.
+    """
+    # Bind kv_caches to ModelRunner
+    assert len(runner_kv_caches) == 0
+
+    # Convert kv_caches dict to a list of tensors in the order of layer_index.
+    index2name = defaultdict(list)
+    for layer_name in kv_caches:
+        index2name[extract_layer_index(layer_name)].append(layer_name)
+
+    for layer_index in sorted(index2name.keys()):
+        layer_names = index2name[layer_index]
+        if len(layer_names) > 1:
+            # One typical case is encoder-decoder model, e.g., bart.
+            # The cross attention and self attention in the same decoder layer
+            # has different layer_name but the same layer_index.
+            raise NotImplementedError
+        layer_name = layer_names[0]
+        runner_kv_caches.append(kv_caches[layer_name])
+
+    # Bind kv_caches to forward context
+    for layer_name, kv_cache in kv_caches.items():
+        # NOTE: Use list because of v0 PP virtual engine.
+        forward_context[layer_name].kv_cache = [kv_cache]
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index de83640b27cd6..aa63d9414c296 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -7,6 +7,8 @@
 import torch.distributed
 import torch.nn as nn
 
+from vllm.attention.backends.abstract import AttentionType
+from vllm.attention.layer import Attention
 from vllm.config import CompilationLevel, VllmConfig
 from vllm.distributed.parallel_state import graph_capture
 from vllm.forward_context import set_forward_context
@@ -16,14 +18,16 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
 from vllm.sampling_params import SamplingType
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
-                        LayerBlockType, bind_kv_cache, cdiv,
-                        is_pin_memory_available)
+                        LayerBlockType, cdiv, is_pin_memory_available)
 from vllm.v1.attention.backends.flash_attn import (FlashAttentionBackend,
                                                    FlashAttentionMetadata)
 from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
 from vllm.v1.engine.mm_input_mapper import MMInputMapperClient
+from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
+                                        KVCacheSpec)
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.sample.metadata import SamplingMetadata
+from vllm.v1.utils import bind_kv_cache
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 
 if TYPE_CHECKING:
@@ -856,15 +860,71 @@ def capture_model(self) -> None:
         logger.info("Graph capturing finished in %.0f secs, took %.2f GiB",
                     elapsed_time, cuda_graph_size / (1 << 30))
 
-    def initialize_kv_cache(self, num_blocks: int) -> None:
-        assert len(self.kv_caches) == 0
-        kv_cache_shape = FlashAttentionBackend.get_kv_cache_shape(
-            num_blocks, self.block_size, self.num_kv_heads, self.head_size)
-        for _ in range(self.num_attn_layers):
-            self.kv_caches.append(
-                torch.zeros(kv_cache_shape,
-                            dtype=self.kv_cache_dtype,
-                            device=self.device))
+    def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
+        """
+        Initialize KV cache based on `kv_cache_config`.
+        Args:
+            kv_cache_config: Configuration for the KV cache, including the KV 
+            cache size of each layer
+        """
+        if len(kv_cache_config.groups) > 1:
+            raise NotImplementedError(
+                "Hybrid models with more than one KV cache type are not "
+                "supported yet.")
+
+        kv_caches: Dict[str, torch.Tensor] = {}
+
+        for layer_name, layer_spec in kv_cache_config.kv_cache_spec.items():
+            tensor_config = kv_cache_config.tensors[layer_name]
+            assert tensor_config.size % layer_spec.page_size_bytes == 0
+            num_blocks = tensor_config.size // layer_spec.page_size_bytes
+            if isinstance(layer_spec, FullAttentionSpec):
+                kv_cache_shape = FlashAttentionBackend.get_kv_cache_shape(
+                    num_blocks, layer_spec.block_size, layer_spec.num_kv_heads,
+                    layer_spec.head_size)
+                dtype = layer_spec.dtype
+                kv_caches[layer_name] = torch.zeros(kv_cache_shape,
+                                                    dtype=dtype,
+                                                    device=self.device)
+            else:
+                raise NotImplementedError
+
         bind_kv_cache(
+            kv_caches,
             self.vllm_config.compilation_config.static_forward_context,
-            [self.kv_caches])
+            self.kv_caches)
+
+    def get_kv_cache_spec(self) -> KVCacheSpec:
+        """
+        Generates the KVCacheSpec by parsing the kv cache format from each 
+        Attention module in the static forward context.
+        Returns:
+            KVCacheSpec: A dictionary mapping layer names to their KV cache 
+            format. Layers that do not need KV cache are not included.
+        """
+
+        forward_ctx = self.vllm_config.compilation_config.static_forward_context
+        block_size = self.vllm_config.cache_config.block_size
+        kv_cache_spec: KVCacheSpec = {}
+        for layer_name, attn_module in forward_ctx.items():
+            # TODO: Support other attention modules, e.g., sliding window,
+            # cross-attention, MLA.
+            assert isinstance(attn_module, Attention)
+            if attn_module.attn_type == AttentionType.DECODER:
+                kv_cache_spec[layer_name] = FullAttentionSpec(
+                    block_size=block_size,
+                    num_kv_heads=attn_module.num_kv_heads,
+                    head_size=attn_module.head_size,
+                    dtype=attn_module.dtype,
+                )
+            elif attn_module.attn_type in (AttentionType.ENCODER,
+                                           AttentionType.ENCODER_ONLY):
+                # encoder-only attention does not need KV cache.
+                continue
+            elif attn_module.attn_type == AttentionType.ENCODER_DECODER:
+                raise NotImplementedError
+            else:
+                raise ValueError(
+                    f"Unknown attention type: {attn_module.attn_type}")
+
+        return kv_cache_spec
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 81b247e07ef4a..4fb4197f1822f 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -1,7 +1,7 @@
 """A GPU worker class."""
 import gc
 import os
-from typing import TYPE_CHECKING, Optional, Tuple
+from typing import TYPE_CHECKING, Optional
 
 import torch
 import torch.distributed
@@ -16,6 +16,7 @@
 from vllm.platforms import current_platform
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, LayerBlockType, get_dtype_size
 from vllm.v1.core.scheduler import SchedulerOutput
+from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.worker.gpu_model_runner import GPUModelRunner
 
@@ -112,20 +113,18 @@ def load_model(self) -> None:
         self.model_runner.load_model()
 
     @torch.inference_mode()
-    def determine_num_available_blocks(self) -> Tuple[int, int]:
-        """Profiles the peak memory usage of the model to determine how many
-        KV blocks may be allocated without OOMs.
+    def determine_available_memory(self) -> int:
+        """Profiles the peak memory usage of the model to determine how much 
+        memory can be used for KV cache without OOMs.
 
         The engine will first conduct a profiling of the existing memory usage.
-        Then, it calculate the maximum possible number of GPU and CPU blocks
-        that can be allocated with the remaining free memory.
+        Then, it calculate the free memory that can be used for KV cache in
+        bytes.
 
         .. tip::
             You may limit the usage of GPU memory
             by adjusting the `gpu_memory_utilization` parameter.
         """
-        # Profile the memory usage of the model and get the maximum number of
-        # cache blocks that can be allocated with the remaining free memory.
         torch.cuda.empty_cache()
         torch.cuda.reset_peak_memory_stats()
 
@@ -161,33 +160,14 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
             total_gpu_memory * self.cache_config.gpu_memory_utilization -
             peak_memory)
 
-        # Calculate the number of blocks that can be allocated with the
-        # profiled peak memory.
-        cache_block_size = _get_cache_block_size(self.cache_config,
-                                                 self.model_config,
-                                                 self.parallel_config)
-        num_gpu_blocks = int(available_kv_cache_memory // cache_block_size)
-        num_gpu_blocks = max(num_gpu_blocks, 0)
-        return num_gpu_blocks, 0
-
-    def initialize_cache(self, num_gpu_blocks: int) -> None:
-        """Allocate GPU and CPU KV cache with the specified number of blocks."""
-        if num_gpu_blocks <= 0:
-            raise ValueError("No available memory for the cache blocks. "
-                             "Try increasing `gpu_memory_utilization` when "
-                             "initializing the engine.")
-
-        max_seq_len = self.cache_config.block_size * num_gpu_blocks
-        max_model_len = self.model_config.max_model_len
-        if max_model_len > max_seq_len:
-            raise ValueError(
-                f"The model's max seq len ({max_model_len}) "
-                "is larger than the maximum number of tokens that can be "
-                f"stored in KV cache ({max_seq_len}). Try increasing "
-                "`gpu_memory_utilization` or decreasing `max_model_len` when "
-                "initializing the engine.")
+        return int(available_kv_cache_memory)
+
+    def get_kv_cache_spec(self) -> KVCacheSpec:
+        return self.model_runner.get_kv_cache_spec()
 
-        self.model_runner.initialize_kv_cache(num_gpu_blocks)
+    def initialize_cache(self, kv_cache_config: KVCacheConfig) -> None:
+        """Allocate GPU KV cache with the specified kv_cache_config."""
+        self.model_runner.initialize_kv_cache(kv_cache_config)
 
     def compile_or_warm_up_model(self) -> None:
         if not self.model_config.enforce_eager:

From 07934cc237d16427d705e5abc3c83e4eb0f9b7f4 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Fri, 17 Jan 2025 19:32:28 +0800
Subject: [PATCH 254/462] [Misc][LoRA] Improve the readability of LoRA error
 messages (#12102)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 .../entrypoints/openai/test_lora_adapters.py  |  69 ++++++++---
 tests/lora/test_lora_checkpoints.py           |  16 +++
 tests/lora/test_lora_huggingface.py           |   3 +
 tests/lora/test_lora_manager.py               |  59 +---------
 tests/lora/test_peft_helper.py                | 109 ++++++++++++++++++
 vllm/engine/multiprocessing/engine.py         |   1 +
 vllm/entrypoints/openai/serving_models.py     |  24 ++--
 vllm/lora/models.py                           |  12 +-
 vllm/lora/peft_helper.py                      |  49 ++++++--
 vllm/lora/worker_manager.py                   |  19 ++-
 10 files changed, 245 insertions(+), 116 deletions(-)
 create mode 100644 tests/lora/test_peft_helper.py

diff --git a/tests/entrypoints/openai/test_lora_adapters.py b/tests/entrypoints/openai/test_lora_adapters.py
index 46a064f6d9e68..6ff99f6faa143 100644
--- a/tests/entrypoints/openai/test_lora_adapters.py
+++ b/tests/entrypoints/openai/test_lora_adapters.py
@@ -17,6 +17,33 @@
 # generation quality here
 LORA_NAME = "typeof/zephyr-7b-beta-lora"
 
+BADREQUEST_CASES = [
+    (
+        "test_rank",
+        {
+            "r": 1024
+        },
+        "is greater than max_lora_rank",
+    ),
+    (
+        "test_bias",
+        {
+            "bias": "all"
+        },
+        "Adapter bias cannot be used without bias_enabled",
+    ),
+    ("test_dora", {
+        "use_dora": True
+    }, "does not yet support DoRA"),
+    (
+        "test_modules_to_save",
+        {
+            "modules_to_save": ["lm_head"]
+        },
+        "only supports modules_to_save being None",
+    ),
+]
+
 
 @pytest.fixture(scope="module")
 def zephyr_lora_files():
@@ -138,32 +165,36 @@ async def test_dynamic_lora_invalid_files(client: openai.AsyncOpenAI,
 
 
 @pytest.mark.asyncio
-async def test_dynamic_lora_invalid_lora_rank(client: openai.AsyncOpenAI,
-                                              tmp_path, zephyr_lora_files):
-    invalid_rank = tmp_path / "invalid_rank"
-
-    # Copy adapter from zephyr_lora_files to invalid_rank
-    shutil.copytree(zephyr_lora_files, invalid_rank)
-
-    with open(invalid_rank / "adapter_config.json") as f:
+@pytest.mark.parametrize("test_name,config_change,expected_error",
+                         BADREQUEST_CASES)
+async def test_dynamic_lora_badrequests(client: openai.AsyncOpenAI, tmp_path,
+                                        zephyr_lora_files, test_name: str,
+                                        config_change: dict,
+                                        expected_error: str):
+    # Create test directory
+    test_dir = tmp_path / test_name
+
+    # Copy adapter files
+    shutil.copytree(zephyr_lora_files, test_dir)
+
+    # Load and modify configuration
+    config_path = test_dir / "adapter_config.json"
+    with open(config_path) as f:
         adapter_config = json.load(f)
+    # Apply configuration changes
+    adapter_config.update(config_change)
 
-    print(adapter_config)
-
-    # assert False
-
-    # Change rank to invalid value
-    adapter_config["r"] = 1024
-    with open(invalid_rank / "adapter_config.json", "w") as f:
+    # Save modified configuration
+    with open(config_path, "w") as f:
         json.dump(adapter_config, f)
 
-    with pytest.raises(openai.BadRequestError,
-                       match="is greater than max_lora_rank"):
+    # Test loading the adapter
+    with pytest.raises(openai.BadRequestError, match=expected_error):
         await client.post("load_lora_adapter",
                           cast_to=str,
                           body={
-                              "lora_name": "invalid-json",
-                              "lora_path": str(invalid_rank)
+                              "lora_name": test_name,
+                              "lora_path": str(test_dir)
                           })
 
 
diff --git a/tests/lora/test_lora_checkpoints.py b/tests/lora/test_lora_checkpoints.py
index 537d95b025a9d..b907af47d08d7 100644
--- a/tests/lora/test_lora_checkpoints.py
+++ b/tests/lora/test_lora_checkpoints.py
@@ -3,6 +3,7 @@
 import pytest
 
 from vllm.lora.models import LoRAModel
+from vllm.lora.peft_helper import PEFTHelper
 from vllm.model_executor.models.baichuan import BaiChuanBaseForCausalLM
 from vllm.model_executor.models.utils import WeightsMapper
 
@@ -30,11 +31,14 @@ def test_load_checkpoints(
         else:
             expected_lora_modules.append(module)
     if lora_name == "baichuan7B":
+        peft_helper = PEFTHelper.from_local_dir(baichuan_lora_files,
+                                                max_position_embeddings=4096)
         # For the baichuan7B model, load it's LoRA,
         # and the test should pass.
         LoRAModel.from_local_checkpoint(
             baichuan_lora_files,
             expected_lora_modules,
+            peft_helper=peft_helper,
             lora_model_id=1,
             device="cpu",
             embedding_modules=embedding_modules,
@@ -43,9 +47,12 @@ def test_load_checkpoints(
         # Test that the target_modules contain prefix
         # such as "model.layers.0.self_atten.W_pack", and
         # the test should pass.
+        peft_helper = PEFTHelper.from_local_dir(baichuan_zero_lora_files,
+                                                max_position_embeddings=4096)
         LoRAModel.from_local_checkpoint(
             baichuan_zero_lora_files,
             expected_lora_modules,
+            peft_helper=peft_helper,
             lora_model_id=1,
             device="cpu",
             embedding_modules=embedding_modules,
@@ -53,9 +60,12 @@ def test_load_checkpoints(
     elif lora_name == "baichuan7B-zero-regex":
         # Test that the `target_modules` in the form of regular expressions,
         # such as `model\\..*(W_pack|o_proj)`, and the test should pass.
+        peft_helper = PEFTHelper.from_local_dir(baichuan_regex_lora_files,
+                                                max_position_embeddings=4096)
         LoRAModel.from_local_checkpoint(
             baichuan_regex_lora_files,
             expected_lora_modules,
+            peft_helper=peft_helper,
             lora_model_id=1,
             device="cpu",
             embedding_modules=embedding_modules,
@@ -64,10 +74,13 @@ def test_load_checkpoints(
         # For the baichuan7B model, load chatglm3-6b's LoRA,
         # and the test should raise the following error.
         expected_error = "Please verify that the loaded LoRA module is correct"  # noqa: E501
+        peft_helper = PEFTHelper.from_local_dir(chatglm3_lora_files,
+                                                max_position_embeddings=4096)
         with pytest.raises(ValueError, match=expected_error):
             LoRAModel.from_local_checkpoint(
                 chatglm3_lora_files,
                 expected_lora_modules,
+                peft_helper=peft_helper,
                 lora_model_id=1,
                 device="cpu",
                 embedding_modules=embedding_modules,
@@ -94,9 +107,12 @@ def test_lora_weights_mapping(baichuan_lora_files):
             ".layers.": ".baichuan_layers.",
         },
     )
+    peft_helper = PEFTHelper.from_local_dir(baichuan_lora_files,
+                                            max_position_embeddings=4096)
     lora_model = LoRAModel.from_local_checkpoint(
         baichuan_lora_files,
         expected_lora_modules,
+        peft_helper=peft_helper,
         lora_model_id=1,
         device="cpu",
         embedding_modules=embedding_modules,
diff --git a/tests/lora/test_lora_huggingface.py b/tests/lora/test_lora_huggingface.py
index e2daf9d135113..1c0ee01c038d0 100644
--- a/tests/lora/test_lora_huggingface.py
+++ b/tests/lora/test_lora_huggingface.py
@@ -3,6 +3,7 @@
 import pytest
 
 from vllm.lora.models import LoRAModel
+from vllm.lora.peft_helper import PEFTHelper
 from vllm.lora.utils import get_adapter_absolute_path
 from vllm.model_executor.models.llama import LlamaForCausalLM
 
@@ -27,9 +28,11 @@ def test_load_checkpoints_from_huggingface(lora_fixture_name, request):
     lora_path = get_adapter_absolute_path(lora_name)
 
     # lora loading should work for either absolute path and hugggingface id.
+    peft_helper = PEFTHelper.from_local_dir(lora_path, 4096)
     lora_model = LoRAModel.from_local_checkpoint(
         lora_path,
         expected_lora_modules,
+        peft_helper=peft_helper,
         lora_model_id=1,
         device="cpu",
         embedding_modules=embedding_modules,
diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py
index ca523c66abe42..9a5b9aabf5078 100644
--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@@ -1,5 +1,3 @@
-import json
-import math
 import os
 from typing import Dict, List
 
@@ -34,56 +32,6 @@
 ] if current_platform.is_cuda_alike() else ["cpu"])
 
 
-def test_peft_helper(sql_lora_files):
-    lora_config_path = os.path.join(sql_lora_files, "adapter_config.json")
-    with open(lora_config_path) as f:
-        config = json.load(f)
-    peft_helper = PEFTHelper.from_dict(config)
-    assert peft_helper.r == 8
-    assert peft_helper.lora_alpha == 16
-    assert peft_helper.target_modules == [
-        "q_proj",
-        "v_proj",
-        "k_proj",
-        "o_proj",
-        "gate_proj",
-        "up_proj",
-        "down_proj",
-        "embed_tokens",
-        "lm_head",
-    ]
-    scaling = peft_helper.lora_alpha / peft_helper.r
-    assert abs(peft_helper.vllm_lora_scaling_factor - scaling) < 1e-3
-
-    # test RSLoRA
-    config = dict(r=8,
-                  lora_alpha=16,
-                  target_modules=["gate_proj"],
-                  use_rslora=True)
-    peft_helper = PEFTHelper.from_dict(config)
-
-    scaling = peft_helper.lora_alpha / math.sqrt(peft_helper.r)
-    assert abs(peft_helper.vllm_lora_scaling_factor - scaling) < 1e-3
-
-    expected_error = "vLLM only supports modules_to_save being None."
-    with pytest.raises(ValueError, match=expected_error):
-        config = dict(
-            r=8,
-            lora_alpha=16,
-            target_modules=["gate_proj"],
-            modules_to_save=["lm_head"],
-        )
-        PEFTHelper.from_dict(config)
-
-    expected_error = "vLLM does not yet support DoRA."
-    with pytest.raises(ValueError, match=expected_error):
-        config = dict(r=8,
-                      lora_alpha=16,
-                      target_modules=["gate_proj"],
-                      use_dora=True)
-        PEFTHelper.from_dict(config)
-
-
 @pytest.mark.parametrize("device", DEVICES)
 def test_from_lora_tensors(sql_lora_files, device):
     tensors = load_file(
@@ -91,11 +39,8 @@ def test_from_lora_tensors(sql_lora_files, device):
     new_embeddings = load_file(
         os.path.join(sql_lora_files, "new_embeddings.safetensors"))
 
-    lora_config_path = os.path.join(sql_lora_files, "adapter_config.json")
-    with open(lora_config_path) as f:
-        config = json.load(f)
-
-    peft_helper = PEFTHelper.from_dict(config)
+    peft_helper = PEFTHelper.from_local_dir(sql_lora_files,
+                                            max_position_embeddings=4096)
     lora_model = LoRAModel.from_lora_tensors(
         1,
         tensors,
diff --git a/tests/lora/test_peft_helper.py b/tests/lora/test_peft_helper.py
new file mode 100644
index 0000000000000..a524d5ce5f34a
--- /dev/null
+++ b/tests/lora/test_peft_helper.py
@@ -0,0 +1,109 @@
+import json
+import math
+import shutil
+
+import pytest
+
+from vllm.config import LoRAConfig
+from vllm.lora.peft_helper import PEFTHelper
+
+ERROR_CASES = [
+    (
+        "test_rank",
+        {
+            "r": 1024
+        },
+        "is greater than max_lora_rank",
+    ),
+    (
+        "test_bias",
+        {
+            "bias": "all"
+        },
+        "Adapter bias cannot be used without bias_enabled",
+    ),
+    ("test_dora", {
+        "use_dora": True
+    }, "does not yet support DoRA"),
+    (
+        "test_modules_to_save",
+        {
+            "modules_to_save": ["lm_head"]
+        },
+        "only supports modules_to_save being None",
+    ),
+]
+
+
+def test_peft_helper_pass(long_context_lora_files_16k_1, tmp_path):
+    peft_helper = PEFTHelper.from_local_dir(long_context_lora_files_16k_1,
+                                            max_position_embeddings=4096)
+    lora_config = LoRAConfig(max_lora_rank=16, max_cpu_loras=3, max_loras=2)
+    peft_helper.validate_legal(lora_config)
+    assert peft_helper.r == 8
+    assert peft_helper.lora_alpha == 16
+    assert peft_helper.target_modules == [
+        "q_proj",
+        "v_proj",
+        "k_proj",
+        "o_proj",
+        "gate_proj",
+        "up_proj",
+        "down_proj",
+        "embed_tokens",
+        "lm_head",
+    ]
+    assert peft_helper.context_length == 16384
+    assert peft_helper.vllm_max_position_embeddings == 4096
+    assert peft_helper.vllm_long_context_scaling_factor == float(
+        math.ceil(peft_helper.context_length /
+                  peft_helper.vllm_max_position_embeddings))
+    # test RSLoRA
+    rslora_config = dict(use_rslora=True)
+    test_dir = tmp_path / "test_rslora"
+    shutil.copytree(long_context_lora_files_16k_1, test_dir)
+
+    # Load and modify configuration
+    config_path = test_dir / "adapter_config.json"
+    with open(config_path) as f:
+        adapter_config = json.load(f)
+    # Apply configuration changes
+    adapter_config.update(rslora_config)
+
+    # Save modified configuration
+    with open(config_path, "w") as f:
+        json.dump(adapter_config, f)
+
+    peft_helper = PEFTHelper.from_local_dir(test_dir,
+                                            max_position_embeddings=4096)
+    peft_helper.validate_legal(lora_config)
+    scaling = peft_helper.lora_alpha / math.sqrt(peft_helper.r)
+    assert abs(peft_helper.vllm_lora_scaling_factor - scaling) < 1e-3
+
+
+@pytest.mark.parametrize("test_name,config_change,expected_error", ERROR_CASES)
+def test_peft_helper_error(
+    sql_lora_files,
+    tmp_path,
+    test_name: str,
+    config_change: dict,
+    expected_error: str,
+):
+    test_dir = tmp_path / test_name
+    shutil.copytree(sql_lora_files, test_dir)
+
+    # Load and modify configuration
+    config_path = test_dir / "adapter_config.json"
+    with open(config_path) as f:
+        adapter_config = json.load(f)
+    # Apply configuration changes
+    adapter_config.update(config_change)
+
+    # Save modified configuration
+    with open(config_path, "w") as f:
+        json.dump(adapter_config, f)
+    lora_config = LoRAConfig(max_lora_rank=16, max_cpu_loras=3, max_loras=2)
+    # Test loading the adapter
+    with pytest.raises(ValueError, match=expected_error):
+        PEFTHelper.from_local_dir(
+            test_dir, max_position_embeddings=4096).validate_legal(lora_config)
diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py
index 8f231de912c95..3aa9d30549f36 100644
--- a/vllm/engine/multiprocessing/engine.py
+++ b/vllm/engine/multiprocessing/engine.py
@@ -296,6 +296,7 @@ def _handle_load_adapter_request(self, request: RPCLoadAdapterRequest):
                                is_engine_errored=False,
                                exception=e)
             self._send_outputs(rpc_err)
+            return
         # Otherwise, send back the successful load message
         self._send_outputs(
             RPCAdapterLoadedResponse(request_id=request.request_id))
diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py
index a222eafadcb68..fc422f0917bd5 100644
--- a/vllm/entrypoints/openai/serving_models.py
+++ b/vllm/entrypoints/openai/serving_models.py
@@ -157,24 +157,16 @@ async def load_lora_adapter(
         # This will also pre-load it for incoming requests
         try:
             await self.engine_client.add_lora(lora_request)
-        except ValueError as e:
-            # Adapter not found or lora configuration errors
-            if "No adapter found" in str(e):
-                return create_error_response(message=str(e),
-                                             err_type="NotFoundError",
-                                             status_code=HTTPStatus.NOT_FOUND)
-            else:
-                return create_error_response(
-                    message=str(e),
-                    err_type="BadRequestError",
-                    status_code=HTTPStatus.BAD_REQUEST)
         except BaseException as e:
-            # Some other unexpected problem loading the adapter, e.g. malformed
-            # input files.
-            # More detailed error messages for the user would be nicer here
+            error_type = "BadRequestError"
+            status_code = HTTPStatus.BAD_REQUEST
+            if isinstance(e, ValueError) and "No adapter found" in str(e):
+                error_type = "NotFoundError"
+                status_code = HTTPStatus.NOT_FOUND
+
             return create_error_response(message=str(e),
-                                         err_type="BadRequestError",
-                                         status_code=HTTPStatus.BAD_REQUEST)
+                                         err_type=error_type,
+                                         status_code=status_code)
 
         self.lora_requests.append(lora_request)
         logger.info("Loaded new LoRA adapter: name '%s', path '%s'", lora_name,
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 5b7225bdc8f37..9809405ca9a61 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -1,5 +1,4 @@
 import copy
-import json
 import math
 import os
 import re
@@ -180,8 +179,8 @@ def from_local_checkpoint(
         cls,
         lora_dir: str,
         expected_lora_modules: List[str],
+        peft_helper: PEFTHelper,
         *,
-        max_position_embeddings: Optional[int] = None,
         lora_model_id: Optional[int] = None,
         device: str = "cuda",
         dtype: Optional[torch.dtype] = None,
@@ -196,9 +195,7 @@ def from_local_checkpoint(
             lora_dir: The local path that has lora data.
             expected_lora_modules: Name of modules that are expected to be
                 replaced by lora.
-            max_position_embeddings: Max position embedding length. Used to
-                scaling the largest context length. If None, the lora model's
-                context length is not scaled.
+            peft_helper: Loaded lora configuration information.
             lora_model_id: Lora model id. If not given, automatically set by
                 a global counter.
             device: Device where the lora model is loaded.
@@ -207,18 +204,13 @@ def from_local_checkpoint(
         Returns:
             Loaded LoRA Model.
         """
-        lora_config_path = os.path.join(lora_dir, "adapter_config.json")
         lora_tensor_path = os.path.join(lora_dir, "adapter_model.safetensors")
         lora_bin_file_path = os.path.join(lora_dir, "adapter_model.bin")
         new_embeddings_tensor_path = os.path.join(
             lora_dir, "new_embeddings.safetensors")
         new_embeddings_bin_file_path = os.path.join(lora_dir,
                                                     "new_embeddings.bin")
-        with open(lora_config_path) as f:
-            config = json.load(f)
 
-        config["vllm_max_position_embeddings"] = max_position_embeddings
-        peft_helper = PEFTHelper.from_dict(config)
         unexpected_modules: List[Union[list[str], str]]
         if os.path.isfile(lora_tensor_path):
             tensors: Dict[str, torch.Tensor] = {}
diff --git a/vllm/lora/peft_helper.py b/vllm/lora/peft_helper.py
index dacfb9ebd1480..b9c506f6e0bfd 100644
--- a/vllm/lora/peft_helper.py
+++ b/vllm/lora/peft_helper.py
@@ -1,9 +1,12 @@
 # Adapted from: https://github.com/huggingface/peft/blob/main/src/peft/tuners/lora/config.py
 
+import json
 import math
+import os
 from dataclasses import MISSING, dataclass, field, fields
-from typing import Literal, Optional, Union
+from typing import List, Literal, Optional, Union
 
+from vllm.config import LoRAConfig
 from vllm.logger import init_logger
 
 logger = init_logger(__name__)
@@ -11,6 +14,12 @@
 
 @dataclass
 class PEFTHelper:
+    """ 
+    A helper class for PEFT configurations, specifically designed for LoRA.
+    This class handles configuration validation, compatibility checks for 
+    various LoRA implementations.
+    """
+
     # Required fields
     r: int
     lora_alpha: int
@@ -29,20 +38,18 @@ class PEFTHelper:
     vllm_max_position_embeddings: Optional[int] = field(default=False)
     vllm_long_context_scaling_factor: Optional[float] = field(default=None)
 
-    def _validate_features(self):
+    def _validate_features(self) -> List[str]:
+        """
+        Check if there are any unsupported Lora features.
+        """
         error_msg = []
-
         if self.modules_to_save:
             error_msg.append("vLLM only supports modules_to_save being None.")
-
         if self.use_dora:
             error_msg.append("vLLM does not yet support DoRA.")
-
-        if error_msg:
-            raise ValueError(f"{', '.join(error_msg)}")
+        return error_msg
 
     def __post_init__(self):
-        self._validate_features()
         if self.use_rslora:
             logger.info_once("Loading LoRA weights trained with rsLoRA.")
             self.vllm_lora_scaling_factor = self.lora_alpha / math.sqrt(self.r)
@@ -78,3 +85,29 @@ def from_dict(cls, config_dict: dict) -> "PEFTHelper":
             for k, v in config_dict.items() if k in class_fields
         }
         return cls(**filtered_dict)
+
+    @classmethod
+    def from_local_dir(cls, lora_path: str,
+                       max_position_embeddings: Optional[int]) -> "PEFTHelper":
+        lora_config_path = os.path.join(lora_path, "adapter_config.json")
+
+        with open(lora_config_path) as f:
+            config = json.load(f)
+        config["vllm_max_position_embeddings"] = max_position_embeddings
+        return cls.from_dict(config)
+
+    def validate_legal(self, lora_config: LoRAConfig) -> None:
+        """
+        Validates the LoRA configuration settings against application 
+        constraints and requirements.
+        """
+        error_msg = self._validate_features()
+        if self.r > lora_config.max_lora_rank:
+            error_msg.append(
+                f"LoRA rank {self.r} is greater than max_lora_rank"
+                f" {lora_config.max_lora_rank}.")
+        if self.bias != "none" and not lora_config.bias_enabled:
+            error_msg.append(
+                "Adapter bias cannot be used without bias_enabled.")
+        if error_msg:
+            raise ValueError(f"{' '.join(error_msg)}")
diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py
index eec462743fe9d..a64296f7fd902 100644
--- a/vllm/lora/worker_manager.py
+++ b/vllm/lora/worker_manager.py
@@ -12,6 +12,7 @@
 from vllm.logger import init_logger
 from vllm.lora.models import (LoRAModel, LoRAModelManager,
                               LRUCacheLoRAModelManager, create_lora_manager)
+from vllm.lora.peft_helper import PEFTHelper
 from vllm.lora.request import LoRARequest
 from vllm.lora.utils import get_adapter_absolute_path
 
@@ -95,6 +96,13 @@ def _load_adapter(self, lora_request: LoRARequest) -> LoRAModel:
             expected_lora_modules = list(set(expected_lora_modules))
             lora_path = get_adapter_absolute_path(lora_request.lora_path)
 
+            peft_helper = PEFTHelper.from_local_dir(
+                lora_path, self.max_position_embeddings)
+
+            # Validates the LoRA configuration against requirements before
+            # loading weights, throwing an exception if validation fails.
+            peft_helper.validate_legal(self.lora_config)
+
             # For some models like Qwen2VL, we need to use hf_to_vllm_mapper
             # to ensure correct loading of lora weights.
             hf_to_vllm_mapper = None
@@ -105,7 +113,7 @@ def _load_adapter(self, lora_request: LoRARequest) -> LoRAModel:
             lora = self._lora_model_cls.from_local_checkpoint(
                 lora_path,
                 expected_lora_modules,
-                max_position_embeddings=self.max_position_embeddings,
+                peft_helper=peft_helper,
                 lora_model_id=lora_request.lora_int_id,
                 device="cpu",
                 dtype=self.lora_config.lora_dtype,
@@ -120,15 +128,14 @@ def _load_adapter(self, lora_request: LoRARequest) -> LoRAModel:
             # - No adapter found to download from huggingface (or in
             #       offline mode)
             # - No local adapter files found at `lora_request.lora_path`
+            # For NotFoundError
             raise ValueError(
                 f"Loading lora {lora_request.lora_name} failed: No adapter "
                 f"found for {lora_path}") from e
         except Exception as e:
-            raise RuntimeError(f"Loading lora {lora_path} failed") from e
-        if lora.rank > self.lora_config.max_lora_rank:
-            raise ValueError(
-                f"LoRA rank {lora.rank} is greater than max_lora_rank "
-                f"{self.lora_config.max_lora_rank}.")
+            # For BadRequestError
+            raise e
+
         if lora.extra_vocab_size > self.lora_config.lora_extra_vocab_size:
             raise ValueError(f"LoRA added vocab size {lora.extra_vocab_size} "
                              f"is greater than lora_extra_vocab_size "

From d4e619457075c0dd917b84644f467f7f8aae10f0 Mon Sep 17 00:00:00 2001
From: "Li, Jiang" <jiang1.li@intel.com>
Date: Fri, 17 Jan 2025 19:39:52 +0800
Subject: [PATCH 255/462] [CI/Build][CPU][Bugfix] Fix CPU CI (#12150)

Signed-off-by: jiang1.li <jiang1.li@intel.com>
---
 .buildkite/run-cpu-test.sh               | 4 ++--
 vllm/model_executor/layers/activation.py | 8 ++++++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index 9925db7bea593..e19ace782feb5 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -83,6 +83,6 @@ function cpu_tests() {
     tests/lora/test_qwen2vl.py"
 }
 
-# All of CPU tests are expected to be finished less than 25 mins.
+# All of CPU tests are expected to be finished less than 40 mins.
 export -f cpu_tests
-timeout 30m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
+timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index af7894b42c560..fb9684ac1c184 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -30,8 +30,10 @@ class FatreluAndMul(CustomOp):
     def __init__(self, threshold: float = 0.):
         super().__init__()
         self.threshold = threshold
-        if current_platform.is_cuda_alike() or current_platform.is_cpu():
+        if current_platform.is_cuda_alike():
             self.op = torch.ops._C.fatrelu_and_mul
+        elif current_platform.is_cpu():
+            self._forward_method = self.forward_native
 
     def forward_native(self, x: torch.Tensor) -> torch.Tensor:
         d = x.shape[-1] // 2
@@ -100,11 +102,13 @@ class MulAndSilu(CustomOp):
 
     def __init__(self):
         super().__init__()
-        if current_platform.is_cuda_alike() or current_platform.is_cpu():
+        if current_platform.is_cuda_alike():
             self.op = torch.ops._C.mul_and_silu
         elif current_platform.is_xpu():
             from vllm._ipex_ops import ipex_ops
             self.op = ipex_ops.silu_and_mul
+        elif current_platform.is_cpu():
+            self._forward_method = self.forward_native
 
     def forward_native(self, x: torch.Tensor) -> torch.Tensor:
         """PyTorch-native implementation equivalent to forward()."""

From 87a0c076afafb93dd082ff3876bea08adca56c56 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Fri, 17 Jan 2025 20:47:01 +0800
Subject: [PATCH 256/462] [core] allow callable in collective_rpc (#12151)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 .buildkite/test-pipeline.yaml                |  4 ++-
 tests/engine/test_custom_executor.py         |  4 +--
 tests/entrypoints/llm/test_collective_rpc.py | 36 ++++++++++++++++++++
 vllm/engine/llm_engine.py                    | 17 +++++++--
 vllm/entrypoints/llm.py                      | 14 +++++---
 vllm/executor/executor_base.py               |  9 ++---
 vllm/executor/mp_distributed_executor.py     | 21 ++++++++----
 vllm/executor/multiproc_worker_utils.py      | 12 +++----
 vllm/executor/ray_distributed_executor.py    | 14 +++++---
 vllm/executor/uniproc_executor.py            | 14 +++-----
 vllm/utils.py                                | 23 +++++++++++++
 vllm/v1/executor/multiproc_executor.py       | 19 ++++++++---
 vllm/worker/worker_base.py                   | 10 +++---
 13 files changed, 147 insertions(+), 50 deletions(-)
 create mode 100644 tests/entrypoints/llm/test_collective_rpc.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 7442de245bd80..bff557d7fc92f 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -107,7 +107,7 @@ steps:
   source_file_dependencies:
   - vllm/
   commands:
-  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py
+  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
   - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
   - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
   - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
@@ -466,7 +466,9 @@ steps:
   - vllm/worker/worker_base.py
   - vllm/worker/worker.py
   - vllm/worker/model_runner.py
+  - entrypoints/llm/test_collective_rpc.py
   commands:
+  - pytest -v -s entrypoints/llm/test_collective_rpc.py
   - torchrun --nproc-per-node=2 distributed/test_torchrun_example.py
   - pytest -v -s ./compile/test_basic_correctness.py
   - pytest -v -s ./compile/test_wrapper.py
diff --git a/tests/engine/test_custom_executor.py b/tests/engine/test_custom_executor.py
index 2a057ca488a50..fdfcd4f4c9d50 100644
--- a/tests/engine/test_custom_executor.py
+++ b/tests/engine/test_custom_executor.py
@@ -1,6 +1,6 @@
 import asyncio
 import os
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import pytest
 
@@ -18,7 +18,7 @@ class Mock:
 class CustomUniExecutor(UniProcExecutor):
 
     def collective_rpc(self,
-                       method: str,
+                       method: Union[str, Callable],
                        timeout: Optional[float] = None,
                        args: Tuple = (),
                        kwargs: Optional[Dict] = None) -> List[Any]:
diff --git a/tests/entrypoints/llm/test_collective_rpc.py b/tests/entrypoints/llm/test_collective_rpc.py
new file mode 100644
index 0000000000000..22473ce275295
--- /dev/null
+++ b/tests/entrypoints/llm/test_collective_rpc.py
@@ -0,0 +1,36 @@
+import pytest
+
+from vllm import LLM
+
+from ...utils import fork_new_process_for_each_test
+
+
+@pytest.mark.parametrize("tp_size", [1, 2])
+@pytest.mark.parametrize("backend", ["mp", "ray"])
+@fork_new_process_for_each_test
+def test_collective_rpc(tp_size, backend):
+    if tp_size == 1 and backend == "ray":
+        pytest.skip("Skip duplicate test case")
+    if tp_size == 1:
+        backend = None
+
+    # intentionally define the method and class in the test function,
+    # to test if they can be serialized and sent to the workers
+    def echo_rank(self):
+        return self.rank
+
+    from vllm.worker.worker import Worker
+
+    class MyWorker(Worker):
+
+        def echo_rank(self):
+            return self.rank
+
+    llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct",
+              enforce_eager=True,
+              load_format="dummy",
+              tensor_parallel_size=tp_size,
+              distributed_executor_backend=backend,
+              worker_cls=MyWorker)
+    for method in ["echo_rank", echo_rank]:
+        assert llm.collective_rpc(method) == list(range(tp_size))
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 5d19ce03d5b58..88c21f9a6d31b 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -5,10 +5,10 @@
 from contextlib import contextmanager
 from dataclasses import dataclass
 from functools import partial
-from typing import (TYPE_CHECKING, Callable, ClassVar, Deque, Dict, Iterable,
-                    List, Mapping, NamedTuple, Optional)
+from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Deque, Dict,
+                    Iterable, List, Mapping, NamedTuple, Optional)
 from typing import Sequence as GenericSequence
-from typing import Set, Type, Union, cast, overload
+from typing import Set, Tuple, Type, Union, cast, overload
 
 import torch
 from typing_extensions import TypeVar, deprecated
@@ -1816,6 +1816,17 @@ def start_profile(self) -> None:
     def stop_profile(self) -> None:
         self.model_executor.stop_profile()
 
+    def collective_rpc(self,
+                       method: Union[str, Callable],
+                       timeout: Optional[float] = None,
+                       args: Tuple = (),
+                       kwargs: Optional[Dict] = None) -> List[Any]:
+        """
+        See LLM.collective_rpc for more details.
+        """
+        return self.model_executor.collective_rpc(method, timeout, args,
+                                                  kwargs)
+
     def check_health(self) -> None:
         if self.tokenizer:
             self.tokenizer.check_health()
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index b78d5c65a40f8..0cfe6be9ac767 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -1,8 +1,8 @@
 import itertools
 import warnings
 from contextlib import contextmanager
-from typing import (Any, ClassVar, Dict, List, Optional, Sequence, Tuple, Type,
-                    Union, cast, overload)
+from typing import (Any, Callable, ClassVar, Dict, List, Optional, Sequence,
+                    Tuple, Type, Union, cast, overload)
 
 import cloudpickle
 from tqdm import tqdm
@@ -464,7 +464,7 @@ def generate(
         return self.engine_class.validate_outputs(outputs, RequestOutput)
 
     def collective_rpc(self,
-                       method: str,
+                       method: Union[str, Callable],
                        timeout: Optional[float] = None,
                        args: Tuple = (),
                        kwargs: Optional[Dict] = None) -> List[Any]:
@@ -476,9 +476,13 @@ def collective_rpc(self,
         Then, users can call the new methods through this API.
         It is recommended to use this API to only pass control messages,
         and set up data-plane communication to pass data.
+        The method can also be a callable, which will be serialized
+        and sent to all workers to execute.
+        If the method is a callable, it should accept an additional
+        `self` argument, in addition to the arguments passed in `args`
+        and `kwargs`. The `self` argument will be the worker object.
         """
-        return self.llm_engine.model_executor.collective_rpc(
-            method, timeout, args, kwargs)
+        return self.llm_engine.collective_rpc(method, timeout, args, kwargs)
 
     def beam_search(
         self,
diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py
index 00ecadcf92667..d8457cb693cdb 100644
--- a/vllm/executor/executor_base.py
+++ b/vllm/executor/executor_base.py
@@ -1,6 +1,7 @@
 import asyncio
 from abc import ABC, abstractmethod
-from typing import Any, Awaitable, Dict, List, Optional, Set, Tuple, Union
+from typing import (Any, Awaitable, Callable, Dict, List, Optional, Set, Tuple,
+                    Union)
 
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
@@ -47,7 +48,7 @@ def _init_executor(self) -> None:
 
     @abstractmethod
     def collective_rpc(self,
-                       method: str,
+                       method: Union[str, Callable],
                        timeout: Optional[float] = None,
                        args: Tuple = (),
                        kwargs: Optional[Dict] = None) -> List[Any]:
@@ -260,7 +261,7 @@ def _driver_execute_model(
         raise NotImplementedError
 
     def collective_rpc(self,
-                       method: str,
+                       method: Union[str, Callable],
                        timeout: Optional[float] = None,
                        args: Tuple = (),
                        kwargs: Optional[Dict] = None) -> List[Any]:
@@ -269,7 +270,7 @@ def collective_rpc(self,
     @abstractmethod
     def _run_workers(
         self,
-        method: str,
+        method: Union[str, Callable],
         *args,
         async_run_tensor_parallel_workers_only: bool = False,
         max_concurrent_workers: Optional[int] = None,
diff --git a/vllm/executor/mp_distributed_executor.py b/vllm/executor/mp_distributed_executor.py
index d9dde949b844a..8ae88e646aad6 100644
--- a/vllm/executor/mp_distributed_executor.py
+++ b/vllm/executor/mp_distributed_executor.py
@@ -1,5 +1,7 @@
 import asyncio
-from typing import Any, List, Optional
+from typing import Any, Callable, List, Optional, Union
+
+import cloudpickle
 
 from vllm.executor.executor_base import DistributedExecutorBase
 from vllm.executor.multiproc_worker_utils import (
@@ -9,7 +11,7 @@
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.sequence import ExecuteModelRequest
 from vllm.utils import (_run_task_with_lock, get_distributed_init_method,
-                        get_ip, get_open_port, make_async)
+                        get_ip, get_open_port, make_async, run_method)
 from vllm.worker.worker_base import WorkerWrapperBase
 
 logger = init_logger(__name__)
@@ -107,7 +109,7 @@ def _driver_execute_model(
 
     def _run_workers(
         self,
-        method: str,
+        method: Union[str, Callable],
         *args,
         async_run_tensor_parallel_workers_only: bool = False,
         max_concurrent_workers: Optional[int] = None,
@@ -121,6 +123,11 @@ def _run_workers(
                 It will also be run asynchronously and return a list of futures
                 rather than blocking on the results.
         """
+        if isinstance(method, str):
+            sent_method = method
+        else:
+            sent_method = cloudpickle.dumps(method)
+        del method
 
         if max_concurrent_workers:
             raise NotImplementedError(
@@ -129,18 +136,18 @@ def _run_workers(
         if async_run_tensor_parallel_workers_only:
             # Run only non-driver workers and just return futures.
             return [
-                worker.execute_method(method, *args, **kwargs)
+                worker.execute_method(sent_method, *args, **kwargs)
                 for worker in self.non_driver_workers
             ]
 
         # Start all remote workers first.
         worker_outputs = [
-            worker.execute_method(method, *args, **kwargs)
+            worker.execute_method(sent_method, *args, **kwargs)
             for worker in self.workers
         ]
 
-        driver_worker_method = getattr(self.driver_worker, method)
-        driver_worker_output = driver_worker_method(*args, **kwargs)
+        driver_worker_output = run_method(self.driver_worker, sent_method,
+                                          args, kwargs)
 
         # Get the results of the workers.
         return [driver_worker_output
diff --git a/vllm/executor/multiproc_worker_utils.py b/vllm/executor/multiproc_worker_utils.py
index c9fb3c664c575..539b6ae2d3572 100644
--- a/vllm/executor/multiproc_worker_utils.py
+++ b/vllm/executor/multiproc_worker_utils.py
@@ -15,7 +15,7 @@
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.triton_utils.importing import HAS_TRITON
-from vllm.utils import _check_multiproc_method, get_mp_context
+from vllm.utils import _check_multiproc_method, get_mp_context, run_method
 
 if HAS_TRITON:
     from vllm.triton_utils import maybe_set_triton_cache_manager
@@ -169,7 +169,7 @@ def __init__(self, result_handler: ResultHandler,
         self.process.start()
 
     def _enqueue_task(self, future: Union[ResultFuture, asyncio.Future],
-                      method: str, args, kwargs):
+                      method: Union[str, bytes], args, kwargs):
         task_id = uuid.uuid4()
         self.tasks[task_id] = future
         try:
@@ -180,12 +180,13 @@ def _enqueue_task(self, future: Union[ResultFuture, asyncio.Future],
             del self.tasks[task_id]
             raise ChildProcessError("worker died") from e
 
-    def execute_method(self, method: str, *args, **kwargs):
+    def execute_method(self, method: Union[str, bytes], *args, **kwargs):
         future: ResultFuture = ResultFuture()
         self._enqueue_task(future, method, args, kwargs)
         return future
 
-    async def execute_method_async(self, method: str, *args, **kwargs):
+    async def execute_method_async(self, method: Union[str, bytes], *args,
+                                   **kwargs):
         future = asyncio.get_running_loop().create_future()
         self._enqueue_task(future, method, args, kwargs)
         return await future
@@ -230,8 +231,7 @@ def _run_worker_process(
             exception = None
             task_id, method, args, kwargs = items
             try:
-                executor = getattr(worker, method)
-                output = executor(*args, **kwargs)
+                output = run_method(worker, method, args, kwargs)
             except SystemExit:
                 raise
             except KeyboardInterrupt:
diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py
index 3baeb63918a62..2afd99f99b353 100644
--- a/vllm/executor/ray_distributed_executor.py
+++ b/vllm/executor/ray_distributed_executor.py
@@ -2,8 +2,9 @@
 import os
 from collections import defaultdict
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, Dict, List, Optional
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
 
+import cloudpickle
 import msgspec
 
 import vllm.envs as envs
@@ -410,7 +411,7 @@ def execute_model(
 
     def _run_workers(
         self,
-        method: str,
+        method: Union[str, Callable],
         *args,
         async_run_tensor_parallel_workers_only: bool = False,
         max_concurrent_workers: Optional[int] = None,
@@ -426,6 +427,11 @@ def _run_workers(
           rather than blocking on the results.
         - args/kwargs: All workers share the same args/kwargs
         """
+        if isinstance(method, str):
+            sent_method = method
+        else:
+            sent_method = cloudpickle.dumps(method)
+        del method
         if self.use_ray_spmd_worker:
             assert not async_run_tensor_parallel_workers_only, (
                 "async_run_tensor_parallel_workers_only is not supported for "
@@ -440,7 +446,7 @@ def _run_workers(
         if async_run_tensor_parallel_workers_only:
             ray_workers = self.non_driver_workers
         ray_worker_outputs = [
-            worker.execute_method.remote(method, *args, **kwargs)
+            worker.execute_method.remote(sent_method, *args, **kwargs)
             for worker in ray_workers
         ]
 
@@ -455,7 +461,7 @@ def _run_workers(
         if not self.use_ray_spmd_worker:
             # Start the driver worker after all the ray workers.
             driver_worker_output = [
-                self.driver_worker.execute_method(method, *args, **kwargs)
+                self.driver_worker.execute_method(sent_method, *args, **kwargs)
             ]
 
         # Get the results of the ray workers.
diff --git a/vllm/executor/uniproc_executor.py b/vllm/executor/uniproc_executor.py
index 27b83e95ba95b..a5c4dcf0ec7f9 100644
--- a/vllm/executor/uniproc_executor.py
+++ b/vllm/executor/uniproc_executor.py
@@ -1,5 +1,5 @@
 import os
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import torch
 import torch.distributed as dist
@@ -7,7 +7,8 @@
 import vllm.envs as envs
 from vllm.executor.executor_base import ExecutorBase
 from vllm.logger import init_logger
-from vllm.utils import get_distributed_init_method, get_ip, get_open_port
+from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
+                        run_method)
 from vllm.worker.worker_base import WorkerWrapperBase
 
 logger = init_logger(__name__)
@@ -39,18 +40,13 @@ def _init_executor(self) -> None:
         self.collective_rpc("load_model")
 
     def collective_rpc(self,
-                       method: str,
+                       method: Union[str, Callable],
                        timeout: Optional[float] = None,
                        args: Tuple = (),
                        kwargs: Optional[Dict] = None) -> List[Any]:
         if kwargs is None:
             kwargs = {}
-        try:
-            func = getattr(self.driver_worker, method)
-        except AttributeError:
-            raise NotImplementedError(f"Method {method} is not implemented.") \
-                from None
-        answer = func(*args, **kwargs)
+        answer = run_method(self.driver_worker, method, args, kwargs)
         return [answer]
 
     def check_health(self) -> None:
diff --git a/vllm/utils.py b/vllm/utils.py
index 7477e7028f5ef..89ba119bb5e55 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -36,6 +36,7 @@
                     overload)
 from uuid import uuid4
 
+import cloudpickle
 import numpy as np
 import numpy.typing as npt
 import psutil
@@ -2166,3 +2167,25 @@ def bind_kv_cache(
         assert len(forward_ctx.kv_cache) == len(kv_cache)
         for ve, ve_kv_cache in enumerate(kv_cache):
             forward_ctx.kv_cache[ve] = ve_kv_cache[kv_cache_idx]
+
+
+def run_method(obj: Any, method: Union[str, bytes, Callable], args: Tuple[Any],
+               kwargs: Dict[str, Any]) -> Any:
+    """
+    Run a method of an object with the given arguments and keyword arguments.
+    If the method is string, it will be converted to a method using getattr.
+    If the method is serialized bytes and will be deserialized using
+    cloudpickle.
+    If the method is a callable, it will be called directly.
+    """
+    if isinstance(method, bytes):
+        func = partial(cloudpickle.loads(method), obj)
+    elif isinstance(method, str):
+        try:
+            func = getattr(obj, method)
+        except AttributeError:
+            raise NotImplementedError(f"Method {method!r} is not"
+                                      " implemented.") from None
+    else:
+        func = partial(method, obj)  # type: ignore
+    return func(*args, **kwargs)
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index e92acc7cb5e41..fd977d07e8d81 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -6,9 +6,11 @@
 import weakref
 from dataclasses import dataclass
 from enum import Enum, auto
+from functools import partial
 from multiprocessing.process import BaseProcess
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
+import cloudpickle
 import psutil
 import zmq
 
@@ -120,7 +122,7 @@ def get_kv_cache_spec(self) -> KVCacheSpec:
         return kv_cache_specs[0]
 
     def collective_rpc(self,
-                       method: str,
+                       method: Union[str, Callable],
                        timeout: Optional[float] = None,
                        args: Tuple = (),
                        kwargs: Optional[Dict] = None) -> List[Any]:
@@ -141,7 +143,12 @@ def collective_rpc(self,
         kwargs = kwargs or {}
 
         try:
-            self.rpc_broadcast_mq.enqueue((method, args, kwargs))
+            if isinstance(method, str):
+                send_method = method
+            else:
+                send_method = cloudpickle.dumps(
+                    method, protocol=pickle.HIGHEST_PROTOCOL)
+            self.rpc_broadcast_mq.enqueue((send_method, args, kwargs))
 
             responses = [None] * self.world_size
             for w in self.workers:
@@ -408,7 +415,11 @@ def worker_busy_loop(self):
             method, args, kwargs = self.rpc_broadcast_mq.dequeue()
 
             try:
-                output = getattr(self.worker, method)(*args, **kwargs)
+                if isinstance(method, str):
+                    func = getattr(self.worker, method)
+                elif isinstance(method, bytes):
+                    func = partial(cloudpickle.loads(method), self.worker)
+                output = func(*args, **kwargs)
             except Exception as e:
                 self.worker_response_mq.enqueue(
                     (WorkerProc.ResponseStatus.FAILURE, e))
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index bced5b9f44228..fb9919f7a7b6a 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -14,7 +14,8 @@
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.sequence import ExecuteModelRequest, IntermediateTensors
 from vllm.utils import (enable_trace_function_call_for_thread,
-                        resolve_obj_by_qualname, update_environment_variables)
+                        resolve_obj_by_qualname, run_method,
+                        update_environment_variables)
 from vllm.worker.model_runner_base import (BroadcastableModelInput,
                                            ModelRunnerBase,
                                            ModelRunnerInputBase)
@@ -539,17 +540,16 @@ def init_worker(self, all_kwargs: List[Dict[str, Any]]) -> None:
         self.worker = worker_class(**kwargs)
         assert self.worker is not None
 
-    def execute_method(self, method: str, *args, **kwargs):
+    def execute_method(self, method: Union[str, bytes], *args, **kwargs):
         try:
             target = self if self.worker is None else self.worker
-            executor = getattr(target, method)
-            return executor(*args, **kwargs)
+            return run_method(target, method, args, kwargs)
         except Exception as e:
             # if the driver worker also execute methods,
             # exceptions in the rest worker may cause deadlock in rpc like ray
             # see https://github.com/vllm-project/vllm/issues/3455
             # print the error and inform the user to solve the error
-            msg = (f"Error executing method {method}. "
+            msg = (f"Error executing method {method!r}. "
                    "This might cause deadlock in distributed execution.")
             logger.exception(msg)
             raise e

From 58fd57ff1d6d4ed56bed40aaaf9fe133b93b2efa Mon Sep 17 00:00:00 2001
From: Wallas Henrique <wallashss@users.noreply.github.com>
Date: Fri, 17 Jan 2025 13:24:22 -0300
Subject: [PATCH 257/462] [Bugfix] Fix score api for missing max_model_len
 validation (#12119)

Signed-off-by: Wallas Santos <wallashss@ibm.com>
---
 tests/entrypoints/openai/test_score.py    | 45 ++++++++++++++++---
 vllm/entrypoints/openai/serving_engine.py | 14 +++---
 vllm/entrypoints/openai/serving_score.py  | 54 ++++++++++++++---------
 3 files changed, 80 insertions(+), 33 deletions(-)

diff --git a/tests/entrypoints/openai/test_score.py b/tests/entrypoints/openai/test_score.py
index a803ea4a8d6ad..06e0f93dbe269 100644
--- a/tests/entrypoints/openai/test_score.py
+++ b/tests/entrypoints/openai/test_score.py
@@ -12,6 +12,9 @@
 def server():
     args = [
         "--enforce-eager",
+        # Will be used on tests to compare prompt input length
+        "--max-model-len",
+        "100"
     ]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
@@ -20,8 +23,7 @@ def server():
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_text_1_str_text_2_list(server: RemoteOpenAIServer,
-                                      model_name: str):
+def test_text_1_str_text_2_list(server: RemoteOpenAIServer, model_name: str):
     text_1 = "What is the capital of France?"
     text_2 = [
         "The capital of Brazil is Brasilia.", "The capital of France is Paris."
@@ -45,8 +47,7 @@ async def test_text_1_str_text_2_list(server: RemoteOpenAIServer,
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_text_1_list_text_2_list(server: RemoteOpenAIServer,
-                                       model_name: str):
+def test_text_1_list_text_2_list(server: RemoteOpenAIServer, model_name: str):
     text_1 = [
         "What is the capital of the United States?",
         "What is the capital of France?"
@@ -73,8 +74,7 @@ async def test_text_1_list_text_2_list(server: RemoteOpenAIServer,
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_text_1_str_text_2_str(server: RemoteOpenAIServer,
-                                     model_name: str):
+def test_text_1_str_text_2_str(server: RemoteOpenAIServer, model_name: str):
     text_1 = "What is the capital of France?"
     text_2 = "The capital of France is Paris."
 
@@ -91,3 +91,36 @@ async def test_text_1_str_text_2_str(server: RemoteOpenAIServer,
     assert score.data is not None
     assert len(score.data) == 1
     assert score.data[0].score >= 0.9
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+def test_score_max_model_len(server: RemoteOpenAIServer, model_name: str):
+
+    text_1 = "What is the capital of France?" * 20
+    text_2 = [
+        "The capital of Brazil is Brasilia.", "The capital of France is Paris."
+    ]
+
+    score_response = requests.post(server.url_for("score"),
+                                   json={
+                                       "model": model_name,
+                                       "text_1": text_1,
+                                       "text_2": text_2,
+                                   })
+    assert score_response.status_code == 400
+    # Assert just a small fragments of the response
+    assert "Please reduce the length of the input." in \
+        score_response.text
+
+    # Test truncation
+    score_response = requests.post(server.url_for("score"),
+                                   json={
+                                       "model": model_name,
+                                       "text_1": text_1,
+                                       "text_2": text_2,
+                                       "truncate_prompt_tokens": 101
+                                   })
+    assert score_response.status_code == 400
+    assert "Please, select a smaller truncation size." in \
+        score_response.text
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 88859255f202a..3da447be06430 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -203,15 +203,19 @@ def _validate_input(
     ) -> TextTokensPrompt:
         token_num = len(input_ids)
 
-        # Note: EmbeddingRequest doesn't have max_tokens
-        if isinstance(request,
-                      (EmbeddingChatRequest, EmbeddingCompletionRequest)):
+        # Note: EmbeddingRequest and ScoreRequest doesn't have max_tokens
+        if isinstance(
+                request,
+            (EmbeddingChatRequest, EmbeddingCompletionRequest, ScoreRequest)):
+
+            operation = "score" if isinstance(request, ScoreRequest) \
+                else "embedding generation"
             if token_num > self.max_model_len:
                 raise ValueError(
                     f"This model's maximum context length is "
                     f"{self.max_model_len} tokens. However, you requested "
-                    f"{token_num} tokens in the input for embedding "
-                    f"generation. Please reduce the length of the input.")
+                    f"{token_num} tokens in the input for {operation}. "
+                    f"Please reduce the length of the input.")
             return TextTokensPrompt(prompt=input_text,
                                     prompt_token_ids=input_ids)
 
diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py
index 5d3e7139d7a17..381edf8fac49e 100644
--- a/vllm/entrypoints/openai/serving_score.py
+++ b/vllm/entrypoints/openai/serving_score.py
@@ -101,6 +101,38 @@ async def create_score(
             if not self.model_config.is_cross_encoder:
                 raise ValueError("Model is not cross encoder.")
 
+            if truncate_prompt_tokens is not None and \
+                truncate_prompt_tokens > self.max_model_len:
+                raise ValueError(
+                    f"truncate_prompt_tokens value ({truncate_prompt_tokens}) "
+                    f"is greater than max_model_len ({self.max_model_len})."
+                    f" Please, select a smaller truncation size.")
+
+            input_pairs = make_pairs(request.text_1, request.text_2)
+            for q, t in input_pairs:
+                request_prompt = f"{q}{tokenizer.sep_token}{t}"
+
+                tokenization_kwargs: Dict[str, Any] = {}
+                if truncate_prompt_tokens is not None:
+                    tokenization_kwargs["truncation"] = True
+                    tokenization_kwargs["max_length"] = truncate_prompt_tokens
+
+                tokenize_async = make_async(tokenizer.__call__,
+                                            executor=self._tokenizer_executor)
+                prompt_inputs = await tokenize_async(text=q,
+                                                     text_pair=t,
+                                                     **tokenization_kwargs)
+
+                input_ids = prompt_inputs["input_ids"]
+                text_token_prompt = \
+                    self._validate_input(request, input_ids, request_prompt)
+                engine_prompt = TokensPrompt(
+                    prompt_token_ids=text_token_prompt["prompt_token_ids"],
+                    token_type_ids=prompt_inputs.get("token_type_ids"))
+
+                request_prompts.append(request_prompt)
+                engine_prompts.append(engine_prompt)
+
         except ValueError as e:
             logger.exception("Error in preprocessing prompt inputs")
             return self.create_error_response(str(e))
@@ -108,28 +140,6 @@ async def create_score(
         # Schedule the request and get the result generator.
         generators: List[AsyncGenerator[PoolingRequestOutput, None]] = []
 
-        input_pairs = make_pairs(request.text_1, request.text_2)
-
-        for q, t in input_pairs:
-            request_prompt = f"{q}{tokenizer.sep_token}{t}"
-
-            tokenization_kwargs: Dict[str, Any] = {}
-            if truncate_prompt_tokens is not None:
-                tokenization_kwargs["truncation"] = True
-                tokenization_kwargs["max_length"] = truncate_prompt_tokens
-
-            tokenize_async = make_async(tokenizer.__call__,
-                                        executor=self._tokenizer_executor)
-            prompt_inputs = await tokenize_async(text=q,
-                                                 text_pair=t,
-                                                 **tokenization_kwargs)
-            engine_prompt = TokensPrompt(
-                prompt_token_ids=prompt_inputs["input_ids"],
-                token_type_ids=prompt_inputs.get("token_type_ids"))
-
-            request_prompts.append(request_prompt)
-            engine_prompts.append(engine_prompt)
-
         try:
             pooling_params = request.to_pooling_params()
 

From 54cacf008f00d35d46273fed4d538cf5740d0965 Mon Sep 17 00:00:00 2001
From: Kunshang Ji <kunshang.ji@intel.com>
Date: Sat, 18 Jan 2025 00:47:53 +0800
Subject: [PATCH 258/462] [Bugfix] Mistral tokenizer encode accept list of str
 (#12149)

Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
---
 vllm/transformers_utils/tokenizers/mistral.py | 38 +++++++++++++++----
 1 file changed, 30 insertions(+), 8 deletions(-)

diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
index 17d722e3d88fe..d801cf4e4c7b1 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -18,6 +18,7 @@
                                                      Tekkenizer)
 
 from vllm.logger import init_logger
+from vllm.utils import is_list_of
 
 if TYPE_CHECKING:
     from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
@@ -27,7 +28,7 @@
 
 @dataclass
 class Encoding:
-    input_ids: List[int]
+    input_ids: Union[List[int], List[List[int]]]
 
 
 def maybe_serialize_tool_calls(request: ChatCompletionRequest):
@@ -223,17 +224,25 @@ def __len__(self) -> int:
 
     def __call__(
         self,
-        prompt: str,
+        prompt: Union[str, List[str], List[int]],
         add_special_tokens: bool = False,
         truncation: bool = False,
         max_length: Optional[int] = None,
     ):
-        # Mistral Tokenizers should not add special tokens
-        input_ids = self.encode(prompt)
-
-        if truncation:
-            input_ids = input_ids[:max_length]
-
+        input_ids: Union[List[int], List[List[int]]]
+        # For List[str], original prompt text
+        if is_list_of(prompt, str):
+            input_ids_: List[List[int]] = []
+            for p in prompt:
+                each_input_ids = self.encode_one(p, truncation, max_length)
+                input_ids_.append(each_input_ids)
+            input_ids = input_ids_
+        # For List[int], apply chat template output, already tokens.
+        elif is_list_of(prompt, int):
+            input_ids = prompt
+        # For str, single prompt text
+        else:
+            input_ids = self.encode_one(prompt, truncation, max_length)
         return Encoding(input_ids=input_ids)
 
     def get_vocab(self) -> Dict[str, int]:
@@ -245,6 +254,19 @@ def get_added_vocab(self) -> Dict[str, int]:
         # Mistral tokenizers have no added vocabulary
         return {}
 
+    def encode_one(
+        self,
+        prompt: str,
+        truncation: bool = False,
+        max_length: Optional[int] = None,
+    ) -> List[int]:
+        # Mistral Tokenizers should not add special tokens
+        input_ids = self.encode(prompt)
+
+        if truncation:
+            input_ids = input_ids[:max_length]
+        return input_ids
+
     def encode(self, prompt: str) -> List[int]:
         # `encode` should only be used for prompt completion
         # it should never be used for chat_completion.

From b5b57e301e7bce3a90af7a3ed206414c46eb64e0 Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com>
Date: Fri, 17 Jan 2025 12:12:26 -0500
Subject: [PATCH 259/462] [AMD][FP8] Using MI300 FP8 format on ROCm for
 block_quant (#12134)

Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
---
 .../model_executor/layers/quantization/fp8.py | 33 +++++++++++++++++++
 .../layers/quantization/utils/fp8_utils.py    | 14 ++++++--
 2 files changed, 44 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 4969ee559522e..26dd5df4e55b2 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -247,6 +247,15 @@ def create_weights(
     def process_weights_after_loading(self, layer: Module) -> None:
         # Block quant doesn't need to process weights after loading
         if self.block_quant:
+            if current_platform.is_rocm():
+                weight, weight_scale, _ = \
+                    normalize_e4m3fn_to_e4m3fnuz(
+                        weight=layer.weight,
+                        weight_scale=layer.weight_scale_inv,
+                        input_scale=layer.input_scale)
+                layer.weight = Parameter(weight, requires_grad=False)
+                layer.weight_scale_inv = Parameter(weight_scale,
+                                                   requires_grad=False)
             return
         layer.weight = torch.nn.Parameter(layer.weight.data,
                                           requires_grad=False)
@@ -495,6 +504,30 @@ def create_weights(self, layer: Module, num_experts: int, hidden_size: int,
     def process_weights_after_loading(self, layer: Module) -> None:
         # Block quant doesn't need to process weights after loading
         if self.block_quant:
+            if current_platform.is_rocm():
+                w13_weight, w13_weight_scale_inv, w13_input_scale = \
+                    normalize_e4m3fn_to_e4m3fnuz(
+                        layer.w13_weight, layer.w13_weight_scale_inv,
+                        layer.w13_input_scale)
+                w2_weight, w2_weight_scale_inv, w2_input_scale = \
+                    normalize_e4m3fn_to_e4m3fnuz(
+                        layer.w2_weight, layer.w2_weight_scale_inv,
+                        layer.w2_input_scale)
+                # Reset the parameter
+                layer.w13_weight = torch.nn.Parameter(w13_weight,
+                                                      requires_grad=False)
+                layer.w13_weight_scale_inv = torch.nn.Parameter(
+                    w13_weight_scale_inv, requires_grad=False)
+                if w13_input_scale is not None:
+                    layer.w13_input_scale = torch.nn.Parameter(
+                        w13_input_scale, requires_grad=False)
+                layer.w2_weight = torch.nn.Parameter(w2_weight,
+                                                     requires_grad=False)
+                layer.w2_weight_scale_inv = torch.nn.Parameter(
+                    w2_weight_scale_inv, requires_grad=False)
+                if w2_input_scale is not None:
+                    layer.w2_input_scale = torch.nn.Parameter(
+                        w2_input_scale, requires_grad=False)
             return
         # If checkpoint is fp16, quantize in place.
         if not self.quant_config.is_checkpoint_fp8_serialized:
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index f3c3e130e4161..b6882cc7c837c 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -5,6 +5,8 @@
 import triton
 import triton.language as tl
 
+from vllm.platforms import current_platform
+
 
 def apply_w8a8_block_fp8_linear(
     input: torch.Tensor,
@@ -33,11 +35,14 @@ def apply_w8a8_block_fp8_linear(
 
 
 def input_to_float8(
-    x: torch.Tensor,
-    dtype: torch.dtype = torch.float8_e4m3fn
+        x: torch.Tensor,
+        dtype: Optional[torch.dtype] = None
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     """This function quantizes input values to float8 values "
     "with tensor-wise quantization."""
+    if dtype is None:
+        dtype = (torch.float8_e4m3fnuz
+                 if current_platform.is_rocm() else torch.float8_e4m3fn)
     finfo = torch.finfo(dtype)
     min_val, max_val = x.aminmax()
     amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-12)
@@ -125,7 +130,7 @@ def per_token_group_quant_fp8(
     x: torch.Tensor,
     group_size: int,
     eps: float = 1e-10,
-    dtype: torch.dtype = torch.float8_e4m3fn,
+    dtype: Optional[torch.dtype] = None,
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     """Function to perform per-token-group quantization on an input tensor `x`.
     It converts the tensor values into signed float8 values and returns the
@@ -140,6 +145,9 @@ def per_token_group_quant_fp8(
         Tuple[torch.Tensor, torch.Tensor]: The quantized tensor and the
         scaling factor for quantization.
     """
+    if dtype is None:
+        dtype = (torch.float8_e4m3fnuz
+                 if current_platform.is_rocm() else torch.float8_e4m3fn)
     assert (x.shape[-1] % group_size == 0), (
         f"the last dimension of `x` {x.shape[-1]} must be divisible "
         f"by `group_size` {group_size}")

From 7b98a65ae6f011fb31fefa1f563b7d6e554df434 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sat, 18 Jan 2025 04:29:31 +0800
Subject: [PATCH 260/462] [torch.compile] disable logging when cache is
 disabled (#12043)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/compilation/backends.py | 30 +++++++++++++++++++++---------
 1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 87655530cead4..157e3f7f39c9c 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -251,15 +251,27 @@ def _check_can_cache(*args, **kwargs):
         def _get_shape_env() -> AlwaysHitShapeEnv:
             return AlwaysHitShapeEnv()
 
-        with patch(# for hijacking the hash of the compiled graph
-                "torch._inductor.codecache.compiled_fx_graph_hash",
-                hijack_compiled_fx_graph_hash), \
-            patch(# for providing a dummy shape environment
-                "torch._inductor.codecache.FxGraphCache._get_shape_env",
-                 _get_shape_env), \
-            patch(# for forcing the graph to be cached
-                "torch._inductor.codecache.FxGraphCache._check_can_cache",
-                _check_can_cache):
+        with ExitStack() as stack:
+            if not cache_data.disabled:
+                # compilation cache is enabled, patch several functions
+
+                # for hijacking the hash of the compiled graph
+                stack.enter_context(
+                    patch("torch._inductor.codecache.compiled_fx_graph_hash",
+                          hijack_compiled_fx_graph_hash))
+
+                # for providing a dummy shape environment
+                stack.enter_context(
+                    patch(
+                        "torch._inductor.codecache.FxGraphCache._get_shape_env",
+                        _get_shape_env))
+
+                # for forcing the graph to be cached
+                stack.enter_context(
+                    patch(
+                        "torch._inductor.codecache.FxGraphCache._check_can_cache",
+                        _check_can_cache))
+
             compiled_graph = compile_fx(graph,
                                         example_inputs,
                                         config_patches=current_config)

From 2b835032275622d70b19b8cd834740336dc26138 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sat, 18 Jan 2025 10:53:27 +0800
Subject: [PATCH 261/462] [misc] fix cross-node TP (#12166)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/executor/mp_distributed_executor.py | 38 ++++++++++++++++++++++--
 vllm/platforms/cuda.py                   | 22 --------------
 2 files changed, 36 insertions(+), 24 deletions(-)

diff --git a/vllm/executor/mp_distributed_executor.py b/vllm/executor/mp_distributed_executor.py
index 8ae88e646aad6..a80b0ee8b3122 100644
--- a/vllm/executor/mp_distributed_executor.py
+++ b/vllm/executor/mp_distributed_executor.py
@@ -1,4 +1,5 @@
 import asyncio
+import os
 from typing import Any, Callable, List, Optional, Union
 
 import cloudpickle
@@ -10,8 +11,9 @@
 from vllm.logger import init_logger
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.sequence import ExecuteModelRequest
-from vllm.utils import (_run_task_with_lock, get_distributed_init_method,
-                        get_ip, get_open_port, make_async, run_method)
+from vllm.utils import (_run_task_with_lock, cuda_device_count_stateless,
+                        get_distributed_init_method, get_ip, get_open_port,
+                        make_async, run_method, update_environment_variables)
 from vllm.worker.worker_base import WorkerWrapperBase
 
 logger = init_logger(__name__)
@@ -22,7 +24,39 @@ class MultiprocessingDistributedExecutor(DistributedExecutorBase):
 
     uses_ray: bool = False
 
+    def _check_cuda(self) -> None:
+        """Check that the number of GPUs is sufficient for the parallel
+        configuration. Separate from _init_executor to reduce the number of
+        indented blocks.
+        """
+        parallel_config = self.parallel_config
+        world_size = parallel_config.world_size
+        tensor_parallel_size = parallel_config.tensor_parallel_size
+
+        cuda_device_count = cuda_device_count_stateless()
+        # Use confusing message for more common TP-only case.
+        if tensor_parallel_size > cuda_device_count:
+            raise RuntimeError(
+                f"please set tensor_parallel_size ({tensor_parallel_size}) "
+                f"to less than max local gpu count ({cuda_device_count})")
+
+        if world_size > cuda_device_count:
+            raise RuntimeError(
+                f"please ensure that world_size ({world_size}) "
+                f"is less than than max local gpu count ({cuda_device_count})")
+
+        # Set CUDA_VISIBLE_DEVICES for the driver, inherited by workers
+        if "CUDA_VISIBLE_DEVICES" not in os.environ:
+            update_environment_variables({
+                "CUDA_VISIBLE_DEVICES": (",".join(map(str, range(world_size))))
+            })
+
     def _init_executor(self) -> None:
+
+        from vllm.platforms import current_platform
+        if current_platform.is_cuda_alike():
+            self._check_cuda()
+
         # Create the parallel GPU workers.
         world_size = self.parallel_config.world_size
         tensor_parallel_size = self.parallel_config.tensor_parallel_size
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 8350177b68ade..2587e3a11dde3 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -139,28 +139,6 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
                 else:
                     parallel_config.worker_cls = "vllm.worker.worker.Worker"
 
-        world_size = parallel_config.world_size
-        tensor_parallel_size = parallel_config.tensor_parallel_size
-
-        from vllm.utils import (cuda_device_count_stateless,
-                                update_environment_variables)
-
-        # Set CUDA_VISIBLE_DEVICES for the driver, inherited by workers
-        if "CUDA_VISIBLE_DEVICES" not in os.environ:
-            update_environment_variables({
-                "CUDA_VISIBLE_DEVICES": (",".join(map(str, range(world_size))))
-            })
-
-        cuda_device_count = cuda_device_count_stateless()
-        # Use confusing message for more common TP-only case.
-        assert tensor_parallel_size <= cuda_device_count, (
-            f"please set tensor_parallel_size ({tensor_parallel_size}) "
-            f"to less than max local gpu count ({cuda_device_count})")
-
-        assert world_size <= cuda_device_count, (
-            f"please ensure that world_size ({world_size}) "
-            f"is less than than max local gpu count ({cuda_device_count})")
-
         cache_config = vllm_config.cache_config
         if cache_config and cache_config.block_size is None:
             cache_config.block_size = 16

From c09503ddd657850c66548b5bb28e58bac1c4afb7 Mon Sep 17 00:00:00 2001
From: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com>
Date: Fri, 17 Jan 2025 22:15:53 -0500
Subject: [PATCH 262/462] [AMD][CI/Build][Bugfix] use pytorch stale wheel
 (#12172)

Signed-off-by: hongxyan <hongxyan@amd.com>
---
 Dockerfile.rocm                                          | 6 +++---
 docs/source/getting_started/installation/gpu/rocm.inc.md | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index e733994f8c33e..e922cb207b899 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -51,10 +51,10 @@ RUN --mount=type=cache,target=/root/.cache/pip \
         *"rocm-6.2"*) \
             python3 -m pip uninstall -y torch torchvision \
             && python3 -m pip install --pre \
-                torch==2.6.0.dev20241113+rocm6.2 \
+                torch \
                 'setuptools-scm>=8' \
-                torchvision==0.20.0.dev20241113+rocm6.2 \
-                --extra-index-url https://download.pytorch.org/whl/nightly/rocm6.2;; \
+                torchvision \
+                --extra-index-url https://download.pytorch.org/whl/rocm6.2;; \
         *) ;; esac
 
 ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer
diff --git a/docs/source/getting_started/installation/gpu/rocm.inc.md b/docs/source/getting_started/installation/gpu/rocm.inc.md
index f6f9d3c303f89..4256027e6c40e 100644
--- a/docs/source/getting_started/installation/gpu/rocm.inc.md
+++ b/docs/source/getting_started/installation/gpu/rocm.inc.md
@@ -70,7 +70,7 @@ Currently, there are no pre-built ROCm wheels.
 
     # Install PyTorch
     $ pip uninstall torch -y
-    $ pip install --no-cache-dir --pre torch==2.6.0.dev20241024 --index-url https://download.pytorch.org/whl/nightly/rocm6.2
+    $ pip install --no-cache-dir --pre torch --index-url https://download.pytorch.org/whl/rocm6.2
 
     # Build & install AMD SMI
     $ pip install /opt/rocm/share/amd_smi

From da02cb4b274ab8bcebb8b8e677ff4b43440bc499 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sat, 18 Jan 2025 12:25:08 +0800
Subject: [PATCH 263/462] [core] further polish memory profiling (#12126)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 tests/test_utils.py   | 26 ++++++------
 vllm/utils.py         | 95 +++++++++++++++++++++++++------------------
 vllm/worker/worker.py | 31 +++++++-------
 3 files changed, 85 insertions(+), 67 deletions(-)

diff --git a/tests/test_utils.py b/tests/test_utils.py
index c68d730af7f8a..d5dc4464e634d 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -9,10 +9,10 @@
 from vllm_test_utils import monitor
 
 from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
-from vllm.utils import (FlexibleArgumentParser, PlaceholderModule,
-                        StoreBoolean, bind_kv_cache, deprecate_kwargs,
-                        get_open_port, memory_profiling, merge_async_iterators,
-                        supports_kw)
+from vllm.utils import (FlexibleArgumentParser, MemorySnapshot,
+                        PlaceholderModule, StoreBoolean, bind_kv_cache,
+                        deprecate_kwargs, get_open_port, memory_profiling,
+                        merge_async_iterators, supports_kw)
 
 from .utils import error_on_warning, fork_new_process_for_each_test
 
@@ -284,14 +284,13 @@ def test_memory_profiling():
     # 512 MiB allocation outside of this instance
     handle1 = lib.cudaMalloc(512 * 1024 * 1024)
 
-    baseline_memory_in_bytes = \
-        torch.cuda.mem_get_info()[1] - torch.cuda.mem_get_info()[0]
+    baseline_snapshot = MemorySnapshot()
 
     # load weights
 
     weights = torch.randn(128, 1024, 1024, device='cuda', dtype=torch.float32)
 
-    weights_memory_in_bytes = 128 * 1024 * 1024 * 4 # 512 MiB
+    weights_memory = 128 * 1024 * 1024 * 4 # 512 MiB
 
     def measure_current_non_torch():
         free, total = torch.cuda.mem_get_info()
@@ -300,8 +299,8 @@ def measure_current_non_torch():
         current_non_torch = current_used - current_torch
         return current_non_torch
 
-    with memory_profiling(baseline_memory_in_bytes=baseline_memory_in_bytes,
-    weights_memory_in_bytes=weights_memory_in_bytes) as result, \
+    with memory_profiling(baseline_snapshot=baseline_snapshot,
+    weights_memory=weights_memory) as result, \
         monitor(measure_current_non_torch) as monitored_values:
         # make a memory spike, 1 GiB
         spike = torch.randn(256, 1024, 1024, device='cuda', dtype=torch.float32)
@@ -316,13 +315,12 @@ def measure_current_non_torch():
     assert measured_diff == 256 * 1024 * 1024
 
     # Check that the memory usage is within 5% of the expected values
-    # 5% tolerance is caused by PyTorch caching allocator,
-    # we cannot control PyTorch's behavior of its internal buffers,
+    # 5% tolerance is caused by cuda runtime.
+    # we cannot control cuda runtime in the granularity of bytes,
     # which causes a small error (<10 MiB in practice)
-    non_torch_ratio = result.non_torch_increase_in_bytes / (256 * 1024 * 1024) # noqa
-    torch_peak_ratio = result.torch_peak_increase_in_bytes / (1024 * 1024 * 1024) # noqa
+    non_torch_ratio = result.non_torch_increase / (256 * 1024 * 1024) # noqa
     assert abs(non_torch_ratio - 1) <= 0.05
-    assert abs(torch_peak_ratio - 1) <= 0.05
+    assert result.torch_peak_increase == 1024 * 1024 * 1024
     del weights
     lib.cudaFree(handle1)
     lib.cudaFree(handle2)
diff --git a/vllm/utils.py b/vllm/utils.py
index 89ba119bb5e55..17bffd2846b46 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -1923,36 +1923,57 @@ def kill_process_tree(pid: int):
 @dataclass
 class MemorySnapshot:
     """Memory snapshot."""
-    torch_peak_in_bytes: int = 0
-    torch_memory_in_bytes: int = 0
+    torch_peak: int = 0
+    cuda_memory: int = 0
+    torch_memory: int = 0
+    non_torch_memory: int = 0
     timestamp: float = 0.0
+    auto_measure: bool = True
+
+    def __post_init__(self):
+        if self.auto_measure:
+            self.measure()
 
     def measure(self):
-        self.torch_peak_in_bytes = torch.cuda.max_memory_reserved()
+        # we measure the torch peak memory usage via allocated_bytes,
+        # rather than `torch.cuda.memory_reserved()` .
+        # After `torch.cuda.reset_peak_memory_stats()`,
+        # `torch.cuda.memory_reserved()` will keep growing, and only shrink
+        # when we call `torch.cuda.empty_cache()` or OOM happens.
+        self.torch_peak = torch.cuda.memory_stats().get(
+            "allocated_bytes.all.peak", 0)
+
+        self.cuda_memory = torch.cuda.mem_get_info(
+        )[1] - torch.cuda.mem_get_info()[0]
+
         # torch.cuda.memory_reserved() is how many bytes
         # PyTorch gets from cuda (by calling cudaMalloc, etc.)
-        self.torch_memory_in_bytes = torch.cuda.memory_reserved()
+        # this is used to measure the non-torch memory usage
+        self.torch_memory = torch.cuda.memory_reserved()
+
+        self.non_torch_memory = self.cuda_memory - self.torch_memory
         self.timestamp = time.time()
 
     def __sub__(self, other: "MemorySnapshot") -> "MemorySnapshot":
-        """support a - b"""
         return MemorySnapshot(
-            torch_peak_in_bytes=self.torch_peak_in_bytes -
-            other.torch_peak_in_bytes,
-            torch_memory_in_bytes=self.torch_memory_in_bytes -
-            other.torch_memory_in_bytes,
-            timestamp=self.timestamp - other.timestamp)
+            torch_peak=self.torch_peak - other.torch_peak,
+            cuda_memory=self.cuda_memory - other.cuda_memory,
+            torch_memory=self.torch_memory - other.torch_memory,
+            non_torch_memory=self.non_torch_memory - other.non_torch_memory,
+            timestamp=self.timestamp - other.timestamp,
+            auto_measure=False,
+        )
 
 
 @dataclass
 class MemoryProfilingResult:
-    """Memory profiling result.
-    """  # noqa
-    baseline_memory_in_bytes: int = 0
-    non_kv_cache_memory_in_bytes: int = 0
-    torch_peak_increase_in_bytes: int = 0
-    non_torch_increase_in_bytes: int = 0
-    weights_memory_in_bytes: float = 0
+    """Memory profiling result. All numbers are in bytes.
+    """
+    non_kv_cache_memory: int = 0
+    torch_peak_increase: int = 0
+    non_torch_increase: int = 0
+    weights_memory: float = 0
+    before_create: MemorySnapshot = field(default_factory=MemorySnapshot)
     before_profile: MemorySnapshot = field(default_factory=MemorySnapshot)
     after_profile: MemorySnapshot = field(default_factory=MemorySnapshot)
     profile_time: float = 0.0
@@ -1960,18 +1981,14 @@ class MemoryProfilingResult:
 
 @contextlib.contextmanager
 def memory_profiling(
-    baseline_memory_in_bytes: int, weights_memory_in_bytes: int
-) -> Generator[MemoryProfilingResult, None, None]:
+        baseline_snapshot: MemorySnapshot,
+        weights_memory: int) -> Generator[MemoryProfilingResult, None, None]:
     """Memory profiling context manager.
-    baseline_memory_in_bytes: memory used by all the components other than
-        the current vLLM instance. It contains: memory used by other processes, memory
-        used by another vLLM instance in the same process, etc. It is usually measured
-        before the current vLLM instance initialize the device. And we assume it is
-        constant during the profiling of the current vLLM instance.
-    weights_memory_in_bytes: memory used by PyTorch when loading the model weights.
+    baseline_snapshot: the memory snapshot before the current vLLM instance.
+    weights_memory: memory used by PyTorch when loading the model weights.
         Note that, before loading the model weights, we also initialize the device
         and distributed environment, which may consume some memory. This part is not
-        included in the weights_memory_in_bytes because PyTorch does not control it.
+        included in the weights_memory because PyTorch does not control it.
 
     The memory in one GPU can be classified into 3 categories:
     1. memory used by anything other than the current vLLM instance.
@@ -2006,20 +2023,21 @@ def memory_profiling(
     b. 2 GiB reserved for the peak activation tensors (category 2)
     c. 1 GiB used by non-torch components (category 3)
 
-    The memory used for loading weights (a.) is directly given from the argument `weights_memory_in_bytes`.
+    The memory used for loading weights (a.) is directly given from the argument `weights_memory`.
 
-    The increase of `torch.cuda.memory_stats()["allocated_bytes.all.peak"]` after profiling gives (b.).
+    The increase of `torch.cuda.memory_stats()["allocated_bytes.all.peak"]` during profiling gives (b.).
 
-    (c.) is tricky. We measure the total memory used in this GPU (`torch.cuda.mem_get_info()[1] - torch.cuda.mem_get_info()[0]`),
-    subtract the baseline memory, the memory used by the model weights, and diff of `torch.cuda.memory_reserved()`.
+    The increase of `non_torch_memory` from creating the current vLLM instance until after profiling to get (c.).
     """ # noqa
+    gc.collect()
+    torch.cuda.empty_cache()
     torch.cuda.reset_peak_memory_stats()
 
     result = MemoryProfilingResult()
 
-    result.baseline_memory_in_bytes = baseline_memory_in_bytes
+    result.before_create = baseline_snapshot
     # the part of memory used for holding the model weights
-    result.weights_memory_in_bytes = weights_memory_in_bytes
+    result.weights_memory = weights_memory
 
     result.before_profile.measure()
 
@@ -2030,13 +2048,12 @@ def memory_profiling(
 
     result.after_profile.measure()
 
-    diff = result.after_profile - result.before_profile
-    result.torch_peak_increase_in_bytes = diff.torch_peak_in_bytes
-    current_cuda_memory_bytes = torch.cuda.mem_get_info(
-    )[1] - torch.cuda.mem_get_info()[0]
-    result.non_torch_increase_in_bytes = current_cuda_memory_bytes - baseline_memory_in_bytes - weights_memory_in_bytes - diff.torch_memory_in_bytes  # noqa
-    result.profile_time = diff.timestamp
-    result.non_kv_cache_memory_in_bytes = result.non_torch_increase_in_bytes + result.torch_peak_increase_in_bytes + result.weights_memory_in_bytes  # noqa
+    diff_profile = result.after_profile - result.before_profile
+    diff_from_create = result.after_profile - result.before_create
+    result.torch_peak_increase = diff_profile.torch_peak
+    result.non_torch_increase = diff_from_create.non_torch_memory
+    result.profile_time = diff_profile.timestamp
+    result.non_kv_cache_memory = result.non_torch_increase + result.torch_peak_increase + result.weights_memory  # noqa
 
 
 # Adapted from: https://github.com/sgl-project/sglang/blob/v0.4.1/python/sglang/srt/utils.py#L630 # noqa: E501
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 43eeb287d64eb..29d62ddda3dc0 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -21,7 +21,8 @@
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sequence import (ExecuteModelRequest, IntermediateTensors,
                            SequenceGroupMetadata, SequenceGroupMetadataDelta)
-from vllm.utils import GiB_bytes, bind_kv_cache, memory_profiling
+from vllm.utils import (GiB_bytes, MemorySnapshot, bind_kv_cache,
+                        memory_profiling)
 from vllm.worker.cache_engine import CacheEngine
 from vllm.worker.enc_dec_model_runner import EncoderDecoderModelRunner
 from vllm.worker.model_runner import GPUModelRunnerBase, ModelRunner
@@ -137,7 +138,8 @@ def init_device(self) -> None:
             _check_if_gpu_supports_dtype(self.model_config.dtype)
             gc.collect()
             torch.cuda.empty_cache()
-            self.init_gpu_memory = torch.cuda.mem_get_info()[0]
+            torch.cuda.reset_peak_memory_stats()
+            self.baseline_snapshot = MemorySnapshot()
         else:
             raise RuntimeError(
                 f"Not support device type: {self.device_config.device}")
@@ -192,10 +194,9 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
 
         # Execute a forward pass with dummy inputs to profile the memory usage
         # of the model.
-        with memory_profiling(baseline_memory_in_bytes=total_gpu_memory -
-                              self.init_gpu_memory,
-                              weights_memory_in_bytes=self.model_runner.
-                              model_memory_usage) as result:
+        with memory_profiling(
+                self.baseline_snapshot,
+                weights_memory=self.model_runner.model_memory_usage) as result:
             self.model_runner.profile_run()
 
         self._assert_memory_footprint_increased_during_profiling()
@@ -203,7 +204,7 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
         memory_for_current_instance = total_gpu_memory * \
             self.cache_config.gpu_memory_utilization
         available_kv_cache_memory = (memory_for_current_instance -
-                                     result.non_kv_cache_memory_in_bytes)
+                                     result.non_kv_cache_memory)
 
         # Calculate the number of blocks that can be allocated with the
         # profiled peak memory.
@@ -226,11 +227,11 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
                f"({self.cache_config.gpu_memory_utilization:.2f})"
                f" = {(memory_for_current_instance / GiB_bytes):.2f}GiB\n"
                "model weights take "
-               f"{(result.weights_memory_in_bytes / GiB_bytes):.2f}GiB;"
+               f"{(result.weights_memory / GiB_bytes):.2f}GiB;"
                " non_torch_memory takes "
-               f"{(result.non_torch_increase_in_bytes / GiB_bytes):.2f}GiB;"
+               f"{(result.non_torch_increase / GiB_bytes):.2f}GiB;"
                " PyTorch activation peak memory takes "
-               f"{(result.torch_peak_increase_in_bytes / GiB_bytes):.2f}GiB;"
+               f"{(result.torch_peak_increase / GiB_bytes):.2f}GiB;"
                " the rest of the memory reserved for KV Cache is "
                f"{(available_kv_cache_memory / GiB_bytes):.2f}GiB.")
 
@@ -246,11 +247,13 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
     def _assert_memory_footprint_increased_during_profiling(self):
         # NOTE(woosuk): Here we assume that the other processes using the same
         # GPU did not change their memory usage during the profiling.
-        free_gpu_memory, _ = torch.cuda.mem_get_info()
-        assert self.init_gpu_memory - free_gpu_memory > 0, (
+        free_gpu_memory, total = torch.cuda.mem_get_info()
+        cuda_memory = total - free_gpu_memory
+        assert self.baseline_snapshot.cuda_memory < cuda_memory, (
             "Error in memory profiling. "
-            f"Initial free memory {self.init_gpu_memory}, current free memory"
-            f" {free_gpu_memory}. This happens when the GPU memory was "
+            f"Initial used memory {self.baseline_snapshot.cuda_memory}, "
+            f"currently used memory {cuda_memory}. "
+            f"This happens when the GPU memory was "
             "not properly cleaned up before initializing the vLLM instance.")
 
     def initialize_cache(self, num_gpu_blocks: int,

From 813f249f022a44aded2a843f0c7108ea0b7d1f6b Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Fri, 17 Jan 2025 23:35:21 -0500
Subject: [PATCH 264/462] [Docs] Fix broken link in SECURITY.md (#12175)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 SECURITY.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/SECURITY.md b/SECURITY.md
index de0032d26c87b..47196a1f1221e 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -4,7 +4,7 @@
 
 If you believe you have found a security vulnerability in vLLM, we encourage you to let us know right away. We will investigate all legitimate reports and do our best to quickly fix the problem.
 
-Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new). Reports will then be triaged by the [vulnerability management team](https://docs.vllm.ai/contributing/vulnerability_management/).
+Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new). Reports will then be triaged by the [vulnerability management team](https://docs.vllm.ai/en/latest/contributing/vulnerability_management.html).
 
 ---
 

From 02798ecabed36f4c255f5a12ad6c271f95cd8c4e Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sat, 18 Jan 2025 13:59:39 +0800
Subject: [PATCH 265/462] [Model] Port deepseek-vl2 processor, remove
 dependency (#12169)

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 .buildkite/test-pipeline.yaml                 |   1 -
 docs/source/models/supported_models.md        |  10 +-
 .../vision_language_multi_image.py            |   2 +-
 .../vision_language/test_models.py            |   2 +-
 .../multimodal/processing/test_common.py      |   3 +
 vllm/model_executor/models/deepseek_vl2.py    |  51 +--
 .../transformers_utils/processors/__init__.py |   4 +
 .../processors/deepseek_vl2.py                | 361 ++++++++++++++++++
 8 files changed, 385 insertions(+), 49 deletions(-)
 create mode 100644 vllm/transformers_utils/processors/__init__.py
 create mode 100644 vllm/transformers_utils/processors/deepseek_vl2.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index bff557d7fc92f..d2b140e718501 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -52,7 +52,6 @@ steps:
   - tests/worker
   - tests/standalone_tests/lazy_torch_compile.py
   commands:
-  - pip install git+https://github.com/Isotr0py/DeepSeek-VL2.git  # Used by multimoda processing test
   - python3 standalone_tests/lazy_torch_compile.py
   - pytest -v -s mq_llm_engine # MQLLMEngine
   - pytest -v -s async_engine # AsyncLLMEngine
diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index d07cde3db5c6e..2edb610ddf959 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -767,16 +767,10 @@ See [this page](#generative-models) for more information on how to use generativ
 <sup>E</sup> Pre-computed embeddings can be inputted for this modality.  
 <sup>+</sup> Multiple items can be inputted per text prompt for this modality.
 
-````{note}
-To use `DeepSeek-VL2` series models, you need to install a fork version `deepseek_vl2` package:
-
-```shell
-pip install git+https://github.com/Isotr0py/DeepSeek-VL2.git
+```{note}
+To use `DeepSeek-VL2` series models, you have to pass `--hf_overrides '{"architectures": ["DeepseekVLV2ForCausalLM"]}'` when running vLLM.
 ```
 
-Besides, to run `DeepSeek-VL2` series models, you have to pass `--hf_overrides '{"architectures": ["DeepseekVLV2ForCausalLM"]}'` when running vLLM.
-````
-
 ```{note}
 To use `TIGER-Lab/Mantis-8B-siglip-llama3`, you have to pass `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM.
 ```
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index cf3c5dd4e0a2c..43c44fa867e0a 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -393,7 +393,7 @@ def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData:
 
 model_example_map = {
     "aria": load_aria,
-    "deepseek_vl2": load_deepseek_vl2,
+    "deepseek_vl_v2": load_deepseek_vl2,
     "h2ovl_chat": load_h2onvl,
     "idefics3": load_idefics3,
     "internvl_chat": load_internvl,
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index 5710303548c34..ca572cc39e538 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -190,7 +190,7 @@
         dtype="bfloat16",
     ),
     "deepseek_vl_v2": VLMTestInfo(
-        models=["deepseek-ai/deepseek-vl2-tiny"],
+        models=["Isotr0py/deepseek-vl2-tiny"], # model repo using dynamic module
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
         prompt_formatter=lambda img_prompt: f"<|User|>: {img_prompt}\n\n<|Assistant|>: ", # noqa: E501
         max_model_len=4096,
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index 0a38779e0e4f0..1e3e7ea50b122 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -22,6 +22,8 @@ def _test_processing_correctness(
 ):
     if model_id == "TIGER-Lab/Mantis-8B-siglip-llama3":
         hf_overrides = {"architectures": ["MantisForConditionalGeneration"]}
+    elif model_id == "deepseek-ai/deepseek-vl2-tiny":
+        hf_overrides = {"architectures": ["DeepseekVLV2ForCausalLM"]}
     else:
         hf_overrides = {}
 
@@ -139,6 +141,7 @@ def _test_processing_correctness(
     ("rhymes-ai/Aria", {"image": True}),
     ("Salesforce/blip2-opt-2.7b", {"image": False}),
     ("facebook/chameleon-7b", {"image": False}),
+    ("deepseek-ai/deepseek-vl2-tiny", {"image": True}),
     ("adept/fuyu-8b", {"image": False}),
     ("llava-hf/llava-1.5-7b-hf", {"image": True}),
     ("llava-hf/llava-v1.6-mistral-7b-hf", {"image": True}),
diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
index 4553695022169..4d3d1c329a2c0 100644
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -1,7 +1,7 @@
 # adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/faf18023f24b962b32d9f0a2d89e402a8d383a78/deepseek_vl2/models/modeling_deepseek_vl_v2.py
 """Inference-only Deepseek-VL2 model compatible with HuggingFace weights."""
 import math
-from functools import cached_property, partial
+from functools import cached_property
 from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
                     TypedDict, Union)
 
@@ -9,7 +9,7 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange, repeat
-from transformers import AutoProcessor, BatchFeature, ProcessorMixin
+from transformers import BatchFeature
 
 from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
@@ -31,6 +31,8 @@
 from vllm.transformers_utils.configs.deepseek_vl2 import (DeepseekVLV2Config,
                                                           MlpProjectorConfig,
                                                           VisionEncoderConfig)
+from vllm.transformers_utils.processors.deepseek_vl2 import (
+    DeepseekVLV2Processor)
 from vllm.utils import is_list_of
 
 from .interfaces import SupportsMultiModal, SupportsPP
@@ -129,25 +131,8 @@ class DeepseekVL2ProcessingInfo(BaseProcessingInfo):
     def get_hf_config(self):
         return self.ctx.get_hf_config(DeepseekVLV2Config)
 
-    def get_hf_processor(self) -> ProcessorMixin:
-        # TODO(Isotr0py): we should get rid of dependency on deepseek_vl2
-        # in the future, because it's flasky and lack of maintenance.
-        try:
-            from deepseek_vl2.models.processing_deepseek_vl_v2 import (
-                DeepseekVLV2Processor, select_best_resolution)
-            AutoProcessor.register("DeepseekVLV2Processor",
-                                   DeepseekVLV2Processor)
-        except ModuleNotFoundError as exc:
-            raise ModuleNotFoundError(
-                "You need to `pip install "
-                "git+https://github.com/deepseek-ai/DeepSeek-VL2.git` "
-                "to use this model") from exc
-
-        processor = self.ctx.get_hf_processor(DeepseekVLV2Processor)
-        processor.select_best_resolution = partial(
-            select_best_resolution,
-            candidate_resolutions=processor.candidate_resolutions)
-        return processor
+    def get_hf_processor(self) -> DeepseekVLV2Processor:
+        return self.ctx.get_hf_processor(DeepseekVLV2Processor)
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None}
@@ -224,31 +209,21 @@ def _call_hf_processor(
         mm_kwargs: Mapping[str, object],
     ) -> BatchFeature:
         if mm_data:
-            outputs = self.info.ctx.call_hf_processor(
+            processed_outputs = self.info.ctx.call_hf_processor(
                 self.info.get_hf_processor(**mm_kwargs),
                 dict(prompt=prompt, **mm_data),
                 mm_kwargs,
             )
-
-            # Deepseek-vl2 processor don't return BatchFeature,
-            # we need to manually create it
-            processed_outputs = dict(input_ids=outputs["input_ids"])
-            processed_outputs = BatchFeature(data=dict(processed_outputs),
-                                             tensor_type="pt")
-
-            # Remove batch dimension from processor outputs,
-            # because we will try batch to create NestedTensors
             target_dtype = self.info.ctx.model_config.dtype
-            pixel_values = outputs["images"].to(target_dtype).squeeze(0)
-            images_spatial_crop = outputs["images_spatial_crop"].squeeze(0)
+            pixel_values = processed_outputs.pop("pixel_values").to(
+                target_dtype)
+            # split pixel values into patches corresponding to each image
+            images_spatial_crop = processed_outputs["images_spatial_crop"]
             patches_per_image = [
                 x.prod().item() + 1 for x in images_spatial_crop
             ]
-
-            # Rename `images` -> `pixel_values` to avoid confusion
-            processed_outputs["pixel_values"] = list(
-                pixel_values.split(patches_per_image))
-            processed_outputs["images_spatial_crop"] = images_spatial_crop
+            pixel_values = pixel_values.split(patches_per_image)
+            processed_outputs["pixel_values"] = pixel_values
         else:
             tokenizer = self.info.get_tokenizer()
             processed_outputs = tokenizer(prompt,
diff --git a/vllm/transformers_utils/processors/__init__.py b/vllm/transformers_utils/processors/__init__.py
new file mode 100644
index 0000000000000..9c71b8cada32e
--- /dev/null
+++ b/vllm/transformers_utils/processors/__init__.py
@@ -0,0 +1,4 @@
+from vllm.transformers_utils.processors.deepseek_vl2 import (
+    DeepseekVLV2Processor)
+
+__all__ = ["DeepseekVLV2Processor"]
diff --git a/vllm/transformers_utils/processors/deepseek_vl2.py b/vllm/transformers_utils/processors/deepseek_vl2.py
new file mode 100644
index 0000000000000..27cdf6bc22d0e
--- /dev/null
+++ b/vllm/transformers_utils/processors/deepseek_vl2.py
@@ -0,0 +1,361 @@
+# yapf: disable
+# ruff: noqa: E501
+# coding=utf-8
+# adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/ff23960c5cf9e6874b44be38af930cfb0ccbb620/deepseek_vl2/models/processing_deepseek_vl_v2.py
+# Copyright (c) 2023-2024 DeepSeek.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of
+# this software and associated documentation files (the "Software"), to deal in
+# the Software without restriction, including without limitation the rights to
+# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+# the Software, and to permit persons to whom the Software is furnished to do so,
+# subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+import math
+from typing import List, Tuple
+
+import torch
+import torchvision.transforms as T
+from PIL import Image, ImageOps
+from transformers import AutoProcessor, BatchFeature, LlamaTokenizerFast
+from transformers.processing_utils import ProcessorMixin
+
+
+class ImageTransform:
+
+    def __init__(self,
+                 mean: Tuple[float, float, float] = (0.5, 0.5, 0.5),
+                 std: Tuple[float, float, float] = (0.5, 0.5, 0.5),
+                 normalize: bool = True):
+        self.mean = mean
+        self.std = std
+        self.normalize = normalize
+
+        transform_pipelines = [T.ToTensor()]
+
+        if normalize:
+            transform_pipelines.append(T.Normalize(mean, std))
+
+        self.transform = T.Compose(transform_pipelines)
+
+    def __call__(self, pil_img: Image.Image):
+        x = self.transform(pil_img)
+        return x
+
+
+class DeepseekVLV2Processor(ProcessorMixin):
+    tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast")
+    attributes = ["tokenizer"]
+
+    def __init__(
+        self,
+        tokenizer: LlamaTokenizerFast,
+        candidate_resolutions: Tuple[Tuple[int, int]],
+        patch_size: int,
+        downsample_ratio: int,
+        image_mean: Tuple[float, float, float] = (0.5, 0.5, 0.5),
+        image_std: Tuple[float, float, float] = (0.5, 0.5, 0.5),
+        normalize: bool = True,
+        image_token: str = "<image>",
+        pad_token: str = "<｜▁pad▁｜>",
+        add_special_token: bool = False,
+        sft_format: str = "deepseek",
+        mask_prompt: bool = True,
+        ignore_id: int = -100,
+        **kwargs,
+    ):
+
+        self.candidate_resolutions = candidate_resolutions
+        self.image_size = candidate_resolutions[0][0]
+        self.patch_size = patch_size
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.normalize = normalize
+        self.downsample_ratio = downsample_ratio
+
+        self.image_transform = ImageTransform(mean=image_mean, std=image_std, normalize=normalize)
+        self.tokenizer = tokenizer
+        self.tokenizer.padding_side = 'left'  # must set this，padding side with make a difference in batch inference
+
+        # add the pad_token as special token to use 'tokenizer.pad_token' and 'tokenizer.pad_token_id'
+        if tokenizer.pad_token is None:
+            self.tokenizer.add_special_tokens({'pad_token': pad_token})
+
+        # add image token
+        image_token_id = self.tokenizer.vocab.get(image_token)
+        if image_token_id is None:
+            special_tokens = [image_token]
+            special_tokens_dict = {"additional_special_tokens": special_tokens}
+            self.tokenizer.add_special_tokens(special_tokens_dict)
+        self.image_token_id = self.tokenizer.vocab.get(image_token)
+
+        # add five special tokens for grounding-related tasks
+        # <|ref|>, <|/ref|>, <|det|>, <|/det|>, <|grounding|>
+        special_tokens = ['<|ref|>', '<|/ref|>', '<|det|>', '<|/det|>', '<|grounding|>']
+        special_tokens_dict = {"additional_special_tokens": special_tokens}
+        self.tokenizer.add_special_tokens(special_tokens_dict)
+
+        # add special tokens for SFT data
+        special_tokens = ["<|User|>", "<|Assistant|>"]
+        special_tokens_dict = {"additional_special_tokens": special_tokens}
+        self.tokenizer.add_special_tokens(special_tokens_dict)
+
+        self.image_token = image_token
+        self.pad_token = pad_token
+        self.add_special_token = add_special_token
+        self.sft_format = sft_format
+        self.mask_prompt = mask_prompt
+        self.ignore_id = ignore_id
+
+        super().__init__(
+            tokenizer,
+            **kwargs,
+        )
+
+    def select_best_resolution(self, image_size):
+        # used for cropping
+        original_width, original_height = image_size
+        best_fit = None
+        max_effective_resolution = 0
+        min_wasted_resolution = float("inf")
+
+        for width, height in self.candidate_resolutions:
+            scale = min(width / original_width, height / original_height)
+            downscaled_width, downscaled_height = int(
+                original_width * scale), int(original_height * scale)
+            effective_resolution = min(downscaled_width * downscaled_height,
+                                       original_width * original_height)
+            wasted_resolution = (width * height) - effective_resolution
+
+            if effective_resolution > max_effective_resolution or (
+                    effective_resolution == max_effective_resolution
+                    and wasted_resolution < min_wasted_resolution):
+                max_effective_resolution = effective_resolution
+                min_wasted_resolution = wasted_resolution
+                best_fit = (width, height)
+
+        return best_fit
+
+    @property
+    def bos_id(self):
+        return self.tokenizer.bos_token_id
+
+    @property
+    def eos_id(self):
+        return self.tokenizer.eos_token_id
+
+    @property
+    def pad_id(self):
+        return self.tokenizer.pad_token_id
+
+    def encode(self, text: str, bos: bool = True, eos: bool = False):
+        t = self.tokenizer.encode(text, add_special_tokens=False)
+
+        if bos:
+            t = [self.bos_id] + t
+        if eos:
+            t = t + [self.eos_id]
+
+        return t
+
+    def decode(self, t: List[int], **kwargs) -> str:
+        return self.tokenizer.decode(t, **kwargs)
+
+    def process_one(
+        self,
+        prompt: str,
+        images: List[Image.Image],
+        inference_mode: bool = True,
+        **kwargs,
+    ):
+        """
+
+        Args:
+            prompt (str): the formatted prompt;
+            conversations (List[Dict]): conversations with a list of messages;
+            images (List[ImageType]): the list of images;
+            inference_mode (bool): if True, then remove the last eos token;
+            system_prompt (str): the system prompt;
+            **kwargs:
+
+        Returns:
+            outputs (BaseProcessorOutput): the output of the processor,
+                - input_ids (torch.LongTensor): [N + image tokens]
+                - target_ids (torch.LongTensor): [N + image tokens]
+                - pixel_values (torch.FloatTensor): [n_patches, 3, H, W]
+                - image_id (int): the id of the image token
+                - num_image_tokens (List[int]): the number of image tokens
+        """
+
+        assert (prompt is not None and images is not None
+                ), "prompt and images must be used at the same time."
+
+        sft_format = prompt
+        tokenized_str, images_list, images_seq_mask, images_spatial_crop, num_image_tokens = self.tokenize_with_images(
+            sft_format, images, bos=True, eos=True, cropping=len(images) <= 2)
+        masked_tokenized_str = []
+        for token_index in tokenized_str:
+            if token_index != self.image_token_id:
+                masked_tokenized_str.append(token_index)
+            else:
+                masked_tokenized_str.append(self.ignore_id)
+
+        assert len(tokenized_str) == len(images_seq_mask) == len(masked_tokenized_str), \
+            (f"tokenized_str's length {len(tokenized_str)}, input_ids' length {len(masked_tokenized_str)}, "
+             f"imags_seq_mask's length {len(images_seq_mask)}, are not equal")
+
+        input_ids = torch.LongTensor(tokenized_str)
+        target_ids = torch.LongTensor(masked_tokenized_str)
+        images_seq_mask = torch.tensor(images_seq_mask, dtype=torch.bool)
+
+        # set input_ids < 0 | input_ids == self.image_token_id as ignore_id
+        target_ids[(input_ids < 0) |
+                   (input_ids == self.image_token_id)] = self.ignore_id
+        input_ids[input_ids < 0] = self.pad_id
+
+        if inference_mode:
+            # 去掉结尾的eos token
+            assert input_ids[-1] == self.eos_id
+            input_ids = input_ids[:-1]
+            target_ids = target_ids[:-1]
+            images_seq_mask = images_seq_mask[:-1]
+
+        if len(images_list) == 0:
+            pixel_values = torch.zeros((1, 3, self.image_size, self.image_size))
+            images_spatial_crop = torch.zeros((1, 2), dtype=torch.long)
+        else:
+            pixel_values = torch.stack(images_list, dim=0)
+            images_spatial_crop = torch.tensor(images_spatial_crop, dtype=torch.long)
+
+        input_ids = input_ids.unsqueeze(0)
+
+        prepare = BatchFeature(
+            data=dict(
+                input_ids=input_ids,
+                pixel_values=pixel_values,
+                images_seq_mask=images_seq_mask,
+                images_spatial_crop=images_spatial_crop,
+                num_image_tokens=num_image_tokens,
+            ),
+            tensor_type="pt",
+        )
+        return prepare
+
+    def __call__(
+        self,
+        *,
+        prompt: str,
+        images: List[Image.Image],
+        inference_mode: bool = True,
+        **kwargs,
+    ):
+        """
+
+        Args:
+            prompt (str): the formatted prompt;
+            images (List[ImageType]): the list of images;
+            inference_mode (bool): if True, then remove the last eos token;
+            **kwargs:
+
+        Returns:
+            outputs (BaseProcessorOutput): the output of the processor,
+                - input_ids (torch.LongTensor): [N + image tokens]
+                - images (torch.FloatTensor): [n_images, 3, H, W]
+                - image_id (int): the id of the image token
+                - num_image_tokens (List[int]): the number of image tokens
+        """
+
+        prepare = self.process_one(
+            prompt=prompt,
+            images=images,
+            inference_mode=inference_mode,
+        )
+
+        return prepare
+
+    def tokenize_with_images(
+        self,
+        conversation: str,
+        images: List[Image.Image],
+        bos: bool = True,
+        eos: bool = True,
+        cropping: bool = True,
+    ):
+        """Tokenize text with <image> tags."""
+        assert conversation.count(self.image_token) == len(images)
+        text_splits = conversation.split(self.image_token)
+        images_list, images_seq_mask, images_spatial_crop = [], [], []
+        num_image_tokens = []
+        tokenized_str = []
+        for text_sep, image in zip(text_splits, images):
+            """encode text_sep"""
+            tokenized_sep = self.encode(text_sep, bos=False, eos=False)
+            tokenized_str += tokenized_sep
+            images_seq_mask += [False] * len(tokenized_sep)
+
+            """select best resolution for anyres"""
+            if cropping:
+                best_width, best_height = self.select_best_resolution(image.size)
+            else:
+                best_width, best_height = self.image_size, self.image_size
+
+            """process the global view"""
+            global_view = ImageOps.pad(image, (self.image_size, self.image_size),
+                                       color=tuple(int(x * 255) for x in self.image_transform.mean))
+            images_list.append(self.image_transform(global_view))
+
+            """process the local views"""
+            local_view = ImageOps.pad(image, (best_width, best_height),
+                                      color=tuple(int(x * 255) for x in self.image_transform.mean))
+            for i in range(0, best_height, self.image_size):
+                for j in range(0, best_width, self.image_size):
+                    images_list.append(
+                        self.image_transform(local_view.crop((j, i, j + self.image_size, i + self.image_size))))
+
+            """record height / width crop num"""
+            num_width_tiles, num_height_tiles = best_width // self.image_size, best_height // self.image_size
+            images_spatial_crop.append([num_width_tiles, num_height_tiles])
+
+            """add image tokens"""
+            h = w = math.ceil((self.image_size // self.patch_size) / self.downsample_ratio)
+            # global views tokens h * (w + 1), 1 is for line separator
+            tokenized_image = [self.image_token_id] * h * (w + 1)
+            # add a separator between global and local views
+            tokenized_image += [self.image_token_id]
+            # local views tokens, (num_height_tiles * h) * (num_width_tiles * w + 1)
+            tokenized_image += [self.image_token_id] * (num_height_tiles * h) * (num_width_tiles * w + 1)
+
+            tokenized_str += tokenized_image
+            images_seq_mask += [True] * len(tokenized_image)
+            num_image_tokens.append(len(tokenized_image))
+
+        """process the last text split"""
+        tokenized_sep = self.encode(text_splits[-1], bos=False, eos=False)
+        tokenized_str += tokenized_sep
+        images_seq_mask += [False] * len(tokenized_sep)
+
+        """add the bos and eos tokens"""
+        if bos:
+            tokenized_str = [self.bos_id] + tokenized_str
+            images_seq_mask = [False] + images_seq_mask
+        if eos:
+            tokenized_str = tokenized_str + [self.eos_id]
+            images_seq_mask = images_seq_mask + [False]
+
+        assert len(tokenized_str) == len(
+            images_seq_mask), f"tokenize_with_images func: tokenized_str's length {len(tokenized_str)} is not equal to imags_seq_mask's length {len(images_seq_mask)}"
+
+        return tokenized_str, images_list, images_seq_mask, images_spatial_crop, num_image_tokens
+
+
+AutoProcessor.register("DeepseekVLV2Processor", DeepseekVLV2Processor)

From 6d0e3d372446cde48b387d4d3530e25fc6e06320 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sat, 18 Jan 2025 14:35:15 +0800
Subject: [PATCH 266/462] [core] clean up executor class hierarchy between v1
 and v0 (#12171)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/executor/executor_base.py         |  10 -
 vllm/v1/executor/abstract.py           |  87 ++++---
 vllm/v1/executor/multiproc_executor.py |  50 +---
 vllm/v1/executor/ray_executor.py       | 344 -------------------------
 vllm/v1/executor/ray_utils.py          | 280 --------------------
 vllm/v1/executor/uniproc_executor.py   |  88 -------
 6 files changed, 61 insertions(+), 798 deletions(-)
 delete mode 100644 vllm/v1/executor/ray_executor.py
 delete mode 100644 vllm/v1/executor/ray_utils.py
 delete mode 100644 vllm/v1/executor/uniproc_executor.py

diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py
index d8457cb693cdb..e5952b388c543 100644
--- a/vllm/executor/executor_base.py
+++ b/vllm/executor/executor_base.py
@@ -79,16 +79,6 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
         b = min([r[1] for r in results])
         return a, b
 
-    def initialize(self, num_gpu_blocks: int) -> None:
-        """
-        Initialize the KV caches and begin the model execution loop of the
-        underlying workers.
-        For V1 compatibility.
-        """
-        logger.info("# GPU blocks: %d", num_gpu_blocks)
-        self.collective_rpc("initialize_cache", args=(num_gpu_blocks, ))
-        self.collective_rpc("compile_or_warm_up_model")
-
     def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None:
         """Initialize the KV cache by invoking the underlying worker.
         """
diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py
index 5240778ebf330..131be759842c7 100644
--- a/vllm/v1/executor/abstract.py
+++ b/vllm/v1/executor/abstract.py
@@ -1,63 +1,92 @@
-from abc import ABC, abstractmethod
 from typing import Type
 
 from vllm.config import VllmConfig
+from vllm.executor.executor_base import ExecutorBase
+from vllm.executor.ray_distributed_executor import (  # noqa
+    RayDistributedExecutor as RayDistributedExecutorV0)
+from vllm.executor.uniproc_executor import (  # noqa
+    ExecutorWithExternalLauncher as ExecutorWithExternalLauncherV0)
+from vllm.executor.uniproc_executor import (  # noqa
+    UniProcExecutor as UniProcExecutorV0)
 from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
 from vllm.v1.outputs import ModelRunnerOutput
 
 
-class Executor(ABC):
-    """Abstract class for executors."""
+class Executor(ExecutorBase):
+    """
+    Abstract class for v1 executors, mainly define some methods for v1.
+    For methods shared by v0 and v1, define them in ExecutorBase"""
 
     @staticmethod
     def get_class(vllm_config: VllmConfig) -> Type["Executor"]:
         executor_class: Type[Executor]
+        parallel_config = vllm_config.parallel_config
         distributed_executor_backend = (
-            vllm_config.parallel_config.distributed_executor_backend)
+            parallel_config.distributed_executor_backend)
+        if distributed_executor_backend is None:
+            # If the user does not specify the distributed executor backend,
+            # we will choose the backend based on the world size.
+            if parallel_config.world_size > 1:
+                distributed_executor_backend = "mp"
+            else:
+                distributed_executor_backend = "uni"
+
         if distributed_executor_backend == "ray":
-            from vllm.executor.ray_distributed_executor import (  # noqa
-                RayDistributedExecutor)
             executor_class = RayDistributedExecutor
         elif distributed_executor_backend == "mp":
             from vllm.v1.executor.multiproc_executor import MultiprocExecutor
             executor_class = MultiprocExecutor
+        elif distributed_executor_backend == "uni":
+            executor_class = UniProcExecutor
+        elif distributed_executor_backend == "external_launcher":
+            # TODO: make v1 scheduling deterministic
+            # to support external launcher
+            executor_class = ExecutorWithExternalLauncher
         else:
-            assert (distributed_executor_backend is None)
-            from vllm.v1.executor.uniproc_executor import UniprocExecutor
-            executor_class = UniprocExecutor
+            raise ValueError("Unknown distributed executor backend: "
+                             f"{distributed_executor_backend}")
         return executor_class
 
-    @abstractmethod
-    def __init__(self, vllm_config: VllmConfig) -> None:
-        raise NotImplementedError
-
-    @abstractmethod
     def initialize(self, kv_cache_config: KVCacheConfig) -> None:
-        raise NotImplementedError
+        """
+        Initialize the KV caches and begin the model execution loop of the
+        underlying workers.
+        """
+        self.collective_rpc("initialize_cache", args=(kv_cache_config, ))
+        self.collective_rpc("compile_or_warm_up_model")
 
-    @abstractmethod
     def determine_available_memory(self) -> int:  # in bytes
-        raise NotImplementedError
+        output = self.collective_rpc("determine_available_memory")
+        # Since we use a shared centralized controller, we take the minimum
+        # memory size across all workers to make sure all the memory
+        # operators can be applied to all workers.
+        return min(output)
 
-    @abstractmethod
     def get_kv_cache_spec(self) -> KVCacheSpec:
-        raise NotImplementedError
+        output = self.collective_rpc("get_kv_cache_spec")
+        for x in output:
+            assert x == output[0]
+        return output[0]
 
-    @abstractmethod
     def execute_model(
         self,
         scheduler_output,
     ) -> ModelRunnerOutput:
-        raise NotImplementedError
+        output = self.collective_rpc("execute_model",
+                                     args=(scheduler_output, ))
+        return output[0]
 
-    @abstractmethod
     def profile(self, is_start: bool = True):
-        raise NotImplementedError
+        self.collective_rpc("profile", args=(is_start, ))
+
+
+class UniProcExecutor(UniProcExecutorV0, Executor):
+    pass
+
+
+class ExecutorWithExternalLauncher(ExecutorWithExternalLauncherV0, Executor):
+    pass
 
-    @abstractmethod
-    def shutdown(self):
-        pass
 
-    @abstractmethod
-    def check_health(self) -> None:
-        raise NotImplementedError
+class RayDistributedExecutor(RayDistributedExecutorV0, Executor):
+    pass
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index fd977d07e8d81..93026029ad13e 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -25,8 +25,6 @@
 from vllm.utils import (get_distributed_init_method, get_mp_context,
                         get_open_port, get_open_zmq_ipc_path, zmq_socket_ctx)
 from vllm.v1.executor.abstract import Executor
-from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
-from vllm.v1.outputs import ModelRunnerOutput
 from vllm.worker.worker_base import WorkerWrapperBase
 
 logger = init_logger(__name__)
@@ -37,7 +35,7 @@
 
 class MultiprocExecutor(Executor):
 
-    def __init__(self, vllm_config: VllmConfig) -> None:
+    def _init_executor(self) -> None:
         # Call self.shutdown at exit to clean up
         # and ensure workers will be terminated.
         self._finalizer = weakref.finalize(self, self.shutdown)
@@ -55,9 +53,6 @@ def sigusr1_handler(signum, frame):
 
         signal.signal(signal.SIGUSR1, sigusr1_handler)
 
-        self.vllm_config = vllm_config
-        self.parallel_config = vllm_config.parallel_config
-
         self.world_size = self.parallel_config.world_size
         tensor_parallel_size = self.parallel_config.tensor_parallel_size
         assert self.world_size == tensor_parallel_size, (
@@ -82,7 +77,8 @@ def sigusr1_handler(signum, frame):
         # Create workers
         self.workers: List[WorkerProcHandle] = []
         for rank in range(self.world_size):
-            worker = WorkerProc.make_worker_process(vllm_config, rank, rank,
+            worker = WorkerProc.make_worker_process(self.vllm_config, rank,
+                                                    rank,
                                                     distributed_init_method,
                                                     scheduler_output_handle)
             self.workers.append(worker)
@@ -93,34 +89,6 @@ def sigusr1_handler(signum, frame):
         for w in self.workers:
             w.worker_response_mq.wait_until_ready()
 
-    def initialize(self, kv_cache_config: KVCacheConfig) -> None:
-        """
-        Initialize the KV caches and begin the model execution loop of the
-        underlying workers.
-        """
-        self.collective_rpc("initialize_cache", args=(kv_cache_config, ))
-        self.collective_rpc("compile_or_warm_up_model")
-
-    def determine_available_memory(self) -> int:
-        """
-        Determine the available memory (in bytes) for KV cache by invoking the
-        underlying worker.
-        """
-        memory_sizes = self.collective_rpc("determine_available_memory")
-
-        # Since we use a shared centralized controller, we take the minimum
-        # memory size across all workers to make sure all the memory
-        # operators can be applied to all workers.
-        return min(memory_sizes)
-
-    def get_kv_cache_spec(self) -> KVCacheSpec:
-        """
-        Get all kv cache needed by the model by invoking the underlying worker.
-        """
-        kv_cache_specs = self.collective_rpc("get_kv_cache_spec")
-        assert all(s == kv_cache_specs[0] for s in kv_cache_specs)
-        return kv_cache_specs[0]
-
     def collective_rpc(self,
                        method: Union[str, Callable],
                        timeout: Optional[float] = None,
@@ -172,18 +140,6 @@ def collective_rpc(self,
             # Re-raise any other exceptions
             raise e
 
-    def execute_model(
-        self,
-        scheduler_output,
-    ) -> ModelRunnerOutput:
-        model_output = self.collective_rpc("execute_model",
-                                           args=(scheduler_output, ))[0]
-        return model_output
-
-    def profile(self, is_start: bool = True):
-        self.collective_rpc("profile", args=(is_start, ))
-        return
-
     def _ensure_worker_termination(self):
         """Ensure that all worker processes are terminated. Assumes workers have
         received termination requests. Waits for processing, then sends
diff --git a/vllm/v1/executor/ray_executor.py b/vllm/v1/executor/ray_executor.py
deleted file mode 100644
index fd67fa2235770..0000000000000
--- a/vllm/v1/executor/ray_executor.py
+++ /dev/null
@@ -1,344 +0,0 @@
-import os
-from collections import defaultdict
-from itertools import islice, repeat
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
-
-import vllm.envs as envs
-from vllm.config import VllmConfig
-from vllm.logger import init_logger
-from vllm.utils import get_distributed_init_method, get_ip, get_open_port
-from vllm.v1.executor.abstract import Executor
-from vllm.v1.executor.ray_utils import (RayWorkerWrapper,
-                                        initialize_ray_cluster, ray)
-from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
-from vllm.v1.outputs import ModelRunnerOutput
-
-if ray is not None:
-    from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
-
-if TYPE_CHECKING:
-    from ray.util.placement_group import PlacementGroup
-
-logger = init_logger(__name__)
-
-
-class RayExecutor(Executor):
-
-    def __init__(self, vllm_config: VllmConfig) -> None:
-        self.vllm_config = vllm_config
-        self.parallel_config = vllm_config.parallel_config
-        self.model_config = vllm_config.model_config
-        self.forward_dag: Optional[ray.dag.CompiledDAG] = None
-
-        # Disable Ray usage stats collection.
-        ray_usage = os.environ.get("RAY_USAGE_STATS_ENABLED", "0")
-        if ray_usage != "1":
-            os.environ["RAY_USAGE_STATS_ENABLED"] = "0"
-
-        initialize_ray_cluster(self.parallel_config)
-        placement_group = self.parallel_config.placement_group
-
-        # Create the parallel GPU workers.
-        self._init_workers_ray(placement_group)
-
-    def _init_workers_ray(self, placement_group: "PlacementGroup",
-                          **ray_remote_kwargs):
-        # A list of workers to run a model.
-        self.workers: List[RayWorkerWrapper] = []
-        if self.parallel_config.ray_workers_use_nsight:
-            ray_remote_kwargs = self._configure_ray_workers_use_nsight(
-                ray_remote_kwargs)
-
-        # Create the workers.
-        driver_ip = get_ip()
-        for bundle_id, bundle in enumerate(placement_group.bundle_specs):
-            if not bundle.get("GPU", 0):
-                # Skip bundles that don't have GPUs,
-                # as each worker needs one GPU.
-                continue
-            scheduling_strategy = PlacementGroupSchedulingStrategy(
-                placement_group=placement_group,
-                placement_group_capture_child_tasks=True,
-                placement_group_bundle_index=bundle_id,
-            )
-
-            worker = ray.remote(
-                num_cpus=0,
-                num_gpus=1,
-                scheduling_strategy=scheduling_strategy,
-                **ray_remote_kwargs,
-            )(RayWorkerWrapper).remote(vllm_config=self.vllm_config)
-            self.workers.append(worker)
-
-        logger.debug("workers: %s", self.workers)
-        worker_ips = [
-            ray.get(worker.get_node_ip.remote())  # type: ignore[attr-defined]
-            for worker in self.workers
-        ]
-        ip_counts: Dict[str, int] = {}
-        for ip in worker_ips:
-            ip_counts[ip] = ip_counts.get(ip, 0) + 1
-
-        worker_to_ip = dict(zip(self.workers, worker_ips))
-
-        def sort_by_driver_then_worker_ip(worker):
-            """
-            Sort the workers based on 3 properties:
-            1. If the worker is on the same node as the driver (vllm engine),
-                it should be placed first.
-            2. Then, if the worker is on a node with fewer workers, it should
-                be placed first.
-            3. Finally, if the work is on a node with smaller IP address, it
-                should be placed first. This is simply a tiebreaker to make
-                sure the workers are sorted in a deterministic way.
-            """
-            ip = worker_to_ip[worker]
-            return (ip != driver_ip, ip_counts[ip], ip)
-
-        # After sorting, the workers on the same node will be
-        # close to each other, and the workers on the driver
-        # node will be placed first.
-        self.workers = sorted(self.workers, key=sort_by_driver_then_worker_ip)
-
-        # Get the set of GPU IDs used on each node.
-        worker_node_and_gpu_ids = self._run_workers("get_node_and_gpu_ids")
-
-        node_workers = defaultdict(list)  # node id -> list of worker ranks
-        node_gpus = defaultdict(list)  # node id -> list of gpu ids
-
-        for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids):
-            node_workers[node_id].append(i)
-            # `gpu_ids` can be a list of strings or integers.
-            # convert them to integers for consistency.
-            # NOTE: gpu_ids can be larger than 9 (e.g. 16 GPUs),
-            # string sorting is not sufficient.
-            # see https://github.com/vllm-project/vllm/issues/5590
-            gpu_ids = [int(x) for x in gpu_ids]
-            node_gpus[node_id].extend(gpu_ids)
-
-        for node_id, gpu_ids in node_gpus.items():
-            node_gpus[node_id] = sorted(gpu_ids)
-
-        all_ips = set(worker_ips)
-        n_ips = len(all_ips)
-        n_nodes = len(node_workers)
-
-        if n_nodes != n_ips:
-            raise RuntimeError(
-                f"Every node should have a unique IP address. Got {n_nodes}"
-                f" nodes with node ids {list(node_workers.keys())} and "
-                f"{n_ips} unique IP addresses {all_ips}. Please check your"
-                " network configuration. If you set `VLLM_HOST_IP` or "
-                "`HOST_IP` environment variable, make sure it is unique for"
-                " each node.")
-
-        # Set environment variables for the driver and workers.
-        all_args_to_update_environment_variables = [({
-            "CUDA_VISIBLE_DEVICES":
-            ",".join(map(str, node_gpus[node_id])),
-            "VLLM_TRACE_FUNCTION":
-            str(envs.VLLM_TRACE_FUNCTION),
-            "VLLM_USE_V1":
-            str(int(envs.VLLM_USE_V1)),
-            **({
-                "VLLM_ATTENTION_BACKEND": envs.VLLM_ATTENTION_BACKEND
-            } if envs.VLLM_ATTENTION_BACKEND is not None else {})
-        }, ) for (node_id, _) in worker_node_and_gpu_ids]
-
-        self._env_vars_for_all_workers = (
-            all_args_to_update_environment_variables)
-
-        self._run_workers("update_environment_variables",
-                          all_args=self._get_env_vars_to_be_updated())
-
-        if len(node_gpus) == 1:
-            # in single node case, we don't need to get the IP address.
-            # the loopback address is sufficient
-            # NOTE: a node may have several IP addresses, one for each
-            # network interface. `get_ip()` might return any of them,
-            # while they might not work for communication inside the node
-            # if the network setup is complicated. Using the loopback address
-            # solves this issue, as it always works for communication inside
-            # the node.
-            driver_ip = "127.0.0.1"
-        distributed_init_method = get_distributed_init_method(
-            driver_ip, get_open_port())
-
-        # Initialize the actual workers inside worker wrapper.
-        init_worker_all_kwargs = [
-            self._get_worker_kwargs(
-                local_rank=node_workers[node_id].index(rank),
-                rank=rank,
-                distributed_init_method=distributed_init_method,
-            ) for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids)
-        ]
-        self._run_workers("init_worker", all_kwargs=init_worker_all_kwargs)
-        self._run_workers("initialize")
-        self._run_workers("load_model")
-
-    def _configure_ray_workers_use_nsight(self,
-                                          ray_remote_kwargs) -> Dict[str, Any]:
-        # If nsight profiling is enabled, we need to set the profiling
-        # configuration for the ray workers as runtime env.
-        runtime_env = ray_remote_kwargs.setdefault("runtime_env", {})
-        runtime_env.update({
-            "nsight": {
-                "t": "cuda,cudnn,cublas",
-                "o": "'worker_process_%p'",
-                "cuda-graph-trace": "node",
-            }
-        })
-
-        return ray_remote_kwargs
-
-    def _get_env_vars_to_be_updated(self):
-        return self._env_vars_for_all_workers
-
-    def _get_worker_kwargs(
-            self,
-            local_rank: int = 0,
-            rank: int = 0,
-            distributed_init_method: Optional[str] = None) -> Dict[str, Any]:
-        """
-        Return worker init args for a given rank.
-        """
-        if distributed_init_method is None:
-            distributed_init_method = get_distributed_init_method(
-                get_ip(), get_open_port())
-        return dict(
-            vllm_config=self.vllm_config,
-            local_rank=local_rank,
-            rank=rank,
-            distributed_init_method=distributed_init_method,
-        )
-
-    def determine_available_memory(self) -> int:
-        """
-        Determine the available GPU memory in bytes.
-        
-        This invokes `determine_available_memory` on each worker and takes
-        the min of the results, guaranteeing that the selected cache sizes are
-        compatible with all workers.
-        """
-
-        memory_sizes = self._run_workers("determine_available_memory")
-
-        # Since we use a shared centralized controller, we take the minimum
-        # memory size across all workers to make sure all the memory
-        # operators can be applied to all workers.
-        return min(memory_sizes)
-
-    def initialize(self, kv_cache_config: KVCacheConfig) -> None:
-        """
-        Initialize the KV cache in all workers.
-        """
-        self._run_workers("initialize_cache", kv_cache_config)
-        self._run_workers("compile_or_warm_up_model")
-
-    def get_kv_cache_spec(self) -> KVCacheSpec:
-        """
-        Get all kv cache needed by the model
-        
-        This invokes `get_kv_cache_spec` on each worker and asserts that
-        they are identical. The KVCacheSpec is then returned.
-        """
-        kv_cache_specs = self._run_workers("get_kv_cache_spec")
-        assert all(s == kv_cache_specs[0] for s in kv_cache_specs)
-        return kv_cache_specs[0]
-
-    def _run_workers(
-        self,
-        method: str,
-        *args,
-        all_args: Optional[List[Tuple[Any, ...]]] = None,
-        all_kwargs: Optional[List[Dict[str, Any]]] = None,
-        **kwargs,
-    ) -> Any:
-        """
-        Runs the given method on all workers. Can be used in the following
-        ways:
-
-        Args:
-        - args/kwargs: All workers share the same args/kwargs
-        - all_args/all_kwargs: args/kwargs for each worker are specified
-          individually
-        """
-        count = len(self.workers)
-        all_worker_args = repeat(args, count) if all_args is None \
-            else islice(all_args, 0, None)
-        all_worker_kwargs = repeat(kwargs, count) if all_kwargs is None \
-            else islice(all_kwargs, 0, None)
-
-        ray_worker_refs = [
-            worker.execute_method.remote(  # type: ignore[attr-defined]
-                method, *worker_args, **worker_kwargs)
-            for (worker, worker_args, worker_kwargs
-                 ) in zip(self.workers, all_worker_args, all_worker_kwargs)
-        ]
-        return ray.get(ray_worker_refs)
-
-    def execute_model(
-        self,
-        scheduler_output,
-    ) -> ModelRunnerOutput:
-        if self.forward_dag is None:
-            self.forward_dag = self._compiled_ray_dag()
-        # Only the first worker (with rank 0) returns the execution result.
-        # Others return None.
-        output = ray.get(self.forward_dag.execute(scheduler_output))[0]
-        return output
-
-    def profile(self, is_start=True):
-        raise NotImplementedError
-
-    def shutdown(self):
-        if hasattr(self, "forward_dag") and self.forward_dag is not None:
-            self.forward_dag.teardown()
-            import ray
-            for worker in self.workers:
-                ray.kill(worker)
-            self.forward_dag = None
-
-    def check_health(self) -> None:
-        logger.debug("Called check_health.")
-
-    def _check_ray_compiled_graph_installation(self):
-        import pkg_resources
-        from packaging import version
-
-        required_version = version.parse("2.39")
-        current_version = version.parse(
-            pkg_resources.get_distribution("ray").version)
-        if current_version < required_version:
-            raise ValueError(f"Ray version {required_version} is "
-                             f"required, but found {current_version}")
-
-        import importlib.util
-        raycg = importlib.util.find_spec("ray.experimental.compiled_dag_ref")
-        if raycg is None:
-            raise ValueError("Ray Compiled Graph is not installed. "
-                             "Run `pip install ray[adag]` to install it.")
-
-        cupy_spec = importlib.util.find_spec("cupy")
-        if cupy_spec is None and envs.VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL:
-            raise ValueError(
-                "cupy is not installed but required since "
-                "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL is set."
-                "Run `pip install ray[adag]` and check cupy installation.")
-
-    def _compiled_ray_dag(self):
-        assert self.parallel_config.use_ray
-        self._check_ray_compiled_graph_installation()
-        from ray.dag import InputNode, MultiOutputNode
-
-        with InputNode() as input_batches:
-            outputs = [
-                worker.execute_model.bind(  # type: ignore[attr-defined]
-                    input_batches) for worker in self.workers
-            ]
-            forward_dag = MultiOutputNode(outputs)
-
-        return forward_dag.experimental_compile()
-
-    def __del__(self):
-        self.shutdown()
diff --git a/vllm/v1/executor/ray_utils.py b/vllm/v1/executor/ray_utils.py
deleted file mode 100644
index fc9715b7a5909..0000000000000
--- a/vllm/v1/executor/ray_utils.py
+++ /dev/null
@@ -1,280 +0,0 @@
-import time
-from collections import defaultdict
-from typing import TYPE_CHECKING, Dict, List, Optional, Tuple
-
-from vllm.config import ParallelConfig
-from vllm.logger import init_logger
-from vllm.platforms import current_platform
-from vllm.utils import get_ip
-from vllm.v1.outputs import ModelRunnerOutput
-from vllm.worker.worker_base import WorkerWrapperBase
-
-if TYPE_CHECKING:
-    from vllm.v1.core.scheduler import SchedulerOutput
-
-logger = init_logger(__name__)
-PG_WAIT_TIMEOUT = 60
-
-try:
-    import ray
-    from ray.util import placement_group_table
-    from ray.util.placement_group import PlacementGroup
-    try:
-        from ray._private.state import available_resources_per_node
-    except ImportError:
-        # Ray 2.9.x doesn't expose `available_resources_per_node`
-        from ray._private.state import state as _state
-        available_resources_per_node = _state._available_resources_per_node
-
-    class RayWorkerWrapper(WorkerWrapperBase):
-
-        def __init__(self, *args, **kwargs) -> None:
-            super().__init__(*args, **kwargs)
-            # Since the compiled DAG runs a main execution
-            # in a different thread that calls cuda.set_device.
-            # The flag indicates is set_device is called on
-            # that thread. It will be removed soon.
-            self.compiled_dag_cuda_device_set = False
-
-        def get_node_ip(self) -> str:
-            return get_ip()
-
-        def get_node_and_gpu_ids(self) -> Tuple[str, List[int]]:
-            node_id = ray.get_runtime_context().get_node_id()
-            device_key = current_platform.ray_device_key
-            if not device_key:
-                raise RuntimeError("current platform %s does not support ray.",
-                                   current_platform.device_name)
-            gpu_ids = ray.get_runtime_context().get_accelerator_ids(
-            )[device_key]
-            return node_id, gpu_ids
-
-        def setup_device_if_necessary(self):
-            # TODO(swang): This is needed right now because Ray CG executes
-            # on a background thread, so we need to reset torch's current
-            # device.
-            # We can remove this API after it is fixed in compiled graph.
-            import torch
-            assert self.worker is not None, "Worker is not initialized"
-            if not self.compiled_dag_cuda_device_set:
-                torch.cuda.set_device(self.worker.device)
-                self.compiled_dag_cuda_device_set = True
-
-        def execute_model(
-            self,
-            scheduler_output: "SchedulerOutput",
-        ) -> ModelRunnerOutput:
-            self.setup_device_if_necessary()
-            assert self.worker is not None, "Worker is not initialized"
-            output = self.worker.model_runner.execute_model(scheduler_output)
-            return output
-
-    ray_import_err = None
-
-except ImportError as e:
-    ray = None  # type: ignore
-    ray_import_err = e
-    RayWorkerWrapper = None  # type: ignore
-
-
-def ray_is_available() -> bool:
-    """Returns True if Ray is available."""
-    return ray is not None
-
-
-def assert_ray_available():
-    """
-    Raise an exception if Ray is not available.
-    """
-    if ray is None:
-        raise ValueError("Failed to import Ray, please install Ray with "
-                         "`pip install ray`.") from ray_import_err
-
-
-def _verify_bundles(placement_group: "PlacementGroup",
-                    parallel_config: ParallelConfig, device_str: str):
-    """
-    Verify a given placement group has bundles located in the right place.
-
-    There are 2 rules.
-    - Warn if all tensor parallel workers cannot fit in a single node.
-    - Fail if driver node is not included in a placement group.
-
-    Args:
-        placement_group: The placement group to verify.
-        parallel_config: The parallel configuration.
-        device_str: The required device.
-    """
-    assert ray.is_initialized(), (
-        "Ray is not initialized although distributed-executor-backend is ray.")
-    pg_data = placement_group_table(placement_group)
-    # bundle_idx -> node_id
-    bundle_to_node_ids = pg_data["bundles_to_node_id"]
-    # bundle_idx -> bundle (e.g., {"GPU": 1})
-    bundles = pg_data["bundles"]
-    # node_id -> List of bundle (e.g., {"GPU": 1})
-    node_id_to_bundle: Dict[str, List[Dict[str, float]]] = defaultdict(list)
-
-    for bundle_idx, node_id in bundle_to_node_ids.items():
-        node_id_to_bundle[node_id].append(bundles[bundle_idx])
-    driver_node_id = ray.get_runtime_context().get_node_id()
-
-    if driver_node_id not in node_id_to_bundle:
-        raise RuntimeError(
-            f"driver node id {driver_node_id} is not included in a placement "
-            f"group {placement_group.id}. Node id -> bundles "
-            f"{node_id_to_bundle}. "
-            "You don't have enough GPUs available in a current node. Check "
-            "`ray status` to see if you have available GPUs in a node "
-            f"{driver_node_id} before starting an vLLM engine.")
-
-    for node_id, bundles in node_id_to_bundle.items():
-        if len(bundles) < parallel_config.tensor_parallel_size:
-            logger.warning(
-                "tensor_parallel_size=%d "
-                "is bigger than a reserved number of %ss (%d "
-                "%ss) in a node %s. Tensor parallel workers can be "
-                "spread out to 2+ nodes which can degrade the performance "
-                "unless you have fast interconnect across nodes, like "
-                "Infiniband. To resolve this issue, make sure you have more "
-                "than %d GPUs available at each node.",
-                parallel_config.tensor_parallel_size, device_str, len(bundles),
-                device_str, node_id, parallel_config.tensor_parallel_size)
-
-
-def _wait_until_pg_ready(current_placement_group: "PlacementGroup"):
-    """Wait until a placement group is ready.
-
-    It prints the informative log messages if the placement group is
-    not created within time.
-
-    """
-    # Wait until PG is ready - this will block until all
-    # requested resources are available, and will timeout
-    # if they cannot be provisioned.
-    placement_group_specs = current_placement_group.bundle_specs
-
-    s = time.time()
-    pg_ready_ref = current_placement_group.ready()
-    wait_interval = 10
-    while time.time() - s < PG_WAIT_TIMEOUT:
-        ready, _ = ray.wait([pg_ready_ref], timeout=wait_interval)
-        if len(ready) > 0:
-            break
-
-        # Exponential backoff for warning print.
-        wait_interval *= 2
-        logger.info(
-            "Waiting for creating a placement group of specs for "
-            "%d seconds. specs=%s. Check "
-            "`ray status` to see if you have enough resources.",
-            int(time.time() - s), placement_group_specs)
-
-    try:
-        ray.get(pg_ready_ref, timeout=0)
-    except ray.exceptions.GetTimeoutError:
-        raise ValueError(
-            "Cannot provide a placement group of "
-            f"{placement_group_specs=} within {PG_WAIT_TIMEOUT} seconds. See "
-            "`ray status` to make sure the cluster has enough resources."
-        ) from None
-
-
-def initialize_ray_cluster(
-    parallel_config: ParallelConfig,
-    ray_address: Optional[str] = None,
-):
-    """Initialize the distributed cluster with Ray.
-
-    it will connect to the Ray cluster and create a placement group
-    for the workers, which includes the specification of the resources
-    for each distributed worker.
-
-    Args:
-        parallel_config: The configurations for parallel execution.
-        ray_address: The address of the Ray cluster. If None, uses
-            the default Ray cluster address.
-    """
-    assert_ray_available()
-
-    # Connect to a ray cluster.
-    if current_platform.is_rocm() or current_platform.is_xpu():
-        # Try to connect existing ray instance and create a new one if not found
-        try:
-            ray.init("auto")
-        except ConnectionError:
-            logger.warning(
-                "No existing RAY instance detected. "
-                "A new instance will be launched with current node resources.")
-            ray.init(address=ray_address,
-                     ignore_reinit_error=True,
-                     num_gpus=parallel_config.world_size)
-    else:
-        ray.init(address=ray_address, ignore_reinit_error=True)
-
-    if parallel_config.placement_group:
-        # Placement group is already set.
-        return
-
-    device_str = current_platform.ray_device_key
-    if not device_str:
-        raise ValueError(
-            f"current platform {current_platform.device_name} does not "
-            "support ray.")
-    # Create placement group for worker processes
-    current_placement_group = ray.util.get_current_placement_group()
-    if current_placement_group:
-        # We are in a placement group
-        bundles = current_placement_group.bundle_specs
-        # Verify that we can use the placement group.
-        device_bundles = 0
-        for bundle in bundles:
-            bundle_devices = bundle.get(device_str, 0)
-            if bundle_devices > 1:
-                raise ValueError(
-                    "Placement group bundle cannot have more than 1 "
-                    f"{device_str}.")
-            if bundle_devices:
-                device_bundles += 1
-        if parallel_config.world_size > device_bundles:
-            raise ValueError(
-                f"The number of required {device_str}s exceeds the total "
-                f"number of available {device_str}s in the placement group."
-                f"Required number of devices: {parallel_config.world_size}. "
-                f"Total number of devices: {device_bundles}.")
-    else:
-        num_devices_in_cluster = ray.cluster_resources().get(device_str, 0)
-        if parallel_config.world_size > num_devices_in_cluster:
-            raise ValueError(
-                f"The number of required {device_str}s exceeds the total "
-                f"number of available {device_str}s in the placement group.")
-        # Create a new placement group
-        placement_group_specs: List[Dict[str, float]] = ([{
-            device_str: 1.0
-        } for _ in range(parallel_config.world_size)])
-
-        # vLLM engine is also a worker to execute model with an accelerator,
-        # so it requires to have the device in a current node. Check if
-        # the current node has at least one device.
-        current_ip = get_ip()
-        current_node_id = ray.get_runtime_context().get_node_id()
-        current_node_resource = available_resources_per_node()[current_node_id]
-        if current_node_resource.get(device_str, 0) < 1:
-            raise ValueError(
-                f"Current node has no {device_str} available. "
-                f"{current_node_resource=}. vLLM engine cannot start without "
-                f"{device_str}. Make sure you have at least 1 {device_str} "
-                f"available in a node {current_node_id=} {current_ip=}.")
-        # This way, at least bundle is required to be created in a current
-        # node.
-        placement_group_specs[0][f"node:{current_ip}"] = 0.001
-
-        # By default, Ray packs resources as much as possible.
-        current_placement_group = ray.util.placement_group(
-            placement_group_specs, strategy="PACK")
-        _wait_until_pg_ready(current_placement_group)
-
-    assert current_placement_group is not None
-    _verify_bundles(current_placement_group, parallel_config, device_str)
-    # Set the placement group in the parallel config
-    parallel_config.placement_group = current_placement_group
diff --git a/vllm/v1/executor/uniproc_executor.py b/vllm/v1/executor/uniproc_executor.py
deleted file mode 100644
index b3997caac726b..0000000000000
--- a/vllm/v1/executor/uniproc_executor.py
+++ /dev/null
@@ -1,88 +0,0 @@
-import os
-from typing import Optional
-
-from vllm.config import VllmConfig
-from vllm.logger import init_logger
-from vllm.utils import get_distributed_init_method, get_ip, get_open_port
-from vllm.v1.executor.abstract import Executor
-from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
-from vllm.v1.outputs import ModelRunnerOutput
-from vllm.v1.worker.gpu_worker import Worker
-
-logger = init_logger(__name__)
-
-
-class UniprocExecutor(Executor):
-
-    def __init__(self, vllm_config: VllmConfig) -> None:
-        self.vllm_config = vllm_config
-        self.model_config = vllm_config.model_config
-        self.cache_config = vllm_config.cache_config
-        self.lora_config = vllm_config.lora_config
-        self.load_config = vllm_config.load_config
-        self.parallel_config = vllm_config.parallel_config
-        self.scheduler_config = vllm_config.scheduler_config
-        self.device_config = vllm_config.device_config
-        self.speculative_config = vllm_config.speculative_config
-        self.prompt_adapter_config = vllm_config.prompt_adapter_config
-        self.observability_config = vllm_config.observability_config
-
-        self.worker: Worker = self._create_worker()
-        self.worker.init_device()
-        self.worker.load_model()
-
-    def _create_worker(
-            self,
-            local_rank: int = 0,
-            rank: int = 0,
-            distributed_init_method: Optional[str] = None) -> Worker:
-        """Return worker init args for a given rank."""
-        # see https://github.com/NVIDIA/nccl/issues/1234
-        os.environ['NCCL_CUMEM_ENABLE'] = '0'
-
-        if distributed_init_method is None:
-            distributed_init_method = get_distributed_init_method(
-                get_ip(), get_open_port())
-        return Worker(
-            vllm_config=self.vllm_config,
-            local_rank=local_rank,
-            rank=rank,
-            distributed_init_method=distributed_init_method,
-        )
-
-    def determine_available_memory(self) -> int:
-        """Determine the available memory (in bytes) for KV cache by invoking 
-        the underlying worker.
-        """
-        return self.worker.determine_available_memory()
-
-    def get_kv_cache_spec(self) -> KVCacheSpec:
-        """Get all kv cache needed by the model by invoking the underlying
-        worker.
-        """
-        return self.worker.get_kv_cache_spec()
-
-    def initialize(self, kv_cache_config: KVCacheConfig) -> None:
-        """Initialize the KV cache by invoking the underlying worker.
-        """
-        self.worker.initialize_cache(kv_cache_config)
-        self.worker.compile_or_warm_up_model()
-
-    def execute_model(
-        self,
-        scheduler_output,
-    ) -> ModelRunnerOutput:
-        output = self.worker.execute_model(scheduler_output)
-        assert output is not None
-        return output
-
-    def profile(self, is_start: bool = True):
-        self.worker.profile(is_start)
-
-    def shutdown(self):
-        pass
-
-    def check_health(self) -> None:
-        # UniprocExecutor will always be healthy as long as
-        # it's running.
-        return

From 32eb0da808ea162a2d6758ff0bd9bdd0934b5fd5 Mon Sep 17 00:00:00 2001
From: yancong <32220263+ice-tong@users.noreply.github.com>
Date: Sun, 19 Jan 2025 08:13:16 +0800
Subject: [PATCH 267/462] [Misc] Support register quantization method
 out-of-tree (#11969)

---
 .../test_register_quantization_config.py      | 117 ++++++++++++++++++
 .../layers/quantization/__init__.py           |  41 ++++++
 2 files changed, 158 insertions(+)
 create mode 100644 tests/quantization/test_register_quantization_config.py

diff --git a/tests/quantization/test_register_quantization_config.py b/tests/quantization/test_register_quantization_config.py
new file mode 100644
index 0000000000000..8e7f44a399ddf
--- /dev/null
+++ b/tests/quantization/test_register_quantization_config.py
@@ -0,0 +1,117 @@
+"""Tests register custom quantization config.
+
+See https://github.com/vllm-project/vllm/issues/11926 for more details.
+
+Run `pytest tests/quantization/test_register_quantization_config.py`.
+"""
+from typing import Any, Dict, List, Optional
+
+import pytest
+import torch
+import torch.nn.functional as F
+
+from vllm.model_executor.layers.linear import LinearBase  # noqa: E501
+from vllm.model_executor.layers.linear import UnquantizedLinearMethod
+from vllm.model_executor.layers.quantization import (
+    get_quantization_config, register_quantization_config)
+from vllm.model_executor.layers.quantization.base_config import (  # noqa: E501
+    QuantizationConfig)
+
+
+class FakeQuantLinearMethod(UnquantizedLinearMethod):
+    """Fake quantization linear method for per-token dynamic quantization."""
+
+    def __init__(self, num_bits: int = 8) -> None:
+        """Initialize the quantization method."""
+        super().__init__()
+        self.num_bits = num_bits
+
+    def apply(self,
+              layer: "torch.nn.Module",
+              x: "torch.Tensor",
+              bias: Optional["torch.Tensor"] = None) -> "torch.Tensor":
+        """Perform fake quantization before the linear layer."""
+
+        # Calculate the scales dynamically
+        max_val = torch.amax(x, dim=(0, -1), keepdims=True)
+        min_val = torch.amin(x, dim=(0, -1), keepdims=True)
+        scales = (max_val - min_val) / (2**self.num_bits - 1)
+
+        # Fake quantize the input
+        quant_x = torch.clamp(torch.round(x / scales), -2**(self.num_bits - 1),
+                              2**(self.num_bits - 1) - 1)
+        dequant_x = quant_x * scales
+
+        return F.linear(dequant_x, layer.weight, bias)
+
+
+@register_quantization_config("custom_quant")
+class CustomQuantConfig(QuantizationConfig):
+    """Custom quantization config for per-token dynamic fake quantization."""
+
+    def __init__(self, num_bits: int = 8) -> None:
+        """Initialize the quantization config."""
+        self.num_bits = num_bits
+
+    def get_name(self) -> str:
+        """Name of the quantization method."""
+        return "custom_quant"
+
+    def get_supported_act_dtypes(self) -> List["torch.dtype"]:
+        """List of supported activation dtypes."""
+        return [torch.float16, torch.bfloat16]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        """Minimum GPU capability to support the quantization method."""
+        return -1
+
+    @staticmethod
+    def get_config_filenames() -> List[str]:
+        """List of filenames to search for in the model directory."""
+        return []
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> "CustomQuantConfig":
+        """Create a config class from the model's quantization config."""
+        return CustomQuantConfig(num_bits=config.get("num_bits", 8))
+
+    def get_quant_method(self, layer: "torch.nn.Module",
+                         prefix: str) -> Optional["FakeQuantLinearMethod"]:
+        """Get the quantize method to use for the quantized layer."""
+        if isinstance(layer, LinearBase):
+            return FakeQuantLinearMethod(num_bits=self.num_bits)
+        return None
+
+
+def test_register_quantization_config():
+    """Test register custom quantization config."""
+
+    # The quantization method `custom_quant` should be registered.
+    assert get_quantization_config("custom_quant") == CustomQuantConfig
+
+    # The quantization method `custom_quant` is already exists,
+    # should raise an error.
+    with pytest.raises(ValueError):
+        register_quantization_config("custom_quant")(CustomQuantConfig)
+
+
+@pytest.mark.parametrize(argnames="model",
+                         argvalues=[
+                             "meta-llama/Meta-Llama-3-8B-Instruct",
+                         ])
+def test_custom_quant(vllm_runner, model):
+    """Test infer with the custom quantization method."""
+    with vllm_runner(model_name=model,
+                     quantization="custom_quant",
+                     enforce_eager=True) as llm:
+
+        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
+        layer = model.model.layers[0]
+        qkv_proj = layer.self_attn.qkv_proj
+
+        # Check the quantization method is FakeQuantLinearMethod
+        assert isinstance(qkv_proj.quant_method, FakeQuantLinearMethod)
+
+        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        assert output
diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
index caeb8b95e02f2..d2bde13fcf546 100644
--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -29,6 +29,45 @@
     "quark"
 ]
 
+# The customized quantization methods which will be added to this dict.
+_CUSTOMIZED_METHOD_TO_QUANT_CONFIG = {}
+
+
+def register_quantization_config(quantization: str):
+    """Register a customized vllm quantization config.
+
+    When a quantization method is not supported by vllm, you can register a customized
+    quantization config to support it.
+
+    Args:
+        quantization (str): The quantization method name.
+
+    Examples:
+        >>> from vllm.model_executor.layers.quantization import register_quantization_config
+        >>> from vllm.model_executor.layers.quantization import get_quantization_config
+        >>> from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
+        >>>
+        >>> @register_quantization_config("my_quant")
+        ... class MyQuantConfig(QuantizationConfig):
+        ...     pass
+        >>>
+        >>> get_quantization_config("my_quant")
+        <class 'MyQuantConfig'>
+    """  # noqa: E501
+
+    def _wrapper(quant_config_cls):
+        if quantization in QUANTIZATION_METHODS:
+            raise ValueError(
+                f"The quantization method `{quantization}` is already exists.")
+        if not issubclass(quant_config_cls, QuantizationConfig):
+            raise ValueError("The quantization config must be a subclass of "
+                             "`QuantizationConfig`.")
+        _CUSTOMIZED_METHOD_TO_QUANT_CONFIG[quantization] = quant_config_cls
+        QUANTIZATION_METHODS.append(quantization)
+        return quant_config_cls
+
+    return _wrapper
+
 
 def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
     if quantization not in QUANTIZATION_METHODS:
@@ -84,6 +123,8 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
         "ipex": IPEXConfig,
         "quark": QuarkConfig
     }
+    # Update the `method_to_config` with customized quantization methods.
+    method_to_config.update(_CUSTOMIZED_METHOD_TO_QUANT_CONFIG)
 
     return method_to_config[quantization]
 

From 7a8a48d51e51554645b233f870b71ef43bc70177 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Sat, 18 Jan 2025 19:07:15 -0800
Subject: [PATCH 268/462] [V1] Collect env var for usage stats (#12115)

---
 vllm/usage/usage_lib.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py
index a9deee881f41a..841df3994fba2 100644
--- a/vllm/usage/usage_lib.py
+++ b/vllm/usage/usage_lib.py
@@ -27,6 +27,17 @@
 
 _GLOBAL_RUNTIME_DATA: Dict[str, Union[str, int, bool]] = {}
 
+_USAGE_ENV_VARS_TO_COLLECT = [
+    "VLLM_USE_MODELSCOPE",
+    "VLLM_USE_TRITON_FLASH_ATTN",
+    "VLLM_ATTENTION_BACKEND",
+    "VLLM_USE_FLASHINFER_SAMPLER",
+    "VLLM_PP_LAYER_PARTITION",
+    "VLLM_USE_TRITON_AWQ",
+    "VLLM_USE_V1",
+    "VLLM_ENABLE_V1_MULTIPROCESSING",
+]
+
 
 def set_runtime_usage_data(key: str, value: Union[str, int, bool]) -> None:
     """Set global usage data that will be sent with every usage heartbeat."""
@@ -122,6 +133,7 @@ def __init__(self) -> None:
         self.gpu_count: Optional[int] = None
         self.gpu_type: Optional[str] = None
         self.gpu_memory_per_device: Optional[int] = None
+        self.env_var_json: Optional[str] = None
 
         # vLLM Information
         self.model_architecture: Optional[str] = None
@@ -176,6 +188,12 @@ def _report_usage_once(self, model_architecture: str,
         self.vllm_version = VLLM_VERSION
         self.model_architecture = model_architecture
 
+        # Environment variables
+        self.env_var_json = json.dumps({
+            env_var: getattr(envs, env_var)
+            for env_var in _USAGE_ENV_VARS_TO_COLLECT
+        })
+
         # Metadata
         self.log_time = _get_current_timestamp_ns()
         self.source = envs.VLLM_USAGE_SOURCE

From 4e94951bb16282c36de6d12ef14a1500f25a3bdf Mon Sep 17 00:00:00 2001
From: Michal Adamczyk <madamczyk@habana.ai>
Date: Sun, 19 Jan 2025 04:12:05 +0100
Subject: [PATCH 269/462] [BUGFIX] Move scores to float32 in case of running
 xgrammar on cpu (#12152)

Signed-off-by: Michal Adamczyk <madamczyk@habana.ai>
---
 vllm/model_executor/guided_decoding/xgrammar_decoding.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
index f10a8fb8e03cf..2d8594cb8aafa 100644
--- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py
+++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
@@ -298,8 +298,11 @@ def __call__(self, input_ids: list[int],
         # token_bitmask is a CPU tensor for use with accept_token and
         # fill_next_token_bitmask so we move it to the device of scores
         device_type = scores.device.type
+        dtype = scores.dtype
         if device_type != "cuda":
-            scores = scores.to("cpu").unsqueeze(0)
+            # xgrammar on cpu only supports float32 scores
+            # see: https://github.com/mlc-ai/xgrammar/blob/c1b64920cad24f44f235778c1c00bb52d57da01a/python/xgrammar/kernels/apply_token_bitmask_inplace_cpu.py#L22
+            scores = scores.to("cpu").float().unsqueeze(0)
 
         # Note: In this method, if the tensors have different dimensions
         # on CPU device fails, but on GPU it runs without error. Hence the
@@ -307,7 +310,7 @@ def __call__(self, input_ids: list[int],
         xgr.apply_token_bitmask_inplace(scores,
                                         self.token_bitmask.to(scores.device))
         if device_type != "cuda":
-            scores = scores.to(device_type).squeeze()
+            scores = scores.to(dtype).to(device_type).squeeze()
 
         return scores
 

From 630eb5b5ce6ea59b6480440b7f6064be5ca71ae1 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sun, 19 Jan 2025 11:16:34 +0800
Subject: [PATCH 270/462] [Bugfix] Fix multi-modal processors for transformers
 4.48 (#12187)

---
 vllm/model_executor/models/llava.py         |  25 ++++-
 vllm/model_executor/models/qwen2_audio.py   |  72 ++++++++----
 vllm/model_executor/models/ultravox.py      |   9 +-
 vllm/transformers_utils/config.py           |   9 +-
 vllm/transformers_utils/configs/__init__.py |   2 +
 vllm/transformers_utils/configs/aria.py     | 118 ++++++++++++++++++++
 6 files changed, 199 insertions(+), 36 deletions(-)

diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 722fff98d5c19..6cceded43a79d 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -5,9 +5,11 @@
 
 import torch
 import torch.nn as nn
+from packaging.version import Version
 from transformers import (BatchFeature, CLIPVisionConfig, LlavaConfig,
                           PixtralVisionConfig, PretrainedConfig,
                           SiglipVisionConfig)
+from transformers import __version__ as TRANSFORMERS_VERSION
 from transformers.models.llava import LlavaProcessor
 from transformers.models.pixtral import PixtralProcessor
 
@@ -716,6 +718,27 @@ def load_weights(self, weights: Iterable[Tuple[str,
         return loader.load_weights(weights)
 
 
+class MantisProcessingInfo(LlavaProcessingInfo):
+
+    def get_hf_processor(self):
+        hf_config = self.get_hf_config()
+        vision_info = self.get_vision_encoder_info()
+
+        if Version(TRANSFORMERS_VERSION) < Version("4.48"):
+            # BUG: num_additional_image_tokens = 0 but treated as 1,
+            # so we set vision_feature_select_strategy to None to offset this
+            vision_feature_select_strategy = None
+        else:
+            # FIXED: https://github.com/huggingface/transformers/pull/33424/files#diff-6a37acc21efcadaae622b079b2712a131131448ff64262bd219aa346aeec38faL150
+            vision_feature_select_strategy = hf_config.vision_feature_select_strategy  # noqa: E501
+
+        return self.ctx.get_hf_processor(
+            LlavaProcessor,
+            patch_size=vision_info.get_patch_size(),
+            vision_feature_select_strategy=vision_feature_select_strategy,
+        )
+
+
 class MantisMultiModalProcessor(LlavaMultiModalProcessor):
 
     def apply(
@@ -794,7 +817,7 @@ def get_replacement_mantis(item_idx: int):
 # To use this model, please use
 # `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'`
 @MULTIMODAL_REGISTRY.register_processor(MantisMultiModalProcessor,
-                                        info=LlavaProcessingInfo,
+                                        info=MantisProcessingInfo,
                                         dummy_inputs=LlavaDummyInputsBuilder)
 class MantisForConditionalGeneration(LlavaForConditionalGeneration):
     pass
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index 0dff9595c6c08..47d56175261e4 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -36,8 +36,9 @@
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
-                                    NestedTensors)
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalInputsV2, MultiModalKwargs,
+                                    NestedTensors, PlaceholderRange)
 from vllm.multimodal.parse import (AudioProcessorItems, MultiModalDataItems,
                                    MultiModalDataParser)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
@@ -153,29 +154,24 @@ def _call_hf_processor(
         mm_data: Mapping[str, object],
         mm_kwargs: Mapping[str, Any],
     ) -> BatchFeature:
-        mm_data = dict(mm_data)
-        audios = mm_data.pop("audios", [])
-
-        if audios:
-            mm_data["audios"] = audios
-
-            feature_extractor = self.info.get_feature_extractor(**mm_kwargs)
-            mm_kwargs = dict(
-                **mm_kwargs,
-                sampling_rate=feature_extractor.sampling_rate,
-            )
-        else:
-            # NOTE: WhisperFeatureExtractor cannot handle empty list of audios
-            pass
+        # Text-only input not supported in composite processor
+        if not mm_data or not mm_data.get("audios", []):
+            prompt_ids = self.info.get_tokenizer().encode(prompt)
+            prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids)
+            return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
+
+        feature_extractor = self.info.get_feature_extractor(**mm_kwargs)
+        mm_kwargs = dict(
+            **mm_kwargs,
+            sampling_rate=feature_extractor.sampling_rate,
+        )
 
-        processed_outputs = super()._call_hf_processor(
+        return super()._call_hf_processor(
             prompt=prompt,
             mm_data=mm_data,
             mm_kwargs=mm_kwargs,
         )
 
-        return processed_outputs
-
     def _get_mm_fields_config(
         self,
         hf_inputs: BatchFeature,
@@ -192,8 +188,14 @@ def _get_prompt_replacements(
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
-        hf_config = self.info.get_hf_config()
-        placeholder = hf_config.audio_token_index
+        processor = self.info.get_hf_processor()
+
+        # Use getattr with default to be compatible with transformers<4.48
+        audio_token = getattr(processor, "audio_token", "<|AUDIO|>")
+        audio_bos_token = getattr(processor, "audio_bos_token",
+                                  "<|audio_bos|>")
+        audio_eos_token = getattr(processor, "audio_eos_token",
+                                  "<|audio_eos|>")
 
         feature_attention_mask = out_mm_kwargs.get("feature_attention_mask")
         if feature_attention_mask is None:
@@ -214,12 +216,16 @@ def get_replacement_qwen2_audio(item_idx: int):
                     f"The audio {audio} (len={len(audio)}) is too short "
                     "to be represented inside the model")
 
-            return [placeholder] * num_placeholders
+            return "".join([
+                audio_bos_token,
+                audio_token * num_placeholders,
+                audio_eos_token,
+            ])
 
         return [
             PromptReplacement(
                 modality="audio",
-                target=[placeholder],
+                target=audio_token,
                 replacement=get_replacement_qwen2_audio,
             )
         ]
@@ -234,6 +240,26 @@ def _always_apply_prompt_replacements(self) -> bool:
         # tokens than the number of audio items)
         return not hasattr(self.info.get_hf_processor(), "audio_token")
 
+    def apply(
+        self,
+        prompt: Union[str, list[int]],
+        mm_data: MultiModalDataDict,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> MultiModalInputsV2:
+        result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
+
+        # Only <|AUDIO|> tokens should be considered as placeholders,
+        # so we ignore the audio_bos_token and audio_eos_token
+        result["mm_placeholders"] = {
+            modality: [
+                PlaceholderRange(offset=p["offset"] + 1,
+                                 length=p["length"] - 2) for p in ps
+            ]
+            for modality, ps in result["mm_placeholders"].items()
+        }
+
+        return result
+
 
 @MULTIMODAL_REGISTRY.register_processor(
     Qwen2AudioMultiModalProcessor,
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index 587f18ccaf98f..9301422383696 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -137,7 +137,7 @@ def _call_hf_processor(
         mm_kwargs: Mapping[str, object],
     ) -> BatchFeature:
         # Text-only input not supported in composite processor
-        if not mm_data:
+        if not mm_data or not mm_data.get("audios", []):
             prompt_ids = self.info.get_tokenizer().encode(prompt)
             prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids)
             return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
@@ -146,13 +146,6 @@ def _call_hf_processor(
         audios = mm_data.pop("audios", [])
         assert isinstance(audios, list)
 
-        if not audios:
-            return super()._call_hf_processor(
-                prompt=prompt,
-                mm_data=mm_data,
-                mm_kwargs=mm_kwargs,
-            )
-
         feature_extractor = self.info.get_feature_extractor()
         mm_kwargs = dict(
             **mm_kwargs,
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index c97acffa1a719..f57dfded0a62f 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -22,10 +22,10 @@
 from vllm.logger import init_logger
 # yapf conflicts with isort for this block
 # yapf: disable
-from vllm.transformers_utils.configs import (ChatGLMConfig, Cohere2Config,
-                                             DbrxConfig, DeepseekVLV2Config,
-                                             EAGLEConfig, ExaoneConfig,
-                                             H2OVLChatConfig,
+from vllm.transformers_utils.configs import (AriaConfig, ChatGLMConfig,
+                                             Cohere2Config, DbrxConfig,
+                                             DeepseekVLV2Config, EAGLEConfig,
+                                             ExaoneConfig, H2OVLChatConfig,
                                              InternVLChatConfig, JAISConfig,
                                              MedusaConfig, MllamaConfig,
                                              MLPSpeculatorConfig, MPTConfig,
@@ -52,6 +52,7 @@
 }
 
 _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
+    "aria": AriaConfig,
     "chatglm": ChatGLMConfig,
     "cohere2": Cohere2Config,
     "dbrx": DbrxConfig,
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index f065c56124605..807ef4fbfd0c0 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -1,3 +1,4 @@
+from vllm.transformers_utils.configs.aria import AriaConfig
 from vllm.transformers_utils.configs.chatglm import ChatGLMConfig
 from vllm.transformers_utils.configs.cohere2 import Cohere2Config
 from vllm.transformers_utils.configs.dbrx import DbrxConfig
@@ -23,6 +24,7 @@
 from vllm.transformers_utils.configs.ultravox import UltravoxConfig
 
 __all__ = [
+    "AriaConfig",
     "ChatGLMConfig",
     "Cohere2Config",
     "DbrxConfig",
diff --git a/vllm/transformers_utils/configs/aria.py b/vllm/transformers_utils/configs/aria.py
index d253da0d96a34..f4b531225b5d0 100644
--- a/vllm/transformers_utils/configs/aria.py
+++ b/vllm/transformers_utils/configs/aria.py
@@ -1,7 +1,32 @@
+# Copyright 2024 Rhymes AI. All rights reserved.
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+from typing import Mapping
+
+from transformers import PretrainedConfig
 from transformers.models.idefics2.configuration_idefics2 import (
     Idefics2VisionConfig)
 from transformers.models.llama.configuration_llama import LlamaConfig
 
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
 
 class AriaVisionConfig(Idefics2VisionConfig):
     model_type = "aria_vision_model"
@@ -45,3 +70,96 @@ def __init__(
         self.moe_num_experts = moe_num_experts
         self.moe_topk = moe_topk
         self.moe_num_shared_experts = moe_num_shared_experts
+
+
+class AriaConfig(PretrainedConfig):
+    """
+    Configuration class for Aria model.
+    This class handles the configuration for both vision and text components of
+    the Aria model,
+    as well as additional parameters for image token handling and projector
+    mapping.
+
+    Args:
+        vision_config (AriaVisionConfig or dict): Configuration for the vision
+            component.
+        text_config (AriaMoELMConfig or dict): Configuration for the text
+            component.
+        projector_patch_to_query_dict (dict): Mapping of patch sizes to query
+            dimensions.
+        ignore_index (int): Index to ignore in loss calculation.
+        image_token_index (int): Index used to represent image tokens.
+        **kwargs: Additional keyword arguments passed to the parent class.
+    Attributes:
+        model_type (str): Type of the model, set to "aria".
+        is_composition (bool): Whether the model is a composition of multiple
+            components.
+        ignore_index (int): Index to ignore in loss calculation.
+        image_token_index (int): Index used to represent image tokens.
+        projector_patch_to_query_dict (dict): Mapping of patch sizes to query
+            dimensions.
+        vision_config (AriaVisionConfig): Configuration for the vision
+            component.
+        text_config (AriaMoELMConfig): Configuration for the text component.
+    """
+
+    model_type = "aria"
+    is_composition = False
+
+    def __init__(
+        self,
+        vision_config: AriaVisionConfig = AriaVisionConfig(),  # noqa: B008
+        text_config: AriaMoELMConfig = AriaMoELMConfig(),  # noqa: B008
+        projector_patch_to_query_dict: Mapping[int, int] = {
+            1225: 128,
+            4900: 256,
+        },
+        ignore_index=-100,
+        image_token_index=32000,
+        tie_word_embeddings=False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.ignore_index = ignore_index
+        self.image_token_index = image_token_index
+        self.tie_word_embeddings = tie_word_embeddings
+        attn_implementation = kwargs.pop("attn_implementation", None)
+
+        # Set the default attention implementation to flash_attention_2 if not
+        # specified
+        self._attn_implementation = ("flash_attention_2"
+                                     if attn_implementation is None else
+                                     attn_implementation)
+
+        # Convert the keys and values of projector_patch_to_query_dict to
+        # integers
+        # This ensures consistency even if they were provided as strings
+        self.projector_patch_to_query_dict = {
+            int(k): int(v)
+            for k, v in projector_patch_to_query_dict.items()
+        }
+
+        if isinstance(vision_config, dict) and "model_type" in vision_config:
+            vision_config = AriaVisionConfig(**vision_config)
+            if attn_implementation is None:
+                vision_attn_implementation = "flash_attention_2"
+            elif attn_implementation == "sdpa":
+                logger.warning("SDPA is not supported for vit, using "
+                               "flash_attention_2 instead")
+                vision_attn_implementation = "flash_attention_2"
+            else:
+                vision_attn_implementation = attn_implementation
+            vision_config._attn_implementation = vision_attn_implementation
+
+        self.vision_config = vision_config
+
+        if isinstance(text_config, dict) and "model_type" in text_config:
+            text_attn_implementation = ("sdpa" if attn_implementation is None
+                                        else attn_implementation)
+            text_config = AriaMoELMConfig(**text_config)
+            text_config._attn_implementation = text_attn_implementation
+
+        self.text_config = text_config
+
+        # This is needed for the static kv cache
+        self.num_hidden_layers = self.text_config.num_hidden_layers

From e66faf4809cebf0b2169887151f782fd99bf208f Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sun, 19 Jan 2025 16:27:26 +0800
Subject: [PATCH 271/462] [torch.compile] store inductor compiled Python file
 (#12182)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/compilation/backends.py | 80 ++++++++++++++++++++++++++----------
 vllm/config.py               | 13 +-----
 2 files changed, 60 insertions(+), 33 deletions(-)

diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 157e3f7f39c9c..d7f4dcb7a20fc 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -25,23 +25,30 @@
 logger = init_logger(__name__)
 
 
+@dataclasses.dataclass
+class InductorArtifact:
+    hash_str: str = ""
+    file_path: str = ""
+
+
 class InductorHashCache:
     """
     Disk format: a Python list of tuples, each tuple is
-    (runtime_shape, graph_index, hash_str)
+    (runtime_shape, graph_index, hash_str, file_path)
     We use list of tuple for readability.
 
     In-memory format: a defaultdict of dict, where the key is
     runtime_shape, and the value is a dict of graph_index to hash_str.
 
-    The data is essentially `Dict[Optional[int], Dict[int, str]]`,
+    The data is essentially `Dict[Optional[int], Dict[int, InductorArtifact]]`,
     we don't use json here because json doesn't support int as key.
 
     TODO: better off-the-shelf solution to serialize the data?
     """
 
     def __init__(self, cache_dir: str, disabled: bool = False):
-        self.cache: defaultdict = defaultdict(dict)
+        self.cache: Dict[Optional[int],
+                         Dict[int, InductorArtifact]] = defaultdict(dict)
         self.disabled = disabled
         self.cache_dir = cache_dir
         self.cache_file_path = os.path.join(cache_dir,
@@ -66,14 +73,25 @@ def deserialize(self, data: str):
         # because it is a safe way to parse Python literals.
         # do not use eval(), it is unsafe.
         list_data = ast.literal_eval(data)
-        for runtime_shape, graph_index, hash_str in list_data:
-            self.cache[runtime_shape][graph_index] = hash_str
+        for item in list_data:
+            runtime_shape = item[0]
+            graph_index = item[1]
+            hash_str = item[2]
+            # for compatibility of old version,
+            # where we don't have file_path.
+            # NOTE: after running the new code, the file_path
+            # will be updated.
+            file_path = "" if len(item) == 3 else item[3]
+            self.cache[runtime_shape][graph_index] = InductorArtifact(
+                hash_str=hash_str, file_path=file_path)
 
     def serialize(self) -> str:
         data = []
-        for runtime_shape, graph_index_to_hash_str in self.cache.items():
-            for graph_index, hash_str in graph_index_to_hash_str.items():
-                data.append((runtime_shape, graph_index, hash_str))
+        for runtime_shape, value in self.cache.items():
+            for graph_index, inductor_artifact in value.items():
+                data.append(
+                    (runtime_shape, graph_index, inductor_artifact.hash_str,
+                     inductor_artifact.file_path))
         printer = pprint.PrettyPrinter(indent=4)
         return printer.pformat(data)
 
@@ -90,13 +108,14 @@ def __contains__(self, key: Tuple[Optional[int], int]) -> bool:
         return runtime_shape in self.cache and graph_index in self.cache[
             runtime_shape]
 
-    def __getitem__(self, key: Tuple[Optional[int], int]) -> str:
+    def __getitem__(self, key: Tuple[Optional[int], int]) -> InductorArtifact:
         if self.disabled:
             raise KeyError("cannot read from disabled cache")
         runtime_shape, graph_index = key
         return self.cache[runtime_shape][graph_index]
 
-    def __setitem__(self, key: Tuple[Optional[int], int], value: str):
+    def __setitem__(self, key: Tuple[Optional[int], int],
+                    value: InductorArtifact):
         # setitem for disabled cache is fine, because we
         # don't actually write to the disk
         runtime_shape, graph_index = key
@@ -181,7 +200,8 @@ def wrap_inductor(graph: fx.GraphModule,
     if (runtime_shape, graph_index) in cache_data:
         # we compiled this graph before
         # so we can directly lookup the compiled graph via hash
-        hash_str = cache_data[(runtime_shape, graph_index)]
+        inductor_artifact = cache_data[(runtime_shape, graph_index)]
+        hash_str = inductor_artifact.hash_str
         if graph_index == 0:
             # adds some info logging for the first graph
             logger.info(
@@ -199,6 +219,7 @@ def wrap_inductor(graph: fx.GraphModule,
                 "Inductor cache lookup failed. Please remove"
                 f"the cache file {cache_data.cache_file_path} and try again."  # noqa
             )
+            inductor_artifact.file_path = inductor_compiled_graph.current_callable.__code__.co_filename  # noqa
 
         # Inductor calling convention (function signature):
         # f(list) -> tuple
@@ -224,19 +245,20 @@ def compiled_graph(*args):
         # the assumption is that we don't have nested Inductor compilation.
         # compiled_fx_graph_hash will only be called once, and we can hook
         # it to get the hash of the compiled graph directly.
-        from torch._inductor.codecache import compiled_fx_graph_hash
+
+        inductor_artifact = InductorArtifact()
+        from torch._inductor.codecache import (FxGraphCache,
+                                               compiled_fx_graph_hash)
+        original_load = FxGraphCache.load
+
+        def hijack_load(*args, **kwargs):
+            inductor_compiled_graph = original_load(*args, **kwargs)
+            inductor_artifact.file_path = inductor_compiled_graph.current_callable.__code__.co_filename  # noqa
+            return inductor_compiled_graph
 
         def hijack_compiled_fx_graph_hash(*args, **kwargs):
             out = compiled_fx_graph_hash(*args, **kwargs)
-            # store the hash in the cache
-            nonlocal cache_data
-            cache_data[(runtime_shape, graph_index)] = out[0]
-            if graph_index == 0:
-                # adds some info logging for the first graph
-                logger.info("Cache the graph of shape %s for later use",
-                            str(runtime_shape))
-            logger.debug("store the %s-th graph for shape %s via hash %s",
-                         graph_index, str(runtime_shape), out[0])
+            inductor_artifact.hash_str = out[0]
             return out
 
         def _check_can_cache(*args, **kwargs):
@@ -255,6 +277,11 @@ def _get_shape_env() -> AlwaysHitShapeEnv:
             if not cache_data.disabled:
                 # compilation cache is enabled, patch several functions
 
+                # hijack to get the compiled graph itself
+                stack.enter_context(
+                    patch("torch._inductor.codecache.FxGraphCache.load",
+                          hijack_load))
+
                 # for hijacking the hash of the compiled graph
                 stack.enter_context(
                     patch("torch._inductor.codecache.compiled_fx_graph_hash",
@@ -275,7 +302,16 @@ def _get_shape_env() -> AlwaysHitShapeEnv:
             compiled_graph = compile_fx(graph,
                                         example_inputs,
                                         config_patches=current_config)
-
+        # store the inductor_artifact in the cache
+        cache_data[(runtime_shape, graph_index)] = inductor_artifact
+        if graph_index == 0:
+            # adds some info logging for the first graph
+            logger.info("Cache the graph of shape %s for later use",
+                        str(runtime_shape))
+        logger.debug(
+            "store the %s-th graph for shape %s via hash %s from file %s",
+            graph_index, str(runtime_shape), inductor_artifact.hash_str,
+            inductor_artifact.file_path)
     # after compiling the last graph, record the end time
     if graph_index == num_graphs - 1:
         now = time.time()
diff --git a/vllm/config.py b/vllm/config.py
index ac5a4c91b1738..4698a05020332 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2862,17 +2862,8 @@ def model_post_init(self, __context: Any) -> None:
                     "vllm.unified_attention_with_output",
                 ]
             else:
-                # v0 can use full graph compilation without splitting,
-                # splitting is optional.
-                # right now we still need it. kv cache shape
-                # will be included in the graph if we don't split
-                # the graph.
-                # TODO: hide kv cache in static forward context
-                # so that inductor does not see it.
-                self.splitting_ops = [
-                    "vllm.unified_attention",
-                    "vllm.unified_attention_with_output",
-                ]
+                # v0 uses full graph compilation
+                self.splitting_ops = []
 
         for k, v in self.inductor_passes.items():
             if not isinstance(v, str):

From 936db119ed390fadc7de448261226358e153e46c Mon Sep 17 00:00:00 2001
From: gujing <925973396@qq.com>
Date: Sun, 19 Jan 2025 17:59:56 +0800
Subject: [PATCH 272/462] benchmark_serving support --served-model-name param
 (#12109)

Signed-off-by: zibai <zibai.gj@alibaba-inc.com>
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
---
 benchmarks/backend_request_func.py |  9 ++++++---
 benchmarks/benchmark_serving.py    | 13 +++++++++++++
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index 9d71e4ecc4a37..a9ab4fc9b621e 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -22,6 +22,7 @@ class RequestFuncInput:
     prompt_len: int
     output_len: int
     model: str
+    model_name: Optional[str] = None
     best_of: int = 1
     logprobs: Optional[int] = None
     extra_body: Optional[dict] = None
@@ -78,7 +79,7 @@ async def async_request_tgi(
                             continue
                         chunk_bytes = chunk_bytes.decode("utf-8")
 
-                        #NOTE: Sometimes TGI returns a ping response without
+                        # NOTE: Sometimes TGI returns a ping response without
                         # any data, we should skip it.
                         if chunk_bytes.startswith(":"):
                             continue
@@ -235,7 +236,8 @@ async def async_request_openai_completions(
 
     async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
         payload = {
-            "model": request_func_input.model,
+            "model": request_func_input.model_name \
+                if request_func_input.model_name else request_func_input.model,
             "prompt": request_func_input.prompt,
             "temperature": 0.0,
             "best_of": request_func_input.best_of,
@@ -328,7 +330,8 @@ async def async_request_openai_chat_completions(
         if request_func_input.multi_modal_content:
             content.append(request_func_input.multi_modal_content)
         payload = {
-            "model": request_func_input.model,
+            "model": request_func_input.model_name \
+                if request_func_input.model_name else request_func_input.model,
             "messages": [
                 {
                     "role": "user",
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 4eb0e1f8ac903..53186e10c5452 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -525,6 +525,7 @@ async def benchmark(
     api_url: str,
     base_url: str,
     model_id: str,
+    model_name: str,
     tokenizer: PreTrainedTokenizerBase,
     input_requests: List[Tuple[str, int, int]],
     logprobs: Optional[int],
@@ -553,6 +554,7 @@ async def benchmark(
             "Multi-modal content is only supported on 'openai-chat' backend.")
     test_input = RequestFuncInput(
         model=model_id,
+        model_name=model_name,
         prompt=test_prompt,
         api_url=api_url,
         prompt_len=test_prompt_len,
@@ -573,6 +575,7 @@ async def benchmark(
     if profile:
         print("Starting profiler...")
         profile_input = RequestFuncInput(model=model_id,
+                                         model_name=model_name,
                                          prompt=test_prompt,
                                          api_url=base_url + "/start_profile",
                                          prompt_len=test_prompt_len,
@@ -616,6 +619,7 @@ async def limited_request_func(request_func_input, pbar):
     async for request in get_request(input_requests, request_rate, burstiness):
         prompt, prompt_len, output_len, mm_content = request
         request_func_input = RequestFuncInput(model=model_id,
+                                              model_name=model_name,
                                               prompt=prompt,
                                               api_url=api_url,
                                               prompt_len=prompt_len,
@@ -780,6 +784,7 @@ def main(args: argparse.Namespace):
 
     backend = args.backend
     model_id = args.model
+    model_name = args.served_model_name
     tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
     tokenizer_mode = args.tokenizer_mode
 
@@ -877,6 +882,7 @@ def main(args: argparse.Namespace):
             api_url=api_url,
             base_url=base_url,
             model_id=model_id,
+            model_name=model_name,
             tokenizer=tokenizer,
             input_requests=input_requests,
             logprobs=args.logprobs,
@@ -1222,5 +1228,12 @@ def main(args: argparse.Namespace):
         'always use the slow tokenizer. \n* '
         '"mistral" will always use the `mistral_common` tokenizer.')
 
+    parser.add_argument("--served-model-name",
+                        type=str,
+                        default=None,
+                        help="The model name used in the API. "
+                        "If not specified, the model name will be the "
+                        "same as the ``--model`` argument. ")
+
     args = parser.parse_args()
     main(args)

From edaae198e72d36e22a10e9e76198fac32f670b49 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sun, 19 Jan 2025 19:49:22 +0800
Subject: [PATCH 273/462] [Misc] Add BNB support to GLM4-V model (#12184)

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 vllm/model_executor/model_loader/loader.py    | 15 ++-
 vllm/model_executor/models/chatglm.py         | 95 +++++++++----------
 .../models/glm4_vision_encoder.py             |  3 +-
 3 files changed, 60 insertions(+), 53 deletions(-)

diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 9fe0db62435a0..e6e37358482fc 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -1105,15 +1105,22 @@ def _load_weights(self, model_config: ModelConfig,
                     weight_name,
                     index,
             ) in self.modules_mapping.inverse_packed_mapping.items():
-                shard_pos = quant_param_name.find(shard_name)
                 # Some models, such as MiniCPM V2.5/2.6, contain both
                 # module names 'kv_proj' and 'qkv_proj'. To prevent 'kv_proj'
                 # from being incorrectly identified as being present in
                 # 'vpm.encoder.layers.0.self_attn.qkv_proj.weight
-                if shard_pos > 0 and quant_param_name[shard_pos - 1] == ".":
+                shard_pos = quant_param_name.find(shard_name)
+                can_correct_rename = (shard_pos > 0) and (
+                    quant_param_name[shard_pos - 1] == ".")
+                # If the quant_param_name is packed, it won't occur in the
+                # param_dict before renaming.
+                new_quant_param_name = quant_param_name.replace(
+                    shard_name, weight_name)
+                need_rename = (quant_param_name not in param_dict) \
+                              and (new_quant_param_name in param_dict)
+                if can_correct_rename and need_rename:
                     shard_index = index
-                    quant_param_name = quant_param_name.replace(
-                        shard_name, weight_name)
+                    quant_param_name = new_quant_param_name
                     break
 
             # Models like Clip/Siglip may skip some layers in initialization,
diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py
index 7e37ce3086e6b..d5f9b4d19e5ca 100644
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -41,7 +41,7 @@
 from vllm.transformers_utils.configs import ChatGLMConfig
 
 from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP
-from .utils import (is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, WeightsMapper, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -605,9 +605,50 @@ def forward(
             return IntermediateTensors({"hidden_states": hidden_states})
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("linear_proj.merged_proj", "linear_proj.gate_proj", 0),
+            ("linear_proj.merged_proj", "linear_proj.dense_h_to_4h", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+
+        for name, loaded_weight in weights:
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                if "rotary_pos_emb.inv_freq" in name:
+                    continue
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class ChatGLMBaseModel(nn.Module, SupportsLoRA, SupportsPP):
 
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_substr={".word_embeddings": ""}, )
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
@@ -660,52 +701,9 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str,
-                                                   torch.Tensor]]) -> Set[str]:
-        # Merge two ColumnParallelLinear into one MergedColumnParallelLinear
-        merged_weights_dict: Dict[str, Dict[str, Optional[torch.Tensor]]] = {
-            "transformer.vision.linear_proj.merged_proj.weight": {
-                "transformer.vision.linear_proj.gate_proj.weight": None,
-                "transformer.vision.linear_proj.dense_h_to_4h.weight": None,
-            }
-        }
-
-        params_dict = dict(self.named_parameters(remove_duplicate=False))
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            is_weight_to_be_merge = False
-            for _, merged_weight_dict in merged_weights_dict.items():
-                if name in merged_weight_dict:
-                    assert merged_weight_dict[name] is None
-                    merged_weight_dict[name] = loaded_weight
-                    is_weight_to_be_merge = True
-            if is_weight_to_be_merge:
-                continue
-            if "rotary_pos_emb.inv_freq" in name:
-                continue
-            if "word_embeddings" in name:
-                name = name.replace(".word_embeddings", "")
-            # Skip loading extra bias for GPTQ models.
-            if name.endswith(".bias") and name not in params_dict:
-                continue
-            if is_pp_missing_parameter(name, self):
-                continue
-            param = params_dict[name]
-            weight_loader = getattr(param, "weight_loader",
-                                    default_weight_loader)
-            weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-
-        for combined_name, merged_weight_dict in merged_weights_dict.items():
-            if combined_name in params_dict:
-                param = params_dict[combined_name]
-                combined_weight = torch.cat(list(merged_weight_dict.values()),
-                                            dim=0)
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, combined_weight)
-                loaded_params.add(combined_name)
-        return loaded_params
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
 
 
 class ChatGLM(ChatGLMBaseModel):
@@ -726,6 +724,7 @@ class ChatGLM(ChatGLMBaseModel):
 
 
 class ChatGLMV(ChatGLMBaseModel, SupportsMultiModal):
+
     packed_modules_mapping = {
         "query_key_value": ["query_key_value"],
         "dense_h_to_4h": ["dense_h_to_4h"],
@@ -777,7 +776,7 @@ def __new__(
     ) -> None:
         config = vllm_config.model_config.hf_config
         # Initialize VL
-        if hasattr(config, "visual"):
+        if hasattr(config, "vision_config"):
             return ChatGLMV(vllm_config=vllm_config, prefix=prefix)
         # Initialize LLM
         else:
diff --git a/vllm/model_executor/models/glm4_vision_encoder.py b/vllm/model_executor/models/glm4_vision_encoder.py
index 39a5736eb199b..51922e6f2d03d 100644
--- a/vllm/model_executor/models/glm4_vision_encoder.py
+++ b/vllm/model_executor/models/glm4_vision_encoder.py
@@ -42,7 +42,8 @@ def forward(self, images: torch.Tensor) -> torch.Tensor:
         torch.Tensor
             Transformed tensor with shape (B, L, D)
         """
-        images = images.to(self.proj.weight.device)
+        images = images.to(device=self.proj.weight.device,
+                           dtype=self.proj.weight.dtype)
         x = self.proj(images)
         x = x.flatten(2).transpose(1, 2)
         cls_token = self.cls_embedding.expand(x.shape[0], -1, -1)

From 81763c58a01eda9205f3750177358acc79613e65 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Sun, 19 Jan 2025 03:52:13 -0800
Subject: [PATCH 274/462] [V1] Add V1 support of Qwen2-VL (#12128)

Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: imkero <kerorek@outlook.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/source/models/supported_models.md        |   2 +-
 .../vision_language/test_qwen2_vl.py          |  18 +--
 vllm/compilation/decorators.py                |  14 +-
 .../model_executor/layers/rotary_embedding.py |  44 +++++-
 vllm/model_executor/models/llava_onevision.py |   6 +-
 vllm/model_executor/models/qwen2.py           |  10 +-
 vllm/model_executor/models/qwen2_vl.py        | 142 ++++++++++--------
 vllm/v1/worker/gpu_input_batch.py             |   3 +
 vllm/v1/worker/gpu_model_runner.py            | 138 ++++++++++++++++-
 9 files changed, 292 insertions(+), 85 deletions(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 2edb610ddf959..eb1bde9ec0089 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -754,7 +754,7 @@ See [this page](#generative-models) for more information on how to use generativ
   - `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc.
   - ✅︎
   - ✅︎
-  -
+  - ✅︎
 * - `UltravoxModel`
   - Ultravox
   - T + A<sup>E+</sup>
diff --git a/tests/models/decoder_only/vision_language/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/test_qwen2_vl.py
index 16e256e040a74..2fd22f0cc88ec 100644
--- a/tests/models/decoder_only/vision_language/test_qwen2_vl.py
+++ b/tests/models/decoder_only/vision_language/test_qwen2_vl.py
@@ -105,7 +105,7 @@ def batch_make_image_embeddings(
     pixel_values = preprocess_result["pixel_values"]
     image_grid_thw = preprocess_result["image_grid_thw"]
 
-    # pixel values to embeddinds & grid_thws
+    # pixel values to embeddings & grid_thws
     with torch.no_grad():
         visual = llm.llm_engine.model_executor.driver_worker. \
             model_runner.model.visual
@@ -124,11 +124,10 @@ def batch_make_image_embeddings(
     for image_batch in image_batches_:
         cur_batch_image_count = len(image_batch)
         merge_size = image_processor.merge_size
-        cur_batch_embed_len = sum([
-            grid_thw.prod() // merge_size // merge_size
+        cur_batch_embed_len = sum(
+            grid_thw.prod(-1) // merge_size // merge_size
             for grid_thw in image_grid_thw[image_counter:image_counter +
-                                           cur_batch_image_count]
-        ])
+                                           cur_batch_image_count])
 
         result.append({
             "image_embeds":
@@ -187,7 +186,7 @@ def batch_make_video_embeddings(
     pixel_values = preprocess_result["pixel_values_videos"]
     video_grid_thw = preprocess_result["video_grid_thw"]
 
-    # pixel values to embeddinds & grid_thws
+    # pixel values to embeddings & grid_thws
     with torch.no_grad():
         visual = llm.llm_engine.model_executor.driver_worker.\
             model_runner.model.visual
@@ -206,11 +205,10 @@ def batch_make_video_embeddings(
     for video_batch in video_batches_:
         cur_batch_video_count = len(video_batch)
         merge_size = image_processor.merge_size
-        cur_batch_embed_len = sum([
-            grid_thw.prod() // merge_size // merge_size
+        cur_batch_embed_len = sum(
+            grid_thw.prod(-1) // merge_size // merge_size
             for grid_thw in video_grid_thw[video_counter:video_counter +
-                                           cur_batch_video_count]
-        ])
+                                           cur_batch_video_count])
 
         result.append({
             "video_embeds":
diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py
index 10513111ea7f1..38f284794b8db 100644
--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -76,8 +76,8 @@ def forward(self, x: torch.Tensor, y: Optional[torch.Tensor]):
     During runtime, when we actually mark dimensions of tensors,
      it depends on the value of arguments:
 
-    - if it is a single integer, the corresponding dimension of the argument
-        will be marked as dynamic.
+    - if it is a single integer (can be negative), the corresponding dimension 
+        of the argument will be marked as dynamic.
     - if it is `None`, ignored.
     - if it is `IntermediateTensors`, all the tensors in the intermediate
         tensors will be marked as dynamic.
@@ -177,10 +177,20 @@ def __call__(self, *args, **kwargs):
             for k, dims in dynamic_arg_dims.items():
                 arg = bound_args.arguments.get(k)
                 if arg is not None:
+                    dims = [dims] if isinstance(dims, int) else dims
                     if isinstance(arg, torch.Tensor):
+                        # In case dims is specified with negative indexing
+                        dims = [
+                            arg.ndim + dim if dim < 0 else dim for dim in dims
+                        ]
                         torch._dynamo.mark_dynamic(arg, dims)
                     elif isinstance(arg, IntermediateTensors):
                         for tensor in arg.tensors.values():
+                            # In case dims is specified with negative indexing
+                            dims = [
+                                tensor.ndim + dim if dim < 0 else dim
+                                for dim in dims
+                            ]
                             torch._dynamo.mark_dynamic(tensor, dims)
                     else:
                         raise ValueError(
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index 3fcd81a3c4213..d071cfe888f05 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -841,6 +841,37 @@ def get_input_positions(
     ) -> Tuple[List[List[int]], int]:
         """Get mrope input positions and delta value."""
 
+        llm_positions, mrope_position_delta = \
+            MRotaryEmbedding.get_input_positions_tensor(
+                input_tokens,
+                image_grid_thw,
+                video_grid_thw,
+                image_token_id,
+                video_token_id,
+                vision_start_token_id,
+                vision_end_token_id,
+                spatial_merge_size,
+                context_len,
+                seq_len,
+            )
+
+        return llm_positions.tolist(), mrope_position_delta
+
+    @staticmethod
+    def get_input_positions_tensor(
+        input_tokens: List[int],
+        image_grid_thw: Union[List[List[int]], torch.Tensor],
+        video_grid_thw: Union[List[List[int]], torch.Tensor],
+        image_token_id: int,
+        video_token_id: int,
+        vision_start_token_id: int,
+        vision_end_token_id: int,
+        spatial_merge_size: int,
+        context_len: int = 0,
+        seq_len: Optional[int] = None,
+    ) -> Tuple[torch.Tensor, int]:
+        """Get mrope input positions and delta value."""
+
         if isinstance(image_grid_thw, torch.Tensor):
             image_grid_thw = image_grid_thw.tolist()
         if isinstance(video_grid_thw, torch.Tensor):
@@ -916,7 +947,7 @@ def get_input_positions(
                                 len(input_tokens)).item()
         llm_positions = llm_positions[:, context_len:seq_len]
 
-        return llm_positions.tolist(), mrope_position_delta
+        return llm_positions, mrope_position_delta
 
     @staticmethod
     def get_next_input_positions(
@@ -930,6 +961,17 @@ def get_next_input_positions(
                       seq_len + mrope_position_delta)) for _ in range(3)
         ]
 
+    @staticmethod
+    def get_next_input_positions_tensor(
+        mrope_position_delta: int,
+        context_len: int,
+        seq_len: int,
+    ) -> torch.Tensor:
+        return torch.arange(
+            mrope_position_delta + context_len,
+            mrope_position_delta + seq_len,
+        ).expand(3, -1)
+
 
 _ROPE_DICT: Dict[Tuple, RotaryEmbedding] = {}
 
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index c9283e0c5ba20..6faa79f65d8de 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -554,10 +554,12 @@ def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
         # Preserve the order of modalities if there are multiple of them
         # from the order of kwargs.
         for input_key in kwargs:
-            if input_key == "pixel_values" and "images" not in modalities:
+            if input_key in ("pixel_values",
+                             "image_embeds") and "images" not in modalities:
                 modalities["images"] = self._parse_and_validate_image_input(
                     **kwargs)
-            if input_key == "pixel_values_videos" and "videos" not in modalities:  # noqa E501
+            if input_key in ("pixel_values_videos",
+                             "video_embeds") and "videos" not in modalities:
                 modalities["videos"] = self._parse_and_validate_video_input(
                     **kwargs)
 
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index d015f60c6d065..82de1c3574090 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -256,7 +256,15 @@ def forward(
         return hidden_states, residual
 
 
-@support_torch_compile
+@support_torch_compile(
+    dynamic_arg_dims={
+        "input_ids": 0,
+        # positions is of shape (3, seq_len) if mrope is enabled for qwen2-vl,
+        # otherwise (seq_len, ).
+        "positions": -1,
+        "intermediate_tensors": 0,
+        "inputs_embeds": 0,
+    })
 class Qwen2Model(nn.Module):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index d00e5d362c8bc..34d5c8ad089a3 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -67,11 +67,15 @@
 
 from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, WeightsMapper,
-                    init_vllm_registered_model, maybe_prefix)
+                    init_vllm_registered_model, maybe_prefix,
+                    merge_multimodal_embeddings)
 from .vision import get_vit_attn_backend
 
 logger = init_logger(__name__)
 
+# For profile run
+_MAX_FRAMES_PER_VIDEO = 16
+
 # === Vision Inputs === #
 
 
@@ -135,7 +139,7 @@ class Qwen2VLVideoEmbeddingInputs(TypedDict):
     - List[`torch.Tensor`]: A list of tensors holding all videos' features.
         Each tensor holds an video's features.
     - `torch.Tensor`: A tensor holding all videos' features
-      (concatenation of all videos' feature tensors).
+        (concatenation of all videos' feature tensors).
     
     Tensor shape: `(num_image_features, hidden_size)`
     - `num_image_features` varies based on 
@@ -611,6 +615,7 @@ def forward(
 
         # adapter
         x = self.merger(x)
+
         return x
 
     def load_weights(self, weights: Iterable[Tuple[str,
@@ -874,8 +879,8 @@ def get_num_frames_with_most_features(self, seq_len: int) -> int:
         max_image_tokens = self.get_max_image_tokens() * max_images
         max_total_frames = self._get_max_video_frames(seq_len -
                                                       max_image_tokens)
-
-        num_frames = max(max_total_frames // max(max_videos, 1), 1)
+        num_frames = min(max(max_total_frames // max(max_videos, 1), 1),
+                         _MAX_FRAMES_PER_VIDEO)
 
         # Temporary workaround for https://github.com/huggingface/transformers/issues/35412
         if num_frames > 1 and num_frames % 2 == 1:
@@ -955,13 +960,14 @@ def _get_prompt_replacements(
             "image": hf_processor.image_token,
             "video": hf_processor.video_token,
         }
+
         merge_length = image_processor.merge_size**2
 
         def get_replacement_qwen2vl(item_idx: int, modality: str):
             grid_thw = out_mm_kwargs[f"{modality}_grid_thw"][item_idx]
             assert isinstance(grid_thw, torch.Tensor)
 
-            num_tokens = grid_thw.prod() // merge_length
+            num_tokens = grid_thw.prod().item() // merge_length
             return placeholder[modality] * num_tokens
 
         return [
@@ -1047,11 +1053,8 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config: Qwen2VLConfig = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         multimodal_config = vllm_config.model_config.multimodal_config
-        assert not cache_config.enable_prefix_caching, \
-            "Qwen2-VL currently does not support prefix caching"
 
         self.config = config
         self.multimodal_config = multimodal_config
@@ -1173,59 +1176,82 @@ def _parse_and_validate_video_input(
                                                video_embeds=video_embeds,
                                                video_grid_thw=video_grid_thw)
 
-    def _process_image_input(self,
-                             image_input: Qwen2VLImageInputs) -> torch.Tensor:
+    def _process_image_input(
+            self, image_input: Qwen2VLImageInputs) -> tuple[torch.Tensor, ...]:
+
+        grid_thw = image_input["image_grid_thw"]
+        assert grid_thw.ndim == 2
+
         if image_input["type"] == "image_embeds":
-            return image_input["image_embeds"].type(self.visual.dtype)
+            image_embeds = image_input["image_embeds"].type(self.visual.dtype)
+        else:
+            pixel_values = image_input["pixel_values"].type(self.visual.dtype)
+            image_embeds = self.visual(pixel_values, grid_thw=grid_thw)
+
+        # Split concatenated embeddings for each image item.
+        merge_size = self.visual.spatial_merge_size
+        sizes = grid_thw.prod(-1) // merge_size // merge_size
+
+        return image_embeds.split(sizes.tolist())
+
+    def _process_video_input(
+            self, video_input: Qwen2VLVideoInputs) -> tuple[torch.Tensor, ...]:
 
-        pixel_values = image_input["pixel_values"].type(self.visual.dtype)
-        image_embeds = self.visual(pixel_values,
-                                   grid_thw=image_input["image_grid_thw"])
-        return image_embeds
+        grid_thw = video_input["video_grid_thw"]
+        assert grid_thw.ndim == 2
 
-    def _process_video_input(self,
-                             video_input: Qwen2VLVideoInputs) -> torch.Tensor:
         if video_input["type"] == "video_embeds":
-            return video_input["video_embeds"].type(self.visual.dtype)
+            video_embeds = video_input["video_embeds"].type(self.visual.dtype)
+        else:
+            pixel_values_videos = video_input["pixel_values_videos"].type(
+                self.visual.dtype)
+            video_embeds = self.visual(pixel_values_videos, grid_thw=grid_thw)
 
-        pixel_values_videos = video_input["pixel_values_videos"].type(
-            self.visual.dtype)
-        video_embeds = self.visual(pixel_values_videos,
-                                   grid_thw=video_input["video_grid_thw"])
-        return video_embeds
+        # Split concatenated embeddings for each video item.
+        merge_size = self.visual.spatial_merge_size
+        sizes = grid_thw.prod(-1) // merge_size // merge_size
 
-    def _merge_multimodal_embeddings(
-        self,
-        input_ids: torch.Tensor,
-        inputs_embeds: torch.Tensor,
-        multimodal_embeddings: torch.Tensor,
-        placeholder_token_id: int,
-    ) -> torch.Tensor:
-        mask = (input_ids == placeholder_token_id)
-        inputs_embeds[mask, :] = multimodal_embeddings
-        return inputs_embeds
+        return video_embeds.split(sizes.tolist())
+
+    def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
+        modalities = {}
+
+        # Preserve the order of modalities if there are multiple of them
+        # from the order of kwargs.
+        for input_key in kwargs:
+            if input_key in ("pixel_values",
+                             "image_embeds") and "images" not in modalities:
+                modalities["images"] = self._parse_and_validate_image_input(
+                    **kwargs)
+            if input_key in ("pixel_values_videos",
+                             "video_embeds") and "videos" not in modalities:
+                modalities["videos"] = self._parse_and_validate_video_input(
+                    **kwargs)
+
+        return modalities
 
     def get_multimodal_embeddings(
             self, **kwargs) -> Optional[List[Tuple[NestedTensors, str]]]:
 
-        image_input = self._parse_and_validate_image_input(**kwargs)
-        video_input = self._parse_and_validate_video_input(**kwargs)
-        if image_input is None and video_input is None:
+        modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
+        if not modalities:
             return None
 
-        # We make a tuple of each embedding with its modality string. This is a
-        # temporary workaround for models to handle mixed modalities when
-        # get_multimodal_embeddings and get_input_embeddings are called
-        # separately.
-        # TODO(ywang96): Add support for mixed-modality inference for v1.
-        multimodal_embeddings: List[Tuple[NestedTensors, str]] = []
-
-        if image_input is not None:
-            image_embeds = self._process_image_input(image_input)
-            multimodal_embeddings.append((image_embeds, "image"))
-        if video_input is not None:
-            video_embeds = self._process_video_input(video_input)
-            multimodal_embeddings.append((video_embeds, "video"))
+        # The result multimodal_embeddings is tuple of tensors, with each
+        # tensor correspoending to a multimodal data item (image or video).
+        multimodal_embeddings: tuple[torch.Tensor, ...] = ()
+
+        # NOTE: It is important to iterate over the keys in this dictionary
+        # to preserve the order of the modalities.
+        for modality in modalities:
+            if modality == "images":
+                image_input = modalities["images"]
+                vision_embeddings = self._process_image_input(image_input)
+                multimodal_embeddings += vision_embeddings
+            if modality == "videos":
+                video_input = modalities["videos"]
+                video_embeddings = self._process_video_input(video_input)
+                multimodal_embeddings += video_embeddings
 
         return multimodal_embeddings
 
@@ -1237,21 +1263,9 @@ def get_input_embeddings(
     ) -> torch.Tensor:
         inputs_embeds = self.language_model.get_input_embeddings(input_ids)
         if multimodal_embeddings is not None:
-            for embeddings, modality in multimodal_embeddings:
-                if modality == "image":
-                    inputs_embeds = self._merge_multimodal_embeddings(
-                        input_ids,
-                        inputs_embeds,
-                        embeddings,
-                        placeholder_token_id=self.config.image_token_id,
-                    )
-                if modality == "video":
-                    inputs_embeds = self._merge_multimodal_embeddings(
-                        input_ids,
-                        inputs_embeds,
-                        embeddings,
-                        placeholder_token_id=self.config.video_token_id,
-                    )
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, multimodal_embeddings,
+                [self.config.image_token_id, self.config.video_token_id])
         return inputs_embeds
 
     def forward(
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 40494e64b22f0..28d8e39053874 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -30,6 +30,9 @@ class CachedRequestState:
     num_computed_tokens: int
     output_token_ids: List[int]
 
+    mrope_positions: Optional[torch.Tensor] = None
+    mrope_position_delta: Optional[int] = None
+
     @property
     def num_tokens(self) -> int:
         return len(self.prompt_token_ids) + len(self.output_token_ids)
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index aa63d9414c296..87a1cd7f9e627 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -14,6 +14,7 @@
 from vllm.forward_context import set_forward_context
 from vllm.inputs import INPUT_REGISTRY
 from vllm.logger import init_logger
+from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
 from vllm.model_executor.model_loader import get_model
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
 from vllm.sampling_params import SamplingType
@@ -139,6 +140,32 @@ def __init__(
         self.positions = torch.zeros(self.max_num_tokens,
                                      dtype=torch.int64,
                                      device=self.device)
+
+        # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
+        if self.model_config.uses_mrope:
+            # NOTE: `mrope_positions` is implemented as a permuted tensor to
+            # satisfy the following properties to allow `torch.compile` to work
+            # properly:
+            # - shape: (3, <variable>)
+            # - stride: (1, 3)
+            # See detailed explanation in https://github.com/vllm-project/vllm/pull/12128#discussion_r1921022256
+
+            # NOTE: When M-RoPE is enabled, position ids are 3D regardless of
+            # the modality of inputs. For text-only inputs, each dimension has
+            # identical position IDs, making M-RoPE functionally equivalent to
+            # 1D-RoPE.
+            # See page 5 of https://arxiv.org/abs/2409.12191
+            self.mrope_positions = torch.zeros((self.max_num_tokens, 3),
+                                               dtype=torch.int64,
+                                               device=self.device)
+            self.mrope_positions_cpu = torch.zeros((self.max_num_tokens, 3),
+                                                   dtype=torch.int64,
+                                                   device="cpu",
+                                                   pin_memory=self.pin_memory)
+
+            self.mrope_positions = self.mrope_positions.permute((1, 0))
+            self.mrope_positions_cpu = self.mrope_positions_cpu.permute((1, 0))
+
         self.inputs_embeds = torch.zeros(
             (self.max_num_tokens, self.hidden_size),
             dtype=self.dtype,
@@ -246,6 +273,35 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
                 num_computed_tokens=new_req_data.num_computed_tokens,
                 output_token_ids=[],
             )
+
+            # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
+            if self.model_config.uses_mrope:
+                image_grid_thw = []
+                video_grid_thw = []
+                for mm_input in self.requests[req_id].mm_inputs:
+                    if mm_input.get("image_grid_thw") is not None:
+                        image_grid_thw.extend(
+                            mm_input["image_grid_thw"].tolist())
+                    if mm_input.get("video_grid_thw") is not None:
+                        video_grid_thw.extend(
+                            mm_input["video_grid_thw"].tolist())
+
+                hf_config = self.model_config.hf_config
+
+                self.requests[req_id].mrope_positions, \
+                    self.requests[req_id].mrope_position_delta = \
+                    MRotaryEmbedding.get_input_positions_tensor(
+                        self.requests[req_id].prompt_token_ids,
+                        image_grid_thw=image_grid_thw,
+                        video_grid_thw=video_grid_thw,
+                        image_token_id=hf_config.image_token_id,
+                        video_token_id=hf_config.video_token_id,
+                        vision_start_token_id=hf_config.vision_start_token_id,
+                        vision_end_token_id=hf_config.vision_end_token_id,
+                        spatial_merge_size=hf_config.vision_config.
+                        spatial_merge_size,
+                    )
+
             req_ids_to_add.append(req_id)
 
         # Update the cached states of the resumed requests.
@@ -313,6 +369,11 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
                arange,
                out=positions_np)
 
+        # Calculate M-RoPE positions.
+        # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
+        if self.model_config.uses_mrope:
+            self._calc_mrope_positions(scheduler_output)
+
         # Get token indices.
         # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
         # -> [0, 1, M, M + 1, M + 2, M + 3, M + 4, 2 * M, 2 * M + 1, 2 * M + 2]
@@ -359,8 +420,16 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
         # Copy the tensors to the GPU.
         self.input_ids[:total_num_scheduled_tokens].copy_(
             self.input_ids_cpu[:total_num_scheduled_tokens], non_blocking=True)
-        self.positions[:total_num_scheduled_tokens].copy_(
-            self.positions_cpu[:total_num_scheduled_tokens], non_blocking=True)
+        if self.model_config.uses_mrope:
+            # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
+            self.mrope_positions[:, :total_num_scheduled_tokens].copy_(
+                self.mrope_positions_cpu[:, :total_num_scheduled_tokens],
+                non_blocking=True)
+        else:
+            # Common case (1D positions)
+            self.positions[:total_num_scheduled_tokens].copy_(
+                self.positions_cpu[:total_num_scheduled_tokens],
+                non_blocking=True)
         query_start_loc = self.query_start_loc_cpu[:num_reqs + 1].to(
             self.device, non_blocking=True)
         seq_start_loc = self.seq_start_loc_cpu[:num_reqs + 1].to(
@@ -472,6 +541,61 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
         logits_indices = query_start_loc[1:] - 1
         return attn_metadata, logits_indices
 
+    def _calc_mrope_positions(self, scheduler_output: "SchedulerOutput"):
+        mrope_pos_ptr = 0
+        num_reqs = self.input_batch.num_reqs
+        for index, req_id in enumerate(self.input_batch.req_ids[:num_reqs]):
+            assert req_id is not None
+
+            req = self.requests[req_id]
+            assert req.mrope_positions is not None
+
+            num_computed_tokens = \
+                self.input_batch.num_computed_tokens_cpu[index]
+            num_scheduled_tokens = \
+                scheduler_output.num_scheduled_tokens[req_id]
+            num_prompt_tokens = len(req.prompt_token_ids)
+
+            if num_computed_tokens + num_scheduled_tokens > num_prompt_tokens:
+                prompt_part_len = max(0,
+                                      num_prompt_tokens - num_computed_tokens)
+                completion_part_len = max(
+                    0, num_scheduled_tokens - prompt_part_len)
+            else:
+                prompt_part_len = num_scheduled_tokens
+                completion_part_len = 0
+
+            assert num_scheduled_tokens == prompt_part_len + completion_part_len
+
+            if prompt_part_len > 0:
+                # prompt's mrope_positions are pre-computed
+                dst_start = mrope_pos_ptr
+                dst_end = mrope_pos_ptr + prompt_part_len
+                src_start = num_computed_tokens
+                src_end = num_computed_tokens + prompt_part_len
+
+                self.mrope_positions_cpu[:, dst_start:dst_end] = \
+                    req.mrope_positions[:,src_start:src_end]
+
+                mrope_pos_ptr += prompt_part_len
+
+            if completion_part_len > 0:
+                # compute completion's mrope_positions on-the-fly
+                dst_start = mrope_pos_ptr
+                dst_end = mrope_pos_ptr + completion_part_len
+
+                self.mrope_positions_cpu[:, dst_start:dst_end] = \
+                    MRotaryEmbedding.get_next_input_positions_tensor(
+                        req.mrope_position_delta,
+                        context_len=num_computed_tokens +
+                        prompt_part_len,
+                        seq_len=num_computed_tokens +
+                        prompt_part_len +
+                        completion_part_len,
+                    )
+
+                mrope_pos_ptr += completion_part_len
+
     def _prepare_sampling(
         self,
         scheduler_output: "SchedulerOutput",
@@ -618,9 +742,12 @@ def execute_model(
         # Run the decoder.
         # Use persistent buffers for CUDA graphs.
         with set_forward_context(attn_metadata, self.vllm_config):
+            positions = self.mrope_positions[:, :num_input_tokens] \
+                if self.model_config.uses_mrope \
+                else self.positions[:num_input_tokens]
             hidden_states = self.model(
                 input_ids=input_ids,
-                positions=self.positions[:num_input_tokens],
+                positions=positions,
                 kv_caches=self.kv_caches,
                 attn_metadata=None,
                 inputs_embeds=inputs_embeds,
@@ -707,9 +834,12 @@ def _dummy_run(
             input_ids = self.input_ids[:num_tokens]
             inputs_embeds = None
         with set_forward_context(None, self.vllm_config):
+            positions = self.mrope_positions[:, :num_tokens] \
+                if self.model_config.uses_mrope \
+                else self.positions[:num_tokens]
             hidden_states = model(
                 input_ids=input_ids,
-                positions=self.positions[:num_tokens],
+                positions=positions,
                 kv_caches=kv_caches,
                 attn_metadata=None,
                 inputs_embeds=inputs_embeds,

From bbe5f9de7dab1fa905807577faa185d85040213a Mon Sep 17 00:00:00 2001
From: Martin Gleize <mgleize@meta.com>
Date: Sun, 19 Jan 2025 19:40:40 +0100
Subject: [PATCH 275/462] [Model] Support for fairseq2 Llama (#11442)

Signed-off-by: Martin Gleize <mgleize@meta.com>
Co-authored-by: mgleize user <mgleize@a100-st-p4de24xlarge-4.fair-a100.hpcaas>
---
 tests/models/registry.py                     |   1 +
 tests/weight_loading/models.txt              |   3 +-
 tests/weight_loading/test_weight_loading.py  |  13 +-
 vllm/model_executor/layers/linear.py         |  34 +++--
 vllm/model_executor/model_loader/loader.py   |  15 +-
 vllm/model_executor/models/fairseq2_llama.py | 151 +++++++++++++++++++
 vllm/model_executor/models/registry.py       |   1 +
 7 files changed, 197 insertions(+), 21 deletions(-)
 create mode 100644 vllm/model_executor/models/fairseq2_llama.py

diff --git a/tests/models/registry.py b/tests/models/registry.py
index 938c838617e8b..cb0521cfe80a7 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -69,6 +69,7 @@ class _HfExamplesInfo:
     "DeepseekV3ForCausalLM": _HfExamplesInfo("deepseek-ai/DeepSeek-V3",  # noqa: E501
                                          trust_remote_code=True),
     "ExaoneForCausalLM": _HfExamplesInfo("LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"),  # noqa: E501
+    "Fairseq2LlamaForCausalLM": _HfExamplesInfo("mgleize/fairseq2-dummy-Llama-3.2-1B"),  # noqa: E501
     "FalconForCausalLM": _HfExamplesInfo("tiiuae/falcon-7b"),
     "GemmaForCausalLM": _HfExamplesInfo("google/gemma-2b"),
     "Gemma2ForCausalLM": _HfExamplesInfo("google/gemma-2-9b"),
diff --git a/tests/weight_loading/models.txt b/tests/weight_loading/models.txt
index a06956ce18a93..272206d4502e9 100644
--- a/tests/weight_loading/models.txt
+++ b/tests/weight_loading/models.txt
@@ -30,4 +30,5 @@ marlin, nm-testing/zephyr-beta-7b-marlin-g128, main
 marlin, robertgshaw2/zephyr-7b-beta-channelwise-marlin, main
 qqq, HandH1998/QQQ-Llama-3-8b-g128, main
 qqq, HandH1998/QQQ-Llama-3-8b, main
-hqq, nm-testing/Llama-3.2-1B-Instruct-HQQ, main
\ No newline at end of file
+hqq, nm-testing/Llama-3.2-1B-Instruct-HQQ, main
+None, mgleize/fairseq2-dummy-Llama-3.2-1B, main
\ No newline at end of file
diff --git a/tests/weight_loading/test_weight_loading.py b/tests/weight_loading/test_weight_loading.py
index 199731bdc21fe..7a3786456d0d6 100644
--- a/tests/weight_loading/test_weight_loading.py
+++ b/tests/weight_loading/test_weight_loading.py
@@ -20,12 +20,13 @@ def test_weight_loading(vllm_runner):
     """
     Test parameter weight loading with tp>1.
     """
-    with vllm_runner(model_name=MODEL_NAME,
-                     revision=REVISION,
-                     dtype=torch.half if QUANTIZATION == "gptq" else "auto",
-                     quantization=QUANTIZATION,
-                     max_model_len=MAX_MODEL_LEN,
-                     tensor_parallel_size=2) as model:
+    with vllm_runner(
+            model_name=MODEL_NAME,
+            revision=REVISION,
+            dtype=torch.half if QUANTIZATION == "gptq" else "auto",
+            quantization=None if QUANTIZATION == "None" else QUANTIZATION,
+            max_model_len=MAX_MODEL_LEN,
+            tensor_parallel_size=2) as model:
 
         output = model.generate_greedy("Hello world!", max_tokens=20)
         print(output)
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 00ae64bbe6388..52263e96fb9f9 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -344,11 +344,13 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
             param.materialize(loaded_weight.shape, dtype=loaded_weight.dtype)
 
         use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False)
+        is_sharded_weight = getattr(param, "is_sharded_weight", False)
+        # bitsandbytes loads the weights of the specific portion
+        # no need to narrow
+        is_sharded_weight = is_sharded_weight or use_bitsandbytes_4bit
 
         param_data = param.data
-        # bitsandbytes loads the weights of the specific portion
-        # no need to narrow here
-        if output_dim is not None and not use_bitsandbytes_4bit:
+        if output_dim is not None and not is_sharded_weight:
             shard_size = param_data.shape[output_dim]
             start_idx = tp_rank * shard_size
             loaded_weight = loaded_weight.narrow(output_dim, start_idx,
@@ -546,6 +548,11 @@ def weight_loader(self,
 
             use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit",
                                             False)
+            is_sharded_weight = getattr(param, "is_sharded_weight", False)
+            # bitsandbytes loads the weights of the specific portion
+            # no need to narrow
+            is_sharded_weight = is_sharded_weight or use_bitsandbytes_4bit
+
             if use_bitsandbytes_4bit:
                 shard_size = loaded_weight.shape[output_dim]
                 shard_offset = loaded_weight.shape[output_dim] * \
@@ -554,9 +561,7 @@ def weight_loader(self,
             param_data = param_data.narrow(output_dim, shard_offset,
                                            shard_size)
             start_idx = tp_rank * shard_size
-            # bitsandbytes loads the weights of the specific portion
-            # no need to narrow here
-            if not use_bitsandbytes_4bit:
+            if not is_sharded_weight:
                 loaded_weight = loaded_weight.narrow(output_dim, start_idx,
                                                      shard_size)
         # Special case for AQLM codebooks.
@@ -941,6 +946,11 @@ def weight_loader(self,
 
             use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit",
                                             False)
+            is_sharded_weight = getattr(param, "is_sharded_weight", False)
+            # bitsandbytes loads the weights of the specific portion
+            # no need to narrow
+            is_sharded_weight = is_sharded_weight or use_bitsandbytes_4bit
+
             if use_bitsandbytes_4bit:
                 orig_qkv_offsets = {
                     "q": (0, self.num_heads * self.head_size),
@@ -964,9 +974,7 @@ def weight_loader(self,
                 shard_id = tp_rank // self.num_kv_head_replicas
             start_idx = shard_id * shard_size
 
-            # bitsandbytes loads the weights of the specific portion
-            # no need to narrow here
-            if not use_bitsandbytes_4bit:
+            if not is_sharded_weight:
                 loaded_weight = loaded_weight.narrow(output_dim, start_idx,
                                                      shard_size)
 
@@ -1070,6 +1078,10 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
         tp_size = get_tensor_model_parallel_world_size()
         input_dim = getattr(param, "input_dim", None)
         use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False)
+        is_sharded_weight = getattr(param, "is_sharded_weight", False)
+        # bitsandbytes loads the weights of the specific portion
+        # no need to narrow
+        is_sharded_weight = is_sharded_weight or use_bitsandbytes_4bit
 
         # Special case for GGUF
         is_gguf_weight = getattr(param, "is_gguf_weight", False)
@@ -1085,9 +1097,7 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
             param.materialize(tuple(weight_shape), dtype=loaded_weight.dtype)
 
         param_data = param.data
-        # bitsandbytes loads the weights of the specific portion
-        # no need to narrow here
-        if input_dim is not None and not use_bitsandbytes_4bit:
+        if input_dim is not None and not is_sharded_weight:
             shard_size = param_data.shape[input_dim]
             start_idx = tp_rank * shard_size
             loaded_weight = loaded_weight.narrow(input_dim, start_idx,
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index e6e37358482fc..f697c3245f098 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -182,6 +182,9 @@ class Source:
         fall_back_to_pt: bool = True
         """Whether .pt weights can be used."""
 
+        allow_patterns_overrides: Optional[list[str]] = None
+        """If defined, weights will load exclusively using these patterns."""
+
     def __init__(self, load_config: LoadConfig):
         super().__init__(load_config)
         if load_config.model_loader_extra_config:
@@ -218,6 +221,7 @@ def _prepare_weights(
         model_name_or_path: str,
         revision: Optional[str],
         fall_back_to_pt: bool,
+        allow_patterns_overrides: Optional[list[str]],
     ) -> Tuple[str, List[str], bool]:
         """Prepare weights for the model.
 
@@ -249,6 +253,9 @@ def _prepare_weights(
         if fall_back_to_pt:
             allow_patterns += ["*.pt"]
 
+        if allow_patterns_overrides is not None:
+            allow_patterns = allow_patterns_overrides
+
         if not is_local:
             hf_folder = download_weights_from_hf(
                 model_name_or_path,
@@ -298,7 +305,8 @@ def _get_weights_iterator(
     ) -> Generator[Tuple[str, torch.Tensor], None, None]:
         """Get an iterator for the model weights based on the load format."""
         hf_folder, hf_weights_files, use_safetensors = self._prepare_weights(
-            source.model_or_path, source.revision, source.fall_back_to_pt)
+            source.model_or_path, source.revision, source.fall_back_to_pt,
+            source.allow_patterns_overrides)
         if self.load_config.load_format == LoadFormat.NPCACHE:
             # Currently np_cache only support *.bin checkpoints
             assert use_safetensors is False
@@ -340,6 +348,8 @@ def _get_all_weights(
             prefix="",
             fall_back_to_pt=getattr(model, "fall_back_to_pt_during_load",
                                     True),
+            allow_patterns_overrides=getattr(model, "allow_patterns_overrides",
+                                             None),
         )
         yield from self._get_weights_iterator(primary_weights)
 
@@ -353,7 +363,8 @@ def _get_all_weights(
     def download_model(self, model_config: ModelConfig) -> None:
         self._prepare_weights(model_config.model,
                               model_config.revision,
-                              fall_back_to_pt=True)
+                              fall_back_to_pt=True,
+                              allow_patterns_overrides=None)
 
     def load_model(self, vllm_config: VllmConfig) -> nn.Module:
         device_config = vllm_config.device_config
diff --git a/vllm/model_executor/models/fairseq2_llama.py b/vllm/model_executor/models/fairseq2_llama.py
new file mode 100644
index 0000000000000..b93a68680375d
--- /dev/null
+++ b/vllm/model_executor/models/fairseq2_llama.py
@@ -0,0 +1,151 @@
+# Copyright 2024 The vLLM team.
+# Copyright 2024 Meta Platforms, Inc. and affiliates. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Llama model for fairseq2 weights."""
+
+from typing import Iterable, Set, Tuple
+
+import torch
+from torch.nn import Parameter
+
+from vllm.config import VllmConfig
+from vllm.distributed import (get_tensor_model_parallel_rank,
+                              get_tensor_model_parallel_world_size)
+from vllm.model_executor.layers.linear import set_weight_attrs
+from vllm.model_executor.models.llama import LlamaForCausalLM
+
+from .utils import AutoWeightsLoader, WeightsMapper
+
+
+class Fairseq2LlamaForCausalLM(LlamaForCausalLM):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+        self.tp_rank = get_tensor_model_parallel_rank()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        # For the model loader to read only the relevant checkpoint files
+        self.allow_patterns_overrides = [
+            # either the full checkpoint
+            "model.pt",
+            # or the tp-sharded checkpoint of the current rank
+            f"model.{self.tp_rank}.pt",
+        ]
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        # fairseq2's serialization adds a wrapper to usual .pt state_dict's:
+        # { "model_key": my_model_name, "my_model_name": state_dict }
+        # which we first need to unpack
+        weights_wrapped = dict(weights)
+        weights = weights_wrapped[
+            weights_wrapped["model_key"]].items()  # type: ignore
+
+        # remap keys
+        fs2_to_vllm_mapper = WeightsMapper(
+            orig_to_new_prefix={
+                "decoder_frontend.embed.": "model.embed_tokens.",
+                "decoder.": "model.",
+                "final_proj.": "lm_head.",
+            },
+            orig_to_new_substr={
+                ".self_attn_layer_norm.": ".input_layernorm.",
+                ".ffn_layer_norm.": ".post_attention_layernorm.",
+                ".self_attn.output_proj.": ".self_attn.o_proj.",
+                ".ffn.gate_proj.": ".mlp.gate_proj.",
+                ".ffn.inner_proj.": ".mlp.up_proj.",
+                ".ffn.output_proj.": ".mlp.down_proj.",
+                ".layer_norm.": ".norm.",
+            },
+        )
+        weights = fs2_to_vllm_mapper.apply(weights)
+
+        params = dict(self.named_parameters())
+
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."]
+                           if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(
+            (self.reshape_fairseq2_weights(name, loaded_weight, params)
+             for name, loaded_weight in weights))
+
+    def flag_sharded_weights(self, params: dict[str, Parameter]):
+        """Sets the `is_sharded_weight` flag to True for all sharded weights"""
+        for name, param in params.items():
+            modules = name.split(".")
+            if "norm" in name and len(param.size()) < 2:
+                # layer norms are not sharded
+                continue
+            elif any(emb in modules for emb in ["embed_tokens", "lm_head"]):
+                # for now we repeat embedding layers for compatibility
+                continue
+            else:
+                # all other layers are sharded
+                set_weight_attrs(param, {"is_sharded_weight": True})
+
+    def reshape_fairseq2_weights(
+        self,
+        name: str,
+        loaded_weight: torch.Tensor,
+        params: dict[str, Parameter],
+    ) -> Tuple[str, torch.Tensor]:
+        """Reshape fairseq2's weights."""
+
+        def permute(w: torch.Tensor, n_heads: int) -> torch.Tensor:
+            attn_in = self.config.head_dim * n_heads
+            # check for a sharded weight on dim 0
+            if attn_in // self.tp_size == w.size()[0]:
+                attn_in //= self.tp_size
+                n_heads //= self.tp_size
+            attn_out = self.config.hidden_size
+            return (w.view(n_heads, attn_in // n_heads // 2, 2,
+                           attn_out).transpose(1,
+                                               2).reshape(attn_in, attn_out))
+
+        modules = name.split(".")
+
+        # rotary embeds should be sliced
+        if "k_proj" in modules:
+            loaded_weight = permute(loaded_weight,
+                                    self.config.num_key_value_heads)
+
+        elif "q_proj" in modules:
+            loaded_weight = permute(loaded_weight,
+                                    self.config.num_attention_heads)
+
+        # We make the loaded weights compatible with both
+        # full checkpoints and tp sharded checkpoints.
+        # Embeddings are repeated to fit the vocab size.
+        # Other weights are flagged for the weight_loader calls.
+        if any(emb in modules for emb in ["embed_tokens", "lm_head"]):
+            # Embeddings are sharded on dim 0
+            dim = 0
+            # In fairseq2, vocab size has to be divisible by tp_size
+            # so we don't worry about padding
+            if self.tp_size > 1 and loaded_weight.shape[
+                    dim] < self.config.vocab_size:
+                assert loaded_weight.shape[
+                    dim] * self.tp_size == self.config.vocab_size, \
+                        "vocab_size should be divisible by tp_size."
+                repeats = [1] * len(loaded_weight.size())
+                repeats[dim] = self.tp_size
+                # repeat to match vocab size and to be easily 'narrow'able
+                loaded_weight = loaded_weight.repeat(repeats)
+                set_weight_attrs(params[name], {"is_sharded_weight": False})
+                # if embeddings are sharded, the rest is too
+                if "embed_tokens" in modules:
+                    self.flag_sharded_weights(params)
+
+        return name, loaded_weight
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index a71f7f7029c7d..311f91472783b 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -47,6 +47,7 @@
     "DeepseekV3ForCausalLM": ("deepseek_v3", "DeepseekV3ForCausalLM"),
     "ExaoneForCausalLM": ("exaone", "ExaoneForCausalLM"),
     "FalconForCausalLM": ("falcon", "FalconForCausalLM"),
+    "Fairseq2LlamaForCausalLM": ("fairseq2_llama", "Fairseq2LlamaForCausalLM"),
     "GemmaForCausalLM": ("gemma", "GemmaForCausalLM"),
     "Gemma2ForCausalLM": ("gemma2", "Gemma2ForCausalLM"),
     "GlmForCausalLM": ("glm", "GlmForCausalLM"),

From df450aa5671bd3a48929686eb14d8a4324afd91a Mon Sep 17 00:00:00 2001
From: shangmingc <caishangming@linux.alibaba.com>
Date: Mon, 20 Jan 2025 10:56:43 +0800
Subject: [PATCH 276/462] [Bugfix] Fix num_heads value for simple connector
 when tp enabled (#12074)

Signed-off-by: Shangming Cai <caishangming@linux.alibaba.com>
---
 vllm/distributed/kv_transfer/kv_connector/simple_connector.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
index 4ace03ff1184e..7780e2dfa317d 100644
--- a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
@@ -35,6 +35,7 @@ def __init__(
     ):
 
         self.config = config.kv_transfer_config
+        self.tp_size = config.parallel_config.tensor_parallel_size
 
         if self.config.kv_connector == "PyNcclConnector":
             from vllm.distributed.kv_transfer.kv_pipe.pynccl_pipe import (
@@ -161,7 +162,7 @@ def send_kv_caches_and_hidden_states(
         end_layer = model_executable.model.end_layer
 
         model_config = model_executable.model.config
-        num_heads = model_config.num_key_value_heads
+        num_heads = int(model_config.num_key_value_heads / self.tp_size)
         hidden_size = model_config.hidden_size
         num_attention_heads = model_config.num_attention_heads
         head_size = int(hidden_size / num_attention_heads)

From 51ef828f10acddbe941c38255c5de7f61738abad Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 20 Jan 2025 11:37:50 +0800
Subject: [PATCH 277/462] [torch.compile] fix sym_tensor_indices (#12191)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/compilation/backends.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index d7f4dcb7a20fc..955c25f300512 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -624,9 +624,13 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
         ]
 
         # index of tensors that have symbolic shapes (batch size)
+        # for weights and static buffers, they will have concrete shapes.
+        # symbolic shape only happens for input tensors.
+        from torch.fx.experimental.symbolic_shapes import is_symbolic
         self.sym_tensor_indices = [
             i for i, x in enumerate(fake_args)
-            if isinstance(x, torch._subclasses.fake_tensor.FakeTensor)
+            if isinstance(x, torch._subclasses.fake_tensor.FakeTensor) and \
+                any(is_symbolic(d) for d in x.size())
         ]
 
         # compiler managed cudagraph input buffers

From 3ea7b94523f748faf464293ca5fdc4c94e3a3a89 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Mon, 20 Jan 2025 06:58:01 +0000
Subject: [PATCH 278/462] Move linting to `pre-commit` (#11975)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .../scripts/nightly-annotate.sh               |   2 +-
 .github/workflows/actionlint.yml              |  40 --
 .github/workflows/clang-format.yml            |  53 --
 .github/workflows/codespell.yml               |  45 --
 .github/workflows/doc-lint.yml                |  32 -
 .github/workflows/dummy.yml                   |  20 +
 .github/workflows/matchers/ruff.json          |  17 -
 .github/workflows/mypy.yaml                   |  51 --
 .github/workflows/png-lint.yml                |  37 --
 .github/workflows/pre-commit.yml              |  17 +
 .github/workflows/ruff.yml                    |  52 --
 .github/workflows/shellcheck.yml              |  37 --
 .github/workflows/yapf.yml                    |  38 --
 .pre-commit-config.yaml                       |  73 +++
 csrc/core/scalar_type.hpp                     |   2 +-
 csrc/cpu/cpu_types.hpp                        |   6 +-
 csrc/cpu/cpu_types_arm.hpp                    | 549 +++++++++---------
 csrc/cpu/cpu_types_vsx.hpp                    | 254 ++++----
 csrc/cpu/cpu_types_x86.hpp                    | 311 +++++-----
 csrc/cutlass_extensions/common.hpp            |   3 +-
 docs/source/contributing/overview.md          |  13 +-
 format.sh                                     | 321 ----------
 pyproject.toml                                |   8 +
 requirements-lint.txt                         |  15 +-
 tools/actionlint.sh                           |  13 -
 tools/doc-lint.sh                             |   3 -
 26 files changed, 725 insertions(+), 1287 deletions(-)
 delete mode 100644 .github/workflows/actionlint.yml
 delete mode 100644 .github/workflows/clang-format.yml
 delete mode 100644 .github/workflows/codespell.yml
 delete mode 100644 .github/workflows/doc-lint.yml
 create mode 100644 .github/workflows/dummy.yml
 delete mode 100644 .github/workflows/matchers/ruff.json
 delete mode 100644 .github/workflows/mypy.yaml
 delete mode 100644 .github/workflows/png-lint.yml
 create mode 100644 .github/workflows/pre-commit.yml
 delete mode 100644 .github/workflows/ruff.yml
 delete mode 100644 .github/workflows/shellcheck.yml
 delete mode 100644 .github/workflows/yapf.yml
 create mode 100644 .pre-commit-config.yaml
 delete mode 100755 format.sh
 delete mode 100755 tools/actionlint.sh
 delete mode 100755 tools/doc-lint.sh

diff --git a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
index 686f70dbece6c..69b6b146b3549 100644
--- a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
+++ b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
@@ -43,7 +43,7 @@ main() {
     
 
-    # The figures should be genereated by a separate process outside the CI/CD pipeline
+    # The figures should be generated by a separate process outside the CI/CD pipeline
 
     # # generate figures
     # python3 -m pip install tabulate pandas matplotlib
diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml
deleted file mode 100644
index 0226cf0ca00e9..0000000000000
--- a/.github/workflows/actionlint.yml
+++ /dev/null
@@ -1,40 +0,0 @@
-name: Lint GitHub Actions workflows
-on:
-  push:
-    branches:
-      - "main"
-    paths:
-      - '.github/workflows/*.ya?ml'
-      - '.github/workflows/actionlint.*'
-      - '.github/workflows/matchers/actionlint.json'
-  pull_request:
-    branches:
-      - "main"
-    paths:
-      - '.github/workflows/*.ya?ml'
-      - '.github/workflows/actionlint.*'
-      - '.github/workflows/matchers/actionlint.json'
-
-env:
-  LC_ALL: en_US.UTF-8
-
-defaults:
-  run:
-    shell: bash
-
-permissions:
-  contents: read
-
-jobs:
-  actionlint:
-    runs-on: ubuntu-latest
-    steps:
-      - name: "Checkout"
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-        with:
-          fetch-depth: 0
-
-      - name: "Run actionlint"
-        run: |
-          echo "::add-matcher::.github/workflows/matchers/actionlint.json"
-          tools/actionlint.sh -color
diff --git a/.github/workflows/clang-format.yml b/.github/workflows/clang-format.yml
deleted file mode 100644
index 68149d2dc019f..0000000000000
--- a/.github/workflows/clang-format.yml
+++ /dev/null
@@ -1,53 +0,0 @@
-name: clang-format
-
-on:
-  # Trigger the workflow on push or pull request,
-  # but only for the main branch
-  push:
-    branches:
-      - main
-    paths:
-      - '**/*.h'
-      - '**/*.cpp'
-      - '**/*.cu'
-      - '**/*.cuh'
-      - '.github/workflows/clang-format.yml'
-  pull_request:
-    branches:
-      - main
-    paths:
-      - '**/*.h'
-      - '**/*.cpp'
-      - '**/*.cu'
-      - '**/*.cuh'
-      - '.github/workflows/clang-format.yml'
-
-jobs:
-  clang-format:
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        python-version: ["3.11"]
-    steps:
-    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Install dependencies
-      run: |
-        python -m pip install --upgrade pip
-        pip install clang-format==18.1.5
-    - name: Running clang-format
-      run: |
-        EXCLUDES=(
-            'csrc/moe/topk_softmax_kernels.cu'
-            'csrc/quantization/gguf/ggml-common.h'
-            'csrc/quantization/gguf/dequantize.cuh'
-            'csrc/quantization/gguf/vecdotq.cuh'
-            'csrc/quantization/gguf/mmq.cuh'
-            'csrc/quantization/gguf/mmvq.cuh'
-        )
-        find csrc/ \( -name '*.h' -o -name '*.cpp' -o -name '*.cu' -o -name '*.cuh' \) -print \
-            | grep -vFf <(printf "%s\n" "${EXCLUDES[@]}") \
-            | xargs clang-format --dry-run --Werror
diff --git a/.github/workflows/codespell.yml b/.github/workflows/codespell.yml
deleted file mode 100644
index 68887adaae54b..0000000000000
--- a/.github/workflows/codespell.yml
+++ /dev/null
@@ -1,45 +0,0 @@
-name: codespell
-
-on:
-  # Trigger the workflow on push or pull request,
-  # but only for the main branch
-  push:
-    branches:
-      - main
-    paths:
-      - "**/*.py"
-      - "**/*.md"
-      - "**/*.rst"
-      - pyproject.toml
-      - requirements-lint.txt
-      - .github/workflows/codespell.yml
-  pull_request:
-    branches:
-      - main
-    paths:
-      - "**/*.py"
-      - "**/*.md"
-      - "**/*.rst"
-      - pyproject.toml
-      - requirements-lint.txt
-      - .github/workflows/codespell.yml
-
-jobs:
-  codespell:
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        python-version: ["3.12"]
-    steps:
-    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Install dependencies
-      run: |
-        python -m pip install --upgrade pip
-        pip install -r requirements-lint.txt
-    - name: Spelling check with codespell
-      run: |
-        codespell --toml pyproject.toml
diff --git a/.github/workflows/doc-lint.yml b/.github/workflows/doc-lint.yml
deleted file mode 100644
index 2f5ee8bbfd8c5..0000000000000
--- a/.github/workflows/doc-lint.yml
+++ /dev/null
@@ -1,32 +0,0 @@
-name: Lint documentation
-
-on:
-  push:
-    branches:
-      - main
-    paths:
-      - "docs/**"
-  pull_request:
-    branches:
-      - main
-    paths:
-      - "docs/**"
-
-jobs:
-  doc-lint:
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        python-version: ["3.12"]
-    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-      - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
-        with:
-          python-version: ${{ matrix.python-version }}
-      - name: Install dependencies
-        run: |
-          python -m pip install --upgrade pip
-          pip install -r requirements-lint.txt
-      - name: Linting docs
-        run: tools/doc-lint.sh
diff --git a/.github/workflows/dummy.yml b/.github/workflows/dummy.yml
new file mode 100644
index 0000000000000..ea507fab6b2de
--- /dev/null
+++ b/.github/workflows/dummy.yml
@@ -0,0 +1,20 @@
+name: dummy-checks
+
+on:
+  pull_request:
+
+jobs:
+  mypy:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.12"]
+    steps:
+      - run: echo "This is a dummy step that always passes"
+  ruff:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.12"]
+    steps:
+      - run: echo "This is a dummy step that always passes"
diff --git a/.github/workflows/matchers/ruff.json b/.github/workflows/matchers/ruff.json
deleted file mode 100644
index f6d4479ee1996..0000000000000
--- a/.github/workflows/matchers/ruff.json
+++ /dev/null
@@ -1,17 +0,0 @@
-{
-    "problemMatcher": [
-      {
-        "owner": "ruff",
-        "pattern": [
-          {
-            "regexp": "^(.+?):(\\d+):(\\d+): (\\w+): (.+)$",
-            "file": 1,
-            "line": 2,
-            "column": 3,
-            "code": 4,
-            "message": 5
-          }
-        ]
-      }
-    ]
-  }
diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml
deleted file mode 100644
index 73eeacf1fa562..0000000000000
--- a/.github/workflows/mypy.yaml
+++ /dev/null
@@ -1,51 +0,0 @@
-name: mypy
-
-on:
-  # Trigger the workflow on push or pull request,
-  # but only for the main branch
-  push:
-    branches:
-      - main
-    paths:
-      - '**/*.py'
-      - '.github/workflows/mypy.yaml'
-      - 'tools/mypy.sh'
-      - 'pyproject.toml'
-  pull_request:
-    branches:
-      - main
-    # This workflow is only relevant when one of the following files changes.
-    # However, we have github configured to expect and require this workflow
-    # to run and pass before github with auto-merge a pull request. Until github
-    # allows more flexible auto-merge policy, we can just run this on every PR.
-    # It doesn't take that long to run, anyway.
-    #paths:
-    #  - '**/*.py'
-    #  - '.github/workflows/mypy.yaml'
-    #  - 'tools/mypy.sh'
-    #  - 'pyproject.toml'
-
-jobs:
-  mypy:
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        python-version: ["3.9", "3.10", "3.11", "3.12"]
-    steps:
-    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Install dependencies
-      run: |
-        python -m pip install --upgrade pip
-        pip install mypy==1.11.1
-        pip install types-setuptools
-        pip install types-PyYAML
-        pip install types-requests
-        pip install types-setuptools
-    - name: Mypy
-      run: |
-        echo "::add-matcher::.github/workflows/matchers/mypy.json"
-        tools/mypy.sh 1 ${{ matrix.python-version }}
diff --git a/.github/workflows/png-lint.yml b/.github/workflows/png-lint.yml
deleted file mode 100644
index 4932af943a07b..0000000000000
--- a/.github/workflows/png-lint.yml
+++ /dev/null
@@ -1,37 +0,0 @@
-name: Lint PNG exports from excalidraw
-on:
-  push:
-    branches:
-      - "main"
-    paths:
-      - '*.excalidraw.png'
-      - '.github/workflows/png-lint.yml'
-  pull_request:
-    branches:
-      - "main"
-    paths:
-      - '*.excalidraw.png'
-      - '.github/workflows/png-lint.yml'
-
-env:
-  LC_ALL: en_US.UTF-8
-
-defaults:
-  run:
-    shell: bash
-
-permissions:
-  contents: read
-
-jobs:
-  actionlint:
-    runs-on: ubuntu-latest
-    steps:
-      - name: "Checkout"
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-        with:
-          fetch-depth: 0
-
-      - name: "Run png-lint.sh to check excalidraw exported images"
-        run: |
-          tools/png-lint.sh
diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
new file mode 100644
index 0000000000000..8c72a709cf330
--- /dev/null
+++ b/.github/workflows/pre-commit.yml
@@ -0,0 +1,17 @@
+name: pre-commit
+
+on:
+  pull_request:
+  push:
+    branches: [main]
+
+jobs:
+  pre-commit:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+    - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
+      with:
+        python-version: "3.12"
+    - run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
+    - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml
deleted file mode 100644
index 7266cc378cfb0..0000000000000
--- a/.github/workflows/ruff.yml
+++ /dev/null
@@ -1,52 +0,0 @@
-name: ruff
-
-on:
-  # Trigger the workflow on push or pull request,
-  # but only for the main branch
-  push:
-    branches:
-      - main
-    paths:
-      - "**/*.py"
-      - pyproject.toml
-      - requirements-lint.txt
-      - .github/workflows/matchers/ruff.json
-      - .github/workflows/ruff.yml
-  pull_request:
-    branches:
-      - main
-    # This workflow is only relevant when one of the following files changes.
-    # However, we have github configured to expect and require this workflow
-    # to run and pass before github with auto-merge a pull request. Until github
-    # allows more flexible auto-merge policy, we can just run this on every PR.
-    # It doesn't take that long to run, anyway.
-    #paths:
-    #  - "**/*.py"
-    #  - pyproject.toml
-    #  - requirements-lint.txt
-    #  - .github/workflows/matchers/ruff.json
-    #  - .github/workflows/ruff.yml
-
-jobs:
-  ruff:
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        python-version: ["3.12"]
-    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-      - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
-        with:
-          python-version: ${{ matrix.python-version }}
-      - name: Install dependencies
-        run: |
-          python -m pip install --upgrade pip
-          pip install -r requirements-lint.txt
-      - name: Analysing the code with ruff
-        run: |
-          echo "::add-matcher::.github/workflows/matchers/ruff.json"
-          ruff check --output-format github .
-      - name: Run isort
-        run: |
-          isort . --check-only
diff --git a/.github/workflows/shellcheck.yml b/.github/workflows/shellcheck.yml
deleted file mode 100644
index 4b1587e373e17..0000000000000
--- a/.github/workflows/shellcheck.yml
+++ /dev/null
@@ -1,37 +0,0 @@
-name: Lint shell scripts
-on:
-  push:
-    branches:
-      - "main"
-    paths:
-      - '**/*.sh'
-      - '.github/workflows/shellcheck.yml'
-  pull_request:
-    branches:
-      - "main"
-    paths:
-      - '**/*.sh'
-      - '.github/workflows/shellcheck.yml'
-
-env:
-  LC_ALL: en_US.UTF-8
-
-defaults:
-  run:
-    shell: bash
-
-permissions:
-  contents: read
-
-jobs:
-  shellcheck:
-    runs-on: ubuntu-latest
-    steps:
-      - name: "Checkout"
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-        with:
-          fetch-depth: 0
-
-      - name: "Check shell scripts"
-        run: |
-          tools/shellcheck.sh
diff --git a/.github/workflows/yapf.yml b/.github/workflows/yapf.yml
deleted file mode 100644
index ff441f94435ad..0000000000000
--- a/.github/workflows/yapf.yml
+++ /dev/null
@@ -1,38 +0,0 @@
-name: yapf
-
-on:
-  # Trigger the workflow on push or pull request,
-  # but only for the main branch
-  push:
-    branches:
-      - main
-    paths:
-      - "**/*.py"
-      - .github/workflows/yapf.yml
-  pull_request:
-    branches:
-      - main
-    paths:
-      - "**/*.py"
-      - .github/workflows/yapf.yml
-
-jobs:
-  yapf:
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        python-version: ["3.12"]
-    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-      - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
-        with:
-          python-version: ${{ matrix.python-version }}
-      - name: Install dependencies
-        run: |
-          python -m pip install --upgrade pip
-          pip install yapf==0.32.0
-          pip install toml==0.10.2
-      - name: Running yapf
-        run: |
-          yapf --diff --recursive .
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000000000..8ea0f37885d9f
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,73 @@
+repos:
+- repo: https://github.com/google/yapf
+  rev: v0.32.0
+  hooks:
+  - id: yapf
+    args: [--in-place, --verbose]
+    additional_dependencies: [toml] # TODO: Remove when yapf is upgraded
+- repo: https://github.com/astral-sh/ruff-pre-commit
+  rev: v0.6.5
+  hooks:
+  - id: ruff
+    args: [--output-format, github]
+- repo: https://github.com/codespell-project/codespell
+  rev: v2.3.0
+  hooks:
+  - id: codespell
+    exclude: 'benchmarks/sonnet.txt|(build|tests/(lora/data|models/fixtures|prompts))/.*'
+- repo: https://github.com/PyCQA/isort
+  rev: 5.13.2
+  hooks:
+  - id: isort
+- repo: https://github.com/pre-commit/mirrors-clang-format
+  rev: v18.1.5
+  hooks:
+  - id: clang-format
+    exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))'
+    types_or: [c++, cuda]
+    args: [--style=file, --verbose]
+- repo: https://github.com/jackdewinter/pymarkdown
+  rev: v0.9.27
+  hooks:
+  - id: pymarkdown
+    files: docs/.*
+- repo: local
+  hooks:
+  - id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
+    name: Run mypy for Python 3.9
+    entry: tools/mypy.sh 1 "3.9"
+    language: python
+    types: [python]
+    additional_dependencies: &mypy_deps [mypy==1.11.1, types-setuptools, types-PyYAML, types-requests]
+  - id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
+    name: Run mypy for Python 3.10
+    entry: tools/mypy.sh 1 "3.10"
+    language: python
+    types: [python]
+    additional_dependencies: *mypy_deps
+  - id: mypy-3.11 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
+    name: Run mypy for Python 3.11
+    entry: tools/mypy.sh 1 "3.11"
+    language: python
+    types: [python]
+    additional_dependencies: *mypy_deps
+  - id: mypy-3.12 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
+    name: Run mypy for Python 3.12
+    entry: tools/mypy.sh 1 "3.12"
+    language: python
+    types: [python]
+    additional_dependencies: *mypy_deps
+  - id: shellcheck
+    name: Lint shell scripts
+    entry: tools/shellcheck.sh
+    language: script
+    types: [shell]
+  - id: png-lint
+    name: Lint PNG exports from excalidraw
+    entry: tools/png-lint.sh
+    language: script
+    types: [png]
+- repo: https://github.com/rhysd/actionlint
+  rev: v1.7.6
+  hooks:
+  - id: actionlint
diff --git a/csrc/core/scalar_type.hpp b/csrc/core/scalar_type.hpp
index 408e736d5bc0f..c2ae554c9f8e8 100644
--- a/csrc/core/scalar_type.hpp
+++ b/csrc/core/scalar_type.hpp
@@ -32,7 +32,7 @@ class ScalarType {
         signed_(signed_),
         bias(bias),
         finite_values_only(finite_values_only),
-        nan_repr(nan_repr){};
+        nan_repr(nan_repr) {};
 
   static constexpr ScalarType int_(uint8_t size_bits, int32_t bias = 0) {
     return ScalarType(0, size_bits - 1, true, bias);
diff --git a/csrc/cpu/cpu_types.hpp b/csrc/cpu/cpu_types.hpp
index 28db0479748bf..a71815106133a 100644
--- a/csrc/cpu/cpu_types.hpp
+++ b/csrc/cpu/cpu_types.hpp
@@ -2,13 +2,13 @@
 #define CPU_TYPES_HPP
 
 #if defined(__x86_64__)
-  //x86 implementation
+  // x86 implementation
   #include "cpu_types_x86.hpp"
 #elif defined(__POWER9_VECTOR__)
-  //ppc implementation
+  // ppc implementation
   #include "cpu_types_vsx.hpp"
 #elif defined(__aarch64__)
-  //arm implementation
+  // arm implementation
   #include "cpu_types_arm.hpp"
 #else
   #warning "unsupported vLLM cpu implementation"
diff --git a/csrc/cpu/cpu_types_arm.hpp b/csrc/cpu/cpu_types_arm.hpp
index ae062a5b86892..990e99f2fc069 100644
--- a/csrc/cpu/cpu_types_arm.hpp
+++ b/csrc/cpu/cpu_types_arm.hpp
@@ -1,48 +1,50 @@
 #include <arm_neon.h>
-#include <torch/all.h> 
+#include <torch/all.h>
 #include <cmath>
 
 namespace vec_op {
 
 #ifdef ARM_BF16_SUPPORT
-  #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)                                 \
-    AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)                         \
-    AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)                         \
-    AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)  
+  #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)         \
+    AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
+    AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)  \
+    AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
 #else
-  #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)                                 \
-    AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)                         \
+  #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)         \
+    AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
     AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)
 #endif
 
-#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...)                          \
+#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
   AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
 
 #ifndef CPU_OP_GUARD
-#define CPU_KERNEL_GUARD_IN(NAME)
-#define CPU_KERNEL_GUARD_OUT(NAME)
+  #define CPU_KERNEL_GUARD_IN(NAME)
+  #define CPU_KERNEL_GUARD_OUT(NAME)
 #else
-#define CPU_KERNEL_GUARD_IN(NAME)                                              \
-  std::cout << #NAME << " invoked." << std::endl;
-#define CPU_KERNEL_GUARD_OUT(NAME) std::cout << #NAME << " exit." << std::endl;
+  #define CPU_KERNEL_GUARD_IN(NAME) \
+    std::cout << #NAME << " invoked." << std::endl;
+  #define CPU_KERNEL_GUARD_OUT(NAME) \
+    std::cout << #NAME << " exit." << std::endl;
 #endif
 
 #define FORCE_INLINE __attribute__((always_inline)) inline
 
 namespace {
-  template <typename T, T... indexes, typename F>
-  constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F &&f) {
-    (f(std::integral_constant<T, indexes>{}), ...);
-  };
-}; 
+template <typename T, T... indexes, typename F>
+constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F&& f) {
+  (f(std::integral_constant<T, indexes>{}), ...);
+};
+};  // namespace
 
 template <typename T, T count, typename F,
           typename = std::enable_if_t<std::is_invocable_v<F, T>>>
-constexpr void unroll_loop(F &&f) {
+constexpr void unroll_loop(F&& f) {
   unroll_loop_item(std::make_integer_sequence<T, count>{}, std::forward<F>(f));
 }
 
-template <typename T> struct Vec {
+template <typename T>
+struct Vec {
   constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; };
 };
 
@@ -54,127 +56,124 @@ struct FP16Vec8 : public Vec<FP16Vec8> {
 
   float16x8_t reg;
 
-  explicit FP16Vec8(const void *ptr)
-      : reg(vld1q_f16(static_cast<const __fp16 *>(ptr))) {};
+  explicit FP16Vec8(const void* ptr)
+      : reg(vld1q_f16(static_cast<const __fp16*>(ptr))) {};
 
-  explicit FP16Vec8(const FP32Vec8 &);
+  explicit FP16Vec8(const FP32Vec8&);
 
-  void save(void *ptr) const {
-    vst1q_f16(static_cast<__fp16 *>(ptr), reg);
-  }
+  void save(void* ptr) const { vst1q_f16(static_cast<__fp16*>(ptr), reg); }
 };
 
 struct FP16Vec16 : public Vec<FP16Vec16> {
-    constexpr static int VEC_ELEM_NUM = 16;
-    
-    float16x8x2_t reg; 
-    
-    explicit FP16Vec16(const void *ptr) {
-        reg.val[0] = vld1q_f16(reinterpret_cast<const __fp16*>(ptr));        
-        reg.val[1] = vld1q_f16(reinterpret_cast<const __fp16*>(ptr) + 8);    
-    }
-    
-    explicit FP16Vec16(const FP32Vec16& vec);
-    
-    void save(void *ptr) const {
-        vst1q_f16(reinterpret_cast<__fp16*>(ptr), reg.val[0]);       
-        vst1q_f16(reinterpret_cast<__fp16*>(ptr) + 8, reg.val[1]);   
+  constexpr static int VEC_ELEM_NUM = 16;
+
+  float16x8x2_t reg;
+
+  explicit FP16Vec16(const void* ptr) {
+    reg.val[0] = vld1q_f16(reinterpret_cast<const __fp16*>(ptr));
+    reg.val[1] = vld1q_f16(reinterpret_cast<const __fp16*>(ptr) + 8);
+  }
+
+  explicit FP16Vec16(const FP32Vec16& vec);
+
+  void save(void* ptr) const {
+    vst1q_f16(reinterpret_cast<__fp16*>(ptr), reg.val[0]);
+    vst1q_f16(reinterpret_cast<__fp16*>(ptr) + 8, reg.val[1]);
+  }
+
+  void save(void* ptr, const int elem_num) const {
+    int full_blocks = elem_num / 8;
+    int remainder = elem_num % 8;
+
+    if (full_blocks > 0) {
+      vst1q_f16(reinterpret_cast<__fp16*>(ptr), reg.val[0]);
+      if (full_blocks > 1) {
+        vst1q_f16(reinterpret_cast<__fp16*>(ptr) + 8, reg.val[1]);
+      }
     }
-    
-    void save(void *ptr, const int elem_num) const {
-        int full_blocks = elem_num / 8;   
-        int remainder = elem_num % 8;     
-        
-        if (full_blocks > 0) {
-            vst1q_f16(reinterpret_cast<__fp16*>(ptr), reg.val[0]);
-            if (full_blocks > 1) {
-                vst1q_f16(reinterpret_cast<__fp16*>(ptr) + 8, reg.val[1]);
-            }
-        }
-
-        // Note: below is the unrolled version of the following code:
-        // 
-        // for (int i = 0; i < remainder; ++i) {
-        //     reinterpret_cast<__fp16*>(ptr)[full_blocks * 8 + i] = 
-        //          vgetq_lane_f16(temp, i);
-        // }
-        // 
-        // For macOS build (Clang), the arm/neon intrinsics function 
-        // `vgetq_lane_f16` needs the parameter `i` to be constant at compile 
-        // time. 
-        
-        if (remainder > 0) {
-            float16x8_t temp = reg.val[full_blocks];
-            __fp16* fp16_ptr = reinterpret_cast<__fp16*>(ptr);
-            switch (remainder)
-            {
-            case 1:
-              fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
-              break;
-            case 2:
-              fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
-              fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
-              break;
-            case 3:
-              fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
-              fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
-              fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
-              break;
-            case 4:
-              fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
-              fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
-              fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
-              fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3);
-              break;
-            case 5:
-              fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
-              fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
-              fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
-              fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3);
-              fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4);
-              break;
-            case 6:
-              fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
-              fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
-              fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
-              fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3);
-              fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4);
-              fp16_ptr[full_blocks * 8 + 5] = vgetq_lane_f16(temp, 5);
-              break;
-            case 7:
-              fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
-              fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
-              fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
-              fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3);
-              fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4);
-              fp16_ptr[full_blocks * 8 + 5] = vgetq_lane_f16(temp, 5);
-              fp16_ptr[full_blocks * 8 + 6] = vgetq_lane_f16(temp, 6);
-              break;
-            
-            default:
-              break;
-            }
-        }
+
+    // Note: below is the unrolled version of the following code:
+    //
+    // for (int i = 0; i < remainder; ++i) {
+    //     reinterpret_cast<__fp16*>(ptr)[full_blocks * 8 + i] =
+    //          vgetq_lane_f16(temp, i);
+    // }
+    //
+    // For macOS build (Clang), the arm/neon intrinsics function
+    // `vgetq_lane_f16` needs the parameter `i` to be constant at compile
+    // time.
+
+    if (remainder > 0) {
+      float16x8_t temp = reg.val[full_blocks];
+      __fp16* fp16_ptr = reinterpret_cast<__fp16*>(ptr);
+      switch (remainder) {
+        case 1:
+          fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
+          break;
+        case 2:
+          fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
+          fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
+          break;
+        case 3:
+          fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
+          fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
+          fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
+          break;
+        case 4:
+          fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
+          fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
+          fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
+          fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3);
+          break;
+        case 5:
+          fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
+          fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
+          fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
+          fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3);
+          fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4);
+          break;
+        case 6:
+          fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
+          fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
+          fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
+          fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3);
+          fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4);
+          fp16_ptr[full_blocks * 8 + 5] = vgetq_lane_f16(temp, 5);
+          break;
+        case 7:
+          fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
+          fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
+          fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
+          fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3);
+          fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4);
+          fp16_ptr[full_blocks * 8 + 5] = vgetq_lane_f16(temp, 5);
+          fp16_ptr[full_blocks * 8 + 6] = vgetq_lane_f16(temp, 6);
+          break;
+
+        default:
+          break;
+      }
     }
+  }
 };
 
-
 #ifdef ARM_BF16_SUPPORT
 struct BF16Vec8 : public Vec<BF16Vec8> {
   constexpr static int VEC_ELEM_NUM = 8;
 
   bfloat16x8_t reg;
 
-  explicit BF16Vec8(const void *ptr)
-      : reg(*reinterpret_cast<const bfloat16x8_t *>(ptr)) {};
+  explicit BF16Vec8(const void* ptr)
+      : reg(*reinterpret_cast<const bfloat16x8_t*>(ptr)) {};
 
   explicit BF16Vec8(bfloat16x8_t data) : reg(data) {};
 
-  explicit BF16Vec8(const FP32Vec8 &);
+  explicit BF16Vec8(const FP32Vec8&);
 
-  explicit BF16Vec8(float32x4x2_t v) : reg(vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[0]), v.val[1])) {};  
+  explicit BF16Vec8(float32x4x2_t v)
+      : reg(vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[0]), v.val[1])) {};
 
-  void save(void *ptr) const { *reinterpret_cast<bfloat16x8_t *>(ptr) = reg; }
+  void save(void* ptr) const { *reinterpret_cast<bfloat16x8_t*>(ptr) = reg; }
 };
 
 struct BF16Vec16 : public Vec<BF16Vec16> {
@@ -182,19 +181,18 @@ struct BF16Vec16 : public Vec<BF16Vec16> {
 
   bfloat16x8x2_t reg;
 
-  explicit BF16Vec16(const void *ptr)
-      : reg(*reinterpret_cast<const bfloat16x8x2_t *>(ptr)) {};
+  explicit BF16Vec16(const void* ptr)
+      : reg(*reinterpret_cast<const bfloat16x8x2_t*>(ptr)) {};
 
   explicit BF16Vec16(bfloat16x8x2_t data) : reg(data) {};
 
-  explicit BF16Vec16(const FP32Vec16 &);
+  explicit BF16Vec16(const FP32Vec16&);
 
-  explicit BF16Vec16(float32x4x4_t v) : reg({
-    vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[0]), v.val[1]),
-    vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[2]), v.val[3])
-  }){};
+  explicit BF16Vec16(float32x4x4_t v)
+      : reg({vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[0]), v.val[1]),
+             vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[2]), v.val[3])}) {};
 
-  void save(void *ptr) const { *reinterpret_cast<bfloat16x8x2_t *>(ptr) = reg; };
+  void save(void* ptr) const { *reinterpret_cast<bfloat16x8x2_t*>(ptr) = reg; };
 };
 
 struct BF16Vec32 : public Vec<BF16Vec32> {
@@ -202,19 +200,15 @@ struct BF16Vec32 : public Vec<BF16Vec32> {
 
   bfloat16x8x4_t reg;
 
-  explicit BF16Vec32(const void *ptr)
-      : reg(*reinterpret_cast<const bfloat16x8x4_t *>(ptr)) {};
+  explicit BF16Vec32(const void* ptr)
+      : reg(*reinterpret_cast<const bfloat16x8x4_t*>(ptr)) {};
 
   explicit BF16Vec32(bfloat16x8x4_t data) : reg(data) {};
 
-  explicit BF16Vec32(const BF16Vec8 &vec8_data) : reg({
-    vec8_data.reg,
-    vec8_data.reg,
-    vec8_data.reg,
-    vec8_data.reg
-  }) {};
+  explicit BF16Vec32(const BF16Vec8& vec8_data)
+      : reg({vec8_data.reg, vec8_data.reg, vec8_data.reg, vec8_data.reg}) {};
 
-  void save(void *ptr) const { *reinterpret_cast<bfloat16x8x4_t *>(ptr) = reg; };
+  void save(void* ptr) const { *reinterpret_cast<bfloat16x8x4_t*>(ptr) = reg; };
 };
 #endif
 
@@ -232,11 +226,11 @@ struct FP32Vec4 : public Vec<FP32Vec4> {
 
   explicit FP32Vec4() : reg(vdupq_n_f32(0.0f)) {};
 
-  explicit FP32Vec4(const float *ptr) : reg(vld1q_f32(ptr)) {};
+  explicit FP32Vec4(const float* ptr) : reg(vld1q_f32(ptr)) {};
 
   explicit FP32Vec4(float32x4_t data) : reg(data) {};
 
-  explicit FP32Vec4(const FP32Vec4 &data) : reg(data.reg) {};
+  explicit FP32Vec4(const FP32Vec4& data) : reg(data.reg) {};
 };
 
 struct FP32Vec8 : public Vec<FP32Vec8> {
@@ -252,32 +246,37 @@ struct FP32Vec8 : public Vec<FP32Vec8> {
 
   explicit FP32Vec8() : reg({vmovq_n_f32(0.0), vmovq_n_f32(0.0)}) {};
 
-  explicit FP32Vec8(const float *ptr) : reg({vld1q_f32(ptr), vld1q_f32(ptr + 4)}) {};
+  explicit FP32Vec8(const float* ptr)
+      : reg({vld1q_f32(ptr), vld1q_f32(ptr + 4)}) {};
 
   explicit FP32Vec8(float32x4x2_t data) : reg(data) {};
 
-  explicit FP32Vec8(const FP32Vec8 &data) : reg(data.reg) {};
+  explicit FP32Vec8(const FP32Vec8& data) : reg(data.reg) {};
 
-  explicit FP32Vec8(const FP16Vec8 &v) {
-        reg.val[0] = vcvt_f32_f16(vget_low_f16(v.reg));  
-        reg.val[1] = vcvt_f32_f16(vget_high_f16(v.reg)); 
-    };
+  explicit FP32Vec8(const FP16Vec8& v) {
+    reg.val[0] = vcvt_f32_f16(vget_low_f16(v.reg));
+    reg.val[1] = vcvt_f32_f16(vget_high_f16(v.reg));
+  };
 
-  explicit FP32Vec8(float16x8_t v) : reg({vcvt_f32_f16(vget_low_f16(v)), vcvt_f32_f16(vget_high_f16(v))}) {};
+  explicit FP32Vec8(float16x8_t v)
+      : reg({vcvt_f32_f16(vget_low_f16(v)), vcvt_f32_f16(vget_high_f16(v))}) {};
 
-  #ifdef ARM_BF16_SUPPORT
+#ifdef ARM_BF16_SUPPORT
 
-  explicit FP32Vec8(bfloat16x8_t v) : reg({vcvtq_low_f32_bf16(v), vcvtq_high_f32_bf16(v)}) {};
+  explicit FP32Vec8(bfloat16x8_t v)
+      : reg({vcvtq_low_f32_bf16(v), vcvtq_high_f32_bf16(v)}) {};
 
-  explicit FP32Vec8(const BF16Vec8 &v) : reg({vcvtq_low_f32_bf16(v.reg), vcvtq_high_f32_bf16(v.reg)}) {};
+  explicit FP32Vec8(const BF16Vec8& v)
+      : reg({vcvtq_low_f32_bf16(v.reg), vcvtq_high_f32_bf16(v.reg)}) {};
 
-  #endif
+#endif
 
   float reduce_sum() const {
     AliasReg ar;
     ar.reg = reg;
     float answer = 0;
-    unroll_loop<int, VEC_ELEM_NUM>([&answer, &ar](int i) { answer += ar.values[i]; });
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&answer, &ar](int i) { answer += ar.values[i]; });
 
     return answer;
   }
@@ -324,10 +323,14 @@ struct FP32Vec8 : public Vec<FP32Vec8> {
     AliasReg ar;
     ar.reg = reg;
 
-    float32x2_t er_vec0 = {static_cast<float32_t>(erf(ar.values[0])), static_cast<float32_t>(erf(ar.values[1]))};
-    float32x2_t er_vec1 = {static_cast<float32_t>(erf(ar.values[2])), static_cast<float32_t>(erf(ar.values[3]))};
-    float32x2_t er_vec2 = {static_cast<float32_t>(erf(ar.values[4])), static_cast<float32_t>(erf(ar.values[5]))};
-    float32x2_t er_vec3 = {static_cast<float32_t>(erf(ar.values[6])), static_cast<float32_t>(erf(ar.values[7]))};
+    float32x2_t er_vec0 = {static_cast<float32_t>(erf(ar.values[0])),
+                           static_cast<float32_t>(erf(ar.values[1]))};
+    float32x2_t er_vec1 = {static_cast<float32_t>(erf(ar.values[2])),
+                           static_cast<float32_t>(erf(ar.values[3]))};
+    float32x2_t er_vec2 = {static_cast<float32_t>(erf(ar.values[4])),
+                           static_cast<float32_t>(erf(ar.values[5]))};
+    float32x2_t er_vec3 = {static_cast<float32_t>(erf(ar.values[6])),
+                           static_cast<float32_t>(erf(ar.values[7]))};
 
     float32x4_t result0 = vcombine_f32(er_vec0, er_vec1);
     float32x4_t result1 = vcombine_f32(er_vec2, er_vec3);
@@ -337,25 +340,29 @@ struct FP32Vec8 : public Vec<FP32Vec8> {
     result.val[1] = result1;
 
     return FP32Vec8(result);
-  } 
+  }
 
-  FP32Vec8 operator*(const FP32Vec8 &b) const {
-    return FP32Vec8(float32x4x2_t({vmulq_f32(reg.val[0], b.reg.val[0]), vmulq_f32(reg.val[1], b.reg.val[1])}));
+  FP32Vec8 operator*(const FP32Vec8& b) const {
+    return FP32Vec8(float32x4x2_t({vmulq_f32(reg.val[0], b.reg.val[0]),
+                                   vmulq_f32(reg.val[1], b.reg.val[1])}));
   }
 
-  FP32Vec8 operator+(const FP32Vec8 &b) const {
-    return FP32Vec8(float32x4x2_t({vaddq_f32(reg.val[0], b.reg.val[0]), vaddq_f32(reg.val[1], b.reg.val[1])}));
+  FP32Vec8 operator+(const FP32Vec8& b) const {
+    return FP32Vec8(float32x4x2_t({vaddq_f32(reg.val[0], b.reg.val[0]),
+                                   vaddq_f32(reg.val[1], b.reg.val[1])}));
   }
 
-  FP32Vec8 operator-(const FP32Vec8 &b) const {
-    return FP32Vec8(float32x4x2_t({vsubq_f32(reg.val[0], b.reg.val[0]), vsubq_f32(reg.val[1], b.reg.val[1])}));
+  FP32Vec8 operator-(const FP32Vec8& b) const {
+    return FP32Vec8(float32x4x2_t({vsubq_f32(reg.val[0], b.reg.val[0]),
+                                   vsubq_f32(reg.val[1], b.reg.val[1])}));
   }
 
-  FP32Vec8 operator/(const FP32Vec8 &b) const {
-    return FP32Vec8(float32x4x2_t({vdivq_f32(reg.val[0], b.reg.val[0]), vdivq_f32(reg.val[1], b.reg.val[1])}));
+  FP32Vec8 operator/(const FP32Vec8& b) const {
+    return FP32Vec8(float32x4x2_t({vdivq_f32(reg.val[0], b.reg.val[0]),
+                                   vdivq_f32(reg.val[1], b.reg.val[1])}));
   }
 
-  void save(float *ptr) const {
+  void save(float* ptr) const {
     vst1q_f32(ptr, reg.val[0]);
     vst1q_f32(ptr + 4, reg.val[1]);
   }
@@ -370,103 +377,100 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
 
   float32x4x4_t reg;
 
-  explicit FP32Vec16(float v) : reg({vmovq_n_f32(v), vmovq_n_f32(v), vmovq_n_f32(v), vmovq_n_f32(v)}) {}
+  explicit FP32Vec16(float v)
+      : reg({vmovq_n_f32(v), vmovq_n_f32(v), vmovq_n_f32(v), vmovq_n_f32(v)}) {}
 
-  explicit FP32Vec16() : reg({vmovq_n_f32(0.0), vmovq_n_f32(0.0), vmovq_n_f32(0.0), vmovq_n_f32(0.0)}) {}
+  explicit FP32Vec16()
+      : reg({vmovq_n_f32(0.0), vmovq_n_f32(0.0), vmovq_n_f32(0.0),
+             vmovq_n_f32(0.0)}) {}
 
-  explicit FP32Vec16(const float *ptr) : reg({vld1q_f32(ptr), vld1q_f32(ptr + 4), vld1q_f32(ptr + 8), vld1q_f32(ptr + 12)}) {}
+  explicit FP32Vec16(const float* ptr)
+      : reg({vld1q_f32(ptr), vld1q_f32(ptr + 4), vld1q_f32(ptr + 8),
+             vld1q_f32(ptr + 12)}) {}
 
   explicit FP32Vec16(float32x4x4_t data) : reg(data) {}
 
-  explicit FP32Vec16(const FP32Vec8 &data) {
-        reg.val[0] = data.reg.val[0]; 
-        reg.val[1] = data.reg.val[1]; 
-        reg.val[2] = data.reg.val[0]; 
-        reg.val[3] = data.reg.val[1]; 
+  explicit FP32Vec16(const FP32Vec8& data) {
+    reg.val[0] = data.reg.val[0];
+    reg.val[1] = data.reg.val[1];
+    reg.val[2] = data.reg.val[0];
+    reg.val[3] = data.reg.val[1];
   }
 
-  explicit FP32Vec16(const FP32Vec16 &data) : reg(data.reg) {}
+  explicit FP32Vec16(const FP32Vec16& data) : reg(data.reg) {}
 
-  explicit FP32Vec16(const FP16Vec8 &v) : FP32Vec16(FP32Vec8(v.reg)) {}
+  explicit FP32Vec16(const FP16Vec8& v) : FP32Vec16(FP32Vec8(v.reg)) {}
 
-  #ifdef ARM_BF16_SUPPORT
-  explicit FP32Vec16(bfloat16x8x2_t v) : reg({
-    vcvtq_low_f32_bf16(v.val[0]),
-    vcvtq_high_f32_bf16(v.val[0]),
-    vcvtq_low_f32_bf16(v.val[1]),
-    vcvtq_high_f32_bf16(v.val[1])
-  }) {};
-  #endif
+#ifdef ARM_BF16_SUPPORT
+  explicit FP32Vec16(bfloat16x8x2_t v)
+      : reg({vcvtq_low_f32_bf16(v.val[0]), vcvtq_high_f32_bf16(v.val[0]),
+             vcvtq_low_f32_bf16(v.val[1]), vcvtq_high_f32_bf16(v.val[1])}) {};
+#endif
 
-  explicit FP32Vec16(const FP32Vec4 &data) {
+  explicit FP32Vec16(const FP32Vec4& data) {
     reg.val[0] = data.reg;
     reg.val[1] = data.reg;
     reg.val[2] = data.reg;
     reg.val[3] = data.reg;
   };
 
-  #ifdef ARM_BF16_SUPPORT
-  explicit FP32Vec16(const BF16Vec16 &v) : reg({
-    vcvtq_low_f32_bf16(v.reg.val[0]),
-    vcvtq_high_f32_bf16(v.reg.val[0]),
-    vcvtq_low_f32_bf16(v.reg.val[1]),
-    vcvtq_high_f32_bf16(v.reg.val[1])
-  }) {};
-
-  explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {};
-  #endif
-
-  explicit FP32Vec16(const FP16Vec16 &v) {
-      reg.val[0] = vcvt_f32_f16(vget_low_f16(v.reg.val[0]));
-      reg.val[1] = vcvt_f32_f16(vget_high_f16(v.reg.val[0]));
-      reg.val[2] = vcvt_f32_f16(vget_low_f16(v.reg.val[1]));
-      reg.val[3] = vcvt_f32_f16(vget_high_f16(v.reg.val[1]));
+#ifdef ARM_BF16_SUPPORT
+  explicit FP32Vec16(const BF16Vec16& v)
+      : reg({vcvtq_low_f32_bf16(v.reg.val[0]),
+             vcvtq_high_f32_bf16(v.reg.val[0]),
+             vcvtq_low_f32_bf16(v.reg.val[1]),
+             vcvtq_high_f32_bf16(v.reg.val[1])}) {};
+
+  explicit FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {};
+#endif
+
+  explicit FP32Vec16(const FP16Vec16& v) {
+    reg.val[0] = vcvt_f32_f16(vget_low_f16(v.reg.val[0]));
+    reg.val[1] = vcvt_f32_f16(vget_high_f16(v.reg.val[0]));
+    reg.val[2] = vcvt_f32_f16(vget_low_f16(v.reg.val[1]));
+    reg.val[3] = vcvt_f32_f16(vget_high_f16(v.reg.val[1]));
   };
 
-  FP32Vec16 operator+(const FP32Vec16 &b) const {
-    return FP32Vec16(float32x4x4_t({
-        vaddq_f32(reg.val[0], b.reg.val[0]),
-        vaddq_f32(reg.val[1], b.reg.val[1]),
-        vaddq_f32(reg.val[2], b.reg.val[2]),
-        vaddq_f32(reg.val[3], b.reg.val[3])}));
+  FP32Vec16 operator+(const FP32Vec16& b) const {
+    return FP32Vec16(float32x4x4_t({vaddq_f32(reg.val[0], b.reg.val[0]),
+                                    vaddq_f32(reg.val[1], b.reg.val[1]),
+                                    vaddq_f32(reg.val[2], b.reg.val[2]),
+                                    vaddq_f32(reg.val[3], b.reg.val[3])}));
   };
 
-  FP32Vec16 operator*(const FP32Vec16 &b) const {
-    return FP32Vec16(float32x4x4_t({
-        vmulq_f32(reg.val[0], b.reg.val[0]),
-        vmulq_f32(reg.val[1], b.reg.val[1]),
-        vmulq_f32(reg.val[2], b.reg.val[2]),
-        vmulq_f32(reg.val[3], b.reg.val[3])}));
+  FP32Vec16 operator*(const FP32Vec16& b) const {
+    return FP32Vec16(float32x4x4_t({vmulq_f32(reg.val[0], b.reg.val[0]),
+                                    vmulq_f32(reg.val[1], b.reg.val[1]),
+                                    vmulq_f32(reg.val[2], b.reg.val[2]),
+                                    vmulq_f32(reg.val[3], b.reg.val[3])}));
   };
 
-  FP32Vec16 operator-(const FP32Vec16 &b) const {
-    return FP32Vec16(float32x4x4_t({
-        vsubq_f32(reg.val[0], b.reg.val[0]),
-        vsubq_f32(reg.val[1], b.reg.val[1]),
-        vsubq_f32(reg.val[2], b.reg.val[2]),
-        vsubq_f32(reg.val[3], b.reg.val[3])
-    }));
+  FP32Vec16 operator-(const FP32Vec16& b) const {
+    return FP32Vec16(float32x4x4_t({vsubq_f32(reg.val[0], b.reg.val[0]),
+                                    vsubq_f32(reg.val[1], b.reg.val[1]),
+                                    vsubq_f32(reg.val[2], b.reg.val[2]),
+                                    vsubq_f32(reg.val[3], b.reg.val[3])}));
   };
 
-  FP32Vec16 operator/(const FP32Vec16 &b) const {
-    return FP32Vec16(float32x4x4_t({
-        vdivq_f32(reg.val[0], b.reg.val[0]),
-        vdivq_f32(reg.val[1], b.reg.val[1]),
-        vdivq_f32(reg.val[2], b.reg.val[2]),
-        vdivq_f32(reg.val[3], b.reg.val[3])
-    }));
+  FP32Vec16 operator/(const FP32Vec16& b) const {
+    return FP32Vec16(float32x4x4_t({vdivq_f32(reg.val[0], b.reg.val[0]),
+                                    vdivq_f32(reg.val[1], b.reg.val[1]),
+                                    vdivq_f32(reg.val[2], b.reg.val[2]),
+                                    vdivq_f32(reg.val[3], b.reg.val[3])}));
   };
 
   float reduce_sum() const {
     AliasReg ar;
     ar.reg = reg;
     float answer = 0;
-    unroll_loop<int, VEC_ELEM_NUM>([&answer, &ar](int i) { answer += ar.values[i]; });
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&answer, &ar](int i) { answer += ar.values[i]; });
 
     return answer;
   };
 
-  template <int group_size> float reduce_sub_sum(int idx) {
+  template <int group_size>
+  float reduce_sub_sum(int idx) {
     static_assert(VEC_ELEM_NUM % group_size == 0);
 
     AliasReg ar;
@@ -479,7 +483,7 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
     return answer;
   };
 
-  void save(float *ptr) const {
+  void save(float* ptr) const {
     vst1q_f32(ptr, reg.val[0]);
     vst1q_f32(ptr + 4, reg.val[1]);
     vst1q_f32(ptr + 8, reg.val[2]);
@@ -487,43 +491,59 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
   };
 };
 
-template <typename T> struct VecType { using vec_type = void; };
+template <typename T>
+struct VecType {
+  using vec_type = void;
+};
 
-template <typename T> using vec_t = typename VecType<T>::vec_type;
+template <typename T>
+using vec_t = typename VecType<T>::vec_type;
 
-template <> struct VecType<float> { using vec_type = FP32Vec8; };
+template <>
+struct VecType<float> {
+  using vec_type = FP32Vec8;
+};
 
-template <> struct VecType<c10::Half> { using vec_type = FP16Vec8; };
+template <>
+struct VecType<c10::Half> {
+  using vec_type = FP16Vec8;
+};
 
 #ifdef ARM_BF16_SUPPORT
-template <> struct VecType<c10::BFloat16> { using vec_type = BF16Vec8; };
+template <>
+struct VecType<c10::BFloat16> {
+  using vec_type = BF16Vec8;
+};
 #endif
 
-template <typename T> void storeFP32(float v, T *ptr) { *ptr = v; }
+template <typename T>
+void storeFP32(float v, T* ptr) {
+  *ptr = v;
+}
 
-template <> inline void storeFP32<c10::Half>(float v, c10::Half *ptr) {
-  *reinterpret_cast<__fp16 *>(ptr) = v;
+template <>
+inline void storeFP32<c10::Half>(float v, c10::Half* ptr) {
+  *reinterpret_cast<__fp16*>(ptr) = v;
 }
 
-inline FP16Vec16::FP16Vec16(const FP32Vec16 &v) {
-    float16x4_t low_0 = vcvt_f16_f32(v.reg.val[0]);
-    float16x4_t high_0 = vcvt_f16_f32(v.reg.val[1]);
-    float16x4_t low_1 = vcvt_f16_f32(v.reg.val[2]);
-    float16x4_t high_1 = vcvt_f16_f32(v.reg.val[3]);
+inline FP16Vec16::FP16Vec16(const FP32Vec16& v) {
+  float16x4_t low_0 = vcvt_f16_f32(v.reg.val[0]);
+  float16x4_t high_0 = vcvt_f16_f32(v.reg.val[1]);
+  float16x4_t low_1 = vcvt_f16_f32(v.reg.val[2]);
+  float16x4_t high_1 = vcvt_f16_f32(v.reg.val[3]);
 
-    reg.val[0] = vcombine_f16(low_0, high_0);
-    reg.val[1] = vcombine_f16(low_1, high_1);
+  reg.val[0] = vcombine_f16(low_0, high_0);
+  reg.val[1] = vcombine_f16(low_1, high_1);
 };
 
-inline FP16Vec8 :: FP16Vec8(const FP32Vec8 &v) {
-    float16x4_t lower_half = vcvt_f16_f32(v.reg.val[0]);
-    float16x4_t upper_half = vcvt_f16_f32(v.reg.val[1]);
+inline FP16Vec8 ::FP16Vec8(const FP32Vec8& v) {
+  float16x4_t lower_half = vcvt_f16_f32(v.reg.val[0]);
+  float16x4_t upper_half = vcvt_f16_f32(v.reg.val[1]);
 
-    reg = vcombine_f16(lower_half, upper_half);
+  reg = vcombine_f16(lower_half, upper_half);
 };
 
-inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) {
-
+inline void fma(FP32Vec16& acc, FP32Vec16& a, FP32Vec16& b) {
   acc.reg.val[0] = vfmaq_f32(acc.reg.val[0], a.reg.val[0], b.reg.val[0]);
   acc.reg.val[1] = vfmaq_f32(acc.reg.val[1], a.reg.val[1], b.reg.val[1]);
   acc.reg.val[2] = vfmaq_f32(acc.reg.val[2], a.reg.val[2], b.reg.val[2]);
@@ -531,8 +551,7 @@ inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) {
 };
 
 #ifdef ARM_BF16_SUPPORT
-inline void fma(FP32Vec16 &acc, BF16Vec32 &a, BF16Vec32 &b) {
-
+inline void fma(FP32Vec16& acc, BF16Vec32& a, BF16Vec32& b) {
   float32x4_t a0_low = vcvt_f32_bf16(vget_low_bf16(a.reg.val[0]));
   float32x4_t a0_high = vcvt_f32_bf16(vget_high_bf16(a.reg.val[0]));
   float32x4_t a1_low = vcvt_f32_bf16(vget_low_bf16(a.reg.val[1]));
@@ -551,22 +570,22 @@ inline void fma(FP32Vec16 &acc, BF16Vec32 &a, BF16Vec32 &b) {
 #endif
 
 #ifdef ARM_BF16_SUPPORT
-inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) : reg(vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[0]), v.reg.val[1])) {};
-
-inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) : reg({
-    vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[0]), v.reg.val[1]),
-    vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[2]), v.reg.val[3])
-  }){};
+inline BF16Vec8::BF16Vec8(const FP32Vec8& v)
+    : reg(vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[0]), v.reg.val[1])) {
+      };
+
+inline BF16Vec16::BF16Vec16(const FP32Vec16& v)
+    : reg({vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[0]), v.reg.val[1]),
+           vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[2]),
+                               v.reg.val[3])}) {};
 #endif
 
-inline void prefetch(const void *addr) {
-    __builtin_prefetch(addr, 0, 1);
-};
+inline void prefetch(const void* addr) { __builtin_prefetch(addr, 0, 1); };
 
 #ifdef ARM_BF16_SUPPORT
 template <>
-inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16 *ptr) { 
-  *reinterpret_cast<__bf16 *>(ptr) = vcvth_bf16_f32(v);
+inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16* ptr) {
+  *reinterpret_cast<__bf16*>(ptr) = vcvth_bf16_f32(v);
 };
 #endif
-};
\ No newline at end of file
+};  // namespace vec_op
\ No newline at end of file
diff --git a/csrc/cpu/cpu_types_vsx.hpp b/csrc/cpu/cpu_types_vsx.hpp
index b50bdadc5713d..a8e1be37eb418 100644
--- a/csrc/cpu/cpu_types_vsx.hpp
+++ b/csrc/cpu/cpu_types_vsx.hpp
@@ -9,38 +9,40 @@
 namespace vec_op {
 
 // FIXME: FP16 is not fully supported in Torch-CPU
-#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)                                 \
-  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)                         \
+#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)         \
+  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
   AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
 
-#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...)                          \
+#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
   AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
 
 #ifndef CPU_OP_GUARD
-#define CPU_KERNEL_GUARD_IN(NAME)
-#define CPU_KERNEL_GUARD_OUT(NAME)
+  #define CPU_KERNEL_GUARD_IN(NAME)
+  #define CPU_KERNEL_GUARD_OUT(NAME)
 #else
-#define CPU_KERNEL_GUARD_IN(NAME)                                              \
-  std::cout << #NAME << " invoked." << std::endl;
-#define CPU_KERNEL_GUARD_OUT(NAME) std::cout << #NAME << " exit." << std::endl;
+  #define CPU_KERNEL_GUARD_IN(NAME) \
+    std::cout << #NAME << " invoked." << std::endl;
+  #define CPU_KERNEL_GUARD_OUT(NAME) \
+    std::cout << #NAME << " exit." << std::endl;
 #endif
 
 #define FORCE_INLINE __attribute__((always_inline)) inline
 
 namespace {
 template <typename T, T... indexes, typename F>
-constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F &&f) {
+constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F&& f) {
   (f(std::integral_constant<T, indexes>{}), ...);
 }
-}; // namespace
+};  // namespace
 
 template <typename T, T count, typename F,
           typename = std::enable_if_t<std::is_invocable_v<F, T>>>
-constexpr void unroll_loop(F &&f) {
+constexpr void unroll_loop(F&& f) {
   unroll_loop_item(std::make_integer_sequence<T, count>{}, std::forward<F>(f));
 }
 
-template <typename T> struct Vec {
+template <typename T>
+struct Vec {
   constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; }
 };
 
@@ -68,12 +70,14 @@ struct BF16Vec8 : public Vec<BF16Vec8> {
 
   __vector signed short reg;
 
-  explicit BF16Vec8(const void *ptr)
-      : reg((__vector signed short)vec_xl(0, (__vector signed short *)ptr)) {}
+  explicit BF16Vec8(const void* ptr)
+      : reg((__vector signed short)vec_xl(0, (__vector signed short*)ptr)) {}
 
-  explicit BF16Vec8(const FP32Vec8 &);
+  explicit BF16Vec8(const FP32Vec8&);
 
-  void save(void *ptr) const { *reinterpret_cast<__vector signed short *>(ptr) = reg; }
+  void save(void* ptr) const {
+    *reinterpret_cast<__vector signed short*>(ptr) = reg;
+  }
 };
 
 struct BF16Vec16 : public Vec<BF16Vec16> {
@@ -81,18 +85,18 @@ struct BF16Vec16 : public Vec<BF16Vec16> {
 
   ss16x8x2_t reg;
 
-  explicit BF16Vec16(const void *ptr) {
+  explicit BF16Vec16(const void* ptr) {
     // Load 256 bits in two parts
-    reg.val[0] = (__vector signed short)vec_xl(0,  (signed short *)ptr);
-    reg.val[1] = (__vector signed short)vec_xl(16, (signed short *)ptr);
+    reg.val[0] = (__vector signed short)vec_xl(0, (signed short*)ptr);
+    reg.val[1] = (__vector signed short)vec_xl(16, (signed short*)ptr);
   }
 
-  explicit BF16Vec16(const FP32Vec16 &);
+  explicit BF16Vec16(const FP32Vec16&);
 
-  void save(void *ptr) const {
+  void save(void* ptr) const {
     // Save 256 bits in two parts
-    vec_xst(reg.val[0], 0, (signed short *)ptr);
-    vec_xst(reg.val[1], 16, (signed short *)ptr);
+    vec_xst(reg.val[0], 0, (signed short*)ptr);
+    vec_xst(reg.val[1], 16, (signed short*)ptr);
   }
 };
 
@@ -102,19 +106,15 @@ struct BF16Vec32 : public Vec<BF16Vec32> {
   constexpr static int VEC_ELEM_NUM = 32;
 
   ss16x8x4_t reg;
-  explicit BF16Vec32(const void *ptr)
-      : reg(*reinterpret_cast<const ss16x8x4_t *>(ptr)) {}
+  explicit BF16Vec32(const void* ptr)
+      : reg(*reinterpret_cast<const ss16x8x4_t*>(ptr)) {}
 
   explicit BF16Vec32(ss16x8x4_t data) : reg(data) {}
 
-  explicit BF16Vec32(const BF16Vec8 &vec8_data) : reg({
-    vec8_data.reg,
-    vec8_data.reg,
-    vec8_data.reg,
-    vec8_data.reg
-  }) {}
+  explicit BF16Vec32(const BF16Vec8& vec8_data)
+      : reg({vec8_data.reg, vec8_data.reg, vec8_data.reg, vec8_data.reg}) {}
 
-  void save(void *ptr) const { *reinterpret_cast<ss16x8x4_t *>(ptr) = reg; }
+  void save(void* ptr) const { *reinterpret_cast<ss16x8x4_t*>(ptr) = reg; }
 };
 
 struct FP32Vec4 : public Vec<FP32Vec4> {
@@ -130,11 +130,11 @@ struct FP32Vec4 : public Vec<FP32Vec4> {
 
   explicit FP32Vec4() : reg(vec_splats(0.0f)) {}
 
-  explicit FP32Vec4(const float *ptr) : reg(vec_xl(0, ptr)) {}
+  explicit FP32Vec4(const float* ptr) : reg(vec_xl(0, ptr)) {}
 
   explicit FP32Vec4(__vector float data) : reg(data) {}
 
-  explicit FP32Vec4(const FP32Vec4 &data) : reg(data.reg) {}
+  explicit FP32Vec4(const FP32Vec4& data) : reg(data.reg) {}
 };
 
 struct FP32Vec8 : public Vec<FP32Vec8> {
@@ -156,19 +156,19 @@ struct FP32Vec8 : public Vec<FP32Vec8> {
     reg.val[1] = vec_splats(0.0f);
   }
 
-  explicit FP32Vec8(const float *ptr) {
+  explicit FP32Vec8(const float* ptr) {
     reg.val[0] = vec_xl(0, ptr);
     reg.val[1] = vec_xl(16, ptr);
   }
 
   explicit FP32Vec8(f32x4x2_t data) : reg(data) {}
 
-  explicit FP32Vec8(const FP32Vec8 &data) {
+  explicit FP32Vec8(const FP32Vec8& data) {
     reg.val[0] = data.reg.val[0];
     reg.val[1] = data.reg.val[1];
   }
 
-  explicit FP32Vec8(const BF16Vec8 &v) {
+  explicit FP32Vec8(const BF16Vec8& v) {
     reg.val[0] = (__vector float)vec_mergeh(zero, v.reg);
     reg.val[1] = (__vector float)vec_mergel(zero, v.reg);
   }
@@ -177,7 +177,8 @@ struct FP32Vec8 : public Vec<FP32Vec8> {
     AliasReg ar;
     ar.reg = reg;
     float result = 0;
-    unroll_loop<int, VEC_ELEM_NUM>([&result, &ar](int i) { result += ar.values[i]; });
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&result, &ar](int i) { result += ar.values[i]; });
 
     return result;
   }
@@ -230,23 +231,27 @@ struct FP32Vec8 : public Vec<FP32Vec8> {
     return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
   }
 
-  FP32Vec8 operator*(const FP32Vec8 &b) const {
-    return FP32Vec8({vec_mul(reg.val[0], b.reg.val[0]), vec_mul(reg.val[1], b.reg.val[1])});
+  FP32Vec8 operator*(const FP32Vec8& b) const {
+    return FP32Vec8(
+        {vec_mul(reg.val[0], b.reg.val[0]), vec_mul(reg.val[1], b.reg.val[1])});
   }
 
-  FP32Vec8 operator+(const FP32Vec8 &b) const {
-    return FP32Vec8({vec_add(reg.val[0], b.reg.val[0]), vec_add(reg.val[1], b.reg.val[1])});
+  FP32Vec8 operator+(const FP32Vec8& b) const {
+    return FP32Vec8(
+        {vec_add(reg.val[0], b.reg.val[0]), vec_add(reg.val[1], b.reg.val[1])});
   }
 
-  FP32Vec8 operator-(const FP32Vec8 &b) const {
-    return FP32Vec8({vec_sub(reg.val[0], b.reg.val[0]), vec_sub(reg.val[1], b.reg.val[1])});
+  FP32Vec8 operator-(const FP32Vec8& b) const {
+    return FP32Vec8(
+        {vec_sub(reg.val[0], b.reg.val[0]), vec_sub(reg.val[1], b.reg.val[1])});
   }
 
-  FP32Vec8 operator/(const FP32Vec8 &b) const {
-    return FP32Vec8({vec_div(reg.val[0], b.reg.val[0]), vec_div(reg.val[1], b.reg.val[1])});
+  FP32Vec8 operator/(const FP32Vec8& b) const {
+    return FP32Vec8(
+        {vec_div(reg.val[0], b.reg.val[0]), vec_div(reg.val[1], b.reg.val[1])});
   }
 
-  void save(float *ptr) const {
+  void save(float* ptr) const {
     vec_xst(reg.val[0], 0, ptr);
     vec_xst(reg.val[1], 16, ptr);
   }
@@ -275,7 +280,7 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
     reg.val[3] = vec_splats(0.0f);
   }
 
-  explicit FP32Vec16(const float *ptr) {
+  explicit FP32Vec16(const float* ptr) {
     reg.val[0] = vec_xl(0, ptr);
     reg.val[1] = vec_xl(16, ptr);
     reg.val[2] = vec_xl(32, ptr);
@@ -284,78 +289,76 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
 
   explicit FP32Vec16(f32x4x4_t data) : reg(data) {}
 
-  explicit FP32Vec16(const FP32Vec16 &data) {
+  explicit FP32Vec16(const FP32Vec16& data) {
     reg.val[0] = data.reg.val[0];
     reg.val[1] = data.reg.val[1];
     reg.val[2] = data.reg.val[2];
     reg.val[3] = data.reg.val[3];
   }
 
-  explicit FP32Vec16(const FP32Vec4 &data) {
+  explicit FP32Vec16(const FP32Vec4& data) {
     reg.val[0] = data.reg;
     reg.val[1] = data.reg;
     reg.val[2] = data.reg;
     reg.val[3] = data.reg;
   }
 
-  explicit FP32Vec16(const FP32Vec8 &data) {
+  explicit FP32Vec16(const FP32Vec8& data) {
     reg.val[0] = data.reg.val[0];
     reg.val[1] = data.reg.val[1];
     reg.val[2] = data.reg.val[0];
     reg.val[3] = data.reg.val[1];
   }
 
-  explicit FP32Vec16(const BF16Vec16 &v) {
+  explicit FP32Vec16(const BF16Vec16& v) {
     reg.val[0] = (__vector float)vec_mergeh(zero, v.reg.val[0]);
     reg.val[1] = (__vector float)vec_mergel(zero, v.reg.val[0]);
     reg.val[2] = (__vector float)vec_mergeh(zero, v.reg.val[1]);
     reg.val[3] = (__vector float)vec_mergel(zero, v.reg.val[1]);
   }
 
-  explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
+  explicit FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {}
 
-  FP32Vec16 operator*(const FP32Vec16 &b) const {
-    return FP32Vec16(f32x4x4_t({
-        vec_mul(reg.val[0], b.reg.val[0]),
-        vec_mul(reg.val[1], b.reg.val[1]),
-        vec_mul(reg.val[2], b.reg.val[2]),
-        vec_mul(reg.val[3], b.reg.val[3])}));
+  FP32Vec16 operator*(const FP32Vec16& b) const {
+    return FP32Vec16(f32x4x4_t({vec_mul(reg.val[0], b.reg.val[0]),
+                                vec_mul(reg.val[1], b.reg.val[1]),
+                                vec_mul(reg.val[2], b.reg.val[2]),
+                                vec_mul(reg.val[3], b.reg.val[3])}));
   }
 
-  FP32Vec16 operator+(const FP32Vec16 &b) const {
-    return FP32Vec16(f32x4x4_t({
-        vec_add(reg.val[0], b.reg.val[0]),
-        vec_add(reg.val[1], b.reg.val[1]),
-        vec_add(reg.val[2], b.reg.val[2]),
-        vec_add(reg.val[3], b.reg.val[3])}));
+  FP32Vec16 operator+(const FP32Vec16& b) const {
+    return FP32Vec16(f32x4x4_t({vec_add(reg.val[0], b.reg.val[0]),
+                                vec_add(reg.val[1], b.reg.val[1]),
+                                vec_add(reg.val[2], b.reg.val[2]),
+                                vec_add(reg.val[3], b.reg.val[3])}));
   }
 
-  FP32Vec16 operator-(const FP32Vec16 &b) const {
-    return FP32Vec16(f32x4x4_t({
-        vec_sub(reg.val[0], b.reg.val[0]),
-        vec_sub(reg.val[1], b.reg.val[1]),
-        vec_sub(reg.val[2], b.reg.val[2]),
-        vec_sub(reg.val[3], b.reg.val[3])}));
+  FP32Vec16 operator-(const FP32Vec16& b) const {
+    return FP32Vec16(f32x4x4_t({vec_sub(reg.val[0], b.reg.val[0]),
+                                vec_sub(reg.val[1], b.reg.val[1]),
+                                vec_sub(reg.val[2], b.reg.val[2]),
+                                vec_sub(reg.val[3], b.reg.val[3])}));
   }
 
-  FP32Vec16 operator/(const FP32Vec16 &b) const {
-    return FP32Vec16(f32x4x4_t({
-        vec_div(reg.val[0], b.reg.val[0]),
-        vec_div(reg.val[1], b.reg.val[1]),
-        vec_div(reg.val[2], b.reg.val[2]),
-        vec_div(reg.val[3], b.reg.val[3])}));
+  FP32Vec16 operator/(const FP32Vec16& b) const {
+    return FP32Vec16(f32x4x4_t({vec_div(reg.val[0], b.reg.val[0]),
+                                vec_div(reg.val[1], b.reg.val[1]),
+                                vec_div(reg.val[2], b.reg.val[2]),
+                                vec_div(reg.val[3], b.reg.val[3])}));
   }
 
   float reduce_sum() const {
     AliasReg ar;
     ar.reg = reg;
     float result = 0;
-    unroll_loop<int, VEC_ELEM_NUM>([&result, &ar](int i) { result += ar.values[i]; });
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&result, &ar](int i) { result += ar.values[i]; });
 
     return result;
   }
 
-  template <int group_size> float reduce_sub_sum(int idx) {
+  template <int group_size>
+  float reduce_sub_sum(int idx) {
     static_assert(VEC_ELEM_NUM % group_size == 0);
 
     AliasReg ar;
@@ -368,7 +371,7 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
     return result;
   }
 
-  void save(float *ptr) const {
+  void save(float* ptr) const {
     vec_xst(reg.val[0], 0, ptr);
     vec_xst(reg.val[1], 16, ptr);
     vec_xst(reg.val[2], 32, ptr);
@@ -376,43 +379,62 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
   }
 };
 
-template <typename T> struct VecType { using vec_type = void; };
+template <typename T>
+struct VecType {
+  using vec_type = void;
+};
 
-template <typename T> using vec_t = typename VecType<T>::vec_type;
+template <typename T>
+using vec_t = typename VecType<T>::vec_type;
 
-template <> struct VecType<float> { using vec_type = FP32Vec8; };
+template <>
+struct VecType<float> {
+  using vec_type = FP32Vec8;
+};
 
-template <> struct VecType<c10::BFloat16> { using vec_type = BF16Vec8; };
+template <>
+struct VecType<c10::BFloat16> {
+  using vec_type = BF16Vec8;
+};
 
-template <typename T> void storeFP32(float v, T *ptr) { *ptr = v; }
+template <typename T>
+void storeFP32(float v, T* ptr) {
+  *ptr = v;
+}
 
-inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) {
+inline void fma(FP32Vec16& acc, FP32Vec16& a, FP32Vec16& b) {
   acc = acc + a * b;
 }
 
-template <> inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16 *ptr) {
-  c10::BFloat16 __attribute__((__may_alias__)) *v_ptr =
-      reinterpret_cast<c10::BFloat16 *>(&v);
+template <>
+inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16* ptr) {
+  c10::BFloat16 __attribute__((__may_alias__))* v_ptr =
+      reinterpret_cast<c10::BFloat16*>(&v);
   *ptr = *(v_ptr + 1);
 }
 
 #ifndef __VEC_CLASS_FP_NAN
-#define __VEC_CLASS_FP_NAN (1 << 6)
+  #define __VEC_CLASS_FP_NAN (1 << 6)
 #endif
 
-const static __vector unsigned char omask = { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
+const static __vector unsigned char omask = {0,  1,  4,  5,  8,  9,  12, 13,
+                                             16, 17, 20, 21, 24, 25, 28, 29};
 #ifndef _ARCH_PWR10
-const static __vector unsigned int bias = { 0x00007fff, 0x00007fff, 0x00007fff, 0x00007fff };
-const static __vector unsigned int nan  = { 0x7fc00000, 0x7fc00000, 0x7fc00000, 0x7fc00000 };
-const static __vector unsigned int sh16 = { 16, 16, 16, 16 };
-const static __vector unsigned int one  = { 1, 1, 1, 1 };
+const static __vector unsigned int bias = {0x00007fff, 0x00007fff, 0x00007fff,
+                                           0x00007fff};
+const static __vector unsigned int nan = {0x7fc00000, 0x7fc00000, 0x7fc00000,
+                                          0x7fc00000};
+const static __vector unsigned int sh16 = {16, 16, 16, 16};
+const static __vector unsigned int one = {1, 1, 1, 1};
 #endif
 
-inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) {
+inline BF16Vec8::BF16Vec8(const FP32Vec8& v) {
 #ifdef _ARCH_PWR10
   __vector signed short ret[2];
-  ret[0] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[0]);
-  ret[1] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[1]);
+  ret[0] = (__vector signed short)__builtin_vsx_xvcvspbf16(
+      (__vector unsigned char)v.reg.val[0]);
+  ret[1] = (__vector signed short)__builtin_vsx_xvcvspbf16(
+      (__vector unsigned char)v.reg.val[1]);
   reg = vec_perm(ret[0], ret[1], omask);
 #elif defined(_ARCH_PWR9)
   __vector unsigned int inp0 = (__vector unsigned int)(v.reg.val[0]);
@@ -425,8 +447,10 @@ inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) {
   __vector unsigned int rnd1 = vec_add(lsb1, bias);
   inp0 = vec_add(inp0, rnd0);
   inp1 = vec_add(inp1, rnd1);
-  __vector __bool int sel0 = vec_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN);
-  __vector __bool int sel1 = vec_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN);
+  __vector __bool int sel0 =
+      vec_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN);
+  __vector __bool int sel1 =
+      vec_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN);
   inp0 = vec_sel(inp0, nan, sel0);
   inp1 = vec_sel(inp1, nan, sel1);
   inp0 = vec_sr(inp0, sh16);
@@ -435,13 +459,17 @@ inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) {
 #endif
 }
 
-inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) {
+inline BF16Vec16::BF16Vec16(const FP32Vec16& v) {
 #ifdef _ARCH_PWR10
   __vector signed short ret[4];
-  ret[0] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[0]);
-  ret[1] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[1]);
-  ret[2] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[2]);
-  ret[3] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[3]);
+  ret[0] = (__vector signed short)__builtin_vsx_xvcvspbf16(
+      (__vector unsigned char)v.reg.val[0]);
+  ret[1] = (__vector signed short)__builtin_vsx_xvcvspbf16(
+      (__vector unsigned char)v.reg.val[1]);
+  ret[2] = (__vector signed short)__builtin_vsx_xvcvspbf16(
+      (__vector unsigned char)v.reg.val[2]);
+  ret[3] = (__vector signed short)__builtin_vsx_xvcvspbf16(
+      (__vector unsigned char)v.reg.val[3]);
   reg.val[0] = vec_perm(ret[0], ret[1], omask);
   reg.val[1] = vec_perm(ret[2], ret[3], omask);
 #elif defined(_ARCH_PWR9)
@@ -465,10 +493,14 @@ inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) {
   inp1 = vec_add(inp1, rnd1);
   inp2 = vec_add(inp2, rnd2);
   inp3 = vec_add(inp3, rnd3);
-  __vector __bool int sel0 = vec_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN);
-  __vector __bool int sel1 = vec_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN);
-  __vector __bool int sel2 = vec_test_data_class(v.reg.val[2], __VEC_CLASS_FP_NAN);
-  __vector __bool int sel3 = vec_test_data_class(v.reg.val[3], __VEC_CLASS_FP_NAN);
+  __vector __bool int sel0 =
+      vec_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN);
+  __vector __bool int sel1 =
+      vec_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN);
+  __vector __bool int sel2 =
+      vec_test_data_class(v.reg.val[2], __VEC_CLASS_FP_NAN);
+  __vector __bool int sel3 =
+      vec_test_data_class(v.reg.val[3], __VEC_CLASS_FP_NAN);
   inp0 = vec_sel(inp0, nan, sel0);
   inp1 = vec_sel(inp1, nan, sel1);
   inp2 = vec_sel(inp2, nan, sel2);
@@ -482,10 +514,10 @@ inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) {
 #endif
 }
 
-inline void prefetch(const void *addr) {
+inline void prefetch(const void* addr) {
   __asm__ __volatile__("dcbt 0, %0" : : "r"(addr) : "memory");
 }
 
-}; // namespace vec_op
+};  // namespace vec_op
 
 #endif
diff --git a/csrc/cpu/cpu_types_x86.hpp b/csrc/cpu/cpu_types_x86.hpp
index 4bb4eb0f491ac..a4ef2be2a58ca 100644
--- a/csrc/cpu/cpu_types_x86.hpp
+++ b/csrc/cpu/cpu_types_x86.hpp
@@ -11,39 +11,40 @@ static_assert(false, "AVX2 must be supported for the current implementation.");
 
 namespace vec_op {
 
-#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)                                 \
-  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)                         \
-  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)                      \
+#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)            \
+  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)    \
+  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \
   AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)
 
-#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...)                          \
+#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
   AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
 
 #ifndef CPU_OP_GUARD
-#define CPU_KERNEL_GUARD_IN(NAME)
-#define CPU_KERNEL_GUARD_OUT(NAME)
+  #define CPU_KERNEL_GUARD_IN(NAME)
+  #define CPU_KERNEL_GUARD_OUT(NAME)
 #else
-#define CPU_KERNEL_GUARD_IN(NAME)                                              \
-  RECORD_FUNCTION(#NAME, c10::ArrayRef<c10::IValue>({}));
-#define CPU_KERNEL_GUARD_OUT(NAME)
+  #define CPU_KERNEL_GUARD_IN(NAME) \
+    RECORD_FUNCTION(#NAME, c10::ArrayRef<c10::IValue>({}));
+  #define CPU_KERNEL_GUARD_OUT(NAME)
 #endif
 
 #define FORCE_INLINE __attribute__((always_inline)) inline
 
 namespace {
 template <typename T, T... indexes, typename F>
-constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F &&f) {
+constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F&& f) {
   (f(std::integral_constant<T, indexes>{}), ...);
 }
-}; // namespace
+};  // namespace
 
 template <typename T, T count, typename F,
           typename = std::enable_if_t<std::is_invocable_v<F, T>>>
-constexpr void unroll_loop(F &&f) {
+constexpr void unroll_loop(F&& f) {
   unroll_loop_item(std::make_integer_sequence<T, count>{}, std::forward<F>(f));
 }
 
-template <typename T> struct Vec {
+template <typename T>
+struct Vec {
   constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; }
 };
 
@@ -55,12 +56,12 @@ struct FP16Vec8 : public Vec<FP16Vec8> {
 
   __m128i reg;
 
-  explicit FP16Vec8(const void *ptr)
-      : reg((__m128i)_mm_loadu_si128((__m128i *)ptr)) {}
+  explicit FP16Vec8(const void* ptr)
+      : reg((__m128i)_mm_loadu_si128((__m128i*)ptr)) {}
 
-  explicit FP16Vec8(const FP32Vec8 &);
+  explicit FP16Vec8(const FP32Vec8&);
 
-  void save(void *ptr) const { *reinterpret_cast<__m128i *>(ptr) = reg; }
+  void save(void* ptr) const { *reinterpret_cast<__m128i*>(ptr) = reg; }
 };
 
 struct FP16Vec16 : public Vec<FP16Vec16> {
@@ -68,12 +69,12 @@ struct FP16Vec16 : public Vec<FP16Vec16> {
 
   __m256i reg;
 
-  explicit FP16Vec16(const void *ptr)
-      : reg((__m256i)_mm256_loadu_si256((__m256i *)ptr)) {}
+  explicit FP16Vec16(const void* ptr)
+      : reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {}
 
-  explicit FP16Vec16(const FP32Vec16 &);
+  explicit FP16Vec16(const FP32Vec16&);
 
-  void save(void *ptr) const { *reinterpret_cast<__m256i *>(ptr) = reg; }
+  void save(void* ptr) const { *reinterpret_cast<__m256i*>(ptr) = reg; }
 
   void save(void* ptr, const int elem_num) const {
     constexpr uint32_t M = 0xFFFFFFFF;
@@ -87,12 +88,12 @@ struct BF16Vec8 : public Vec<BF16Vec8> {
 
   __m128i reg;
 
-  explicit BF16Vec8(const void *ptr)
-      : reg((__m128i)_mm_loadu_si128((__m128i *)ptr)) {}
+  explicit BF16Vec8(const void* ptr)
+      : reg((__m128i)_mm_loadu_si128((__m128i*)ptr)) {}
 
-  explicit BF16Vec8(const FP32Vec8 &);
+  explicit BF16Vec8(const FP32Vec8&);
 
-  void save(void *ptr) const { *reinterpret_cast<__m128i *>(ptr) = reg; }
+  void save(void* ptr) const { *reinterpret_cast<__m128i*>(ptr) = reg; }
 };
 
 struct BF16Vec16 : public Vec<BF16Vec16> {
@@ -100,12 +101,12 @@ struct BF16Vec16 : public Vec<BF16Vec16> {
 
   __m256i reg;
 
-  explicit BF16Vec16(const void *ptr)
-      : reg((__m256i)_mm256_loadu_si256((__m256i *)ptr)) {}
+  explicit BF16Vec16(const void* ptr)
+      : reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {}
 
-  explicit BF16Vec16(const FP32Vec16 &);
+  explicit BF16Vec16(const FP32Vec16&);
 
-  void save(void *ptr) const { *reinterpret_cast<__m256i *>(ptr) = reg; }
+  void save(void* ptr) const { *reinterpret_cast<__m256i*>(ptr) = reg; }
 
   void save(void* ptr, const int elem_num) const {
     constexpr uint32_t M = 0xFFFFFFFF;
@@ -120,11 +121,11 @@ struct BF16Vec32 : public Vec<BF16Vec32> {
 
   __m512i reg;
 
-  explicit BF16Vec32(const void *ptr) : reg((__m512i)_mm512_loadu_si512(ptr)) {}
+  explicit BF16Vec32(const void* ptr) : reg((__m512i)_mm512_loadu_si512(ptr)) {}
 
   explicit BF16Vec32(__m512i data) : reg(data) {}
 
-  explicit BF16Vec32(BF16Vec8 &vec8_data)
+  explicit BF16Vec32(BF16Vec8& vec8_data)
       : reg((__m512i)_mm512_inserti32x4(
             _mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(
                                                       (__m128i)vec8_data.reg),
@@ -132,7 +133,7 @@ struct BF16Vec32 : public Vec<BF16Vec32> {
                                (__m128i)vec8_data.reg, 2),
             (__m128i)vec8_data.reg, 3)) {}
 
-  void save(void *ptr) const { *reinterpret_cast<__m512i *>(ptr) = reg; }
+  void save(void* ptr) const { *reinterpret_cast<__m512i*>(ptr) = reg; }
 };
 #else
 struct BF16Vec32 : public Vec<BF16Vec32> {
@@ -141,24 +142,24 @@ struct BF16Vec32 : public Vec<BF16Vec32> {
   __m256i reg_low;
   __m256i reg_high;
 
-  explicit BF16Vec32(const void *ptr)
-      : reg_low(_mm256_loadu_si256((__m256i const *)ptr)),
-        reg_high(_mm256_loadu_si256((__m256i const *)ptr + 1)) {}
+  explicit BF16Vec32(const void* ptr)
+      : reg_low(_mm256_loadu_si256((__m256i const*)ptr)),
+        reg_high(_mm256_loadu_si256((__m256i const*)ptr + 1)) {}
 
-  explicit BF16Vec32(__m256i low, __m256i high) : reg_low(low),
-                                                  reg_high(high) {}
+  explicit BF16Vec32(__m256i low, __m256i high)
+      : reg_low(low), reg_high(high) {}
 
-  explicit BF16Vec32(BF16Vec8 &vec8_data)
+  explicit BF16Vec32(BF16Vec8& vec8_data)
       : reg_low((__m256i)_mm256_inserti32x4(
-                _mm256_castsi128_si256((__m128i)vec8_data.reg),
-                                       (__m128i)vec8_data.reg, 1)),
+            _mm256_castsi128_si256((__m128i)vec8_data.reg),
+            (__m128i)vec8_data.reg, 1)),
         reg_high((__m256i)_mm256_inserti32x4(
-                _mm256_castsi128_si256((__m128i)vec8_data.reg),
-                                       (__m128i)vec8_data.reg, 1)) {}
+            _mm256_castsi128_si256((__m128i)vec8_data.reg),
+            (__m128i)vec8_data.reg, 1)) {}
 
-  void save(void *ptr) const {
-    *reinterpret_cast<__m256i *>(ptr) = reg_low;
-    *reinterpret_cast<__m256i *>((__m256i *)ptr + 1) = reg_high;
+  void save(void* ptr) const {
+    *reinterpret_cast<__m256i*>(ptr) = reg_low;
+    *reinterpret_cast<__m256i*>((__m256i*)ptr + 1) = reg_high;
   }
 };
 #endif
@@ -176,11 +177,11 @@ struct FP32Vec4 : public Vec<FP32Vec4> {
 
   explicit FP32Vec4() : reg(_mm_set1_ps(0.0)) {}
 
-  explicit FP32Vec4(const float *ptr) : reg(_mm_loadu_ps(ptr)) {}
+  explicit FP32Vec4(const float* ptr) : reg(_mm_loadu_ps(ptr)) {}
 
   explicit FP32Vec4(__m128 data) : reg(data) {}
 
-  explicit FP32Vec4(const FP32Vec4 &data) : reg(data.reg) {}
+  explicit FP32Vec4(const FP32Vec4& data) : reg(data.reg) {}
 };
 
 struct FP32Vec8 : public Vec<FP32Vec8> {
@@ -196,15 +197,15 @@ struct FP32Vec8 : public Vec<FP32Vec8> {
 
   explicit FP32Vec8() : reg(_mm256_set1_ps(0.0)) {}
 
-  explicit FP32Vec8(const float *ptr) : reg(_mm256_loadu_ps(ptr)) {}
+  explicit FP32Vec8(const float* ptr) : reg(_mm256_loadu_ps(ptr)) {}
 
   explicit FP32Vec8(__m256 data) : reg(data) {}
 
-  explicit FP32Vec8(const FP32Vec8 &data) : reg(data.reg) {}
+  explicit FP32Vec8(const FP32Vec8& data) : reg(data.reg) {}
 
-  explicit FP32Vec8(const FP16Vec8 &v) : reg(_mm256_cvtph_ps(v.reg)) {}
+  explicit FP32Vec8(const FP16Vec8& v) : reg(_mm256_cvtph_ps(v.reg)) {}
 
-  explicit FP32Vec8(const BF16Vec8 &v)
+  explicit FP32Vec8(const BF16Vec8& v)
       : reg(_mm256_castsi256_ps(
             _mm256_bslli_epi128(_mm256_cvtepu16_epi32(v.reg), 2))) {}
 
@@ -212,7 +213,8 @@ struct FP32Vec8 : public Vec<FP32Vec8> {
     AliasReg ar;
     ar.reg = reg;
     float result = 0;
-    unroll_loop<int, VEC_ELEM_NUM>([&result, &ar](int i) { result += ar.values[i]; });
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&result, &ar](int i) { result += ar.values[i]; });
 
     return result;
   }
@@ -244,27 +246,27 @@ struct FP32Vec8 : public Vec<FP32Vec8> {
                                   erf(ar.values[1]), erf(ar.values[0])));
   }
 
-  FP32Vec8 operator*(const FP32Vec8 &b) const {
+  FP32Vec8 operator*(const FP32Vec8& b) const {
     return FP32Vec8(_mm256_mul_ps(reg, b.reg));
   }
 
-  FP32Vec8 operator+(const FP32Vec8 &b) const {
+  FP32Vec8 operator+(const FP32Vec8& b) const {
     return FP32Vec8(_mm256_add_ps(reg, b.reg));
   }
 
-  FP32Vec8 operator-(const FP32Vec8 &b) const {
+  FP32Vec8 operator-(const FP32Vec8& b) const {
     return FP32Vec8(_mm256_sub_ps(reg, b.reg));
   }
 
-  FP32Vec8 operator/(const FP32Vec8 &b) const {
+  FP32Vec8 operator/(const FP32Vec8& b) const {
     return FP32Vec8(_mm256_div_ps(reg, b.reg));
   }
 
-  void save(float *ptr) const { _mm256_storeu_ps(ptr, reg); }
+  void save(float* ptr) const { _mm256_storeu_ps(ptr, reg); }
 };
 
 #ifdef __AVX512F__
-struct INT32Vec16: public Vec<INT32Vec16> {
+struct INT32Vec16 : public Vec<INT32Vec16> {
   constexpr static int VEC_ELEM_NUM = 16;
   union AliasReg {
     __m512i reg;
@@ -272,12 +274,11 @@ struct INT32Vec16: public Vec<INT32Vec16> {
   };
 
   __m512i reg;
-  
-  explicit INT32Vec16(const void* data_ptr) : reg(_mm512_loadu_epi32(data_ptr)) {}
 
-  void save(int32_t* ptr) const {
-    _mm512_storeu_epi32(ptr, reg);
-  }
+  explicit INT32Vec16(const void* data_ptr)
+      : reg(_mm512_loadu_epi32(data_ptr)) {}
+
+  void save(int32_t* ptr) const { _mm512_storeu_epi32(ptr, reg); }
 
   void save(int32_t* ptr, const int elem_num) const {
     constexpr uint32_t M = 0xFFFFFFFF;
@@ -301,11 +302,11 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
 
   explicit FP32Vec16() : reg(_mm512_set1_ps(0.0)) {}
 
-  explicit FP32Vec16(const float *ptr) : reg(_mm512_loadu_ps(ptr)) {}
+  explicit FP32Vec16(const float* ptr) : reg(_mm512_loadu_ps(ptr)) {}
 
   explicit FP32Vec16(__m512 data) : reg(data) {}
 
-  explicit FP32Vec16(const FP32Vec4 &data)
+  explicit FP32Vec16(const FP32Vec4& data)
       : reg((__m512)_mm512_inserti32x4(
             _mm512_inserti32x4(
                 _mm512_inserti32x4(_mm512_castsi128_si512((__m128i)data.reg),
@@ -313,36 +314,37 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
                 (__m128i)data.reg, 2),
             (__m128i)data.reg, 3)) {}
 
-  explicit FP32Vec16(const FP32Vec8 &data)
+  explicit FP32Vec16(const FP32Vec8& data)
       : reg((__m512)_mm512_inserti32x8(
             _mm512_castsi256_si512((__m256i)data.reg), (__m256i)data.reg, 1)) {}
 
-  explicit FP32Vec16(const BF16Vec16 &v)
+  explicit FP32Vec16(const BF16Vec16& v)
       : reg(_mm512_castsi512_ps(
             _mm512_bslli_epi128(_mm512_cvtepu16_epi32(v.reg), 2))) {}
 
-  explicit FP32Vec16(const FP16Vec16 &v) : reg(_mm512_cvtph_ps(v.reg)) {}
+  explicit FP32Vec16(const FP16Vec16& v) : reg(_mm512_cvtph_ps(v.reg)) {}
 
-  explicit FP32Vec16(const FP16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
+  explicit FP32Vec16(const FP16Vec8& v) : FP32Vec16(FP32Vec8(v)) {}
 
-  explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
+  explicit FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {}
 
-  explicit FP32Vec16(const INT32Vec16 &v)
-      : reg(_mm512_cvt_roundepi32_ps(v.reg, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC)) {}
+  explicit FP32Vec16(const INT32Vec16& v)
+      : reg(_mm512_cvt_roundepi32_ps(
+            v.reg, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)) {}
 
-  FP32Vec16 operator*(const FP32Vec16 &b) const {
+  FP32Vec16 operator*(const FP32Vec16& b) const {
     return FP32Vec16(_mm512_mul_ps(reg, b.reg));
   }
 
-  FP32Vec16 operator+(const FP32Vec16 &b) const {
+  FP32Vec16 operator+(const FP32Vec16& b) const {
     return FP32Vec16(_mm512_add_ps(reg, b.reg));
   }
 
-  FP32Vec16 operator-(const FP32Vec16 &b) const {
+  FP32Vec16 operator-(const FP32Vec16& b) const {
     return FP32Vec16(_mm512_sub_ps(reg, b.reg));
   }
 
-  FP32Vec16 operator/(const FP32Vec16 &b) const {
+  FP32Vec16 operator/(const FP32Vec16& b) const {
     return FP32Vec16(_mm512_div_ps(reg, b.reg));
   }
 
@@ -370,9 +372,7 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
     return FP32Vec16(_mm512_mask_min_ps(reg, mask, reg, b.reg));
   }
 
-  FP32Vec16 abs() const {
-    return FP32Vec16(_mm512_abs_ps(reg));
-  } 
+  FP32Vec16 abs() const { return FP32Vec16(_mm512_abs_ps(reg)); }
 
   float reduce_sum() const { return _mm512_reduce_add_ps(reg); }
 
@@ -380,14 +380,15 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
 
   float reduce_min() const { return _mm512_reduce_min_ps(reg); }
 
-  template <int group_size> float reduce_sub_sum(int idx) {
+  template <int group_size>
+  float reduce_sub_sum(int idx) {
     static_assert(VEC_ELEM_NUM % group_size == 0);
     constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size));
     __mmask16 mask = _cvtu32_mask16(base_mask << (idx * group_size));
     return _mm512_mask_reduce_add_ps(mask, reg);
   }
 
-  void save(float *ptr) const { _mm512_storeu_ps(ptr, reg); }
+  void save(float* ptr) const { _mm512_storeu_ps(ptr, reg); }
 
   void save(float* ptr, const int elem_num) const {
     constexpr uint32_t M = 0xFFFFFFFF;
@@ -407,32 +408,30 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
   __m256 reg_low;
   __m256 reg_high;
 
-  explicit FP32Vec16(float v) : reg_low(_mm256_set1_ps(v)),
-                                reg_high(_mm256_set1_ps(v)) {}
+  explicit FP32Vec16(float v)
+      : reg_low(_mm256_set1_ps(v)), reg_high(_mm256_set1_ps(v)) {}
 
-  explicit FP32Vec16() : reg_low(_mm256_set1_ps(0.0)),
-                         reg_high(_mm256_set1_ps(0.0)) {}
+  explicit FP32Vec16()
+      : reg_low(_mm256_set1_ps(0.0)), reg_high(_mm256_set1_ps(0.0)) {}
 
-  explicit FP32Vec16(const float *ptr) : reg_low(_mm256_loadu_ps(ptr)),
-                                         reg_high(_mm256_loadu_ps(ptr + 8)) {}
+  explicit FP32Vec16(const float* ptr)
+      : reg_low(_mm256_loadu_ps(ptr)), reg_high(_mm256_loadu_ps(ptr + 8)) {}
 
   explicit FP32Vec16(__m256 low, __m256 high) : reg_low(low), reg_high(high) {}
 
-  explicit FP32Vec16(const FP32Vec16 &data) : reg_low(data.reg_low),
-                                              reg_high(data.reg_high) {}
+  explicit FP32Vec16(const FP32Vec16& data)
+      : reg_low(data.reg_low), reg_high(data.reg_high) {}
 
-  explicit FP32Vec16(const FP32Vec4 &data)
+  explicit FP32Vec16(const FP32Vec4& data)
       : reg_low((__m256)_mm256_inserti128_si256(
-                _mm256_castsi128_si256((__m128i)data.reg),
-                                       (__m128i)data.reg, 1)),
+            _mm256_castsi128_si256((__m128i)data.reg), (__m128i)data.reg, 1)),
         reg_high((__m256)_mm256_inserti128_si256(
-                 _mm256_castsi128_si256((__m128i)data.reg),
-                                       (__m128i)data.reg, 1)) {}
+            _mm256_castsi128_si256((__m128i)data.reg), (__m128i)data.reg, 1)) {}
 
-  explicit FP32Vec16(const FP32Vec8 &data)
+  explicit FP32Vec16(const FP32Vec8& data)
       : reg_low(data.reg), reg_high(data.reg) {}
 
-  explicit FP32Vec16(const FP16Vec16 &v) {
+  explicit FP32Vec16(const FP16Vec16& v) {
     __m128i low = _mm256_extractf128_si256(v.reg, 0);
     __m128i high = _mm256_extractf128_si256(v.reg, 1);
 
@@ -440,9 +439,9 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
     reg_high = _mm256_cvtph_ps(high);
   }
 
-  explicit FP32Vec16(const FP16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
+  explicit FP32Vec16(const FP16Vec8& v) : FP32Vec16(FP32Vec8(v)) {}
 
-  explicit FP32Vec16(const BF16Vec16 &v) {
+  explicit FP32Vec16(const BF16Vec16& v) {
     __m128i low = _mm256_extractf128_si256(v.reg, 0);
     __m128i high = _mm256_extractf128_si256(v.reg, 1);
 
@@ -456,24 +455,24 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
     reg_high = _mm256_castsi256_ps(v_high_shifted);
   }
 
-  explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
+  explicit FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {}
 
-  FP32Vec16 operator*(const FP32Vec16 &b) const {
+  FP32Vec16 operator*(const FP32Vec16& b) const {
     return FP32Vec16(_mm256_mul_ps(reg_low, b.reg_low),
                      _mm256_mul_ps(reg_high, b.reg_high));
   }
 
-  FP32Vec16 operator+(const FP32Vec16 &b) const {
+  FP32Vec16 operator+(const FP32Vec16& b) const {
     return FP32Vec16(_mm256_add_ps(reg_low, b.reg_low),
                      _mm256_add_ps(reg_high, b.reg_high));
   }
 
-  FP32Vec16 operator-(const FP32Vec16 &b) const {
+  FP32Vec16 operator-(const FP32Vec16& b) const {
     return FP32Vec16(_mm256_sub_ps(reg_low, b.reg_low),
                      _mm256_sub_ps(reg_high, b.reg_high));
   }
 
-  FP32Vec16 operator/(const FP32Vec16 &b) const {
+  FP32Vec16 operator/(const FP32Vec16& b) const {
     return FP32Vec16(_mm256_div_ps(reg_low, b.reg_low),
                      _mm256_div_ps(reg_high, b.reg_high));
   }
@@ -484,7 +483,8 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
     return low.reduce_sum() + high.reduce_sum();
   }
 
-  template <int group_size> float reduce_sub_sum(int idx) {
+  template <int group_size>
+  float reduce_sub_sum(int idx) {
     float sum = 0.0;
     static_assert(VEC_ELEM_NUM % group_size == 0);
     constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size));
@@ -507,7 +507,7 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
     return sum;
   }
 
-  void save(float *ptr) const {
+  void save(float* ptr) const {
     _mm256_storeu_ps(ptr, reg_low);
     _mm256_storeu_ps(ptr + 8, reg_high);
   }
@@ -515,7 +515,7 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
 #endif
 
 #ifdef __AVX512F__
-struct INT8Vec16: public Vec<INT8Vec16> {
+struct INT8Vec16 : public Vec<INT8Vec16> {
   constexpr static int VEC_ELEM_NUM = 16;
   union AliasReg {
     __m128i reg;
@@ -523,14 +523,12 @@ struct INT8Vec16: public Vec<INT8Vec16> {
   };
 
   __m128i reg;
-  
-  explicit INT8Vec16(const FP32Vec16& vec) : reg(
-    _mm512_cvtepi32_epi8(_mm512_cvt_roundps_epi32(vec.reg, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC))
-  ) {}
 
-  void save(int8_t* ptr) const {
-    _mm_storeu_epi8(ptr, reg);
-  }
+  explicit INT8Vec16(const FP32Vec16& vec)
+      : reg(_mm512_cvtepi32_epi8(_mm512_cvt_roundps_epi32(
+            vec.reg, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC))) {}
+
+  void save(int8_t* ptr) const { _mm_storeu_epi8(ptr, reg); }
 
   void save(int8_t* ptr, const int elem_num) const {
     constexpr uint32_t M = 0xFFFFFFFF;
@@ -540,71 +538,92 @@ struct INT8Vec16: public Vec<INT8Vec16> {
 };
 #endif
 
-template <typename T> struct VecType { using vec_type = void; };
+template <typename T>
+struct VecType {
+  using vec_type = void;
+};
 
-template <typename T> using vec_t = typename VecType<T>::vec_type;
+template <typename T>
+using vec_t = typename VecType<T>::vec_type;
 
-template <> struct VecType<float> { using vec_type = FP32Vec8; };
+template <>
+struct VecType<float> {
+  using vec_type = FP32Vec8;
+};
 
-template <> struct VecType<c10::Half> { using vec_type = FP16Vec8; };
+template <>
+struct VecType<c10::Half> {
+  using vec_type = FP16Vec8;
+};
 
-template <> struct VecType<c10::BFloat16> { using vec_type = BF16Vec8; };
+template <>
+struct VecType<c10::BFloat16> {
+  using vec_type = BF16Vec8;
+};
 
-template <typename T> void storeFP32(float v, T *ptr) { *ptr = v; }
+template <typename T>
+void storeFP32(float v, T* ptr) {
+  *ptr = v;
+}
 
-inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) {
+inline void fma(FP32Vec16& acc, FP32Vec16& a, FP32Vec16& b) {
   acc = acc + a * b;
 }
 
-template <> inline void storeFP32<c10::Half>(float v, c10::Half *ptr) {
-  *reinterpret_cast<unsigned short *>(ptr) =
+template <>
+inline void storeFP32<c10::Half>(float v, c10::Half* ptr) {
+  *reinterpret_cast<unsigned short*>(ptr) =
       _cvtss_sh(v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 
-inline FP16Vec8::FP16Vec8(const FP32Vec8 &v)
+inline FP16Vec8::FP16Vec8(const FP32Vec8& v)
     : reg(_mm256_cvtps_ph(v.reg,
                           _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)) {}
 
 #ifdef __AVX512F__
-inline FP16Vec16::FP16Vec16(const FP32Vec16 &v)
+inline FP16Vec16::FP16Vec16(const FP32Vec16& v)
     : reg(_mm512_cvtps_ph(v.reg,
                           _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)) {}
 #else
-inline FP16Vec16::FP16Vec16(const FP32Vec16 &v)
-    : reg(_mm256_insertf128_si256(_mm256_castsi128_si256(FP16Vec8(FP32Vec8(v.reg_low)).reg), FP16Vec8(FP32Vec8(v.reg_low)).reg, 1)) {}
+inline FP16Vec16::FP16Vec16(const FP32Vec16& v)
+    : reg(_mm256_insertf128_si256(
+          _mm256_castsi128_si256(FP16Vec8(FP32Vec8(v.reg_low)).reg),
+          FP16Vec8(FP32Vec8(v.reg_low)).reg, 1)) {}
 #endif
 
 #ifdef __AVX512BF16__
-template <> inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16 *ptr) {
-  *reinterpret_cast<__bfloat16 *>(ptr) = _mm_cvtness_sbh(v);
+template <>
+inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16* ptr) {
+  *reinterpret_cast<__bfloat16*>(ptr) = _mm_cvtness_sbh(v);
 }
 
-inline BF16Vec8::BF16Vec8(const FP32Vec8 &v)
+inline BF16Vec8::BF16Vec8(const FP32Vec8& v)
     : reg((__m128i)_mm256_cvtneps_pbh(v.reg)) {}
 
-inline BF16Vec16::BF16Vec16(const FP32Vec16 &v)
+inline BF16Vec16::BF16Vec16(const FP32Vec16& v)
     : reg((__m256i)_mm512_cvtneps_pbh(v.reg)) {}
 
-inline void fma(FP32Vec16 &acc, BF16Vec32 &a, BF16Vec32 &b) {
+inline void fma(FP32Vec16& acc, BF16Vec32& a, BF16Vec32& b) {
   acc.reg = _mm512_dpbf16_ps(acc.reg, (__m512bh)a.reg, (__m512bh)b.reg);
 }
 #else
-template <> inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16 *ptr) {
-  c10::BFloat16 __attribute__((__may_alias__)) *v_ptr =
-      reinterpret_cast<c10::BFloat16 *>(&v);
+template <>
+inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16* ptr) {
+  c10::BFloat16 __attribute__((__may_alias__))* v_ptr =
+      reinterpret_cast<c10::BFloat16*>(&v);
   *ptr = *(v_ptr + 1);
 }
 
-#ifdef __AVX512F__
-inline BF16Vec8::BF16Vec8(const FP32Vec8 &v)
+  #ifdef __AVX512F__
+inline BF16Vec8::BF16Vec8(const FP32Vec8& v)
     : reg(_mm256_cvtepi32_epi16(
           _mm256_bsrli_epi128(_mm256_castps_si256(v.reg), 2))) {}
 
-inline BF16Vec16::BF16Vec16(const FP32Vec16 &v)
+inline BF16Vec16::BF16Vec16(const FP32Vec16& v)
     : reg(_mm512_cvtepi32_epi16(
           _mm512_bsrli_epi128(_mm512_castps_si512(v.reg), 2))) {}
-#else
-namespace{
+  #else
+namespace {
 __m128i FP32Vec8_to_BF16Vec8_avx2(__m256 a) {
   __m256i ai = _mm256_castps_si256(a);
   ai = _mm256_srli_epi32(ai, 16);
@@ -612,21 +631,21 @@ __m128i FP32Vec8_to_BF16Vec8_avx2(__m256 a) {
   ai = _mm256_permute4x64_epi64(ai, 0b00111001);
   return _mm256_extracti128_si256(ai, 0);
 }
-}
+}  // namespace
 
-inline BF16Vec8::BF16Vec8(const FP32Vec8 &v)
+inline BF16Vec8::BF16Vec8(const FP32Vec8& v)
     : reg(FP32Vec8_to_BF16Vec8_avx2(v.reg)) {}
 
-inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) {
+inline BF16Vec16::BF16Vec16(const FP32Vec16& v) {
   BF16Vec8 low = BF16Vec8(FP32Vec8(v.reg_low));
   BF16Vec8 high = BF16Vec8(FP32Vec8(v.reg_high));
   reg = _mm256_insertf128_si256(_mm256_castsi128_si256(low.reg), high.reg, 1);
 }
-#endif // __AVX512F__
-#endif // __AVX512BF16__
+  #endif  // __AVX512F__
+#endif    // __AVX512BF16__
 
-inline void prefetch(const void *addr) { _mm_prefetch(addr, _MM_HINT_T1); }
+inline void prefetch(const void* addr) { _mm_prefetch(addr, _MM_HINT_T1); }
 
-}; // namespace vec_op
+};  // namespace vec_op
 
 #endif
diff --git a/csrc/cutlass_extensions/common.hpp b/csrc/cutlass_extensions/common.hpp
index 85e359aa57113..07c9e46c27b06 100644
--- a/csrc/cutlass_extensions/common.hpp
+++ b/csrc/cutlass_extensions/common.hpp
@@ -27,8 +27,7 @@
 inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) {
   int max_shared_mem_per_block_opt_in = 0;
   cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in,
-                        cudaDevAttrMaxSharedMemoryPerBlockOptin,
-                        device);
+                         cudaDevAttrMaxSharedMemoryPerBlockOptin, device);
   return max_shared_mem_per_block_opt_in;
 }
 
diff --git a/docs/source/contributing/overview.md b/docs/source/contributing/overview.md
index e92104399342d..36cf8e7440eca 100644
--- a/docs/source/contributing/overview.md
+++ b/docs/source/contributing/overview.md
@@ -25,10 +25,12 @@ Check out the [building from source](#build-from-source) documentation for detai
 ```bash
 pip install -r requirements-dev.txt
 
-# linting and formatting
-bash format.sh
-# Static type checking
-mypy
+# Linting, formatting and static type checking
+pre-commit install
+
+# You can manually run pre-commit with
+pre-commit run --all-files
+
 # Unit tests
 pytest tests/
 ```
@@ -88,7 +90,8 @@ If the PR spans more than one category, please include all relevant prefixes.
 The PR needs to meet the following code quality standards:
 
 - We adhere to [Google Python style guide](https://google.github.io/styleguide/pyguide.html) and [Google C++ style guide](https://google.github.io/styleguide/cppguide.html).
-- Pass all linter checks. Please use <gh-file:format.sh> to format your code.
+- Pass all linter checks. Please use `pre-commit` to format your code. See
+  <https://pre-commit.com/#usage> if `pre-commit` is new to you.
 - The code needs to be well-documented to ensure future contributors can easily
   understand the code.
 - Include sufficient tests to ensure the project stays correct and robust. This
diff --git a/format.sh b/format.sh
deleted file mode 100755
index 2277eef93c745..0000000000000
--- a/format.sh
+++ /dev/null
@@ -1,321 +0,0 @@
-#!/usr/bin/env bash
-# YAPF formatter, adapted from ray and skypilot.
-#
-# Usage:
-#    # Do work and commit your work.
-
-#    # Format files that differ from origin/main.
-#    bash format.sh
-
-#    # Commit changed files with message 'Run yapf and ruff'
-#
-#
-# YAPF + Clang formatter (if installed). This script formats all changed files from the last mergebase.
-# You are encouraged to run this locally before pushing changes for review.
-
-# Cause the script to exit if a single command fails
-set -eo pipefail
-
-# this stops git rev-parse from failing if we run this from the .git directory
-builtin cd "$(dirname "${BASH_SOURCE:-$0}")"
-ROOT="$(git rev-parse --show-toplevel)"
-builtin cd "$ROOT" || exit 1
-
-check_command() {
-    if ! command -v "$1" &> /dev/null; then
-        echo "❓❓$1 is not installed, please run \`pip install -r requirements-lint.txt\`"
-        exit 1
-    fi
-}
-
-check_command yapf
-check_command ruff
-check_command mypy
-check_command codespell
-check_command isort
-check_command clang-format
-
-YAPF_VERSION=$(yapf --version | awk '{print $2}')
-RUFF_VERSION=$(ruff --version | awk '{print $2}')
-MYPY_VERSION=$(mypy --version | awk '{print $2}')
-CODESPELL_VERSION=$(codespell --version)
-ISORT_VERSION=$(isort --vn)
-CLANGFORMAT_VERSION=$(clang-format --version | awk '{print $3}')
-PYMARKDOWNLNT_VERSION=$(pymarkdownlnt version | awk '{print $1}')
-
-# # params: tool name, tool version, required version
-tool_version_check() {
-    expected=$(grep "$1" requirements-lint.txt | cut -d'=' -f3)
-    if [[ "$2" != "$expected" ]]; then
-        echo "❓❓Wrong $1 version installed: $expected is required, not $2."
-        exit 1
-    fi
-}
-
-tool_version_check "yapf" "$YAPF_VERSION"
-tool_version_check "ruff" "$RUFF_VERSION"
-tool_version_check "mypy" "$MYPY_VERSION"
-tool_version_check "isort" "$ISORT_VERSION"
-tool_version_check "codespell" "$CODESPELL_VERSION"
-tool_version_check "clang-format" "$CLANGFORMAT_VERSION"
-tool_version_check "pymarkdownlnt" "$PYMARKDOWNLNT_VERSION"
-
-YAPF_FLAGS=(
-    '--recursive'
-    '--parallel'
-)
-
-YAPF_EXCLUDES=(
-    '--exclude' 'build/**'
-)
-
-# Format specified files
-format() {
-    yapf --in-place "${YAPF_FLAGS[@]}" "$@"
-}
-
-# Format files that differ from main branch. Ignores dirs that are not slated
-# for autoformat yet.
-format_changed() {
-    # The `if` guard ensures that the list of filenames is not empty, which
-    # could cause yapf to receive 0 positional arguments, making it hang
-    # waiting for STDIN.
-    #
-    # `diff-filter=ACM` and $MERGEBASE is to ensure we only format files that
-    # exist on both branches.
-    MERGEBASE="$(git merge-base origin/main HEAD)"
-
-    if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then
-        git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs -P 5 \
-             yapf --in-place "${YAPF_EXCLUDES[@]}" "${YAPF_FLAGS[@]}"
-    fi
-
-}
-
-# Format all files
-format_all() {
-    yapf --in-place "${YAPF_FLAGS[@]}" "${YAPF_EXCLUDES[@]}" .
-}
-
-## This flag formats individual files. --files *must* be the first command line
-## arg to use this option.
-if [[ "$1" == '--files' ]]; then
-   format "${@:2}"
-   # If `--all` is passed, then any further arguments are ignored and the
-   # entire python directory is formatted.
-elif [[ "$1" == '--all' ]]; then
-   format_all
-else
-   # Format only the files that changed in last commit.
-   format_changed
-fi
-echo 'vLLM yapf: Done'
-
-# Run mypy
-echo 'vLLM mypy:'
-tools/mypy.sh
-echo 'vLLM mypy: Done'
-
-
-# If git diff returns a file that is in the skip list, the file may be checked anyway:
-# https://github.com/codespell-project/codespell/issues/1915
-# Avoiding the "./" prefix and using "/**" globs for directories appears to solve the problem
-CODESPELL_EXCLUDES=(
-    '--skip' 'tests/prompts/**,./benchmarks/sonnet.txt,*tests/lora/data/**,build/**'
-)
-
-# check spelling of specified files
-spell_check() {
-    codespell "$@"
-}
-
-spell_check_all(){
-  codespell --toml pyproject.toml "${CODESPELL_EXCLUDES[@]}"
-}
-
-# Spelling check of files that differ from main branch.
-spell_check_changed() {
-    # The `if` guard ensures that the list of filenames is not empty, which
-    # could cause ruff to receive 0 positional arguments, making it hang
-    # waiting for STDIN.
-    #
-    # `diff-filter=ACM` and $MERGEBASE is to ensure we only lint files that
-    # exist on both branches.
-    MERGEBASE="$(git merge-base origin/main HEAD)"
-    if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then
-        git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \
-            codespell "${CODESPELL_EXCLUDES[@]}"
-    fi
-}
-
-# Run Codespell
-## This flag runs spell check of individual files. --files *must* be the first command line
-## arg to use this option.
-if [[ "$1" == '--files' ]]; then
-   spell_check "${@:2}"
-   # If `--all` is passed, then any further arguments are ignored and the
-   # entire python directory is linted.
-elif [[ "$1" == '--all' ]]; then
-   spell_check_all
-else
-   # Check spelling only of the files that changed in last commit.
-   spell_check_changed
-fi
-echo 'vLLM codespell: Done'
-
-
-# Lint specified files
-lint() {
-    ruff check "$@"
-}
-
-# Lint files that differ from main branch. Ignores dirs that are not slated
-# for autolint yet.
-lint_changed() {
-    # The `if` guard ensures that the list of filenames is not empty, which
-    # could cause ruff to receive 0 positional arguments, making it hang
-    # waiting for STDIN.
-    #
-    # `diff-filter=ACM` and $MERGEBASE is to ensure we only lint files that
-    # exist on both branches.
-    MERGEBASE="$(git merge-base origin/main HEAD)"
-
-    if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then
-        git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \
-             ruff check
-    fi
-
-}
-
-# Run Ruff
-### This flag lints individual files. --files *must* be the first command line
-### arg to use this option.
-if [[ "$1" == '--files' ]]; then
-   lint "${@:2}"
-   # If `--all` is passed, then any further arguments are ignored and the
-   # entire python directory is linted.
-elif [[ "$1" == '--all' ]]; then
-   lint vllm tests
-else
-   # Format only the files that changed in last commit.
-   lint_changed
-fi
-echo 'vLLM ruff: Done'
-
-# check spelling of specified files
-isort_check() {
-    isort "$@"
-}
-
-isort_check_all(){
-  isort .
-}
-
-# Spelling  check of files that differ from main branch.
-isort_check_changed() {
-    # The `if` guard ensures that the list of filenames is not empty, which
-    # could cause ruff to receive 0 positional arguments, making it hang
-    # waiting for STDIN.
-    #
-    # `diff-filter=ACM` and $MERGEBASE is to ensure we only lint files that
-    # exist on both branches.
-    MERGEBASE="$(git merge-base origin/main HEAD)"
-
-    if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then
-        git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \
-             isort
-    fi
-}
-
-# Run Isort
-# This flag runs spell check of individual files. --files *must* be the first command line
-# arg to use this option.
-if [[ "$1" == '--files' ]]; then
-   isort_check "${@:2}"
-   # If `--all` is passed, then any further arguments are ignored and the
-   # entire python directory is linted.
-elif [[ "$1" == '--all' ]]; then
-   isort_check_all
-else
-   # Check spelling only of the files that changed in last commit.
-   isort_check_changed
-fi
-echo 'vLLM isort: Done'
-
-# Clang-format section
-# Exclude some files for formatting because they are vendored
-# NOTE: Keep up to date with .github/workflows/clang-format.yml
-CLANG_FORMAT_EXCLUDES=(
-    'csrc/moe/topk_softmax_kernels.cu'
-    'csrc/quantization/gguf/ggml-common.h'
-    'csrc/quantization/gguf/dequantize.cuh'
-    'csrc/quantization/gguf/vecdotq.cuh'
-    'csrc/quantization/gguf/mmq.cuh'
-    'csrc/quantization/gguf/mmvq.cuh'
-)
-
-# Format specified files with clang-format
-clang_format() {
-    clang-format -i "$@"
-}
-
-# Format files that differ from main branch with clang-format.
-clang_format_changed() {
-    # The `if` guard ensures that the list of filenames is not empty, which
-    # could cause clang-format to receive 0 positional arguments, making it hang
-    # waiting for STDIN.
-    #
-    # `diff-filter=ACM` and $MERGEBASE is to ensure we only format files that
-    # exist on both branches.
-    MERGEBASE="$(git merge-base origin/main HEAD)"
-
-    # Get the list of changed files, excluding the specified ones
-    changed_files=$(git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.h' '*.cpp' '*.cu' '*.cuh' | (grep -vFf <(printf "%s\n" "${CLANG_FORMAT_EXCLUDES[@]}") || echo -e))
-    if [ -n "$changed_files" ]; then
-        echo "$changed_files" | xargs -P 5 clang-format -i
-    fi
-}
-
-# Format all files with clang-format
-clang_format_all() {
-    find csrc/ \( -name '*.h' -o -name '*.cpp' -o -name '*.cu' -o -name '*.cuh' \) -print \
-        | grep -vFf <(printf "%s\n" "${CLANG_FORMAT_EXCLUDES[@]}") \
-        | xargs clang-format -i
-}
-
-# Run clang-format
-if [[ "$1" == '--files' ]]; then
-   clang_format "${@:2}"
-elif [[ "$1" == '--all' ]]; then
-   clang_format_all
-else
-   clang_format_changed
-fi
-echo 'vLLM clang-format: Done'
-
-echo 'vLLM actionlint:'
-tools/actionlint.sh -color
-echo 'vLLM actionlint: Done'
-
-echo 'vLLM shellcheck:'
-tools/shellcheck.sh
-echo 'vLLM shellcheck: Done'
-
-echo 'excalidraw png check:'
-tools/png-lint.sh
-echo 'excalidraw png check: Done'
-
-if ! git diff --quiet &>/dev/null; then
-    echo 
-    echo "🔍🔍There are files changed by the format checker or by you that are not added and committed:"
-    git --no-pager diff --name-only
-    echo "🔍🔍Format checker passed, but please add, commit and push all the files above to include changes made by the format checker."
-
-    exit 1
-else
-    echo "✨🎉 Format check passed! Congratulations! 🎉✨"
-fi
-
-echo 'vLLM doc-lint:'
-tools/doc-lint.sh
-echo 'vLLM doc-lint: Done'
diff --git a/pyproject.toml b/pyproject.toml
index 82275ccafb572..8f2e20d0f5800 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -15,6 +15,11 @@ build-backend = "setuptools.build_meta"
 [tool.setuptools_scm]
 # version_file = "vllm/_version.py" # currently handled by `setup.py:get_version()`
 
+[tool.yapfignore]
+ignore_patterns = [
+    "build/**",
+]
+
 [tool.ruff]
 # Allow lines to be as long as 80.
 line-length = 80
@@ -52,6 +57,9 @@ ignore = [
     "B007",
     # f-string format
     "UP032",
+    # Python 3.8 typing
+    "UP006", "UP035",
+
 ]
 
 [tool.mypy]
diff --git a/requirements-lint.txt b/requirements-lint.txt
index ffc73f90a0d48..62446f94048df 100644
--- a/requirements-lint.txt
+++ b/requirements-lint.txt
@@ -1,15 +1,2 @@
 # formatting
-yapf==0.32.0
-toml==0.10.2
-tomli==2.0.2
-ruff==0.6.5
-codespell==2.3.0
-isort==5.13.2
-clang-format==18.1.5
-pymarkdownlnt==0.9.26
-
-# type checking
-mypy==1.11.1
-types-PyYAML
-types-requests
-types-setuptools
+pre-commit==4.0.1
diff --git a/tools/actionlint.sh b/tools/actionlint.sh
deleted file mode 100755
index f6a8b5e83a2de..0000000000000
--- a/tools/actionlint.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-if command -v actionlint &> /dev/null; then
-    actionlint "$@"
-    exit 0
-elif [ -x ./actionlint ]; then
-    ./actionlint "$@"
-    exit 0
-fi
-
-# download a binary to the current directory - v1.7.3
-bash <(curl https://raw.githubusercontent.com/rhysd/actionlint/aa0a7be8e566b096e64a5df8ff290ec24fa58fbc/scripts/download-actionlint.bash)
-./actionlint "$@"
diff --git a/tools/doc-lint.sh b/tools/doc-lint.sh
deleted file mode 100755
index 19a55ddfa91c4..0000000000000
--- a/tools/doc-lint.sh
+++ /dev/null
@@ -1,3 +0,0 @@
-#!/bin/bash
-
-pymarkdownlnt scan docs -r

From c5c06209ec1d90146dd12095d7bff3326aa6dd15 Mon Sep 17 00:00:00 2001
From: Yuan Tang <terrytangyuan@gmail.com>
Date: Mon, 20 Jan 2025 01:58:29 -0500
Subject: [PATCH 279/462] [DOC] Fix typo in docstring and assert message
 (#12194)

Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
---
 vllm/engine/output_processor/single_step.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/engine/output_processor/single_step.py b/vllm/engine/output_processor/single_step.py
index da3185f33dbe9..55c56abea0da3 100644
--- a/vllm/engine/output_processor/single_step.py
+++ b/vllm/engine/output_processor/single_step.py
@@ -102,9 +102,9 @@ def process_prompt_logprob(self, seq_group: SequenceGroup,
         
         Args:
           seq_group: the output is associated with this :class:`SequenceGroup`
-          output: the :class:`SequenceGroupOutput` for a single scheduler step
+          outputs: the :class:`SequenceGroupOutput` for a single scheduler step
         """
-        assert len(outputs) == 1, ("Single step should only has 1 output.")
+        assert len(outputs) == 1, "Single step should only have 1 output."
         output = outputs[0]
         assert isinstance(output, CompletionSequenceGroupOutput)
         single_step_process_prompt_logprob(self, seq_group, output)

From d2643128f7741b937435b00fecde7d6b2e351d0c Mon Sep 17 00:00:00 2001
From: Yuan Tang <terrytangyuan@gmail.com>
Date: Mon, 20 Jan 2025 01:59:00 -0500
Subject: [PATCH 280/462] [DOC] Add missing docstring in
 LLMEngine.add_request() (#12195)

Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
---
 vllm/engine/llm_engine.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 88c21f9a6d31b..b6bba1d67b408 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -689,7 +689,9 @@ def add_request(
                 :class:`~vllm.PoolingParams` for pooling.
             arrival_time: The arrival time of the request. If None, we use
                 the current monotonic time.
+            lora_request: The LoRA request to add.
             trace_headers: OpenTelemetry trace headers.
+            prompt_adapter_request: The prompt adapter request to add.
             priority: The priority of the request.
                 Only applicable with priority scheduling.
 

From 0974c9bc5c0252ecb25f440139936529657452ab Mon Sep 17 00:00:00 2001
From: Yuan Tang <terrytangyuan@gmail.com>
Date: Mon, 20 Jan 2025 01:59:20 -0500
Subject: [PATCH 281/462] [Bugfix] Fix incorrect types in
 LayerwiseProfileResults (#12196)

Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
---
 vllm/profiler/layerwise_profile.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/profiler/layerwise_profile.py b/vllm/profiler/layerwise_profile.py
index 33babfebdca1e..29c0edd0ee535 100644
--- a/vllm/profiler/layerwise_profile.py
+++ b/vllm/profiler/layerwise_profile.py
@@ -1,7 +1,7 @@
 import copy
 from collections import defaultdict
 from dataclasses import asdict, dataclass, field
-from typing import Callable, Dict, List, Optional, Tuple, TypeAlias, Union
+from typing import Any, Callable, Dict, List, Optional, Tuple, TypeAlias, Union
 
 import pandas as pd
 from torch._C._autograd import DeviceType, _KinetoEvent, _ProfilerResult
@@ -128,7 +128,7 @@ def export_summary_stats_table_csv(self, filename: str):
         ])
         df.to_csv(filename)
 
-    def convert_stats_to_dict(self) -> str:
+    def convert_stats_to_dict(self) -> dict[str, Any]:
         return {
             "metadata": {
                 "num_running_seqs": self.num_running_seqs
@@ -227,7 +227,7 @@ def _total_cuda_time(self):
             [self._cumulative_cuda_time(root) for root in self._module_tree])
 
     def _build_stats_trees(self):
-        summary_dict: Dict[str, self.StatsTreeNode] = {}
+        summary_dict: Dict[str, _StatsTreeNode] = {}
         total_cuda_time = self._total_cuda_time()
 
         def pct_cuda_time(cuda_time_us):

From 83609791d2ceeb628e0d1f5ea60a64c132eb083c Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Mon, 20 Jan 2025 14:59:46 +0800
Subject: [PATCH 282/462] [Model] Add Qwen2 PRM model support (#12202)

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 docs/source/models/supported_models.md        |  5 +++
 .../embedding/language/test_embedding.py      |  9 ++--
 tests/models/registry.py                      |  1 +
 vllm/model_executor/models/qwen2_rm.py        | 42 +++++++++++++++----
 vllm/model_executor/models/registry.py        |  1 +
 5 files changed, 45 insertions(+), 13 deletions(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index eb1bde9ec0089..3da5aaf713c1f 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -470,6 +470,11 @@ of the whole prompt are extracted from the normalized hidden state corresponding
   - `Qwen/Qwen2.5-Math-RM-72B`, etc.
   - ✅︎
   - ✅︎
+* - `Qwen2ForProcessRewardModel`
+  - Qwen2-based
+  - `Qwen/Qwen2.5-Math-PRM-7B`, `Qwen/Qwen2.5-Math-PRM-72B`, etc.
+  - ✅︎
+  - ✅︎
 ```
 
 If your model is not in the above list, we will try to automatically convert the model using
diff --git a/tests/models/embedding/language/test_embedding.py b/tests/models/embedding/language/test_embedding.py
index 04ab4dd7371a3..bb47d14807b55 100644
--- a/tests/models/embedding/language/test_embedding.py
+++ b/tests/models/embedding/language/test_embedding.py
@@ -17,14 +17,15 @@
                      marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
         pytest.param("sentence-transformers/all-MiniLM-L12-v2"),
         pytest.param("intfloat/multilingual-e5-large"),
-        # [Encoder-decoder]
-        pytest.param("intfloat/e5-mistral-7b-instruct",
-                     marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
+        # [Decoder-only]
         pytest.param("BAAI/bge-multilingual-gemma2",
                      marks=[pytest.mark.core_model]),
-        pytest.param("ssmits/Qwen2-7B-Instruct-embed-base"),
+        pytest.param("intfloat/e5-mistral-7b-instruct",
+                     marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
         pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct"),
         pytest.param("Alibaba-NLP/gte-Qwen2-7B-instruct"),
+        pytest.param("ssmits/Qwen2-7B-Instruct-embed-base"),
+        # [Encoder-decoder]
         pytest.param("sentence-transformers/stsb-roberta-base-v2"),
     ],
 )
diff --git a/tests/models/registry.py b/tests/models/registry.py
index cb0521cfe80a7..9603ea8817cac 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -155,6 +155,7 @@ class _HfExamplesInfo:
     "MistralModel": _HfExamplesInfo("intfloat/e5-mistral-7b-instruct"),
     "Qwen2Model": _HfExamplesInfo("ssmits/Qwen2-7B-Instruct-embed-base"),
     "Qwen2ForRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-RM-72B"),
+    "Qwen2ForProcessRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-PRM-7B"),
     "Qwen2ForSequenceClassification": _HfExamplesInfo("jason9693/Qwen2.5-1.5B-apeach"),  # noqa: E501
     "RobertaModel": _HfExamplesInfo("sentence-transformers/stsb-roberta-base-v2"),  # noqa: E501
     "RobertaForMaskedLM": _HfExamplesInfo("sentence-transformers/all-roberta-large-v1"),  # noqa: E501
diff --git a/vllm/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py
index 988d682d36be3..593ce4857af0f 100644
--- a/vllm/model_executor/models/qwen2_rm.py
+++ b/vllm/model_executor/models/qwen2_rm.py
@@ -12,7 +12,7 @@
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                RowParallelLinear)
-from vllm.model_executor.layers.pooler import Pooler, PoolingType
+from vllm.model_executor.layers.pooler import Pooler, PoolingType, SimplePooler
 from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.sequence import IntermediateTensors, PoolerOutput
 
@@ -32,7 +32,7 @@ def forward(self, input):
         return self.activation(input)
 
 
-class Qwen2ForRewardModel(nn.Module, SupportsLoRA, SupportsPP):
+class Qwen2RewardBaseModel(nn.Module, SupportsLoRA, SupportsPP):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -60,7 +60,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
-        pooler_config = vllm_config.model_config.pooler_config
 
         self.config = config
         self.lora_config = lora_config
@@ -74,14 +73,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                                  config.hidden_size,
                                  quant_config=quant_config),
             ReLU(),
-            RowParallelLinear(config.hidden_size, 1,
+            RowParallelLinear(config.hidden_size,
+                              config.num_labels,
                               quant_config=quant_config),
         )
-        self._pooler = Pooler.from_config_with_defaults(
-            pooler_config,
-            pooling_type=PoolingType.ALL,
-            normalize=False,
-            softmax=False)
+        self._pooler: SimplePooler
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
@@ -115,3 +111,31 @@ def load_weights(self, weights: Iterable[Tuple[str,
         loader = AutoWeightsLoader(self,
                                    ignore_unexpected_prefixes=["lm_head."])
         return loader.load_weights(weights)
+
+
+class Qwen2ForRewardModel(Qwen2RewardBaseModel):
+
+    def __init__(self, *, vllm_config, prefix=""):
+        vllm_config.model_config.hf_config.num_labels = 1
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+        pooler_config = vllm_config.model_config.pooler_config
+        self._pooler = Pooler.from_config_with_defaults(
+            pooler_config,
+            pooling_type=PoolingType.ALL,
+            normalize=False,
+            softmax=False)
+
+
+class Qwen2ForProcessRewardModel(Qwen2RewardBaseModel):
+
+    def __init__(self, *, vllm_config, prefix=""):
+        vllm_config.model_config.hf_config.num_labels = 2
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+        pooler_config = vllm_config.model_config.pooler_config
+        self._pooler = Pooler.from_config_with_defaults(
+            pooler_config,
+            pooling_type=PoolingType.STEP,
+            normalize=False,
+            softmax=True,
+            step_tag_id=151651,
+        )
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 311f91472783b..8d2719ca2d00d 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -127,6 +127,7 @@
     "Qwen2Model": ("qwen2", "Qwen2EmbeddingModel"),
     "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"),
     "Qwen2ForRewardModel": ("qwen2_rm", "Qwen2ForRewardModel"),
+    "Qwen2ForProcessRewardModel": ("qwen2_rm", "Qwen2ForProcessRewardModel"),
     "TeleChat2ForCausalLM": ("telechat2", "TeleChat2ForCausalLM"),
     # [Multimodal]
     "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"),  # noqa: E501

From 59a0192fb9bef026086d0a2ed32705d870a9466a Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 20 Jan 2025 15:00:59 +0800
Subject: [PATCH 283/462] [Core] Interface for accessing model from
 `VllmRunner` (#10353)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/conftest.py                             |   5 +
 tests/engine/test_custom_executor.py          |   4 +-
 .../test_model_load_with_params.py            |  64 ++---
 .../decoder_only/language/test_jamba.py       |   7 +-
 .../decoder_only/language/test_mamba.py       |   7 +-
 .../decoder_only/language/test_models.py      |   7 +-
 .../vision_language/test_qwen2_vl.py          |  49 ++--
 .../embedding/language/test_cls_models.py     |   7 +-
 .../embedding/language/test_embedding.py      |   7 +-
 tests/quantization/test_compressed_tensors.py | 242 ++++++++++--------
 tests/quantization/test_fp8.py                |  52 ++--
 tests/quantization/test_lm_head.py            |  37 +--
 tests/quantization/test_quark.py              |  23 +-
 tests/tensorizer_loader/test_tensorizer.py    |  34 ++-
 vllm/engine/llm_engine.py                     |  17 +-
 vllm/entrypoints/llm.py                       |  52 ++--
 vllm/executor/executor_base.py                |  50 +++-
 vllm/executor/mp_distributed_executor.py      |   2 +-
 .../model_executor/model_loader/tensorizer.py |  17 +-
 vllm/spec_decode/ngram_worker.py              |  12 +-
 .../spec_decode/smaller_tp_proposer_worker.py |  12 +
 vllm/spec_decode/spec_decode_worker.py        |   4 +
 vllm/v1/executor/multiproc_executor.py        |  16 +-
 vllm/v1/worker/gpu_model_runner.py            |   3 +
 vllm/v1/worker/gpu_worker.py                  |   4 +
 vllm/worker/cpu_model_runner.py               |   3 +
 vllm/worker/hpu_model_runner.py               |   4 +
 vllm/worker/model_runner.py                   |   3 +
 vllm/worker/model_runner_base.py              |   9 +-
 vllm/worker/neuron_model_runner.py            |   3 +
 vllm/worker/openvino_model_runner.py          |   3 +
 vllm/worker/openvino_worker.py                |   4 +
 vllm/worker/tpu_model_runner.py               |   3 +
 vllm/worker/worker_base.py                    |  12 +
 vllm/worker/xpu_model_runner.py               |   3 +
 35 files changed, 474 insertions(+), 307 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 95af4ac1eb17b..279c1bf9a3776 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -244,6 +244,7 @@ def video_assets() -> _VideoAssets:
 
 
 _T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding, BatchFeature, dict)
+_R = TypeVar("_R")
 
 
 class HfRunner:
@@ -930,6 +931,10 @@ def score(
         req_outputs = self.model.score(text_1, text_2)
         return [req_output.outputs.score for req_output in req_outputs]
 
+    def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
+        executor = self.model.llm_engine.model_executor
+        return executor.apply_model(func)
+
     def __enter__(self):
         return self
 
diff --git a/tests/engine/test_custom_executor.py b/tests/engine/test_custom_executor.py
index fdfcd4f4c9d50..0e33f3662da82 100644
--- a/tests/engine/test_custom_executor.py
+++ b/tests/engine/test_custom_executor.py
@@ -51,7 +51,9 @@ def test_custom_executor(model, tmp_path):
         assert not os.path.exists(".marker")
 
         engine_args = EngineArgs(
-            model=model, distributed_executor_backend=CustomUniExecutor)
+            model=model,
+            distributed_executor_backend=CustomUniExecutor,
+        )
         engine = LLMEngine.from_engine_args(engine_args)
         sampling_params = SamplingParams(max_tokens=1)
 
diff --git a/tests/model_executor/test_model_load_with_params.py b/tests/model_executor/test_model_load_with_params.py
index 0609fd96825e3..9c1f784c1c93b 100644
--- a/tests/model_executor/test_model_load_with_params.py
+++ b/tests/model_executor/test_model_load_with_params.py
@@ -25,13 +25,12 @@ def test_model_loading_with_params(vllm_runner):
     with vllm_runner(model_name=MODEL_NAME,
                      revision=REVISION,
                      dtype="float16",
-                     max_model_len=MAX_MODEL_LEN) as model:
-        output = model.encode("Write a short story about a robot that"
-                              " dreams for the first time.\n")
+                     max_model_len=MAX_MODEL_LEN) as vllm_model:
+        output = vllm_model.encode("Write a short story about a robot that"
+                                   " dreams for the first time.\n")
 
-        model_config = model.model.llm_engine.model_config
-
-        model_tokenizer = model.model.llm_engine.tokenizer
+        model_config = vllm_model.model.llm_engine.model_config
+        model_tokenizer = vllm_model.model.llm_engine.tokenizer
 
         # asserts on the bert model config file
         assert model_config.encoder_config["max_seq_length"] == 512
@@ -46,11 +45,13 @@ def test_model_loading_with_params(vllm_runner):
         assert model_tokenizer.tokenizer_config["do_lower_case"]
         assert model_tokenizer.tokenizer.model_max_length == 512
 
-        model = model.model.llm_engine.model_executor\
-                     .driver_worker.model_runner.model
-        assert isinstance(model, BertEmbeddingModel)
-        assert model._pooler.pooling_type == PoolingType.CLS
-        assert model._pooler.normalize
+        def check_model(model):
+            assert isinstance(model, BertEmbeddingModel)
+            assert model._pooler.pooling_type == PoolingType.CLS
+            assert model._pooler.normalize
+
+        vllm_model.apply_model(check_model)
+
         # assert output
         assert output
 
@@ -64,13 +65,12 @@ def test_roberta_model_loading_with_params(vllm_runner):
     with vllm_runner(model_name=MODEL_NAME_ROBERTA,
                      revision=REVISION_ROBERTA,
                      dtype="float16",
-                     max_model_len=MAX_MODEL_LEN) as model:
-        output = model.encode("Write a short story about a robot that"
-                              " dreams for the first time.\n")
+                     max_model_len=MAX_MODEL_LEN) as vllm_model:
+        output = vllm_model.encode("Write a short story about a robot that"
+                                   " dreams for the first time.\n")
 
-        model_config = model.model.llm_engine.model_config
-
-        model_tokenizer = model.model.llm_engine.tokenizer
+        model_config = vllm_model.model.llm_engine.model_config
+        model_tokenizer = vllm_model.model.llm_engine.tokenizer
 
         # asserts on the bert model config file
         assert model_config.encoder_config["max_seq_length"] == 512
@@ -84,11 +84,12 @@ def test_roberta_model_loading_with_params(vllm_runner):
         assert model_tokenizer.tokenizer_id == "intfloat/multilingual-e5-large"
         assert not model_tokenizer.tokenizer_config["do_lower_case"]
 
-        model = model.model.llm_engine.model_executor\
-                     .driver_worker.model_runner.model
-        assert isinstance(model, RobertaEmbeddingModel)
-        assert model._pooler.pooling_type == PoolingType.MEAN
-        assert model._pooler.normalize
+        def check_model(model):
+            assert isinstance(model, RobertaEmbeddingModel)
+            assert model._pooler.pooling_type == PoolingType.MEAN
+            assert model._pooler.normalize
+
+        vllm_model.apply_model(check_model)
 
         # assert output
         assert output
@@ -103,17 +104,18 @@ def test_facebook_roberta_model_loading_with_params(vllm_runner):
     model_name = "FacebookAI/roberta-base"
     with vllm_runner(model_name=model_name,
                      dtype="float16",
-                     max_model_len=MAX_MODEL_LEN) as model:
-        output = model.encode("Write a short story about a robot that"
-                              " dreams for the first time.\n")
+                     max_model_len=MAX_MODEL_LEN) as vllm_model:
+        output = vllm_model.encode("Write a short story about a robot that"
+                                   " dreams for the first time.\n")
 
-        model_tokenizer = model.model.llm_engine.tokenizer
+        model_tokenizer = vllm_model.model.llm_engine.tokenizer
         assert model_tokenizer.tokenizer_id == model_name
 
-        model = model.model.llm_engine.model_executor\
-                     .driver_worker.model_runner.model
-        assert not hasattr(model, "lm_head")
-        assert isinstance(model, RobertaEmbeddingModel)
-        assert isinstance(model._pooler, CLSPool)
+        def check_model(model):
+            assert isinstance(model, RobertaEmbeddingModel)
+            assert not hasattr(model, "lm_head")
+            assert isinstance(model._pooler, CLSPool)
+
+        vllm_model.apply_model(check_model)
 
         assert output
diff --git a/tests/models/decoder_only/language/test_jamba.py b/tests/models/decoder_only/language/test_jamba.py
index 057b04349e8b7..2e06b10fbb827 100644
--- a/tests/models/decoder_only/language/test_jamba.py
+++ b/tests/models/decoder_only/language/test_jamba.py
@@ -33,10 +33,13 @@ def test_models(
 
     with vllm_runner(model, dtype=dtype) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+
         # This test is for verifying whether the model's extra_repr
         # can be printed correctly.
-        print(vllm_model.model.llm_engine.model_executor.driver_worker.
-              model_runner.model)
+        def print_model(model):
+            print(model)
+
+        vllm_model.apply_model(print_model)
 
     for i in range(len(example_prompts)):
         hf_output_ids, hf_output_str = hf_outputs[i]
diff --git a/tests/models/decoder_only/language/test_mamba.py b/tests/models/decoder_only/language/test_mamba.py
index 06739e8f02253..1ad4f5aae8f5b 100644
--- a/tests/models/decoder_only/language/test_mamba.py
+++ b/tests/models/decoder_only/language/test_mamba.py
@@ -51,10 +51,13 @@ def test_models(
 
     with vllm_runner(model, dtype=dtype) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+
         # This test is for verifying whether the model's extra_repr
         # can be printed correctly.
-        print(vllm_model.model.llm_engine.model_executor.driver_worker.
-              model_runner.model)
+        def print_model(model):
+            print(model)
+
+        vllm_model.apply_model(print_model)
 
     for i in range(len(example_prompts)):
         hf_output_ids, hf_output_str = hf_outputs[i]
diff --git a/tests/models/decoder_only/language/test_models.py b/tests/models/decoder_only/language/test_models.py
index 4e110366a09f3..c7efa4edbbc0a 100644
--- a/tests/models/decoder_only/language/test_models.py
+++ b/tests/models/decoder_only/language/test_models.py
@@ -73,10 +73,13 @@ def test_models(
     with vllm_runner(model, dtype=dtype) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy_logprobs(
             example_prompts, max_tokens, num_logprobs)
+
         # This test is for verifying whether the model's extra_repr
         # can be printed correctly.
-        print(vllm_model.model.llm_engine.model_executor.driver_worker.
-              model_runner.model)
+        def print_model(model):
+            print(model)
+
+        vllm_model.apply_model(print_model)
 
     check_logprobs_close(
         outputs_0_lst=hf_outputs,
diff --git a/tests/models/decoder_only/vision_language/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/test_qwen2_vl.py
index 2fd22f0cc88ec..5a485f3d81747 100644
--- a/tests/models/decoder_only/vision_language/test_qwen2_vl.py
+++ b/tests/models/decoder_only/vision_language/test_qwen2_vl.py
@@ -5,7 +5,6 @@
 import torch
 from PIL import Image
 
-from vllm.entrypoints.llm import LLM
 from vllm.multimodal.image import rescale_image_size
 from vllm.multimodal.video import rescale_video_size, sample_frames_from_video
 
@@ -69,7 +68,7 @@ class Qwen2VLPromptVideoEmbeddingInput(TypedDict):
 
 def batch_make_image_embeddings(
         image_batches: List[Union[Image.Image, List[Image.Image]]], processor,
-        llm: LLM) -> List[Qwen2VLPromptImageEmbeddingInput]:
+        llm: VllmRunner) -> List[Qwen2VLPromptImageEmbeddingInput]:
     """batched image embeddings for Qwen2-VL
 
     This will infer all images' embeddings in a single batch, 
@@ -106,16 +105,18 @@ def batch_make_image_embeddings(
     image_grid_thw = preprocess_result["image_grid_thw"]
 
     # pixel values to embeddings & grid_thws
-    with torch.no_grad():
-        visual = llm.llm_engine.model_executor.driver_worker. \
-            model_runner.model.visual
+    def get_image_embeds(model):
+        with torch.no_grad():
+            visual = model.visual
 
-        pixel_values_on_device = pixel_values.to(visual.device,
-                                                 dtype=visual.dtype)
-        image_grid_thw_on_device = image_grid_thw.to(visual.device,
-                                                     dtype=torch.int64)
-        image_embeds = visual(pixel_values_on_device,
-                              grid_thw=image_grid_thw_on_device)
+            pixel_values_on_device = pixel_values.to(visual.device,
+                                                     dtype=visual.dtype)
+            image_grid_thw_on_device = image_grid_thw.to(visual.device,
+                                                         dtype=torch.int64)
+            return visual(pixel_values_on_device,
+                          grid_thw=image_grid_thw_on_device)
+
+    image_embeds = torch.concat(llm.apply_model(get_image_embeds))
 
     # split into original batches
     result: List[Qwen2VLPromptImageEmbeddingInput] = []
@@ -150,7 +151,7 @@ def batch_make_image_embeddings(
 
 def batch_make_video_embeddings(
         video_batches: PromptVideoInput, processor,
-        llm: LLM) -> List[Qwen2VLPromptVideoEmbeddingInput]:
+        llm: VllmRunner) -> List[Qwen2VLPromptVideoEmbeddingInput]:
     """batched video embeddings for Qwen2-VL
 
     A NDArray represents a single video's all frames.
@@ -187,16 +188,18 @@ def batch_make_video_embeddings(
     video_grid_thw = preprocess_result["video_grid_thw"]
 
     # pixel values to embeddings & grid_thws
-    with torch.no_grad():
-        visual = llm.llm_engine.model_executor.driver_worker.\
-            model_runner.model.visual
+    def get_image_embeds(model):
+        with torch.no_grad():
+            visual = model.visual
+
+            pixel_values_on_device = pixel_values.to(visual.device,
+                                                     dtype=visual.dtype)
+            video_grid_thw_on_device = video_grid_thw.to(visual.device,
+                                                         dtype=torch.int64)
+            return visual(pixel_values_on_device,
+                          grid_thw=video_grid_thw_on_device)
 
-        pixel_values_on_device = pixel_values.to(visual.device,
-                                                 dtype=visual.dtype)
-        video_grid_thw_on_device = video_grid_thw.to(visual.device,
-                                                     dtype=torch.int64)
-        video_embeds = visual(pixel_values_on_device,
-                              grid_thw=video_grid_thw_on_device)
+    video_embeds = torch.concat(llm.apply_model(get_image_embeds))
 
     # split into original batches
     result: List[Qwen2VLPromptVideoEmbeddingInput] = []
@@ -278,9 +281,9 @@ def run_embedding_input_test(
                 max_tokens,
                 num_logprobs=num_logprobs,
                 images=batch_make_image_embeddings(
-                    images, processor, vllm_model.model) if images else None,
+                    images, processor, vllm_model) if images else None,
                 videos=batch_make_video_embeddings(
-                    videos, processor, vllm_model.model) if videos else None)
+                    videos, processor, vllm_model) if videos else None)
             for prompts, images, videos in inputs
         ]
 
diff --git a/tests/models/embedding/language/test_cls_models.py b/tests/models/embedding/language/test_cls_models.py
index 6673a9fc22f69..0cbe4afe96c0a 100644
--- a/tests/models/embedding/language/test_cls_models.py
+++ b/tests/models/embedding/language/test_cls_models.py
@@ -24,10 +24,13 @@ def test_classification_models(
 ) -> None:
     with vllm_runner(model, dtype=dtype) as vllm_model:
         vllm_outputs = vllm_model.classify(example_prompts)
+
         # This test is for verifying whether the model's extra_repr
         # can be printed correctly.
-        print(vllm_model.model.llm_engine.model_executor.driver_worker.
-              model_runner.model)
+        def print_model(model):
+            print(model)
+
+        vllm_model.apply_model(print_model)
 
     with hf_runner(model,
                    dtype=dtype,
diff --git a/tests/models/embedding/language/test_embedding.py b/tests/models/embedding/language/test_embedding.py
index bb47d14807b55..e17198e385475 100644
--- a/tests/models/embedding/language/test_embedding.py
+++ b/tests/models/embedding/language/test_embedding.py
@@ -62,10 +62,13 @@ def test_models(
                      max_model_len=None,
                      **vllm_extra_kwargs) as vllm_model:
         vllm_outputs = vllm_model.encode(example_prompts)
+
         # This test is for verifying whether the model's extra_repr
         # can be printed correctly.
-        print(vllm_model.model.llm_engine.model_executor.driver_worker.
-              model_runner.model)
+        def print_model(model):
+            print(model)
+
+        vllm_model.apply_model(print_model)
 
     check_embeddings_close(
         embeddings_0_lst=hf_outputs,
diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index 92436889ecffe..0cd86cef0a475 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -30,50 +30,55 @@
 def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
     model_path, strategy, quant_type, shape_0, is_symmetric = model_args
     with vllm_runner(model_path, enforce_eager=True) as llm:
-        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
-        layer = model.model.layers[0]
-
-        qkv_proj = layer.self_attn.qkv_proj
-        o_proj = layer.self_attn.o_proj
-        gate_up_proj = layer.mlp.gate_up_proj
-        down_proj = layer.mlp.down_proj
-
-        # assert zp for symmetric and asymmetric cases
-        def zp_valid(zp: Optional[torch.Tensor]):
-            if is_symmetric:
-                return zp is None
-
-            return zp is not None and zp.dtype is torch.int32
-
-        assert zp_valid(qkv_proj.input_zero_point)
-        assert zp_valid(o_proj.input_zero_point)
-        assert zp_valid(gate_up_proj.input_zero_point)
-        assert zp_valid(down_proj.input_zero_point)
-
-        assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
-        assert isinstance(o_proj.quant_method, CompressedTensorsLinearMethod)
-        assert isinstance(gate_up_proj.quant_method,
-                          CompressedTensorsLinearMethod)
-        assert isinstance(down_proj.quant_method,
-                          CompressedTensorsLinearMethod)
-        assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8Int8)
-
-        assert qkv_proj.scheme.strategy == strategy
-        assert qkv_proj.scheme.is_static_input_scheme
-        expected_type = torch.int8
-
-        assert qkv_proj.weight.dtype is expected_type
-        assert o_proj.weight.dtype is expected_type
-        assert gate_up_proj.weight.dtype is expected_type
-
-        if qkv_proj.scheme.strategy == "tensor":
-            # Make sure it is a channelwise buffer
-            # After running process_weights_after_loading
-            assert len(qkv_proj.weight_scale.shape) == 2
-            assert qkv_proj.weight_scale.shape[0] == shape_0
-            assert qkv_proj.weight_scale.shape[1] == 1
-        assert qkv_proj.weight_scale.dtype is torch.float32
-        assert qkv_proj.input_scale.dtype is torch.float32
+
+        def check_model(model):
+            layer = model.model.layers[0]
+
+            qkv_proj = layer.self_attn.qkv_proj
+            o_proj = layer.self_attn.o_proj
+            gate_up_proj = layer.mlp.gate_up_proj
+            down_proj = layer.mlp.down_proj
+
+            # assert zp for symmetric and asymmetric cases
+            def zp_valid(zp: Optional[torch.Tensor]):
+                if is_symmetric:
+                    return zp is None
+
+                return zp is not None and zp.dtype is torch.int32
+
+            assert zp_valid(qkv_proj.input_zero_point)
+            assert zp_valid(o_proj.input_zero_point)
+            assert zp_valid(gate_up_proj.input_zero_point)
+            assert zp_valid(down_proj.input_zero_point)
+
+            assert isinstance(qkv_proj.quant_method,
+                              CompressedTensorsLinearMethod)
+            assert isinstance(o_proj.quant_method,
+                              CompressedTensorsLinearMethod)
+            assert isinstance(gate_up_proj.quant_method,
+                              CompressedTensorsLinearMethod)
+            assert isinstance(down_proj.quant_method,
+                              CompressedTensorsLinearMethod)
+            assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8Int8)
+
+            assert qkv_proj.scheme.strategy == strategy
+            assert qkv_proj.scheme.is_static_input_scheme
+            expected_type = torch.int8
+
+            assert qkv_proj.weight.dtype is expected_type
+            assert o_proj.weight.dtype is expected_type
+            assert gate_up_proj.weight.dtype is expected_type
+
+            if qkv_proj.scheme.strategy == "tensor":
+                # Make sure it is a channelwise buffer
+                # After running process_weights_after_loading
+                assert len(qkv_proj.weight_scale.shape) == 2
+                assert qkv_proj.weight_scale.shape[0] == shape_0
+                assert qkv_proj.weight_scale.shape[1] == 1
+            assert qkv_proj.weight_scale.dtype is torch.float32
+            assert qkv_proj.input_scale.dtype is torch.float32
+
+        llm.apply_model(check_model)
 
         output = llm.generate_greedy(["Hello my name is"], max_tokens=20)
         assert output
@@ -129,16 +134,20 @@ def test_compressed_tensors_no_enforce_eager(vllm_runner):
 def test_compressed_tensors_w8a8_dynamic_per_token(vllm_runner, model_args):
     model_path, strategy = model_args
     with vllm_runner(model_path, dtype=torch.float16) as llm:
-        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
-        layer = model.model.layers[0]
 
-        qkv_proj = layer.self_attn.qkv_proj
+        def check_model(model):
+            layer = model.model.layers[0]
+
+            qkv_proj = layer.self_attn.qkv_proj
+
+            assert isinstance(qkv_proj.quant_method,
+                              CompressedTensorsLinearMethod)
+            assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8Int8)
+            assert not qkv_proj.scheme.is_static_input_scheme
+            assert qkv_proj.scheme.strategy == strategy
+            assert qkv_proj.weight.dtype is torch.int8
 
-        assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
-        assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8Int8)
-        assert not qkv_proj.scheme.is_static_input_scheme
-        assert qkv_proj.scheme.strategy == strategy
-        assert qkv_proj.weight.dtype is torch.int8
+        llm.apply_model(check_model)
 
         output = llm.generate_greedy(["Hello my name is"], max_tokens=20)
         assert output
@@ -152,19 +161,24 @@ def test_compressed_tensors_w8a8_dynamic_per_token(vllm_runner, model_args):
 def test_compressed_tensors_wNa16(vllm_runner, wNa16_args):
     model, strategy, group, pack_factor = wNa16_args
     with vllm_runner(model) as llm:
-        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
-        layer = model.model.layers[0]
 
-        qkv_proj = layer.self_attn.qkv_proj
-        assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
-        assert isinstance(qkv_proj.scheme, CompressedTensorsWNA16)
+        def check_model(model):
+            layer = model.model.layers[0]
 
-        assert qkv_proj.scheme.strategy == strategy
-        assert qkv_proj.scheme.group_size == (-1 if group is None else group)
+            qkv_proj = layer.self_attn.qkv_proj
+            assert isinstance(qkv_proj.quant_method,
+                              CompressedTensorsLinearMethod)
+            assert isinstance(qkv_proj.scheme, CompressedTensorsWNA16)
 
-        assert qkv_proj.weight_packed.dtype is torch.int32
-        assert qkv_proj.weight_scale.dtype is torch.float16
-        assert qkv_proj.scheme.pack_factor == pack_factor
+            assert qkv_proj.scheme.strategy == strategy
+            assert qkv_proj.scheme.group_size == (-1
+                                                  if group is None else group)
+
+            assert qkv_proj.weight_packed.dtype is torch.int32
+            assert qkv_proj.weight_scale.dtype is torch.float16
+            assert qkv_proj.scheme.pack_factor == pack_factor
+
+        llm.apply_model(check_model)
 
         output = llm.generate_greedy("Hello my name is", max_tokens=20)
         assert output
@@ -173,14 +187,18 @@ def test_compressed_tensors_wNa16(vllm_runner, wNa16_args):
 def test_compressed_tensors_w4a16_marlin24(vllm_runner):
     model_path = "nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t"
     with vllm_runner(model_path) as llm:
-        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
-        layer = model.model.layers[0]
 
-        qkv_proj = layer.self_attn.qkv_proj
+        def check_model(model):
+            layer = model.model.layers[0]
+
+            qkv_proj = layer.self_attn.qkv_proj
 
-        assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
-        assert isinstance(qkv_proj.scheme, CompressedTensorsW4A16Sparse24)
-        assert qkv_proj.weight_packed.dtype is torch.int32
+            assert isinstance(qkv_proj.quant_method,
+                              CompressedTensorsLinearMethod)
+            assert isinstance(qkv_proj.scheme, CompressedTensorsW4A16Sparse24)
+            assert qkv_proj.weight_packed.dtype is torch.int32
+
+        llm.apply_model(check_model)
 
         output = llm.generate_greedy("Hello my name is", max_tokens=20)
         assert output
@@ -189,23 +207,27 @@ def test_compressed_tensors_w4a16_marlin24(vllm_runner):
 def test_compressed_tensors_fp8(vllm_runner):
     model_path = "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
     with vllm_runner(model_path) as llm:
-        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
-        layer = model.model.layers[0]
 
-        qkv_proj = layer.self_attn.qkv_proj
+        def check_model(model):
+            layer = model.model.layers[0]
 
-        assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
-        assert isinstance(
-            qkv_proj.scheme,
-            (CompressedTensorsW8A8Fp8, CompressedTensorsW8A16Fp8))
+            qkv_proj = layer.self_attn.qkv_proj
 
-        assert qkv_proj.input_scale.dtype is torch.float32
+            assert isinstance(qkv_proj.quant_method,
+                              CompressedTensorsLinearMethod)
+            assert isinstance(
+                qkv_proj.scheme,
+                (CompressedTensorsW8A8Fp8, CompressedTensorsW8A16Fp8))
 
-        if isinstance(qkv_proj.scheme, CompressedTensorsW8A8Fp8):
-            assert len(qkv_proj.input_scale.shape) == 0
-            assert qkv_proj.weight.dtype is torch.float8_e4m3fn
-            assert qkv_proj.weight_scale.dtype is torch.float32
-            assert len(qkv_proj.weight_scale.shape) == 0
+            assert qkv_proj.input_scale.dtype is torch.float32
+
+            if isinstance(qkv_proj.scheme, CompressedTensorsW8A8Fp8):
+                assert len(qkv_proj.input_scale.shape) == 0
+                assert qkv_proj.weight.dtype is torch.float8_e4m3fn
+                assert qkv_proj.weight_scale.dtype is torch.float32
+                assert len(qkv_proj.weight_scale.shape) == 0
+
+        llm.apply_model(check_model)
 
         output = llm.generate_greedy("Hello my name is", max_tokens=20)
         assert output
@@ -248,12 +270,15 @@ def _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy):
 def test_compressed_tensors_2of4_quant_fp8(vllm_runner, args_2of4):
     model, weight_strategy, input_strategy = args_2of4
     with vllm_runner(model) as llm:
-        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
-        layer = model.model.layers[0]
 
-        qkv_proj = layer.self_attn.qkv_proj
-        assert qkv_proj.scheme.weights_dtype == torch.float8_e4m3fn
-        _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy)
+        def check_model(model):
+            layer = model.model.layers[0]
+
+            qkv_proj = layer.self_attn.qkv_proj
+            assert qkv_proj.scheme.weights_dtype == torch.float8_e4m3fn
+            _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy)
+
+        llm.apply_model(check_model)
 
         output = llm.generate_greedy("Hello my name is", max_tokens=20)
         print(output)
@@ -273,12 +298,15 @@ def test_compressed_tensors_2of4_quant_fp8(vllm_runner, args_2of4):
 def test_compressed_tensors_2of4_quant_int8(vllm_runner, args_2of4):
     model, weight_strategy, input_strategy = args_2of4
     with vllm_runner(model) as llm:
-        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
-        layer = model.model.layers[0]
 
-        qkv_proj = layer.self_attn.qkv_proj
-        assert qkv_proj.scheme.weights_dtype == torch.int8
-        _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy)
+        def check_model(model):
+            layer = model.model.layers[0]
+
+            qkv_proj = layer.self_attn.qkv_proj
+            assert qkv_proj.scheme.weights_dtype == torch.int8
+            _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy)
+
+        llm.apply_model(check_model)
 
         output = llm.generate_greedy("Hello my name is", max_tokens=20)
         print(output)
@@ -293,20 +321,24 @@ def test_compressed_tensors_2of4_quant_int8(vllm_runner, args_2of4):
 def test_compressed_tensors_2of4_sparse(vllm_runner, args_2of4):
     model = args_2of4
     with vllm_runner(model) as llm:
-        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
-        layer = model.model.layers[0]
-
-        qkv_proj = layer.self_attn.qkv_proj
-        assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
-        assert isinstance(qkv_proj.scheme, CompressedTensors24)
-
-        assert qkv_proj.scheme.weight_quant is None
-        assert qkv_proj.scheme.input_quant is None
-        assert not qkv_proj.scheme.quantized
-        assert qkv_proj.quant_method.quantization_config.sparsity_scheme_map
-        sparsity_map = qkv_proj.quant_method.quantization_config.sparsity_scheme_map  # noqa: E501
-        assert sparsity_map.get("Linear").format == "dense"
-        assert sparsity_map.get("Linear").sparsity_structure == "2:4"
+
+        def check_model(model):
+            layer = model.model.layers[0]
+
+            qkv_proj = layer.self_attn.qkv_proj
+            assert isinstance(qkv_proj.quant_method,
+                              CompressedTensorsLinearMethod)
+            assert isinstance(qkv_proj.scheme, CompressedTensors24)
+
+            assert qkv_proj.scheme.weight_quant is None
+            assert qkv_proj.scheme.input_quant is None
+            assert not qkv_proj.scheme.quantized
+            assert qkv_proj.quant_method.quantization_config.sparsity_scheme_map
+            sparsity_map = qkv_proj.quant_method.quantization_config.sparsity_scheme_map  # noqa: E501
+            assert sparsity_map.get("Linear").format == "dense"
+            assert sparsity_map.get("Linear").sparsity_structure == "2:4"
+
+        llm.apply_model(check_model)
 
         output = llm.generate_greedy("Hello my name is", max_tokens=20)
         print(output)
diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py
index a0c1d7e24c503..4bff734746297 100644
--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@@ -49,13 +49,17 @@ def test_model_load_and_run(vllm_runner, model_id: str, force_marlin: bool,
 def test_kv_cache_model_load_and_run(vllm_runner, model_id: str):
     with vllm_runner(model_id, kv_cache_dtype="fp8") as llm:
 
-        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
-        attn = model.model.layers[0].self_attn.attn
-        assert isinstance(attn.quant_method, Fp8KVCacheMethod)
-        # NOTE: it is valid for scales to be 1.0 (default value), but we know
-        # these checkpoints have scales < 1.0
-        assert 0.0 < attn._k_scale < 1.0
-        assert 0.0 < attn._v_scale < 1.0
+        def check_model(model):
+            attn = model.model.layers[0].self_attn.attn
+
+            assert isinstance(attn.quant_method, Fp8KVCacheMethod)
+
+            # NOTE: it is valid for scales to be 1.0 (default value), but
+            # we know these checkpoints have scales < 1.0
+            assert 0.0 < attn._k_scale < 1.0
+            assert 0.0 < attn._v_scale < 1.0
+
+        llm.apply_model(check_model)
 
         # note: this does not test accuracy, just that we can run through
         # see lm-eval tests for accuracy
@@ -77,22 +81,24 @@ def test_load_fp16_model(vllm_runner, kv_cache_dtype: str, force_marlin: bool,
                      quantization="fp8",
                      kv_cache_dtype=kv_cache_dtype) as llm:
 
-        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
-        fc1 = model.model.decoder.layers[0].fc1
-        assert isinstance(fc1.quant_method, Fp8LinearMethod)
-        if kv_cache_dtype == "fp8":
-            attn = model.model.decoder.layers[0].self_attn.attn
-            assert isinstance(attn.quant_method, Fp8KVCacheMethod)
-            assert attn._k_scale == 1.0
-            assert attn._v_scale == 1.0
-
-        if current_platform.has_device_capability(89) and not force_marlin:
-            # For GPUs with hardware support, we keep weights in fp8
-            assert fc1.weight.dtype == torch.float8_e4m3fn
-        else:
-            # For GPUs without hardware support, we pack the fp8 weights
-            # for weight-only quantization using Marlin kernels
-            assert fc1.weight.dtype == torch.int32
+        def check_model(model):
+            fc1 = model.model.decoder.layers[0].fc1
+            assert isinstance(fc1.quant_method, Fp8LinearMethod)
+            if kv_cache_dtype == "fp8":
+                attn = model.model.decoder.layers[0].self_attn.attn
+                assert isinstance(attn.quant_method, Fp8KVCacheMethod)
+                assert attn._k_scale == 1.0
+                assert attn._v_scale == 1.0
+
+            if current_platform.has_device_capability(89) and not force_marlin:
+                # For GPUs with hardware support, we keep weights in fp8
+                assert fc1.weight.dtype == torch.float8_e4m3fn
+            else:
+                # For GPUs without hardware support, we pack the fp8 weights
+                # for weight-only quantization using Marlin kernels
+                assert fc1.weight.dtype == torch.int32
+
+        llm.apply_model(check_model)
 
 
 @pytest.mark.skipif(not is_quant_method_supported("fp8"),
diff --git a/tests/quantization/test_lm_head.py b/tests/quantization/test_lm_head.py
index ad526a4065101..fa2d9645ea47f 100644
--- a/tests/quantization/test_lm_head.py
+++ b/tests/quantization/test_lm_head.py
@@ -28,20 +28,23 @@ def test_lm_head(
     model_lm_head_quant: Tuple[str, bool],
 ) -> None:
     model, lm_head_quantized = model_lm_head_quant
-    vllm_model = vllm_runner(model, dtype=torch.float16, max_model_len=2048)
-
-    lm_head_layer = (vllm_model.model.llm_engine.model_executor.driver_worker.
-                     model_runner.model.lm_head)
-
-    if lm_head_quantized:
-        assert isinstance(
-            lm_head_layer.linear_method,
-            (GPTQLinearMethod, GPTQMarlinLinearMethod, MarlinLinearMethod))
-    else:
-        assert isinstance(lm_head_layer.linear_method,
-                          UnquantizedEmbeddingMethod)
-
-    print(
-        vllm_model.generate_greedy(prompts=["Hello my name is"],
-                                   max_tokens=10)[0][1])
-    del vllm_model
+
+    with vllm_runner(model, dtype=torch.float16,
+                     max_model_len=2048) as vllm_model:
+
+        def check_model(model):
+            lm_head_layer = model.lm_head
+
+            if lm_head_quantized:
+                assert isinstance(lm_head_layer.linear_method,
+                                  (GPTQLinearMethod, GPTQMarlinLinearMethod,
+                                   MarlinLinearMethod))
+            else:
+                assert isinstance(lm_head_layer.linear_method,
+                                  UnquantizedEmbeddingMethod)
+
+        vllm_model.apply_model(check_model)
+
+        print(
+            vllm_model.generate_greedy(prompts=["Hello my name is"],
+                                       max_tokens=10)[0][1])
diff --git a/tests/quantization/test_quark.py b/tests/quantization/test_quark.py
index 27493a682b746..11382ad708faa 100644
--- a/tests/quantization/test_quark.py
+++ b/tests/quantization/test_quark.py
@@ -12,19 +12,22 @@
 def test_quark_fp8(vllm_runner):
     model_path = "amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test"
     with vllm_runner(model_path) as llm:
-        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
-        layer = model.model.layers[0]
 
-        qkv_proj = layer.self_attn.qkv_proj
+        def check_model(model):
+            layer = model.model.layers[0]
 
-        assert isinstance(qkv_proj.quant_method, QuarkLinearMethod)
-        assert isinstance(qkv_proj.scheme, QuarkW8A8Fp8)
+            qkv_proj = layer.self_attn.qkv_proj
 
-        if isinstance(qkv_proj.scheme, QuarkW8A8Fp8):
-            assert len(qkv_proj.input_scale.shape) == 0
-            assert qkv_proj.weight.dtype is torch.float8_e4m3fn
-            #assert qkv_proj.weight.dtype is torch.float8_e4m3fnuz
-            assert len(qkv_proj.weight_scale.shape) == 0
+            assert isinstance(qkv_proj.quant_method, QuarkLinearMethod)
+            assert isinstance(qkv_proj.scheme, QuarkW8A8Fp8)
+
+            if isinstance(qkv_proj.scheme, QuarkW8A8Fp8):
+                assert len(qkv_proj.input_scale.shape) == 0
+                assert qkv_proj.weight.dtype is torch.float8_e4m3fn
+                #assert qkv_proj.weight.dtype is torch.float8_e4m3fnuz
+                assert len(qkv_proj.weight_scale.shape) == 0
+
+        llm.apply_model(check_model)
 
         output = llm.generate_greedy("Hello my name is", max_tokens=20)
         assert output
diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py
index bf409d2d97aa1..6e7eec1c6ab34 100644
--- a/tests/tensorizer_loader/test_tensorizer.py
+++ b/tests/tensorizer_loader/test_tensorizer.py
@@ -3,6 +3,7 @@
 import os
 import pathlib
 import subprocess
+from functools import partial
 from unittest.mock import MagicMock, patch
 
 import openai
@@ -24,7 +25,6 @@
 # yapf: enable
 from vllm.utils import PlaceholderModule, import_from_path
 
-from ..conftest import VllmRunner
 from ..utils import VLLM_PATH, RemoteOpenAIServer
 from .conftest import retry_until_skip
 
@@ -58,16 +58,6 @@ def is_curl_installed():
         return False
 
 
-def get_torch_model(vllm_runner: VllmRunner):
-    return vllm_runner \
-        .model \
-        .llm_engine \
-        .model_executor \
-        .driver_worker \
-        .model_runner \
-        .model
-
-
 def write_keyfile(keyfile_path: str):
     encryption_params = EncryptionParams.random()
     pathlib.Path(keyfile_path).parent.mkdir(parents=True, exist_ok=True)
@@ -121,8 +111,10 @@ def test_deserialized_encrypted_vllm_model_has_same_outputs(
 
         config_for_serializing = TensorizerConfig(tensorizer_uri=model_path,
                                                   encryption_keyfile=key_path)
-        serialize_vllm_model(get_torch_model(vllm_model),
-                             config_for_serializing)
+
+        vllm_model.apply_model(
+            partial(serialize_vllm_model,
+                    tensorizer_config=config_for_serializing))
 
     config_for_deserializing = TensorizerConfig(tensorizer_uri=model_path,
                                                 encryption_keyfile=key_path)
@@ -175,8 +167,10 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
     with vllm_runner(model_ref, ) as vllm_model:
         model_path = tmp_path / (model_ref + ".tensors")
 
-        serialize_vllm_model(get_torch_model(vllm_model),
-                             TensorizerConfig(tensorizer_uri=model_path))
+        vllm_model.apply_model(
+            partial(
+                serialize_vllm_model,
+                tensorizer_config=TensorizerConfig(tensorizer_uri=model_path)))
 
     with vllm_runner(
             model_ref,
@@ -215,8 +209,10 @@ def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path):
     with vllm_runner(model_ref, ) as vllm_model:
         model_path = tmp_path / (model_ref + ".tensors")
 
-        serialize_vllm_model(get_torch_model(vllm_model),
-                             TensorizerConfig(tensorizer_uri=model_path))
+        vllm_model.apply_model(
+            partial(
+                serialize_vllm_model,
+                tensorizer_config=TensorizerConfig(tensorizer_uri=model_path)))
 
         model_loader_extra_config = {
             "tensorizer_uri": str(model_path),
@@ -337,7 +333,9 @@ def test_vllm_tensorized_model_has_same_outputs(vllm_runner, tmp_path):
 
     with vllm_runner(model_ref) as vllm_model:
         outputs = vllm_model.generate(prompts, sampling_params)
-        serialize_vllm_model(get_torch_model(vllm_model), config)
+
+        vllm_model.apply_model(
+            partial(serialize_vllm_model, tensorizer_config=config))
 
         assert is_vllm_tensorized(config)
 
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index b6bba1d67b408..6a6b4a14a4c49 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -5,10 +5,10 @@
 from contextlib import contextmanager
 from dataclasses import dataclass
 from functools import partial
-from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Deque, Dict,
-                    Iterable, List, Mapping, NamedTuple, Optional)
+from typing import (TYPE_CHECKING, Callable, ClassVar, Deque, Dict, Iterable,
+                    List, Mapping, NamedTuple, Optional)
 from typing import Sequence as GenericSequence
-from typing import Set, Tuple, Type, Union, cast, overload
+from typing import Set, Type, Union, cast, overload
 
 import torch
 from typing_extensions import TypeVar, deprecated
@@ -1818,17 +1818,6 @@ def start_profile(self) -> None:
     def stop_profile(self) -> None:
         self.model_executor.stop_profile()
 
-    def collective_rpc(self,
-                       method: Union[str, Callable],
-                       timeout: Optional[float] = None,
-                       args: Tuple = (),
-                       kwargs: Optional[Dict] = None) -> List[Any]:
-        """
-        See LLM.collective_rpc for more details.
-        """
-        return self.model_executor.collective_rpc(method, timeout, args,
-                                                  kwargs)
-
     def check_health(self) -> None:
         if self.tokenizer:
             self.tokenizer.check_health()
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 0cfe6be9ac767..27386daa4bbc9 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -5,8 +5,9 @@
                     Tuple, Type, Union, cast, overload)
 
 import cloudpickle
+import torch.nn as nn
 from tqdm import tqdm
-from typing_extensions import deprecated
+from typing_extensions import TypeVar, deprecated
 
 from vllm import envs
 from vllm.beam_search import (BeamSearchInstance, BeamSearchOutput,
@@ -42,6 +43,8 @@
 
 logger = init_logger(__name__)
 
+_R = TypeVar("_R", default=Any)
+
 
 class LLM:
     """An LLM for generating texts from given prompts and sampling parameters.
@@ -464,25 +467,42 @@ def generate(
         return self.engine_class.validate_outputs(outputs, RequestOutput)
 
     def collective_rpc(self,
-                       method: Union[str, Callable],
+                       method: Union[str, Callable[..., _R]],
                        timeout: Optional[float] = None,
                        args: Tuple = (),
-                       kwargs: Optional[Dict] = None) -> List[Any]:
+                       kwargs: Optional[Dict[str, Any]] = None) -> List[_R]:
+        """
+        Execute an RPC call on all workers.
+
+        Args:
+            method: Name of the worker method to execute, or a callable that
+                is serialized and sent to all workers to execute.
+
+                If the method is a callable, it should accept an additional
+                `self` argument, in addition to the arguments passed in `args`
+                and `kwargs`. The `self` argument will be the worker object.
+            timeout: Maximum time in seconds to wait for execution. Raises a
+                :exc:`TimeoutError` on timeout. `None` means wait indefinitely.
+            args: Positional arguments to pass to the worker method.
+            kwargs: Keyword arguments to pass to the worker method.
+
+        Returns:
+            A list containing the results from each worker.
+        
+        Note:
+            It is recommended to use this API to only pass control messages,
+            and set up data-plane communication to pass data.
+        """
+        executor = self.llm_engine.model_executor
+        return executor.collective_rpc(method, timeout, args, kwargs)
+
+    def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
         """
-        Run a method on all workers, with homogeneous arguments.
-        The main extension point for the LLM entrypoint.
-        Users can provide custom worker class through `worker_cls`
-        argument, and implement new methods in the worker class.
-        Then, users can call the new methods through this API.
-        It is recommended to use this API to only pass control messages,
-        and set up data-plane communication to pass data.
-        The method can also be a callable, which will be serialized
-        and sent to all workers to execute.
-        If the method is a callable, it should accept an additional
-        `self` argument, in addition to the arguments passed in `args`
-        and `kwargs`. The `self` argument will be the worker object.
+        Run a function directly on the model inside each worker,
+        returning the result for each of them.
         """
-        return self.llm_engine.collective_rpc(method, timeout, args, kwargs)
+        executor = self.llm_engine.model_executor
+        return executor.apply_model(func)
 
     def beam_search(
         self,
diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py
index e5952b388c543..859e105f15d97 100644
--- a/vllm/executor/executor_base.py
+++ b/vllm/executor/executor_base.py
@@ -3,6 +3,9 @@
 from typing import (Any, Awaitable, Callable, Dict, List, Optional, Set, Tuple,
                     Union)
 
+import torch.nn as nn
+from typing_extensions import TypeVar
+
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
@@ -11,9 +14,12 @@
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sequence import ExecuteModelRequest, PoolerOutput
 from vllm.utils import make_async
+from vllm.worker.worker_base import WorkerBase
 
 logger = init_logger(__name__)
 
+_R = TypeVar("_R", default=Any)
+
 
 class ExecutorBase(ABC):
     """Base class for all executors.
@@ -44,22 +50,37 @@ def __init__(
 
     @abstractmethod
     def _init_executor(self) -> None:
-        pass
+        raise NotImplementedError
 
     @abstractmethod
     def collective_rpc(self,
-                       method: Union[str, Callable],
+                       method: Union[str, Callable[..., _R]],
                        timeout: Optional[float] = None,
                        args: Tuple = (),
-                       kwargs: Optional[Dict] = None) -> List[Any]:
+                       kwargs: Optional[Dict[str, Any]] = None) -> List[_R]:
         """
-        The main interface of the executor to run a method on all workers,
-        with homogeneous arguments.
-        If the args are heterogeneous, then we can pack them into a list,
-        and unpack them in the method of every worker, because every worker
-        knows their own rank.
+        Execute an RPC call on all workers.
+
+        Args:
+            method: Name of the worker method to execute, or a callable that
+                is serialized and sent to all workers to execute.
+
+                If the method is a callable, it should accept an additional
+                `self` argument, in addition to the arguments passed in `args`
+                and `kwargs`. The `self` argument will be the worker object.
+            timeout: Maximum time in seconds to wait for execution. Raises a
+                :exc:`TimeoutError` on timeout. `None` means wait indefinitely.
+            args: Positional arguments to pass to the worker method.
+            kwargs: Keyword arguments to pass to the worker method.
+
+        Returns:
+            A list containing the results from each worker.
+        
+        Note:
+            It is recommended to use this API to only pass control messages,
+            and set up data-plane communication to pass data.
         """
-        pass
+        raise NotImplementedError
 
     def determine_num_available_blocks(self) -> Tuple[int, int]:
         """Determine the number of available blocks for the GPU KV cache and
@@ -97,6 +118,17 @@ def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None:
         self.collective_rpc("initialize_cache",
                             args=(num_gpu_blocks, num_cpu_blocks))
 
+    def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
+        """
+        Run a function directly on the model inside each worker,
+        returning the result for each of them.
+        """
+
+        def rpc_func(worker: WorkerBase) -> _R:
+            return func(worker.get_model())
+
+        return self.collective_rpc(rpc_func)
+
     def execute_model(
         self, execute_model_req: ExecuteModelRequest
     ) -> Optional[List[Union[SamplerOutput, PoolerOutput]]]:
diff --git a/vllm/executor/mp_distributed_executor.py b/vllm/executor/mp_distributed_executor.py
index a80b0ee8b3122..78c86321d861d 100644
--- a/vllm/executor/mp_distributed_executor.py
+++ b/vllm/executor/mp_distributed_executor.py
@@ -148,7 +148,7 @@ def _run_workers(
         async_run_tensor_parallel_workers_only: bool = False,
         max_concurrent_workers: Optional[int] = None,
         **kwargs,
-    ) -> Any:
+    ) -> List[Any]:
         """Runs the given method on all workers.
 
         Args:
diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py
index fbd4937112e11..5b4757072353f 100644
--- a/vllm/model_executor/model_loader/tensorizer.py
+++ b/vllm/model_executor/model_loader/tensorizer.py
@@ -459,16 +459,7 @@ def tensorize_vllm_model(engine_args: EngineArgs,
             stream.write(encryption_params.key)
 
     engine = LLMEngine.from_engine_args(engine_args)
-    if tensorizer_config._is_sharded:
-        # if the engine is a distributed engine (for tensor parallel) then each
-        # worker shard needs to serialize its part of the model.
-        engine.model_executor._run_workers(
-            "save_tensorized_model",
-            tensorizer_config=tensorizer_config,
-        )
-    else:
-        # with a single worker, we can get to the underlying model directly
-        serialize_vllm_model(
-            engine.model_executor.driver_worker.model_runner.model,
-            tensorizer_config,
-        )
+    engine.model_executor.collective_rpc(
+        "save_tensorized_model",
+        kwargs=dict(tensorizer_config=tensorizer_config),
+    )
diff --git a/vllm/spec_decode/ngram_worker.py b/vllm/spec_decode/ngram_worker.py
index bb6b99135580e..e906b1789cde8 100644
--- a/vllm/spec_decode/ngram_worker.py
+++ b/vllm/spec_decode/ngram_worker.py
@@ -2,6 +2,7 @@
 from typing import List, Optional, Set, Tuple
 
 import torch
+import torch.nn as nn
 
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.sequence import ExecuteModelRequest
@@ -10,6 +11,10 @@
 from vllm.spec_decode.top1_proposer import Top1Proposer
 
 
+class _DummyModel(nn.Module):
+    pass
+
+
 class NGramWorker(NonLLMProposerWorkerBase):
     """NGramWorker provides a light drafter without need for model.
 
@@ -36,7 +41,6 @@ def set_ngram_window_size(self, ngram_prompt_lookup_min: int,
 
     def init_device(self):
         self.device = torch.device(f"{self.device_type}:{self.local_rank}")
-        self.load_model = lambda *args, **kwargs: None
 
         # Current NGramWorker only supports Top1Proposer
         self._proposer = Top1Proposer(
@@ -45,6 +49,12 @@ def init_device(self):
             vocab_size=self.vocab_size,
         )
 
+    def load_model(self) -> None:
+        pass  # Dummy
+
+    def get_model(self) -> nn.Module:
+        return _DummyModel()
+
     def sampler_output(
         self,
         execute_model_req: ExecuteModelRequest,
diff --git a/vllm/spec_decode/smaller_tp_proposer_worker.py b/vllm/spec_decode/smaller_tp_proposer_worker.py
index 8896b7dbc6b8a..c6ff5e52f9388 100644
--- a/vllm/spec_decode/smaller_tp_proposer_worker.py
+++ b/vllm/spec_decode/smaller_tp_proposer_worker.py
@@ -1,6 +1,7 @@
 from typing import List, Optional, Set, Tuple
 
 import torch
+import torch.nn as nn
 
 from vllm.distributed.parallel_state import (get_tp_group,
                                              init_model_parallel_group,
@@ -15,6 +16,10 @@
 logger = init_logger(__name__)
 
 
+class _DummyModel(nn.Module):
+    pass
+
+
 class SmallerTpProposerWorker(ProposerWorkerBase):
     """Class which allows a speculative draft model to run with smaller tensor
     parallel degree than target model.
@@ -139,6 +144,13 @@ def get_spec_proposals(
             return self._worker.get_spec_proposals(
                 execute_model_req, seq_ids_with_bonus_token_in_last_step)
 
+    def get_model(self) -> nn.Module:
+        if self._is_dummy:
+            return _DummyModel()
+
+        with self._patch_tensor_parallel_group():
+            return self._worker.get_model()
+
     def execute_model(
         self,
         execute_model_req: Optional[ExecuteModelRequest] = None
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index 540d118d65ecb..0d66ede3d907a 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -4,6 +4,7 @@
 from typing import Any, Dict, List, Optional, Set, Tuple, Type
 
 import torch
+import torch.nn as nn
 
 from vllm.config import ParallelConfig, SpeculativeConfig, VllmConfig
 from vllm.distributed.communication_op import broadcast_tensor_dict
@@ -403,6 +404,9 @@ def initialize_cache(self, num_gpu_blocks: int,
         self.proposer_worker.initialize_cache(num_gpu_blocks=num_gpu_blocks,
                                               num_cpu_blocks=num_cpu_blocks)
 
+    def get_model(self) -> nn.Module:
+        return self.scorer_worker.get_model()
+
     @torch.inference_mode()
     def execute_model(
         self,
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index 93026029ad13e..f6cf35da0106b 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -94,22 +94,12 @@ def collective_rpc(self,
                        timeout: Optional[float] = None,
                        args: Tuple = (),
                        kwargs: Optional[Dict] = None) -> List[Any]:
-        """
-        Execute an RPC call on workers.
-        
-        Args:
-            method: Name of the worker method to execute
-            timeout: Maximum time in seconds to wait for execution. Rases a
-                     TimeoutError on timeout. None means wait indefinitely.
-            args: Positional arguments to pass to the worker method
-            kwargs: Keyword arguments to pass to the worker method
-
-        Returns:
-            List of results from each worker
-        """
         start_time = time.monotonic()
         kwargs = kwargs or {}
 
+        # NOTE: If the args are heterogeneous, then we pack them into a list,
+        # and unpack them in the method of every worker, because every worker
+        # knows their own rank.
         try:
             if isinstance(method, str):
                 send_method = method
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 87a1cd7f9e627..2350074c23a59 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -689,6 +689,9 @@ def _gather_encoder_outputs(
                 encoder_outputs.append(encoder_output[start_idx:end_idx])
         return encoder_outputs
 
+    def get_model(self) -> nn.Module:
+        return self.model
+
     @torch.inference_mode()
     def execute_model(
         self,
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 4fb4197f1822f..0929e64d58f1e 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -5,6 +5,7 @@
 
 import torch
 import torch.distributed
+import torch.nn as nn
 
 import vllm.envs as envs
 from vllm.config import CacheConfig, ModelConfig, ParallelConfig, VllmConfig
@@ -176,6 +177,9 @@ def compile_or_warm_up_model(self) -> None:
         # the model initialization and profiling.
         set_random_seed(self.model_config.seed)
 
+    def get_model(self) -> nn.Module:
+        return self.model_runner.get_model()
+
     @torch.inference_mode()
     def execute_model(
         self,
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index 303d9a15e9c3c..abbf6450ab7f6 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -509,6 +509,9 @@ def load_model(self) -> None:
             )
             self.model = self.lora_manager.create_lora_manager(self.model)
 
+    def get_model(self) -> nn.Module:
+        return self.model
+
     def _prepare_model_input_tensors(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 260ffaf27f9a1..4c8f69e449393 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -21,6 +21,7 @@
 import habana_frameworks.torch as htorch
 import habana_frameworks.torch.internal.bridge_config as bc
 import torch
+import torch.nn as nn
 from vllm_hpu_extension.ops import LoraMask as LoraMask
 from vllm_hpu_extension.profiler import (HabanaHighLevelProfiler,
                                          HabanaMemoryProfiler, format_bytes)
@@ -676,6 +677,9 @@ def load_model(self) -> None:
         msg = f"Loading model weights took in total {m.get_summary_string()}"
         logger.info(msg)
 
+    def get_model(self) -> nn.Module:
+        return self.model
+
     def _use_graphs(self, batch_size, seq_len, is_prompt):
         if self.enforce_eager:
             return False
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index ae8b7f97c827d..cb2ff0c934da3 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1176,6 +1176,9 @@ def load_model(self) -> None:
                 fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE,
                 backend=backend)
 
+    def get_model(self) -> nn.Module:
+        return self.model
+
     def save_sharded_state(
         self,
         path: str,
diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py
index c7abad7e0258d..acfd6d0b03f62 100644
--- a/vllm/worker/model_runner_base.py
+++ b/vllm/worker/model_runner_base.py
@@ -7,6 +7,7 @@
                     Optional, Type, TypeVar)
 
 import torch
+import torch.nn as nn
 from torch import is_tensor
 
 from vllm.config import VllmConfig
@@ -264,6 +265,10 @@ def prepare_model_input(
         """
         raise NotImplementedError
 
+    @abstractmethod
+    def get_model(self) -> nn.Module:
+        raise NotImplementedError
+
     def execute_model(
         self,
         model_input: T,
@@ -297,9 +302,9 @@ class ModelRunnerWrapperBase:
 
     def __init__(
         self,
-        moderl_runner: ModelRunnerBase,
+        model_runner: ModelRunnerBase,
     ) -> None:
-        self.model_runner: ModelRunnerBase = moderl_runner
+        self.model_runner: ModelRunnerBase = model_runner
 
     def __getattr__(self, attr):
         return getattr(self.model_runner, attr)
diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py
index a35f5467e1a1f..596c26eac28bd 100644
--- a/vllm/worker/neuron_model_runner.py
+++ b/vllm/worker/neuron_model_runner.py
@@ -113,6 +113,9 @@ def load_model(self) -> None:
             raise NotImplementedError(
                 "Supports only Transformer-NeuronX based models.")
 
+    def get_model(self) -> nn.Module:
+        return self.model
+
     def _prepare_prompt(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
diff --git a/vllm/worker/openvino_model_runner.py b/vllm/worker/openvino_model_runner.py
index a38b5a4e6e8d5..9d0a759ca2f21 100644
--- a/vllm/worker/openvino_model_runner.py
+++ b/vllm/worker/openvino_model_runner.py
@@ -84,6 +84,9 @@ def load_model(self) -> None:
                                kv_cache_dtype=self.kv_cache_dtype,
                                ov_core=self.ov_core)
 
+    def get_model(self) -> nn.Module:
+        return self.model
+
     def _prepare_model_input(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
diff --git a/vllm/worker/openvino_worker.py b/vllm/worker/openvino_worker.py
index 50a155d22c666..f5b46cde3969c 100644
--- a/vllm/worker/openvino_worker.py
+++ b/vllm/worker/openvino_worker.py
@@ -4,6 +4,7 @@
 import openvino as ov
 import torch
 import torch.distributed
+import torch.nn as nn
 
 import vllm.envs as envs
 from vllm.attention import get_attn_backend
@@ -362,6 +363,9 @@ def cache_copy(
     ) -> None:
         self.cache_engine.copy(blocks_to_copy)  # type: ignore
 
+    def get_model(self) -> nn.Module:
+        return self.model_runner.get_model()
+
     @torch.inference_mode()
     def execute_model(
         self,
diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py
index 52c577bccab9c..f5c7bc955a673 100644
--- a/vllm/worker/tpu_model_runner.py
+++ b/vllm/worker/tpu_model_runner.py
@@ -158,6 +158,9 @@ def load_model(self) -> None:
                                    fullgraph=True,
                                    dynamic=False)
 
+    def get_model(self) -> nn.Module:
+        return self.model.model
+
     def _dummy_run(
         self,
         batch_size: int,
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index fb9919f7a7b6a..1104eceef72a3 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -6,6 +6,7 @@
 
 import cloudpickle
 import torch
+import torch.nn as nn
 
 from vllm.config import ObservabilityConfig, VllmConfig
 from vllm.distributed import broadcast_tensor_dict, get_pp_group, get_tp_group
@@ -90,6 +91,11 @@ def start_worker_execution_loop(self) -> None:
                 if output is None:
                     return None
 
+    @abstractmethod
+    def get_model(self) -> nn.Module:
+        raise NotImplementedError
+
+    @abstractmethod
     def execute_model(
         self,
         execute_model_req: Optional[ExecuteModelRequest] = None
@@ -147,6 +153,9 @@ def initialize_cache(self, num_gpu_blocks: int,
                          num_cpu_blocks: int) -> None:
         self.worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
 
+    def get_model(self) -> nn.Module:
+        return self.worker.get_model()
+
     def execute_model(
         self,
         execute_model_req: Optional[ExecuteModelRequest] = None
@@ -363,6 +372,9 @@ def prepare_input(
         else:
             return self._get_worker_input_from_broadcast()
 
+    def get_model(self) -> nn.Module:
+        return self.model_runner.get_model()
+
     def execute_model(
         self,
         execute_model_req: Optional[ExecuteModelRequest] = None,
diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
index 82b8f22a5af33..25a2fea1e8eac 100644
--- a/vllm/worker/xpu_model_runner.py
+++ b/vllm/worker/xpu_model_runner.py
@@ -416,6 +416,9 @@ def load_model(self) -> None:
         logger.info("Loading model weights took %.4f GB",
                     self.model_memory_usage / float(2**30))
 
+    def get_model(self) -> nn.Module:
+        return self.model
+
     @property
     def vocab_size(self) -> int:
         return self.model_config.get_vocab_size()

From 5c89a29c22471a0ad5bb05dea9cb891ff97f9623 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 20 Jan 2025 16:04:49 +0800
Subject: [PATCH 284/462] [misc] add placeholder format.sh (#12206)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 format.sh           | 5 +++++
 tools/shellcheck.sh | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)
 create mode 100755 format.sh

diff --git a/format.sh b/format.sh
new file mode 100755
index 0000000000000..4bcd0be0c96e5
--- /dev/null
+++ b/format.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+echo "vLLM linting system has been moved from format.sh to pre-commit hook."
+echo "Please run 'pip install -r requirements-lint.txt' and 'pre-commit install' to install the pre-commit hook."
+echo "Then linters will run automatically before each commit."
diff --git a/tools/shellcheck.sh b/tools/shellcheck.sh
index d99fa77b96351..7efb3cabc64fe 100755
--- a/tools/shellcheck.sh
+++ b/tools/shellcheck.sh
@@ -19,4 +19,4 @@ if ! [ -x "$(command -v shellcheck)" ]; then
 fi
 
 # TODO - fix warnings in .buildkite/run-amd-test.sh
-find . -name "*.sh" -not -path "./.buildkite/run-amd-test.sh" -print0 | xargs -0 -I {} sh -c 'git check-ignore -q "{}" || shellcheck "{}"'
+find . -name "*.sh" ".git" -prune -not -path "./.buildkite/run-amd-test.sh" -print0 | xargs -0 -I {} sh -c 'git check-ignore -q "{}" || shellcheck -s bash "{}"'

From 4001ea126692d9c4e6872936a791a1999c826156 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 20 Jan 2025 16:41:57 +0800
Subject: [PATCH 285/462] [CI/Build] Remove dummy CI steps (#12208)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .github/workflows/dummy.yml | 20 --------------------
 1 file changed, 20 deletions(-)
 delete mode 100644 .github/workflows/dummy.yml

diff --git a/.github/workflows/dummy.yml b/.github/workflows/dummy.yml
deleted file mode 100644
index ea507fab6b2de..0000000000000
--- a/.github/workflows/dummy.yml
+++ /dev/null
@@ -1,20 +0,0 @@
-name: dummy-checks
-
-on:
-  pull_request:
-
-jobs:
-  mypy:
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        python-version: ["3.12"]
-    steps:
-      - run: echo "This is a dummy step that always passes"
-  ruff:
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        python-version: ["3.12"]
-    steps:
-      - run: echo "This is a dummy step that always passes"

From 3127e975fb9417d10513e25b80820870f594c627 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 20 Jan 2025 17:36:24 +0800
Subject: [PATCH 286/462] [CI/Build] Make pre-commit faster (#12212)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .github/workflows/pre-commit.yml |  2 ++
 .pre-commit-config.yaml          | 16 +++++++++++++++-
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index 8c72a709cf330..bf9460151ec1b 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -15,3 +15,5 @@ jobs:
         python-version: "3.12"
     - run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
     - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
+      with:
+        extra_args: --hook-stage manual
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 8ea0f37885d9f..47eddb345edbd 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,3 +1,6 @@
+default_stages:
+  - pre-commit # Run locally
+  - manual # Run in CI
 repos:
 - repo: https://github.com/google/yapf
   rev: v0.32.0
@@ -33,30 +36,41 @@ repos:
     files: docs/.*
 - repo: local
   hooks:
+  - id: mypy-local
+    name: Run mypy for local Python installation
+    entry: tools/mypy.sh
+    language: python
+    types: [python]
+    additional_dependencies: &mypy_deps [mypy==1.11.1, types-setuptools, types-PyYAML, types-requests]
+    stages: [pre-commit] # Don't run in CI
   - id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
     name: Run mypy for Python 3.9
     entry: tools/mypy.sh 1 "3.9"
     language: python
     types: [python]
-    additional_dependencies: &mypy_deps [mypy==1.11.1, types-setuptools, types-PyYAML, types-requests]
+    additional_dependencies: *mypy_deps
+    stages: [manual] # Only run in CI
   - id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
     name: Run mypy for Python 3.10
     entry: tools/mypy.sh 1 "3.10"
     language: python
     types: [python]
     additional_dependencies: *mypy_deps
+    stages: [manual] # Only run in CI
   - id: mypy-3.11 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
     name: Run mypy for Python 3.11
     entry: tools/mypy.sh 1 "3.11"
     language: python
     types: [python]
     additional_dependencies: *mypy_deps
+    stages: [manual] # Only run in CI
   - id: mypy-3.12 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
     name: Run mypy for Python 3.12
     entry: tools/mypy.sh 1 "3.12"
     language: python
     types: [python]
     additional_dependencies: *mypy_deps
+    stages: [manual] # Only run in CI
   - id: shellcheck
     name: Lint shell scripts
     entry: tools/shellcheck.sh

From b37d82791e3c9f7d492db81493d920004de59a26 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 20 Jan 2025 17:58:48 +0800
Subject: [PATCH 287/462] [Model] Upgrade Aria to transformers 4.48 (#12203)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 examples/offline_inference/vision_language.py |   3 -
 .../vision_language/test_models.py            |   7 +-
 .../multimodal/processing/test_common.py      |  12 +-
 tests/models/registry.py                      |  67 ++++-
 tests/models/test_initialization.py           |  14 +-
 tests/models/test_registry.py                 |   3 +
 vllm/model_executor/models/aria.py            | 275 +++++++-----------
 vllm/transformers_utils/config.py             |   9 +-
 vllm/transformers_utils/configs/__init__.py   |   2 -
 vllm/transformers_utils/configs/aria.py       | 165 -----------
 10 files changed, 178 insertions(+), 379 deletions(-)
 delete mode 100644 vllm/transformers_utils/configs/aria.py

diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 69228bbf22949..f9048c7735ebf 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -26,11 +26,8 @@ def run_aria(question: str, modality: str):
 
     # NOTE: Need L40 (or equivalent) to avoid OOM
     llm = LLM(model=model_name,
-              tokenizer_mode="slow",
-              dtype="bfloat16",
               max_model_len=4096,
               max_num_seqs=2,
-              trust_remote_code=True,
               disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
 
     prompt = (f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>\n{question}"
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index ca572cc39e538..14d9a739be318 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -10,7 +10,6 @@
 import pytest
 from transformers import AutoModelForVision2Seq
 from transformers import __version__ as TRANSFORMERS_VERSION
-from transformers.utils import is_flash_attn_2_available
 
 from vllm.platforms import current_platform
 from vllm.utils import identity
@@ -140,9 +139,7 @@
     #### Extended model tests
     "aria": VLMTestInfo(
         models=["rhymes-ai/Aria"],
-        tokenizer_mode="slow",
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
-        dtype="bfloat16",
         prompt_formatter=lambda img_prompt: f"<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n ", # noqa: E501
         img_idx_to_prompt=lambda idx: "<fim_prefix><|img|><fim_suffix>\n",
         max_model_len=4096,
@@ -158,8 +155,8 @@
         max_tokens=64,
         marks=[
             pytest.mark.skipif(
-                not is_flash_attn_2_available(),
-                reason="Model needs flash-attn for numeric convergence.",
+                TRANSFORMERS_VERSION < "4.48.0",
+                reason="HF model requires transformers>=4.48.0",
             ),
             large_gpu_mark(min_gb=64),
         ],
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index 1e3e7ea50b122..d6d3d3b34ad46 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -11,6 +11,7 @@
 from vllm.multimodal.utils import cached_get_tokenizer
 
 from ....multimodal.utils import random_audio, random_image, random_video
+from ...registry import HF_EXAMPLE_MODELS
 
 
 def _test_processing_correctness(
@@ -20,12 +21,9 @@ def _test_processing_correctness(
     num_batches: int,
     simplify_rate: float,
 ):
-    if model_id == "TIGER-Lab/Mantis-8B-siglip-llama3":
-        hf_overrides = {"architectures": ["MantisForConditionalGeneration"]}
-    elif model_id == "deepseek-ai/deepseek-vl2-tiny":
-        hf_overrides = {"architectures": ["DeepseekVLV2ForCausalLM"]}
-    else:
-        hf_overrides = {}
+    model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
+    model_info.check_available_online(on_fail="skip")
+    model_info.check_transformers_version(on_fail="skip")
 
     limit_mm_per_prompt = {
         modality: 3 if supports_multi else 1
@@ -41,7 +39,7 @@ def _test_processing_correctness(
         seed=0,
         dtype="float16",
         revision=None,
-        hf_overrides=hf_overrides,
+        hf_overrides=model_info.hf_overrides,
         limit_mm_per_prompt=limit_mm_per_prompt,
     )
 
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 9603ea8817cac..23227ea6b9714 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -1,5 +1,9 @@
 from dataclasses import dataclass, field
-from typing import AbstractSet, Mapping, Optional
+from typing import AbstractSet, Any, Literal, Mapping, Optional
+
+import pytest
+from packaging.version import Version
+from transformers import __version__ as TRANSFORMERS_VERSION
 
 
 @dataclass(frozen=True)
@@ -38,6 +42,50 @@ class _HfExamplesInfo:
     trust_remote_code: bool = False
     """The ``trust_remote_code`` level required to load the model."""
 
+    hf_overrides: dict[str, Any] = field(default_factory=dict)
+    """The ``hf_overrides`` required to load the model."""
+
+    def check_transformers_version(
+        self,
+        *,
+        on_fail: Literal["error", "skip"],
+    ) -> None:
+        """
+        If the installed transformers version does not meet the requirements,
+        perform the given action.
+        """
+        if self.min_transformers_version is None:
+            return
+
+        current_version = TRANSFORMERS_VERSION
+        required_version = self.min_transformers_version
+        if Version(current_version) < Version(required_version):
+            msg = (
+                f"You have `transformers=={current_version}` installed, but "
+                f"`transformers>={required_version}` is required to run this "
+                "model")
+
+            if on_fail == "error":
+                raise RuntimeError(msg)
+            else:
+                pytest.skip(msg)
+
+    def check_available_online(
+        self,
+        *,
+        on_fail: Literal["error", "skip"],
+    ) -> None:
+        """
+        If the model is not available online, perform the given action.
+        """
+        if not self.is_available_online:
+            msg = "Model is not available online"
+
+            if on_fail == "error":
+                raise RuntimeError(msg)
+            else:
+                pytest.skip(msg)
+
 
 # yapf: disable
 _TEXT_GENERATION_EXAMPLE_MODELS = {
@@ -48,8 +96,6 @@ class _HfExamplesInfo:
                                          trust_remote_code=True),
     "ArcticForCausalLM": _HfExamplesInfo("Snowflake/snowflake-arctic-instruct",
                                          trust_remote_code=True),
-    "AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria",
-                                                    trust_remote_code=True),
     "BaiChuanForCausalLM": _HfExamplesInfo("baichuan-inc/Baichuan-7B",
                                          trust_remote_code=True),
     "BaichuanForCausalLM": _HfExamplesInfo("baichuan-inc/Baichuan2-7B-chat",
@@ -176,6 +222,8 @@ class _HfExamplesInfo:
 
 _MULTIMODAL_EXAMPLE_MODELS = {
     # [Decoder-only]
+    "AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria",
+                                                    min_transformers_version="4.48"),
     "Blip2ForConditionalGeneration": _HfExamplesInfo("Salesforce/blip2-opt-2.7b"),  # noqa: E501
     "ChameleonForConditionalGeneration": _HfExamplesInfo("facebook/chameleon-7b"),  # noqa: E501
     "ChatGLMModel": _HfExamplesInfo("THUDM/glm-4v-9b",
@@ -183,7 +231,8 @@ class _HfExamplesInfo:
                                     trust_remote_code=True),
     "ChatGLMForConditionalGeneration": _HfExamplesInfo("chatglm2-6b",
                                                        is_available_online=False),
-    "DeepseekVLV2ForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-vl2-tiny"),   # noqa: E501
+    "DeepseekVLV2ForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-vl2-tiny",  # noqa: E501
+                                               hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]}),  # noqa: E501
     "FuyuForCausalLM": _HfExamplesInfo("adept/fuyu-8b"),
     "H2OVLChatModel": _HfExamplesInfo("h2oai/h2ovl-mississippi-800m"),
     "InternVLChatModel": _HfExamplesInfo("OpenGVLab/InternVL2-1B",
@@ -194,7 +243,8 @@ class _HfExamplesInfo:
     "LlavaNextForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-v1.6-mistral-7b-hf"),  # noqa: E501
     "LlavaNextVideoForConditionalGeneration": _HfExamplesInfo("llava-hf/LLaVA-NeXT-Video-7B-hf"),  # noqa: E501
     "LlavaOnevisionForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"),  # noqa: E501
-    "MantisForConditionalGeneration": _HfExamplesInfo("TIGER-Lab/Mantis-8B-siglip-llama3"),  # noqa: E501
+    "MantisForConditionalGeneration": _HfExamplesInfo("TIGER-Lab/Mantis-8B-siglip-llama3",  # noqa: E501
+                                                      hf_overrides={"architectures": ["MantisForConditionalGeneration"]}),  # noqa: E501
     "MiniCPMV": _HfExamplesInfo("openbmb/MiniCPM-Llama3-V-2_5",
                                 trust_remote_code=True),
     "MolmoForCausalLM": _HfExamplesInfo("allenai/Molmo-7B-D-0924",
@@ -247,5 +297,12 @@ def get_supported_archs(self) -> AbstractSet[str]:
     def get_hf_info(self, model_arch: str) -> _HfExamplesInfo:
         return self.hf_models[model_arch]
 
+    def find_hf_info(self, model_id: str) -> _HfExamplesInfo:
+        for info in self.hf_models.values():
+            if info.default == model_id:
+                return info
+
+        raise ValueError(f"No example model defined for {model_id}")
+
 
 HF_EXAMPLE_MODELS = HfExampleModels(_EXAMPLE_MODELS)
diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py
index daece7c93c0ef..d3a3aaf670c23 100644
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -1,9 +1,7 @@
 from unittest.mock import patch
 
 import pytest
-from packaging.version import Version
 from transformers import PretrainedConfig
-from transformers import __version__ as TRANSFORMERS_VERSION
 
 from vllm import LLM
 
@@ -13,16 +11,8 @@
 @pytest.mark.parametrize("model_arch", HF_EXAMPLE_MODELS.get_supported_archs())
 def test_can_initialize(model_arch):
     model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
-    if not model_info.is_available_online:
-        pytest.skip("Model is not available online")
-    if model_info.min_transformers_version is not None:
-        current_version = TRANSFORMERS_VERSION
-        required_version = model_info.min_transformers_version
-        if Version(current_version) < Version(required_version):
-            pytest.skip(
-                f"You have `transformers=={current_version}` installed, but "
-                f"`transformers>={required_version}` is required to run this "
-                "model")
+    model_info.check_available_online(on_fail="skip")
+    model_info.check_transformers_version(on_fail="skip")
 
     # Avoid OOM
     def hf_overrides(hf_config: PretrainedConfig) -> PretrainedConfig:
diff --git a/tests/models/test_registry.py b/tests/models/test_registry.py
index 73b70d65e8e0b..ac0366847e334 100644
--- a/tests/models/test_registry.py
+++ b/tests/models/test_registry.py
@@ -21,6 +21,9 @@
 
 @pytest.mark.parametrize("model_arch", ModelRegistry.get_supported_archs())
 def test_registry_imports(model_arch):
+    model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
+    model_info.check_transformers_version(on_fail="skip")
+
     # Ensure all model classes can be imported successfully
     model_cls, _ = ModelRegistry.resolve_model_cls(model_arch)
 
diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
index 5b97eced62df0..503d1a38d9ee9 100644
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@@ -1,9 +1,11 @@
-from typing import (Callable, Iterable, List, Mapping, Optional, Set, Tuple,
-                    TypedDict, Union)
+from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict,
+                    Union)
 
 import torch
 import torch.nn as nn
-from transformers import BatchFeature, PretrainedConfig
+from transformers import AriaConfig, AriaTextConfig, BatchFeature
+from transformers.models.aria.modeling_aria import AriaCrossAttention
+from transformers.models.aria.processing_aria import AriaProcessor
 
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, QuantizationConfig, VllmConfig
@@ -26,10 +28,11 @@
                                         BaseProcessingInfo, PromptReplacement)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.configs.aria import (AriaMoELMConfig,
-                                                  AriaVisionConfig)
 
-from .idefics2_vision_model import Idefics2VisionTransformer
+# yapf: disable
+from .idefics2_vision_model import (
+    Idefics2VisionTransformer as Idefics3VisionTransformer)
+# yapf: enable
 from .interfaces import SupportsMultiModal
 from .llama import LlamaDecoderLayer, LlamaMLP, LlamaModel
 from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
@@ -47,87 +50,22 @@ class AriaImagePixelInputs(TypedDict):
     """
 
 
-class AriaVisionTransformer(Idefics2VisionTransformer):
-    """
-    AriaVisionTransformer is a modified version of Idefics2VisionTransformer
-    that replaces the post-layernorm with an identity layer.
-    """
-
-    def __init__(
-        self,
-        config: AriaVisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = "",
-    ) -> None:
-        super().__init__(config, quant_config, prefix)
-        self.post_layernorm = nn.Identity()
-
-
-class AriaVisionModel(nn.Module):
-    config_class = AriaVisionConfig
+class AriaProjectorMLP(nn.Module):
 
     def __init__(
         self,
-        config: AriaVisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
-        *,
-        prefix: str = "",
+        in_features: int,
+        hidden_features: int,
+        output_dim: int,
     ) -> None:
         super().__init__()
 
-        self.vision_model = AriaVisionTransformer(
-            config,
-            quant_config,
-            prefix=f"{prefix}.vision_model",
-        )
-
-    def forward(
-        self,
-        pixel_values: torch.Tensor,
-        pixel_mask: Optional[torch.Tensor] = None,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
-        patch_attention_mask = self._create_patch_attention_mask(pixel_mask)
-
-        vit_oup = self.vision_model(
-            pixel_values=pixel_values,
-            patch_attention_mask=patch_attention_mask,
-        )
-
-        image_atts = self._create_image_attention_mask(patch_attention_mask)
-
-        return vit_oup, image_atts
-
-    def _create_patch_attention_mask(
-            self, pixel_mask: Optional[torch.Tensor]) -> torch.Tensor:
-        if pixel_mask is None:
-            return None
-
-        patches_subgrid = pixel_mask.unfold(
-            dimension=1,
-            size=self.vision_model.config.patch_size,
-            step=self.vision_model.config.patch_size,
-        ).unfold(
-            dimension=2,
-            size=self.vision_model.config.patch_size,
-            step=self.vision_model.config.patch_size,
-        )
-        return (patches_subgrid.sum(dim=(-1, -2)) > 0).bool()
-
-    def _create_image_attention_mask(
-            self, patch_attention_mask: torch.Tensor) -> torch.Tensor:
-        if patch_attention_mask is None:
-            return None
-
-        flattened_mask = patch_attention_mask.flatten(1)
-        return torch.logical_not(flattened_mask)
-
-
-class FFN(nn.Module):
-
-    def __init__(self, embed_dim: int, ff_dim: int, output_dim: int) -> None:
-        super().__init__()
-        self.linear_in = ColumnParallelLinear(embed_dim, ff_dim, bias=False)
-        self.linear_out = RowParallelLinear(ff_dim, output_dim, bias=False)
+        self.linear_in = ColumnParallelLinear(in_features,
+                                              hidden_features,
+                                              bias=False)
+        self.linear_out = RowParallelLinear(hidden_features,
+                                            output_dim,
+                                            bias=False)
         self.act = get_act_fn("gelu_new")
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
@@ -137,46 +75,6 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         return hidden_states
 
 
-class CrossAttention(nn.Module):
-
-    def __init__(self, kv_dim: int, embed_dim: int, num_heads: int) -> None:
-        super().__init__()
-        self.num_heads = num_heads
-        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=False)
-        self.k_proj = nn.Linear(kv_dim, embed_dim, bias=False)
-        self.v_proj = nn.Linear(kv_dim, embed_dim, bias=False)
-
-        self.multihead_attn = nn.MultiheadAttention(embed_dim, num_heads)
-        self.linear = nn.Linear(embed_dim, embed_dim)
-
-        self.layer_norm = nn.LayerNorm(embed_dim)
-        self.ln_kv = nn.LayerNorm(kv_dim)
-
-    def forward(
-        self,
-        x: torch.Tensor,
-        hidden_states: torch.Tensor,
-        attn_mask: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        normed_hidden_states = self.layer_norm(hidden_states)
-        query = self.q_proj(normed_hidden_states).permute(1, 0, 2)
-
-        x = self.ln_kv(x)
-        key = self.k_proj(x).permute(1, 0, 2)
-        value = self.v_proj(x).permute(1, 0, 2)
-
-        attn_output, _ = self.multihead_attn(query,
-                                             key,
-                                             value,
-                                             attn_mask=attn_mask)
-
-        attn_output = attn_output.permute(1, 0, 2)
-
-        attn_output = self.linear(attn_output)
-
-        return attn_output
-
-
 class AriaProjector(nn.Module):
     """
     A projection module with one cross attention layer and one FFN layer, which
@@ -198,42 +96,42 @@ class AriaProjector(nn.Module):
         A tensor with the shape of (batch_size, query_number, output_dim)
     """
 
-    def __init__(
-        self,
-        patch_to_query_dict: dict[int, int],
-        embed_dim: int,
-        num_heads: int,
-        kv_dim: int,
-        ff_dim: int,
-        output_dim: int,
-        norm_layer: Callable[[int], nn.Module] = nn.LayerNorm,
-    ) -> None:
+    def __init__(self, config: AriaConfig) -> None:
         super().__init__()
-        self.patch_to_query_dict = patch_to_query_dict
-        self.embed_dim = embed_dim
-        self.num_heads = num_heads
+
+        self.patch_to_query_dict = config.projector_patch_to_query_dict
+        self.in_features = config.vision_config.hidden_size
+        self.num_heads = config.vision_config.num_attention_heads
+        self.kv_dim = config.vision_config.hidden_size
+        self.hidden_features = config.text_config.hidden_size
+        self.output_dim = config.text_config.hidden_size
 
         self.query = nn.Parameter(
-            torch.empty(max(patch_to_query_dict.values()), self.embed_dim))
+            torch.empty(config.max_value_projector_patch_to_query_dict,
+                        self.in_features))
 
-        self.cross_attn = CrossAttention(kv_dim, embed_dim, num_heads)
+        self.cross_attn = AriaCrossAttention(config)
 
-        self.ln_ffn = norm_layer(embed_dim)
-        self.ffn = FFN(embed_dim, ff_dim, output_dim)
+        self.layer_norm = nn.LayerNorm(self.in_features)
+        self.feed_forward = AriaProjectorMLP(self.in_features,
+                                             self.hidden_features,
+                                             self.output_dim)
 
     def forward(
         self,
         x: torch.Tensor,
         attn_mask: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        bs = x.shape[0]
-        queries = self.query.unsqueeze(0).repeat(bs, 1, 1)
+        batch_size, num_patches = x.shape[0], x.shape[1]
 
-        query_num = self.patch_to_query_dict.get(x.shape[1], None)
-        assert (query_num is not None
-                ), f"Query number for {x.shape[1]} patches is not provided"
+        if num_patches not in self.patch_to_query_dict:
+            raise KeyError(f"Number of patches {num_patches} not found in "
+                           "patch_to_query_dict amongst possible values "
+                           f"{self.patch_to_query_dict.keys()}.")
 
-        queries = queries[:, :query_num, :]
+        query_num = self.patch_to_query_dict[num_patches]
+
+        queries = self.query[:query_num].unsqueeze(0).repeat(batch_size, 1, 1)
 
         if attn_mask is not None:
             attn_mask = attn_mask.repeat_interleave(self.num_heads, 0)
@@ -241,7 +139,7 @@ def forward(
 
         attention_out = self.cross_attn(x, queries, attn_mask=attn_mask)
 
-        out = self.ffn(self.ln_ffn(attention_out))
+        out = self.feed_forward(self.layer_norm(attention_out))
 
         return out
 
@@ -278,7 +176,7 @@ def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor,
                 param.data.copy_(loaded_weight.transpose(1, 2))
 
 
-class MoELayer(nn.Module):
+class AriaTextMoELayer(nn.Module):
     """
     Mixture of Experts (MoE) Layer for the AriaMoE model.
 
@@ -289,7 +187,7 @@ class MoELayer(nn.Module):
 
     def __init__(
         self,
-        config: AriaMoELMConfig,
+        config: AriaTextConfig,
         quant_config: Optional[QuantizationConfig],
     ) -> None:
         super().__init__()
@@ -303,15 +201,16 @@ def __init__(
             num_experts=config.moe_num_experts,
             top_k=config.moe_topk,
             hidden_size=config.hidden_size,
-            intermediate_size=config.moe_intermediate_size,
+            intermediate_size=config.intermediate_size,
             quant_config=quant_config,
             reduce_results=True,
         )
         self.shared_experts = LlamaMLP(
             config.hidden_size,
-            config.moe_intermediate_size * config.moe_num_shared_experts,
+            config.intermediate_size * config.moe_num_shared_experts,
             "silu",
             quant_config=quant_config,
+            bias=config.mlp_bias,
         )
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
@@ -329,13 +228,13 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         router_output = torch.nn.functional.linear(hidden_states,
                                                    self.router_weight)
 
-        shared_expert_output = self.shared_experts(hidden_states)
         sparse_expert_output = self.experts(hidden_states, router_output)
+        shared_expert_output = self.shared_experts(hidden_states)
 
         return sparse_expert_output + shared_expert_output
 
 
-class MoEDecoderLayer(LlamaDecoderLayer):
+class AriaTextDecoderLayer(LlamaDecoderLayer):
     """
     Custom Decoder Layer for the AriaMoE model which modifies the standard
     `LlamaDecoderLayer` by replacing the traditional MLP with a Mixture of
@@ -344,16 +243,16 @@ class MoEDecoderLayer(LlamaDecoderLayer):
 
     def __init__(
         self,
-        config: AriaMoELMConfig,
+        config: AriaTextConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
     ) -> None:
         super().__init__(config, cache_config, quant_config, prefix)
-        self.mlp = MoELayer(config, quant_config=quant_config)
+        self.mlp = AriaTextMoELayer(config, quant_config=quant_config)
 
 
-class AriaMoELMModel(LlamaModel):
+class AriaTextModel(LlamaModel):
     """
     Custom LlamaModel for the AriaMoE model which modifies the standard
     LlamaModel by replacing the `LlamaDecoderLayer` with `MoEDecoderLayer`.
@@ -362,7 +261,7 @@ class AriaMoELMModel(LlamaModel):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__(vllm_config=vllm_config,
                          prefix=prefix,
-                         layer_type=MoEDecoderLayer)
+                         layer_type=AriaTextDecoderLayer)
 
     # Adapted from LlamaModel.load_weights with the modification of adding
     # the expert weights mapping to `stacked_params_mapping`
@@ -434,25 +333,23 @@ def load_weights(self, weights: Iterable[Tuple[str,
         return loaded_params
 
 
-def build_mm_projector(config: PretrainedConfig):
-    return AriaProjector(
-        patch_to_query_dict=config.projector_patch_to_query_dict,
-        embed_dim=config.vision_config.hidden_size,
-        num_heads=config.vision_config.num_attention_heads,
-        kv_dim=config.vision_config.hidden_size,
-        ff_dim=config.text_config.hidden_size,
-        output_dim=config.text_config.hidden_size,
-    )
-
-
 class AriaProcessingInfo(BaseProcessingInfo):
 
     def get_hf_config(self):
-        return self.ctx.get_hf_config()
+        return self.ctx.get_hf_config(AriaConfig)
 
-    def get_vision_config(self) -> AriaVisionConfig:
+    def get_vision_config(self):
         return self.get_hf_config().vision_config
 
+    def get_hf_processor(self):
+        processor = self.ctx.get_hf_processor(AriaProcessor)
+
+        # Patch for https://github.com/huggingface/transformers/issues/35768
+        processor.tokenizer.image_token = "<|img|>"
+        processor.image_token = "<|img|>"
+
+        return processor
+
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None}
 
@@ -554,10 +451,14 @@ def __init__(
         quant_config = vllm_config.quant_config
 
         self.config = config
-        self.vision_tower = AriaVisionModel(config.vision_config)
-        self.multi_modal_projector = build_mm_projector(config)
+        self.vision_tower = Idefics3VisionTransformer(
+            config.vision_config,
+            quant_config,
+            prefix=f"{prefix}.vision_tower",
+        )
+        self.multi_modal_projector = AriaProjector(config)
         self.vocab_size = config.text_config.vocab_size
-        self.language_model = AriaMoELMModel(
+        self.language_model = AriaTextModel(
             vllm_config=vllm_config.with_hf_config(config.text_config),
             prefix=maybe_prefix(prefix, "language_model.model"),
         )
@@ -608,6 +509,22 @@ def _parse_and_validate_image_input(
             pixel_mask=pixel_mask,
         )
 
+    def _create_patch_attention_mask(
+            self, pixel_mask: Optional[torch.Tensor]) -> torch.Tensor:
+        if pixel_mask is None:
+            return None
+
+        patches_subgrid = pixel_mask.unfold(
+            dimension=1,
+            size=self.vision_tower.config.patch_size,
+            step=self.vision_tower.config.patch_size,
+        ).unfold(
+            dimension=2,
+            size=self.vision_tower.config.patch_size,
+            step=self.vision_tower.config.patch_size,
+        )
+        return (patches_subgrid.sum(dim=(-1, -2)) > 0).bool()
+
     def _process_image_input(
         self, image_input: AriaImagePixelInputs
     ) -> Tuple[torch.Tensor, torch.Tensor]:
@@ -616,9 +533,18 @@ def _process_image_input(
         pixel_values = image_input['pixel_values']
         pixel_mask = image_input['pixel_mask']
 
-        image_feature, image_attn_mask = self.vision_tower(
-            pixel_values, pixel_mask=pixel_mask)
-        return self.multi_modal_projector(image_feature, image_attn_mask)
+        patch_attention_mask = self._create_patch_attention_mask(pixel_mask)
+
+        image_outputs = self.vision_tower(
+            pixel_values=pixel_values,
+            patch_attention_mask=patch_attention_mask,
+        )
+        image_attn_mask = None
+        if patch_attention_mask is not None:
+            flattened_mask = patch_attention_mask.flatten(1)
+            image_attn_mask = torch.logical_not(flattened_mask)
+
+        return self.multi_modal_projector(image_outputs, image_attn_mask)
 
     def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
         image_input = self._parse_and_validate_image_input(**kwargs)
@@ -683,6 +609,5 @@ def sample(
         return next_tokens
 
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
-
         loader = AutoWeightsLoader(self)
         loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index f57dfded0a62f..c97acffa1a719 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -22,10 +22,10 @@
 from vllm.logger import init_logger
 # yapf conflicts with isort for this block
 # yapf: disable
-from vllm.transformers_utils.configs import (AriaConfig, ChatGLMConfig,
-                                             Cohere2Config, DbrxConfig,
-                                             DeepseekVLV2Config, EAGLEConfig,
-                                             ExaoneConfig, H2OVLChatConfig,
+from vllm.transformers_utils.configs import (ChatGLMConfig, Cohere2Config,
+                                             DbrxConfig, DeepseekVLV2Config,
+                                             EAGLEConfig, ExaoneConfig,
+                                             H2OVLChatConfig,
                                              InternVLChatConfig, JAISConfig,
                                              MedusaConfig, MllamaConfig,
                                              MLPSpeculatorConfig, MPTConfig,
@@ -52,7 +52,6 @@
 }
 
 _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
-    "aria": AriaConfig,
     "chatglm": ChatGLMConfig,
     "cohere2": Cohere2Config,
     "dbrx": DbrxConfig,
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index 807ef4fbfd0c0..f065c56124605 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -1,4 +1,3 @@
-from vllm.transformers_utils.configs.aria import AriaConfig
 from vllm.transformers_utils.configs.chatglm import ChatGLMConfig
 from vllm.transformers_utils.configs.cohere2 import Cohere2Config
 from vllm.transformers_utils.configs.dbrx import DbrxConfig
@@ -24,7 +23,6 @@
 from vllm.transformers_utils.configs.ultravox import UltravoxConfig
 
 __all__ = [
-    "AriaConfig",
     "ChatGLMConfig",
     "Cohere2Config",
     "DbrxConfig",
diff --git a/vllm/transformers_utils/configs/aria.py b/vllm/transformers_utils/configs/aria.py
deleted file mode 100644
index f4b531225b5d0..0000000000000
--- a/vllm/transformers_utils/configs/aria.py
+++ /dev/null
@@ -1,165 +0,0 @@
-# Copyright 2024 Rhymes AI. All rights reserved.
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-from typing import Mapping
-
-from transformers import PretrainedConfig
-from transformers.models.idefics2.configuration_idefics2 import (
-    Idefics2VisionConfig)
-from transformers.models.llama.configuration_llama import LlamaConfig
-
-from vllm.logger import init_logger
-
-logger = init_logger(__name__)
-
-
-class AriaVisionConfig(Idefics2VisionConfig):
-    model_type = "aria_vision_model"
-
-
-class AriaMoELMConfig(LlamaConfig):
-    """
-    Configuration class for AriaMoE language model.
-
-    This class extends the LlamaConfig to include additional parameters specific
-    to the Mixture of Experts (MoE) architecture.
-    """
-
-    model_type = "aria_moe_lm"
-
-    def __init__(
-        self,
-        moe_intermediate_size: int = 4096,
-        moe_num_experts: int = 8,
-        moe_topk: int = 2,
-        moe_num_shared_experts: int = 2,
-        **kwargs,
-    ):
-        """
-        Initialize the AriaMoELMConfig.
-
-        Args:
-            moe_intermediate_size (int): The intermediate size for MoE layers.
-                Default is 4096.
-            moe_num_experts (int): The number of experts in the MoE layer.
-                Default is 8.
-            moe_topk (int): The number of top experts to route to for each 
-                token. Default is 2.
-            moe_num_shared_experts (int): The number of shared experts. Default
-                is 2. 
-            **kwargs: Additional keyword arguments to be passed to the parent
-                LlamaConfig.
-        """
-        super().__init__(**kwargs)
-        self.moe_intermediate_size = moe_intermediate_size
-        self.moe_num_experts = moe_num_experts
-        self.moe_topk = moe_topk
-        self.moe_num_shared_experts = moe_num_shared_experts
-
-
-class AriaConfig(PretrainedConfig):
-    """
-    Configuration class for Aria model.
-    This class handles the configuration for both vision and text components of
-    the Aria model,
-    as well as additional parameters for image token handling and projector
-    mapping.
-
-    Args:
-        vision_config (AriaVisionConfig or dict): Configuration for the vision
-            component.
-        text_config (AriaMoELMConfig or dict): Configuration for the text
-            component.
-        projector_patch_to_query_dict (dict): Mapping of patch sizes to query
-            dimensions.
-        ignore_index (int): Index to ignore in loss calculation.
-        image_token_index (int): Index used to represent image tokens.
-        **kwargs: Additional keyword arguments passed to the parent class.
-    Attributes:
-        model_type (str): Type of the model, set to "aria".
-        is_composition (bool): Whether the model is a composition of multiple
-            components.
-        ignore_index (int): Index to ignore in loss calculation.
-        image_token_index (int): Index used to represent image tokens.
-        projector_patch_to_query_dict (dict): Mapping of patch sizes to query
-            dimensions.
-        vision_config (AriaVisionConfig): Configuration for the vision
-            component.
-        text_config (AriaMoELMConfig): Configuration for the text component.
-    """
-
-    model_type = "aria"
-    is_composition = False
-
-    def __init__(
-        self,
-        vision_config: AriaVisionConfig = AriaVisionConfig(),  # noqa: B008
-        text_config: AriaMoELMConfig = AriaMoELMConfig(),  # noqa: B008
-        projector_patch_to_query_dict: Mapping[int, int] = {
-            1225: 128,
-            4900: 256,
-        },
-        ignore_index=-100,
-        image_token_index=32000,
-        tie_word_embeddings=False,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.ignore_index = ignore_index
-        self.image_token_index = image_token_index
-        self.tie_word_embeddings = tie_word_embeddings
-        attn_implementation = kwargs.pop("attn_implementation", None)
-
-        # Set the default attention implementation to flash_attention_2 if not
-        # specified
-        self._attn_implementation = ("flash_attention_2"
-                                     if attn_implementation is None else
-                                     attn_implementation)
-
-        # Convert the keys and values of projector_patch_to_query_dict to
-        # integers
-        # This ensures consistency even if they were provided as strings
-        self.projector_patch_to_query_dict = {
-            int(k): int(v)
-            for k, v in projector_patch_to_query_dict.items()
-        }
-
-        if isinstance(vision_config, dict) and "model_type" in vision_config:
-            vision_config = AriaVisionConfig(**vision_config)
-            if attn_implementation is None:
-                vision_attn_implementation = "flash_attention_2"
-            elif attn_implementation == "sdpa":
-                logger.warning("SDPA is not supported for vit, using "
-                               "flash_attention_2 instead")
-                vision_attn_implementation = "flash_attention_2"
-            else:
-                vision_attn_implementation = attn_implementation
-            vision_config._attn_implementation = vision_attn_implementation
-
-        self.vision_config = vision_config
-
-        if isinstance(text_config, dict) and "model_type" in text_config:
-            text_attn_implementation = ("sdpa" if attn_implementation is None
-                                        else attn_implementation)
-            text_config = AriaMoELMConfig(**text_config)
-            text_config._attn_implementation = text_attn_implementation
-
-        self.text_config = text_config
-
-        # This is needed for the static kv cache
-        self.num_hidden_layers = self.text_config.num_hidden_layers

From 170eb350793a04ceb18ae86be4ccf97d02ad199f Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 20 Jan 2025 18:06:24 +0800
Subject: [PATCH 288/462] [misc] print a message to suggest how to bypass
 commit hooks (#12217)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 .pre-commit-config.yaml | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 47eddb345edbd..8d1fc257388a8 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -34,6 +34,10 @@ repos:
   hooks:
   - id: pymarkdown
     files: docs/.*
+- repo: https://github.com/rhysd/actionlint
+  rev: v1.7.6
+  hooks:
+  - id: actionlint
 - repo: local
   hooks:
   - id: mypy-local
@@ -81,7 +85,8 @@ repos:
     entry: tools/png-lint.sh
     language: script
     types: [png]
-- repo: https://github.com/rhysd/actionlint
-  rev: v1.7.6
-  hooks:
-  - id: actionlint
+  - id: suggestion
+    name: Suggestion
+    entry: bash -c 'echo "To bypass pre-commit hooks, add --no-verify to git commit."'
+    language: system
+    verbose: true

From c222f47992ce0bbcd3ccbce24736e045d8689be8 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 20 Jan 2025 19:35:59 +0800
Subject: [PATCH 289/462] [core][bugfix] configure env var during import vllm
 (#12209)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 examples/offline_inference/rlhf.py |  7 +----
 vllm/__init__.py                   | 49 ++++++++----------------------
 vllm/plugins/__init__.py           | 23 ++++++++++++++
 vllm/worker/worker_base.py         |  3 --
 4 files changed, 37 insertions(+), 45 deletions(-)

diff --git a/examples/offline_inference/rlhf.py b/examples/offline_inference/rlhf.py
index 3bc303dad277f..5c4918008dcb3 100644
--- a/examples/offline_inference/rlhf.py
+++ b/examples/offline_inference/rlhf.py
@@ -19,7 +19,7 @@
 from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
 from transformers import AutoModelForCausalLM
 
-from vllm import LLM, SamplingParams, configure_as_vllm_process
+from vllm import LLM, SamplingParams
 from vllm.utils import get_ip, get_open_port
 from vllm.worker.worker import Worker
 
@@ -98,12 +98,7 @@ def __init__(self, *args, **kwargs):
 """
 Start the training process, here we use huggingface transformers 
 as an example to hold a model on GPU 0.
-
-It is important for all the processes outside of vLLM to call
-`configure_as_vllm_process` to set some common environment variables
-the same as vLLM workers.
 """
-configure_as_vllm_process()
 
 train_model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m")
 train_model.to("cuda:0")
diff --git a/vllm/__init__.py b/vllm/__init__.py
index a533dba561c00..2aabe820d9a84 100644
--- a/vllm/__init__.py
+++ b/vllm/__init__.py
@@ -1,4 +1,7 @@
 """vLLM: a high-throughput and memory-efficient inference engine for LLMs"""
+import os
+
+import torch
 
 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
@@ -17,43 +20,18 @@
 
 from .version import __version__, __version_tuple__
 
+# set some common config/environment variables that should be set
+# for all processes created by vllm and all processes
+# that interact with vllm workers.
+# they are executed whenever `import vllm` is called.
 
-def configure_as_vllm_process():
-    """
-    set some common config/environment variables that should be set
-    for all processes created by vllm and all processes
-    that interact with vllm workers.
-    """
-    import os
-
-    import torch
-
-    # see https://github.com/NVIDIA/nccl/issues/1234
-    os.environ['NCCL_CUMEM_ENABLE'] = '0'
-
-    # see https://github.com/vllm-project/vllm/issues/10480
-    os.environ['TORCHINDUCTOR_COMPILE_THREADS'] = '1'
-    # see https://github.com/vllm-project/vllm/issues/10619
-    torch._inductor.config.compile_threads = 1
-
-    from vllm.platforms import current_platform
-
-    if current_platform.is_xpu():
-        # see https://github.com/pytorch/pytorch/blob/43c5f59/torch/_dynamo/config.py#L158
-        torch._dynamo.config.disable = True
-    elif current_platform.is_hpu():
-        # NOTE(kzawora): PT HPU lazy backend (PT_HPU_LAZY_MODE = 1)
-        # does not support torch.compile
-        # Eager backend (PT_HPU_LAZY_MODE = 0) must be selected for
-        # torch.compile support
-        is_lazy = os.environ.get('PT_HPU_LAZY_MODE', '1') == '1'
-        if is_lazy:
-            torch._dynamo.config.disable = True
-            # NOTE(kzawora) multi-HPU inference with HPUGraphs (lazy-only)
-            # requires enabling lazy collectives
-            # see https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html # noqa: E501
-            os.environ['PT_HPU_ENABLE_LAZY_COLLECTIVES'] = 'true'
+# see https://github.com/NVIDIA/nccl/issues/1234
+os.environ['NCCL_CUMEM_ENABLE'] = '0'
 
+# see https://github.com/vllm-project/vllm/issues/10480
+os.environ['TORCHINDUCTOR_COMPILE_THREADS'] = '1'
+# see https://github.com/vllm-project/vllm/issues/10619
+torch._inductor.config.compile_threads = 1
 
 __all__ = [
     "__version__",
@@ -80,5 +58,4 @@ def configure_as_vllm_process():
     "AsyncEngineArgs",
     "initialize_ray_cluster",
     "PoolingParams",
-    "configure_as_vllm_process",
 ]
diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py
index ff54174f634af..a78a054917756 100644
--- a/vllm/plugins/__init__.py
+++ b/vllm/plugins/__init__.py
@@ -1,6 +1,9 @@
 import logging
+import os
 from typing import Callable, Dict
 
+import torch
+
 import vllm.envs as envs
 
 logger = logging.getLogger(__name__)
@@ -51,6 +54,26 @@ def load_general_plugins():
     if plugins_loaded:
         return
     plugins_loaded = True
+
+    # some platform-specific configurations
+    from vllm.platforms import current_platform
+
+    if current_platform.is_xpu():
+        # see https://github.com/pytorch/pytorch/blob/43c5f59/torch/_dynamo/config.py#L158
+        torch._dynamo.config.disable = True
+    elif current_platform.is_hpu():
+        # NOTE(kzawora): PT HPU lazy backend (PT_HPU_LAZY_MODE = 1)
+        # does not support torch.compile
+        # Eager backend (PT_HPU_LAZY_MODE = 0) must be selected for
+        # torch.compile support
+        is_lazy = os.environ.get('PT_HPU_LAZY_MODE', '1') == '1'
+        if is_lazy:
+            torch._dynamo.config.disable = True
+            # NOTE(kzawora) multi-HPU inference with HPUGraphs (lazy-only)
+            # requires enabling lazy collectives
+            # see https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html # noqa: E501
+            os.environ['PT_HPU_ENABLE_LAZY_COLLECTIVES'] = 'true'
+
     plugins = load_plugins_by_group(group='vllm.general_plugins')
     # general plugins, we only need to execute the loaded functions
     for func in plugins.values():
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index 1104eceef72a3..c6e6693c54f57 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -535,9 +535,6 @@ def init_worker(self, all_kwargs: List[Dict[str, Any]]) -> None:
         kwargs = all_kwargs[self.rpc_rank]
         enable_trace_function_call_for_thread(self.vllm_config)
 
-        from vllm import configure_as_vllm_process
-        configure_as_vllm_process()
-
         from vllm.plugins import load_general_plugins
         load_general_plugins()
 

From 5f0ec3935a0118fee8cf2764728f765c8cc53d2a Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Mon, 20 Jan 2025 21:54:16 +0800
Subject: [PATCH 290/462] [V1] Remove `_get_cache_block_size` (#12214)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
---
 vllm/v1/worker/gpu_worker.py | 24 +-----------------------
 1 file changed, 1 insertion(+), 23 deletions(-)

diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 0929e64d58f1e..bd40112aea5e8 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -8,14 +8,13 @@
 import torch.nn as nn
 
 import vllm.envs as envs
-from vllm.config import CacheConfig, ModelConfig, ParallelConfig, VllmConfig
+from vllm.config import ParallelConfig, VllmConfig
 from vllm.distributed import (ensure_model_parallel_initialized,
                               init_distributed_environment,
                               set_custom_all_reduce)
 from vllm.logger import init_logger
 from vllm.model_executor import set_random_seed
 from vllm.platforms import current_platform
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, LayerBlockType, get_dtype_size
 from vllm.v1.core.scheduler import SchedulerOutput
 from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
 from vllm.v1.outputs import ModelRunnerOutput
@@ -235,24 +234,3 @@ def _check_if_gpu_supports_dtype(torch_dtype: torch.dtype):
                 f"of at least 8.0. Your {gpu_name} GPU {compute_str}. "
                 "You can use float16 instead by explicitly setting the"
                 "`dtype` flag in CLI, for example: --dtype=half.")
-
-
-def _get_cache_block_size(
-    cache_config: CacheConfig,
-    model_config: ModelConfig,
-    parallel_config: ParallelConfig,
-) -> int:
-    head_size = model_config.get_head_size()
-    num_heads = model_config.get_num_kv_heads(parallel_config)
-    num_attention_layers = model_config.get_num_layers_by_block_type(
-        parallel_config, LayerBlockType.attention)
-
-    key_cache_block = cache_config.block_size * num_heads * head_size
-    value_cache_block = key_cache_block
-    total = num_attention_layers * (key_cache_block + value_cache_block)
-    if cache_config.cache_dtype == "auto":
-        dtype = model_config.dtype
-    else:
-        dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype]
-    dtype_size = get_dtype_size(dtype)
-    return dtype_size * total

From 86bfb6dba7c6e0650e7d7498cbd46b49155b2a42 Mon Sep 17 00:00:00 2001
From: wangxiyuan <wangxiyuan1007@gmail.com>
Date: Mon, 20 Jan 2025 23:25:28 +0800
Subject: [PATCH 291/462] [Misc] Pass `attention` to impl backend (#12218)

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
---
 vllm/attention/backends/abstract.py         | 23 +++++++++++++++++----
 vllm/attention/backends/blocksparse_attn.py | 12 +++++------
 vllm/attention/backends/flash_attn.py       | 10 ++++-----
 vllm/attention/backends/flashinfer.py       | 16 +++++++-------
 vllm/attention/backends/hpu_attn.py         |  4 ++--
 vllm/attention/backends/ipex_attn.py        | 18 ++++++++--------
 vllm/attention/backends/pallas.py           |  6 +++---
 vllm/attention/backends/rocm_flash_attn.py  | 20 +++++++++---------
 vllm/attention/backends/torch_sdpa.py       | 18 +++++++---------
 vllm/attention/backends/xformers.py         | 20 ++++++++----------
 vllm/attention/layer.py                     |  8 +++----
 vllm/v1/attention/backends/flash_attn.py    |  9 ++++----
 12 files changed, 86 insertions(+), 78 deletions(-)

diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index 737559bfe70ca..e6ddca69bf01b 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -1,8 +1,8 @@
 from abc import ABC, abstractmethod
 from contextlib import contextmanager
 from dataclasses import dataclass, fields
-from typing import (TYPE_CHECKING, Any, Dict, Generic, List, Optional, Set,
-                    Tuple, Type, TypeVar)
+from typing import (TYPE_CHECKING, Any, Dict, Generic, List, Optional,
+                    Protocol, Set, Tuple, Type, TypeVar)
 
 import torch
 
@@ -223,6 +223,22 @@ def build(self, seq_lens: List[int], query_lens: List[int],
         raise NotImplementedError
 
 
+class AttentionLayer(Protocol):
+
+    _k_scale: float
+    _v_scale: float
+
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+    ) -> torch.Tensor:
+        ...
+
+
 class AttentionImpl(ABC, Generic[T]):
 
     @abstractmethod
@@ -244,13 +260,12 @@ def __init__(
     @abstractmethod
     def forward(
         self,
+        layer: AttentionLayer,
         query: torch.Tensor,
         key: torch.Tensor,
         value: torch.Tensor,
         kv_cache: torch.Tensor,
         attn_metadata: T,
-        k_scale: float = 1.0,
-        v_scale: float = 1.0,
         output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         raise NotImplementedError
diff --git a/vllm/attention/backends/blocksparse_attn.py b/vllm/attention/backends/blocksparse_attn.py
index 77cfa8490172b..9089db1126c94 100644
--- a/vllm/attention/backends/blocksparse_attn.py
+++ b/vllm/attention/backends/blocksparse_attn.py
@@ -4,6 +4,7 @@
 import torch
 
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
+                                              AttentionLayer,
                                               AttentionMetadata, AttentionType)
 from vllm.attention.backends.utils import (CommonAttentionState,
                                            CommonMetadataBuilder)
@@ -358,13 +359,12 @@ def __init__(
 
     def forward(
         self,
+        layer: AttentionLayer,
         query: torch.Tensor,
         key: torch.Tensor,
         value: torch.Tensor,
         kv_cache: torch.Tensor,
         attn_metadata: BlocksparseFlashAttentionMetadata,
-        k_scale: float = 1.0,
-        v_scale: float = 1.0,
         output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with FlashAttention and PagedAttention.
@@ -401,8 +401,8 @@ def forward(
                 value_cache,
                 attn_metadata.slot_mapping,
                 self.kv_cache_dtype,
-                k_scale,
-                v_scale,
+                layer._k_scale,
+                layer._v_scale,
             )
 
         if prefill_meta := attn_metadata.prefill_metadata:
@@ -439,8 +439,8 @@ def forward(
                 self.num_kv_heads,
                 self.scale,
                 self.alibi_slopes,
-                k_scale,
-                v_scale,
+                layer._k_scale,
+                layer._v_scale,
                 tp_rank=self.tp_rank,
                 blocksparse_local_blocks=self.local_blocks,
                 blocksparse_vert_stride=self.vert_stride,
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index 48b3e8d177ec9..40250ef08b595 100644
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -8,6 +8,7 @@
 
 from vllm import _custom_ops as ops
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
+                                              AttentionLayer,
                                               AttentionMetadata,
                                               AttentionMetadataBuilder,
                                               AttentionType)
@@ -634,13 +635,12 @@ def __init__(
 
     def forward(
         self,
+        layer: AttentionLayer,
         query: torch.Tensor,
         key: torch.Tensor,
         value: torch.Tensor,
         kv_cache: torch.Tensor,
         attn_metadata: FlashAttentionMetadata,
-        k_scale: float = 1.0,
-        v_scale: float = 1.0,
         output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with FlashAttention.
@@ -657,7 +657,7 @@ def forward(
         NOTE: It in-place updates the output tensor.
         """
         # NOTE(woosuk): FlashAttention does not support FP8 KV cache.
-        assert k_scale == 1.0 and v_scale == 1.0, (
+        assert layer._k_scale == 1.0 and layer._v_scale == 1.0, (
             "key/v_scale is not supported in FlashAttention.")
 
         assert output is not None, "Output tensor must be provided."
@@ -709,8 +709,8 @@ def forward(
                     kv_cache[1],
                     updated_slot_mapping.flatten(),  # type: ignore[union-attr]
                     kv_cache_dtype,
-                    k_scale,
-                    v_scale,
+                    layer._k_scale,
+                    layer._v_scale,
                 )
 
         (num_prefill_query_tokens, num_prefill_kv_tokens,
diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
index 6ca75fabdfc38..b9cd805e81b45 100644
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -23,6 +23,7 @@
 import vllm.envs as envs
 from vllm import _custom_ops as ops
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
+                                              AttentionLayer,
                                               AttentionMetadata,
                                               AttentionMetadataBuilder,
                                               AttentionState, AttentionType)
@@ -792,13 +793,12 @@ def __init__(
 
     def forward(
         self,
+        layer: AttentionLayer,
         query: torch.Tensor,
         key: torch.Tensor,
         value: torch.Tensor,
         kv_cache: torch.Tensor,
         attn_metadata: FlashInferMetadata,
-        k_scale: float = 1.0,
-        v_scale: float = 1.0,
         output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
 
@@ -826,8 +826,8 @@ def forward(
                 kv_cache[:, 1],
                 attn_metadata.slot_mapping.flatten(),
                 kv_cache_dtype,
-                k_scale,
-                v_scale,
+                layer._k_scale,
+                layer._v_scale,
             )
             # The FlashInfer api requires data to be in fp8_e4m3 or fp8_e5m2
             # to process the cache when the kv_cache_dtype is fp8
@@ -886,8 +886,8 @@ def forward(
                     kv_cache,
                     logits_soft_cap=logits_soft_cap,
                     causal=True,
-                    k_scale=k_scale,
-                    v_scale=v_scale,
+                    k_scale=layer._k_scale,
+                    v_scale=layer._v_scale,
                     window_left=window_left)
         if decode_meta := attn_metadata.decode_metadata:
             assert decode_meta is not None
@@ -897,8 +897,8 @@ def forward(
                 kv_cache,
                 sm_scale=softmax_scale,
                 logits_soft_cap=logits_soft_cap,
-                k_scale=k_scale,
-                v_scale=v_scale,
+                k_scale=layer._k_scale,
+                v_scale=layer._v_scale,
                 window_left=window_left)
 
         if prefill_output is None and decode_output is not None:
diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py
index 94a461e0c8c29..80c132c0a8c05 100644
--- a/vllm/attention/backends/hpu_attn.py
+++ b/vllm/attention/backends/hpu_attn.py
@@ -11,6 +11,7 @@
 from vllm_hpu_extension.utils import Matmul, Softmax, VLLMKVCache
 
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
+                                              AttentionLayer,
                                               AttentionMetadata, AttentionType)
 from vllm.attention.backends.utils import CommonAttentionState
 from vllm.attention.ops.hpu_paged_attn import (HPUPagedAttention,
@@ -152,13 +153,12 @@ def __init__(
 
     def forward(
         self,
+        layer: AttentionLayer,
         query: torch.Tensor,
         key: torch.Tensor,
         value: torch.Tensor,
         kv_cache: torch.Tensor,
         attn_metadata: HPUAttentionMetadata,
-        k_scale: float = 1.0,
-        v_scale: float = 1.0,
         output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with xFormers and PagedAttention.
diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py
index da1d307daa517..cd729a1c8b274 100644
--- a/vllm/attention/backends/ipex_attn.py
+++ b/vllm/attention/backends/ipex_attn.py
@@ -7,6 +7,7 @@
 
 from vllm._ipex_ops import ipex_ops
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
+                                              AttentionLayer,
                                               AttentionMetadata, AttentionType)
 from vllm.attention.backends.utils import CommonAttentionState
 from vllm.attention.ops.paged_attn import (PagedAttention,
@@ -171,13 +172,12 @@ def split_kv_cache(
 
     def forward(
         self,
+        layer: AttentionLayer,
         query: torch.Tensor,
         key: torch.Tensor,
         value: torch.Tensor,
         kv_cache: torch.Tensor,
         attn_metadata: IpexAttnMetadata,  # type: ignore
-        k_scale: float = 1.0,
-        v_scale: float = 1.0,
         output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with IPEX varlen_attention and PagedAttention.
@@ -193,7 +193,7 @@ def forward(
         Returns:
             shape = [num_tokens, num_heads * head_size]
         """
-        assert k_scale == 1.0 and v_scale == 1.0
+        assert layer._k_scale == 1.0 and layer._v_scale == 1.0
         num_tokens, hidden_size = query.shape
         # Reshape the query, key, and value tensors.
         query = query.view(-1, self.num_heads, self.head_size)
@@ -210,8 +210,8 @@ def forward(
                 value_cache,
                 attn_metadata.slot_mapping.flatten(),
                 self.kv_cache_dtype,
-                k_scale,
-                v_scale,
+                layer._k_scale,
+                layer._v_scale,
             )
 
         if attn_metadata.is_prompt:
@@ -296,8 +296,8 @@ def forward(
                     max_seq_len,
                     self.alibi_slopes,
                     self.kv_cache_dtype,
-                    k_scale,
-                    v_scale,
+                    layer._k_scale,
+                    layer._v_scale,
                 )
             else:
                 # Run PagedAttention V2.
@@ -329,8 +329,8 @@ def forward(
                     max_seq_len,
                     self.alibi_slopes,
                     self.kv_cache_dtype,
-                    k_scale,
-                    v_scale,
+                    layer._k_scale,
+                    layer._v_scale,
                 )
 
             # Reshape the output tensor.
diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py
index 2ac492dd8ae54..f5bf390df6afb 100644
--- a/vllm/attention/backends/pallas.py
+++ b/vllm/attention/backends/pallas.py
@@ -5,6 +5,7 @@
 import torch_xla.experimental.custom_kernel  # Required to register custom ops.
 
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
+                                              AttentionLayer,
                                               AttentionMetadata, AttentionType)
 from vllm.attention.backends.utils import CommonAttentionState
 
@@ -150,13 +151,12 @@ def __init__(
 
     def forward(
         self,
+        layer: AttentionLayer,
         query: torch.Tensor,
         key: torch.Tensor,
         value: torch.Tensor,
         kv_cache: Tuple[torch.Tensor, torch.Tensor],
         attn_metadata: PallasMetadata,
-        k_scale: float = 1.0,
-        v_scale: float = 1.0,
         output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with Pallas attention.
@@ -173,7 +173,7 @@ def forward(
         Returns:
             shape = [batch_size, seq_len, num_heads * head_size]
         """
-        assert k_scale == 1.0 and v_scale == 1.0
+        assert layer._k_scale == 1.0 and layer._v_scale == 1.0
         batch_size, seq_len, hidden_size = query.shape
         query = query.view(batch_size, seq_len, self.num_heads, self.head_size)
         key = key.view(batch_size, seq_len, self.num_kv_heads, self.head_size)
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index a91a5af5c3d58..e9f2808ff1674 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -7,6 +7,7 @@
 import vllm.envs as envs
 from vllm import _custom_ops as ops
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
+                                              AttentionLayer,
                                               AttentionMetadata, AttentionType)
 from vllm.attention.backends.utils import (CommonAttentionState,
                                            CommonMetadataBuilder)
@@ -414,13 +415,12 @@ def repeat_kv(self, x: torch.Tensor, n_rep: int) -> torch.Tensor:
 
     def forward(
         self,
+        layer: AttentionLayer,
         query: torch.Tensor,
         key: torch.Tensor,
         value: torch.Tensor,
         kv_cache: torch.Tensor,
         attn_metadata: ROCmFlashAttentionMetadata,
-        k_scale: float = 1.0,
-        v_scale: float = 1.0,
         output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with FlashAttention and PagedAttention.
@@ -458,8 +458,8 @@ def forward(
                 value_cache,
                 attn_metadata.slot_mapping,
                 self.kv_cache_dtype,
-                k_scale,
-                v_scale,
+                layer._k_scale,
+                layer._v_scale,
             )
 
         num_prefill_tokens = attn_metadata.num_prefill_tokens
@@ -567,8 +567,8 @@ def forward(
                     prefill_meta.max_query_len,
                     self.alibi_slopes,
                     self.sliding_window[0],
-                    k_scale,
-                    v_scale,
+                    layer._k_scale,
+                    layer._v_scale,
                 )
 
         if decode_meta := attn_metadata.decode_metadata:
@@ -613,8 +613,8 @@ def forward(
                     max_seq_len,
                     self.alibi_slopes,
                     self.kv_cache_dtype,
-                    k_scale,
-                    v_scale,
+                    layer._k_scale,
+                    layer._v_scale,
                 )
             else:
                 output[num_prefill_tokens:] = PagedAttention.forward_decode(
@@ -628,8 +628,8 @@ def forward(
                     self.num_kv_heads,
                     self.scale,
                     self.alibi_slopes,
-                    k_scale,
-                    v_scale,
+                    layer._k_scale,
+                    layer._v_scale,
                 )
 
         # Reshape the output tensor.
diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py
index ca1c4618615de..7cd2049f0c0a5 100644
--- a/vllm/attention/backends/torch_sdpa.py
+++ b/vllm/attention/backends/torch_sdpa.py
@@ -7,6 +7,7 @@
 from torch.nn.functional import scaled_dot_product_attention
 
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
+                                              AttentionLayer,
                                               AttentionMetadata,
                                               AttentionMetadataBuilder,
                                               AttentionType)
@@ -429,13 +430,12 @@ def __init__(
 
     def forward(
         self,
+        layer: AttentionLayer,
         query: torch.Tensor,
         key: torch.Tensor,
         value: torch.Tensor,
         kv_cache: torch.Tensor,
         attn_metadata: TorchSDPAMetadata,  # type: ignore
-        k_scale: float = 1.0,
-        v_scale: float = 1.0,
         output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with torch SDPA and PagedAttention.
@@ -451,7 +451,7 @@ def forward(
         Returns:
             shape = [num_tokens, num_heads * head_size]
         """
-        assert k_scale == 1.0 and v_scale == 1.0
+        assert layer._k_scale == 1.0 and layer._v_scale == 1.0
         attn_type = self.attn_type
         if (attn_type == AttentionType.ENCODER
                 and (not attn_metadata.is_all_encoder_attn_metadata_set)):
@@ -493,11 +493,9 @@ def forward(
                     # Update self-attention KV cache (prefill/decode)
                     updated_slot_mapping = attn_metadata.slot_mapping
 
-                PagedAttention.write_to_paged_cache(key, value, key_cache,
-                                                    value_cache,
-                                                    updated_slot_mapping,
-                                                    self.kv_cache_dtype,
-                                                    k_scale, v_scale)
+                PagedAttention.write_to_paged_cache(
+                    key, value, key_cache, value_cache, updated_slot_mapping,
+                    self.kv_cache_dtype, layer._k_scale, layer._v_scale)
 
         if attn_type != AttentionType.ENCODER:
             # Decoder self-attention supports chunked prefill.
@@ -571,8 +569,8 @@ def forward(
                 self.num_kv_heads,
                 self.scale,
                 self.alibi_slopes,
-                k_scale,
-                v_scale,
+                layer._k_scale,
+                layer._v_scale,
             )
 
         # Reshape the output tensor.
diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
index 8c8ca8520a9db..38e27434dab2c 100644
--- a/vllm/attention/backends/xformers.py
+++ b/vllm/attention/backends/xformers.py
@@ -10,6 +10,7 @@
                                          LowerTriangularMaskWithTensorBias)
 
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
+                                              AttentionLayer,
                                               AttentionMetadata, AttentionType)
 from vllm.attention.backends.utils import (
     CommonAttentionState, CommonMetadataBuilder,
@@ -412,13 +413,12 @@ def __init__(
 
     def forward(
         self,
+        layer: AttentionLayer,
         query: torch.Tensor,
         key: Optional[torch.Tensor],
         value: Optional[torch.Tensor],
         kv_cache: torch.Tensor,
         attn_metadata: "XFormersMetadata",
-        k_scale: float = 1.0,
-        v_scale: float = 1.0,
         output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with xFormers and PagedAttention.
@@ -524,11 +524,9 @@ def forward(
                 # If kv_cache is not provided, the new key and value tensors are
                 # not cached. This happens during the initial memory
                 # profiling run.
-                PagedAttention.write_to_paged_cache(key, value, key_cache,
-                                                    value_cache,
-                                                    updated_slot_mapping,
-                                                    self.kv_cache_dtype,
-                                                    k_scale, v_scale)
+                PagedAttention.write_to_paged_cache(
+                    key, value, key_cache, value_cache, updated_slot_mapping,
+                    self.kv_cache_dtype, layer._k_scale, layer._v_scale)
         (num_prefill_query_tokens, num_prefill_kv_tokens,
         num_decode_query_tokens) = \
             get_num_prefill_decode_query_kv_tokens(attn_metadata, attn_type)
@@ -580,8 +578,8 @@ def forward(
                     prefill_meta.max_query_len,
                     self.alibi_slopes,
                     self.sliding_window,
-                    k_scale,
-                    v_scale,
+                    layer._k_scale,
+                    layer._v_scale,
                 )
                 assert output[:num_prefill_query_tokens].shape == out.shape
                 output[:num_prefill_query_tokens] = out
@@ -607,8 +605,8 @@ def forward(
                 self.num_kv_heads,
                 self.scale,
                 self.alibi_slopes,
-                k_scale,
-                v_scale,
+                layer._k_scale,
+                layer._v_scale,
             )
 
         # Reshape the output tensor.
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index e2403306950a3..c36f8d08eb4a7 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -243,8 +243,7 @@ def unified_attention(
     attn_metadata = forward_context.attn_metadata
     self = forward_context.attn_layers[layer_name]
     kv_cache = self.kv_cache[forward_context.virtual_engine]
-    return self.impl.forward(query, key, value, kv_cache, attn_metadata,
-                             self._k_scale, self._v_scale)
+    return self.impl.forward(self, query, key, value, kv_cache, attn_metadata)
 
 
 def unified_attention_fake(
@@ -276,13 +275,12 @@ def unified_attention_with_output(
     attn_metadata = forward_context.attn_metadata
     self = forward_context.attn_layers[layer_name]
     kv_cache = self.kv_cache[forward_context.virtual_engine]
-    self.impl.forward(query,
+    self.impl.forward(self,
+                      query,
                       key,
                       value,
                       kv_cache,
                       attn_metadata,
-                      self._k_scale,
-                      self._v_scale,
                       output=output)
 
 
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index 7b0786261a6a6..fd36ea8d8806b 100644
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -130,13 +130,12 @@ def __init__(
 
     def forward(
         self,
+        layer: torch.nn.Module,
         query: torch.Tensor,
         key: torch.Tensor,
         value: torch.Tensor,
         kv_cache: torch.Tensor,
         attn_metadata: FlashAttentionMetadata,
-        k_scale: float = 1.0,
-        v_scale: float = 1.0,
         output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with FlashAttention.
@@ -151,7 +150,7 @@ def forward(
             shape = [num_tokens, num_heads * head_size]
         """
         # NOTE(woosuk): FlashAttention does not support FP8 KV cache.
-        assert k_scale == 1.0 and v_scale == 1.0, (
+        assert layer._k_scale == 1.0 and layer._v_scale == 1.0, (
             "key/v_scale is not supported in FlashAttention.")
 
         assert output is not None, "Output tensor must be provided."
@@ -183,8 +182,8 @@ def forward(
             value_cache,
             attn_metadata.slot_mapping,
             self.kv_cache_dtype,
-            k_scale,
-            v_scale,
+            layer._k_scale,
+            layer._v_scale,
         )
 
         # Compute attention and update output up to `num_actual_tokens`.

From 18572e3384a6f55a7589dd81e1f3f70f7dd73e3a Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 20 Jan 2025 23:35:36 +0800
Subject: [PATCH 292/462] [Bugfix] Fix `HfExampleModels.find_hf_info` (#12223)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/models/registry.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tests/models/registry.py b/tests/models/registry.py
index 23227ea6b9714..e99dbd16c47b9 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -302,6 +302,11 @@ def find_hf_info(self, model_id: str) -> _HfExamplesInfo:
             if info.default == model_id:
                 return info
 
+        # Fallback to extras
+        for info in self.hf_models.values():
+            if any(extra == model_id for extra in info.extras.values()):
+                return info
+
         raise ValueError(f"No example model defined for {model_id}")
 
 
From 96663699b2f78eecd44d1d1de9d93c7e054aabc2 Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Mon, 20 Jan 2025 23:49:18 +0800
Subject: [PATCH 293/462] [CI] Pass local python version explicitly to
 pre-commit mypy.sh (#12224)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
---
 .pre-commit-config.yaml | 2 +-
 tools/mypy.sh           | 6 +++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 8d1fc257388a8..432bf5ed18dbc 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -42,7 +42,7 @@ repos:
   hooks:
   - id: mypy-local
     name: Run mypy for local Python installation
-    entry: tools/mypy.sh
+    entry: tools/mypy.sh 0 "local"
     language: python
     types: [python]
     additional_dependencies: &mypy_deps [mypy==1.11.1, types-setuptools, types-PyYAML, types-requests]
diff --git a/tools/mypy.sh b/tools/mypy.sh
index bf95e4c526fd1..77d342da1ec82 100755
--- a/tools/mypy.sh
+++ b/tools/mypy.sh
@@ -1,12 +1,16 @@
 #!/bin/bash
 
 CI=${1:-0}
-PYTHON_VERSION=${2:-3.9}
+PYTHON_VERSION=${2:-local}
 
 if [ "$CI" -eq 1 ]; then
     set -e
 fi
 
+if [ $PYTHON_VERSION == "local" ]; then
+    PYTHON_VERSION=$(python -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")')
+fi
+
 run_mypy() {
     echo "Running mypy on $1"
     if [ "$CI" -eq 1 ] && [ -z "$1" ]; then

From 7bd36300679a0876c16a905e2baea41dd59a60a2 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Mon, 20 Jan 2025 14:19:09 -0800
Subject: [PATCH 294/462] [Misc] Update CODEOWNERS (#12229)

---
 .github/CODEOWNERS | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 3cb91fc0f8232..37733ebacbc7a 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -2,32 +2,33 @@
 # for more info about CODEOWNERS file
 
 # This lists cover the "core" components of vLLM that require careful review
-/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
-/vllm/core @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
-/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
-/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
-/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
-/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
-/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
+/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
+/vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
+/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
+/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
+/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
+/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
+/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
+/vllm/multimodal @DarkLight1337 @ywang96
 CMakeLists.txt @tlrmchlsmth
 
 # vLLM V1
-/vllm/v1 @WoosukKwon @robertgshaw2-neuralmagic @njhill @ywang96 @comaniac @alexm-neuralmagic
+/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
 
 # Test ownership
-/tests/async_engine @njhill @robertgshaw2-neuralmagic @simon-mo
+/tests/async_engine @njhill @robertgshaw2-redhat @simon-mo
 /tests/test_inputs.py @DarkLight1337 @ywang96
-/tests/entrypoints @DarkLight1337 @robertgshaw2-neuralmagic @simon-mo
+/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo
 /tests/models @DarkLight1337 @ywang96
 /tests/multimodal @DarkLight1337 @ywang96
 /tests/prefix_caching @comaniac @KuntaiDu
 /tests/spec_decode @njhill @LiuXiaoxuanPKU
 /tests/kernels @tlrmchlsmth @WoosukKwon
-/tests/quantization @mgoin @robertgshaw2-neuralmagic
+/tests/quantization @mgoin @robertgshaw2-redhat
 /.buildkite/lm-eval-harness @mgoin @simon-mo
 /tests/distributed/test_multi_node_assignment.py @youkaichao
 /tests/distributed/test_pipeline_parallel.py @youkaichao
 /tests/distributed/test_same_node.py @youkaichao
-/tests/multi_step @alexm-neuralmagic @comaniac
+/tests/multi_step @alexm-redhat @comaniac
 /tests/weight_loading @mgoin @youkaichao
 /tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac

From af69a6aded526343db8d4a199cdfd1bb84134201 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?I=C5=9F=C4=B1k?= <41375111+isikhi@users.noreply.github.com>
Date: Mon, 20 Jan 2025 22:23:28 +0000
Subject: [PATCH 295/462] fix: update platform detection for M-series arm based
 MacBook processors (#12227)

Signed-off-by: isikhi <huseyin.isik000@gmail.com>
---
 vllm/platforms/__init__.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py
index 6ca95b41dbb07..6033a806d2023 100644
--- a/vllm/platforms/__init__.py
+++ b/vllm/platforms/__init__.py
@@ -101,6 +101,10 @@ def cpu_platform_plugin() -> Optional[str]:
     try:
         from importlib.metadata import version
         is_cpu = "cpu" in version("vllm")
+        if is_cpu == False:
+            import platform
+            is_cpu = platform.machine().lower().startswith("arm")
+
     except Exception:
         pass
 

From da7512215f0b5c589c2747303b171357940c0614 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 21 Jan 2025 08:31:01 +0800
Subject: [PATCH 296/462] [misc] add cuda runtime version to usage data
 (#12190)

Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
---
 vllm/usage/usage_lib.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py
index 841df3994fba2..7f5cc906382af 100644
--- a/vllm/usage/usage_lib.py
+++ b/vllm/usage/usage_lib.py
@@ -130,6 +130,7 @@ def __init__(self) -> None:
         self.total_memory: Optional[int] = None
         self.architecture: Optional[str] = None
         self.platform: Optional[str] = None
+        self.cuda_runtime: Optional[str] = None
         self.gpu_count: Optional[int] = None
         self.gpu_type: Optional[str] = None
         self.gpu_memory_per_device: Optional[int] = None
@@ -169,6 +170,8 @@ def _report_usage_once(self, model_architecture: str,
             self.gpu_count = torch.cuda.device_count()
             self.gpu_type = device_property.name
             self.gpu_memory_per_device = device_property.total_memory
+        if current_platform.is_cuda():
+            self.cuda_runtime = torch.version.cuda
         self.provider = _detect_cloud_provider()
         self.architecture = platform.machine()
         self.platform = platform.platform()

From 06a760d6e8bcd60dc98775678b5b12eef01d82bb Mon Sep 17 00:00:00 2001
From: Cheng Kuan Yong Jason <jasoncky96@gmail.com>
Date: Tue, 21 Jan 2025 08:42:02 +0800
Subject: [PATCH 297/462] [bugfix] catch xgrammar unsupported array constraints
 (#12210)

Signed-off-by: Jason Cheng <jasoncky96@gmail.com>
---
 vllm/model_executor/guided_decoding/utils.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/vllm/model_executor/guided_decoding/utils.py b/vllm/model_executor/guided_decoding/utils.py
index 20abaefbacc51..90dfa62ec4670 100644
--- a/vllm/model_executor/guided_decoding/utils.py
+++ b/vllm/model_executor/guided_decoding/utils.py
@@ -20,6 +20,13 @@ def check_object(obj: dict) -> bool:
                 ]):
             return True
 
+        # Check for array unsupported keywords
+        if obj.get("type") == "array" and any(key in obj for key in [
+                "uniqueItems", "contains", "minContains", "maxContains",
+                "minItems", "maxItems"
+        ]):
+            return True
+
         # Recursively check all nested objects and arrays
         for value in obj.values():
             if isinstance(value, dict):

From 750f4cabfac4bfed679d95074d9550b043e3f8d5 Mon Sep 17 00:00:00 2001
From: Jinzhen Lin <linjinzhen@hotmail.com>
Date: Tue, 21 Jan 2025 08:42:16 +0800
Subject: [PATCH 298/462] [Kernel] optimize moe_align_block_size for cuda graph
 and large num_experts (e.g. DeepSeek-V3) (#12222)

Signed-off-by: Jinzhen Lin <linjinzhen@hotmail.com>
Co-authored-by: Michael Goin <mgoin@redhat.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
---
 csrc/moe/moe_align_sum_kernels.cu | 93 +++++++++++++++++++------------
 vllm/config.py                    |  2 +-
 2 files changed, 58 insertions(+), 37 deletions(-)

diff --git a/csrc/moe/moe_align_sum_kernels.cu b/csrc/moe/moe_align_sum_kernels.cu
index 24341d63fb1f8..715a1b42841f2 100644
--- a/csrc/moe/moe_align_sum_kernels.cu
+++ b/csrc/moe/moe_align_sum_kernels.cu
@@ -21,7 +21,7 @@ __device__ __forceinline__ int32_t index(int32_t total_col, int32_t row,
 }
 }  // namespace
 
-template <typename scalar_t>
+template <typename scalar_t, typename token_cnts_t>
 __global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids,
                                             int32_t* sorted_token_ids,
                                             int32_t* expert_ids,
@@ -32,12 +32,8 @@ __global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids,
   const size_t start_idx = threadIdx.x * tokens_per_thread;
 
   extern __shared__ int32_t shared_mem[];
-
-  int32_t* tokens_cnts =
-      shared_mem;  // 2d tensor with shape (blockDim.x + 1, num_experts)
-  int32_t* cumsum =
-      shared_mem +
-      (blockDim.x + 1) * num_experts;  // 1d tensor with shape (num_experts + 1)
+  int32_t* cumsum = shared_mem;  // 1d tensor with shape (num_experts + 1)
+  token_cnts_t* tokens_cnts = (token_cnts_t*)(shared_mem + blockDim.x + 1);
 
   for (int i = 0; i < num_experts; ++i) {
     tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0;
@@ -74,7 +70,7 @@ __global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids,
                           block_size) *
                       block_size;
     }
-    *total_tokens_post_pad = cumsum[num_experts];
+    *total_tokens_post_pad = static_cast<int32_t>(cumsum[num_experts]);
   }
 
   __syncthreads();
@@ -224,26 +220,44 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
                           torch::Tensor num_tokens_post_pad) {
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
-  // If we have very large number of experts, we can no longer use shared
-  // memory.
-  // TODO(simon): the right solution should be calculating the exact right
-  // amount of shared memory and use that. The num_experts >= 256 is just a
-  // temporary solution to unblock Deepseek V3.
-  if (num_experts >= 256) {
+  int device_max_shared_mem;
+  auto dev = topk_ids.get_device();
+  cudaDeviceGetAttribute(&device_max_shared_mem,
+                         cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
+
+  const int32_t num_thread = max((int32_t)num_experts, WARP_SIZE);
+  const int32_t shared_mem_i32 =
+      ((num_thread + 1) * num_experts + (num_experts + 1)) * sizeof(int32_t);
+  const int32_t shared_mem_i16 =
+      ((num_thread + 1) * num_experts) * sizeof(uint16_t) +
+      (num_experts + 1) * sizeof(int32_t);
+
+  bool use_global_memory = false;
+  bool use_i16 = false; // Use uint16_t for shared memory token counts
+  if (shared_mem_i16 > device_max_shared_mem) {
+    use_global_memory = true;
+  } else if (shared_mem_i32 > device_max_shared_mem &&
+             topk_ids.numel() <= 65535) {
+    // when nelements of topk_ids is smaller than 65535 (max value of uint16),
+    // element value of token_cnts would also smaller than 65535,
+    // so we can use uint16 as dtype of token_cnts
+    use_i16 = true;
+  }
+
+  if (use_global_memory) {
     VLLM_DISPATCH_INTEGRAL_TYPES(
         topk_ids.scalar_type(), "moe_align_block_size_global_mem_kernel", [&] {
           // calc needed amount of shared mem for `tokens_cnts` and `cumsum`
           // tensors
           const int32_t num_thread = max((int32_t)num_experts, WARP_SIZE);
 
-          const int32_t mem_tokens_cnts =
-              ((num_experts + 1) * num_experts) * sizeof(int32_t);
-          const int32_t mem_cumsum = (num_experts + 1) * sizeof(int32_t);
-          // allocate global memory
-          int32_t* tokens_cnts;
-          int32_t* cumsum;
-          cudaMalloc(&tokens_cnts, mem_tokens_cnts);
-          cudaMalloc(&cumsum, mem_cumsum);
+          auto options_int = torch::TensorOptions()
+                                 .dtype(torch::kInt)
+                                 .device(topk_ids.device());
+          torch::Tensor token_cnts_buffer =
+              torch::empty({(num_experts + 1) * num_experts}, options_int);
+          torch::Tensor cumsum_buffer =
+              torch::empty({num_experts + 1}, options_int);
 
           auto kernel =
               vllm::moe::moe_align_block_size_global_mem_kernel<scalar_t>;
@@ -252,25 +266,32 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
               sorted_token_ids.data_ptr<int32_t>(),
               experts_ids.data_ptr<int32_t>(),
               num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
-              topk_ids.numel(), tokens_cnts, cumsum);
-          cudaFree(tokens_cnts);
-          cudaFree(cumsum);
+              topk_ids.numel(), token_cnts_buffer.data_ptr<int32_t>(),
+              cumsum_buffer.data_ptr<int32_t>());
         });
-  } else {
+  } else if (use_i16) {
     VLLM_DISPATCH_INTEGRAL_TYPES(
         topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
-          // calc needed amount of shared mem for `tokens_cnts` and `cumsum`
-          // tensors
-          const int32_t num_thread = max((int32_t)num_experts, WARP_SIZE);
-          const int32_t shared_mem =
-              ((num_thread + 1) * num_experts + (num_experts + 1)) *
-              sizeof(int32_t);
-
           // set dynamic shared mem
-          auto kernel = vllm::moe::moe_align_block_size_kernel<scalar_t>;
+          auto kernel =
+              vllm::moe::moe_align_block_size_kernel<scalar_t, uint16_t>;
+          AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(
+              (void*)kernel, shared_mem_i16));
+          kernel<<<1, num_thread, shared_mem_i16, stream>>>(
+              topk_ids.data_ptr<scalar_t>(),
+              sorted_token_ids.data_ptr<int32_t>(),
+              experts_ids.data_ptr<int32_t>(),
+              num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
+              topk_ids.numel());
+        });
+  } else {
+    VLLM_DISPATCH_INTEGRAL_TYPES(
+        topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
+          auto kernel =
+              vllm::moe::moe_align_block_size_kernel<scalar_t, int32_t>;
           AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(
-              (void*)kernel, shared_mem));
-          kernel<<<1, num_thread, shared_mem, stream>>>(
+              (void*)kernel, shared_mem_i32));
+          kernel<<<1, num_thread, shared_mem_i32, stream>>>(
               topk_ids.data_ptr<scalar_t>(),
               sorted_token_ids.data_ptr<int32_t>(),
               experts_ids.data_ptr<int32_t>(),
diff --git a/vllm/config.py b/vllm/config.py
index 4698a05020332..b0a92b2e21343 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -607,7 +607,7 @@ def _verify_cuda_graph(self) -> None:
         self.max_seq_len_to_capture = min(self.max_seq_len_to_capture,
                                           self.max_model_len)
 
-        MODEL_NOT_SUPPORT_CUDA_GRAPH = ['deepseek_v3', 'mllama']
+        MODEL_NOT_SUPPORT_CUDA_GRAPH = ['mllama']
         if (self.hf_config.model_type in MODEL_NOT_SUPPORT_CUDA_GRAPH
                 and not self.enforce_eager):
             logger.warning(

From ecf67814f1a9e31e9802d93e8bd8b11a1c2810e7 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Mon, 20 Jan 2025 20:23:40 -0500
Subject: [PATCH 299/462] Add quantization and guided decoding CODEOWNERS
 (#12228)

Signed-off-by: mgoin <michael@neuralmagic.com>
---
 .github/CODEOWNERS | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 37733ebacbc7a..bc324d8b988b1 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -9,6 +9,8 @@
 /vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
+/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
+/vllm/model_executor/guided_decoding @mgoin
 /vllm/multimodal @DarkLight1337 @ywang96
 CMakeLists.txt @tlrmchlsmth
 

From d4b62d4641377c104baa2de7807f2c61d091cfe2 Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com>
Date: Mon, 20 Jan 2025 23:22:23 -0500
Subject: [PATCH 300/462] [AMD][Build] Porting dockerfiles from the ROCm/vllm
 fork (#11777)

Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
---
 Dockerfile.rocm                               | 258 +++++++-----------
 Dockerfile.rocm_base                          | 158 +++++++++++
 .../installation/gpu/rocm.inc.md              |  13 +-
 ...14336,device_name=AMD_Instinct_MI300X.json |  36 +--
 ...=1792,device_name=AMD_Instinct_MI300X.json |  36 +--
 ...=3584,device_name=AMD_Instinct_MI300X.json |  36 +--
 ...=7168,device_name=AMD_Instinct_MI300X.json |  36 +--
 7 files changed, 337 insertions(+), 236 deletions(-)
 create mode 100644 Dockerfile.rocm_base

diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index e922cb207b899..7213a15a2e005 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -1,174 +1,118 @@
-# Default ROCm 6.2 base image
-ARG BASE_IMAGE="rocm/pytorch:rocm6.2_ubuntu20.04_py3.9_pytorch_release_2.3.0"
+# default base image
+ARG REMOTE_VLLM="0"
+ARG USE_CYTHON="0"
+ARG BUILD_RPD="1"
+ARG COMMON_WORKDIR=/app
+ARG BASE_IMAGE=rocm/vllm-dev:base
 
-# Default ROCm ARCHes to build vLLM for.
-ARG PYTORCH_ROCM_ARCH="gfx908;gfx90a;gfx942;gfx1100"
+FROM ${BASE_IMAGE} AS base
 
-# Whether to install CK-based flash-attention
-# If 0, will not install flash-attention
-ARG BUILD_FA="1"
-ARG FA_GFX_ARCHS="gfx90a;gfx942"
-ARG FA_BRANCH="3cea2fb"
-
-# Whether to build triton on rocm
-ARG BUILD_TRITON="1"
-ARG TRITON_BRANCH="e192dba"
-
-### Base image build stage
-FROM $BASE_IMAGE AS base
-
-# Import arg(s) defined before this build stage
-ARG PYTORCH_ROCM_ARCH
+ARG ARG_PYTORCH_ROCM_ARCH
+ENV PYTORCH_ROCM_ARCH=${ARG_PYTORCH_ROCM_ARCH:-${PYTORCH_ROCM_ARCH}}
 
 # Install some basic utilities
-RUN apt-get update && apt-get install python3 python3-pip -y
-RUN apt-get update && apt-get install -y \
-    curl \
-    ca-certificates \
-    sudo \
-    git \
-    bzip2 \
-    libx11-6 \
-    build-essential \
-    wget \
-    unzip \
-    tmux \
-    ccache \
- && rm -rf /var/lib/apt/lists/*
-
-# When launching the container, mount the code directory to /vllm-workspace
-ARG APP_MOUNT=/vllm-workspace
-WORKDIR ${APP_MOUNT}
-
-RUN python3 -m pip install --upgrade pip
-# Remove sccache so it doesn't interfere with ccache
-# TODO: implement sccache support across components
+RUN apt-get update -q -y && apt-get install -q -y \
+    sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev
+# Remove sccache    
+RUN python3 -m pip install --upgrade pip && pip install setuptools_scm
 RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
+ARG COMMON_WORKDIR
+WORKDIR ${COMMON_WORKDIR}
+
+
+# -----------------------
+# vLLM fetch stages
+FROM base AS fetch_vllm_0
+ONBUILD COPY ./ vllm/
+FROM base AS fetch_vllm_1
+ARG VLLM_REPO="https://github.com/vllm-project/vllm.git"
+ARG VLLM_BRANCH="main"
+ONBUILD RUN git clone ${VLLM_REPO} \
+	    && cd vllm \
+	    && git checkout ${VLLM_BRANCH}
+FROM fetch_vllm_${REMOTE_VLLM} AS fetch_vllm
+
+# -----------------------
+# vLLM build stages
+FROM fetch_vllm AS build_vllm
+ARG USE_CYTHON
+# Build vLLM
+RUN cd vllm \
+    && python3 -m pip install -r requirements-rocm.txt \
+    && python3 setup.py clean --all  \
+    && if [ ${USE_CYTHON} -eq "1" ]; then python3 setup_cython.py build_ext --inplace; fi \
+    && python3 setup.py bdist_wheel --dist-dir=dist
+FROM scratch AS export_vllm
+ARG COMMON_WORKDIR
+COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/dist/*.whl /
+COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/requirements*.txt /
+COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/benchmarks /benchmarks
+COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/tests /tests
+COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/examples /examples
+COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/.buildkite /.buildkite
+
+# -----------------------
+# Test vLLM image
+FROM base AS test
+
+RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/*
+
+# Install vLLM
+RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
+    cd /install \
+    && pip install -U -r requirements-rocm.txt \
+    && pip uninstall -y vllm \
+    && pip install *.whl
+
+WORKDIR /vllm-workspace
+ARG COMMON_WORKDIR
+COPY --from=build_vllm ${COMMON_WORKDIR}/vllm /vllm-workspace
 
-# Install torch == 2.6.0 on ROCm
-RUN --mount=type=cache,target=/root/.cache/pip \
-    case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
-        *"rocm-6.2"*) \
-            python3 -m pip uninstall -y torch torchvision \
-            && python3 -m pip install --pre \
-                torch \
-                'setuptools-scm>=8' \
-                torchvision \
-                --extra-index-url https://download.pytorch.org/whl/rocm6.2;; \
-        *) ;; esac
+# install development dependencies (for testing)
+RUN cd /vllm-workspace \
+    && rm -rf vllm \
+    && python3 -m pip install -e tests/vllm_test_utils \
+    && python3 -m pip install lm-eval[api]==0.4.4
 
-ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer
-ENV PATH=$PATH:/opt/rocm/bin:/libtorch/bin:
-ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib/:/libtorch/lib:
-ENV CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/libtorch/include:/libtorch/include/torch/csrc/api/include/:/opt/rocm/include/:
-
-ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
-ENV CCACHE_DIR=/root/.cache/ccache
-
-
-### AMD-SMI build stage
-FROM base AS build_amdsmi
-# Build amdsmi wheel always
-RUN cd /opt/rocm/share/amd_smi \
-    && python3 -m pip wheel . --wheel-dir=/install
-
-
-### Flash-Attention wheel build stage
-FROM base AS build_fa
-ARG BUILD_FA
-ARG FA_GFX_ARCHS
-ARG FA_BRANCH
-# Build ROCm flash-attention wheel if `BUILD_FA = 1`
-RUN --mount=type=cache,target=${CCACHE_DIR} \
-    if [ "$BUILD_FA" = "1" ]; then \
-        mkdir -p libs \
-        && cd libs \
-        && git clone https://github.com/ROCm/flash-attention.git \
-        && cd flash-attention \
-        && git checkout "${FA_BRANCH}" \
-        && git submodule update --init \
-        && GPU_ARCHS="${FA_GFX_ARCHS}" python3 setup.py bdist_wheel --dist-dir=/install; \
-    # Create an empty directory otherwise as later build stages expect one
-    else mkdir -p /install; \
-    fi
-
-
-### Triton wheel build stage
-FROM base AS build_triton
-ARG BUILD_TRITON
-ARG TRITON_BRANCH
-# Build triton wheel if `BUILD_TRITON = 1`
-RUN --mount=type=cache,target=${CCACHE_DIR} \
-    if [ "$BUILD_TRITON" = "1" ]; then \
-    mkdir -p libs \
-    && cd libs \
-    && python3 -m pip install ninja cmake wheel pybind11 \
-    && git clone https://github.com/OpenAI/triton.git \
-    && cd triton \
-    && git checkout "${TRITON_BRANCH}" \
-    && cd python \
-    && python3 setup.py bdist_wheel --dist-dir=/install; \
-    # Create an empty directory otherwise as later build stages expect one
-    else mkdir -p /install; \
-    fi
-
-
-### Final vLLM build stage
+# -----------------------
+# Final vLLM image
 FROM base AS final
-# Import the vLLM development directory from the build context
-COPY . .
-ARG GIT_REPO_CHECK=0
-RUN --mount=type=bind,source=.git,target=.git \
-    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
 
-RUN python3 -m pip install --upgrade pip
+RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/*
+# Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt.
+# Manually remove it so that later steps of numpy upgrade can continue
+RUN case "$(which python3)" in \
+        *"/opt/conda/envs/py_3.9"*) \
+            rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/;; \
+        *) ;; esac
 
-# Package upgrades for useful functionality or to avoid dependency issues
-RUN --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip install --upgrade numba scipy huggingface-hub[cli] pytest-shard
+RUN python3 -m pip install --upgrade huggingface-hub[cli]
+ARG BUILD_RPD
+RUN if [ ${BUILD_RPD} -eq "1" ]; then \
+    git clone -b nvtx_enabled https://github.com/ROCm/rocmProfileData.git \
+    && cd rocmProfileData/rpd_tracer \
+    && pip install -r requirements.txt && cd ../ \
+    && make && make install \
+    && cd hipMarker && python3 setup.py install ; fi
 
+# Install vLLM
+RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
+    cd /install \
+    && pip install -U -r requirements-rocm.txt \
+    && pip uninstall -y vllm \
+    && pip install *.whl
+
+ARG COMMON_WORKDIR
+
+# Copy over the benchmark scripts as well
+COPY --from=export_vllm /benchmarks ${COMMON_WORKDIR}/vllm/benchmarks
+COPY --from=export_vllm /examples ${COMMON_WORKDIR}/vllm/examples
 
-# Workaround for ray >= 2.10.0
 ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
-# Silences the HF Tokenizers warning
 ENV TOKENIZERS_PARALLELISM=false
 
-RUN --mount=type=cache,target=${CCACHE_DIR} \
-    --mount=type=bind,source=.git,target=.git \
-    --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip install -Ur requirements-rocm.txt \
-    && python3 setup.py clean --all \
-    && python3 setup.py develop
-
-# Copy amdsmi wheel into final image
-RUN --mount=type=bind,from=build_amdsmi,src=/install,target=/install \
-    mkdir -p libs \
-    && cp /install/*.whl libs \
-    # Preemptively uninstall to avoid same-version no-installs
-    && python3 -m pip uninstall -y amdsmi;
-
-# Copy triton wheel(s) into final image if they were built
-RUN --mount=type=bind,from=build_triton,src=/install,target=/install \
-    mkdir -p libs \
-    && if ls /install/*.whl; then \
-        cp /install/*.whl libs \
-        # Preemptively uninstall to avoid same-version no-installs
-        && python3 -m pip uninstall -y triton; fi
-
-# Copy flash-attn wheel(s) into final image if they were built
-RUN --mount=type=bind,from=build_fa,src=/install,target=/install \
-    mkdir -p libs \
-    && if ls /install/*.whl; then \
-        cp /install/*.whl libs \
-        # Preemptively uninstall to avoid same-version no-installs
-        && python3 -m pip uninstall -y flash-attn; fi
-
-# Install wheels that were built to the final image
-RUN --mount=type=cache,target=/root/.cache/pip \
-    if ls libs/*.whl; then \
-    python3 -m pip install libs/*.whl; fi
-
-# install development dependencies (for testing)
-RUN python3 -m pip install -e tests/vllm_test_utils
+# Performance environment variable.
+ENV HIP_FORCE_DEV_KERNARG=1
 
 CMD ["/bin/bash"]
+
diff --git a/Dockerfile.rocm_base b/Dockerfile.rocm_base
new file mode 100644
index 0000000000000..5bbe98b0c2204
--- /dev/null
+++ b/Dockerfile.rocm_base
@@ -0,0 +1,158 @@
+ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:6.3.1-complete
+ARG HIPBLASLT_BRANCH="4d40e36"
+ARG HIPBLAS_COMMON_BRANCH="7c1566b"
+ARG LEGACY_HIPBLASLT_OPTION=
+ARG RCCL_BRANCH="648a58d"
+ARG RCCL_REPO="https://github.com/ROCm/rccl"
+ARG TRITON_BRANCH="e5be006"
+ARG TRITON_REPO="https://github.com/triton-lang/triton.git"
+ARG PYTORCH_BRANCH="8d4926e"
+ARG PYTORCH_VISION_BRANCH="v0.19.1"
+ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
+ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
+ARG FA_BRANCH="b7d29fb"
+ARG FA_REPO="https://github.com/ROCm/flash-attention.git"
+
+FROM ${BASE_IMAGE} AS base
+
+ENV PATH=/opt/rocm/llvm/bin:$PATH
+ENV ROCM_PATH=/opt/rocm
+ENV LD_LIBRARY_PATH=/opt/rocm/lib:/usr/local/lib:
+ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942
+ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
+
+ARG PYTHON_VERSION=3.12
+
+RUN mkdir -p /app
+WORKDIR /app
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Install Python and other dependencies
+RUN apt-get update -y \
+    && apt-get install -y software-properties-common git curl sudo vim less \
+    && add-apt-repository ppa:deadsnakes/ppa \
+    && apt-get update -y \
+    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
+       python${PYTHON_VERSION}-lib2to3 python-is-python3  \
+    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
+    && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
+    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
+    && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
+    && python3 --version && python3 -m pip --version
+
+RUN pip install -U packaging cmake ninja wheel setuptools pybind11 Cython
+
+FROM base AS build_hipblaslt
+ARG HIPBLASLT_BRANCH
+ARG HIPBLAS_COMMON_BRANCH
+# Set to "--legacy_hipblas_direct" for ROCm<=6.2
+ARG LEGACY_HIPBLASLT_OPTION
+RUN git clone https://github.com/ROCm/hipBLAS-common.git
+RUN cd hipBLAS-common \
+    && git checkout ${HIPBLAS_COMMON_BRANCH} \
+    && mkdir build \
+    && cd build \
+    && cmake .. \
+    && make package \
+    && dpkg -i ./*.deb
+RUN git clone https://github.com/ROCm/hipBLASLt
+RUN cd hipBLASLt \
+    && git checkout ${HIPBLASLT_BRANCH} \
+    && ./install.sh -d --architecture ${PYTORCH_ROCM_ARCH} ${LEGACY_HIPBLASLT_OPTION} \
+    && cd build/release \
+    && make package
+RUN mkdir -p /app/install && cp /app/hipBLASLt/build/release/*.deb /app/hipBLAS-common/build/*.deb /app/install
+
+FROM base AS build_rccl
+ARG RCCL_BRANCH
+ARG RCCL_REPO
+RUN git clone ${RCCL_REPO}
+RUN cd rccl \
+    && git checkout ${RCCL_BRANCH} \
+    && ./install.sh -p --amdgpu_targets ${PYTORCH_ROCM_ARCH}
+RUN mkdir -p /app/install && cp /app/rccl/build/release/*.deb /app/install
+
+FROM base AS build_triton
+ARG TRITON_BRANCH
+ARG TRITON_REPO
+RUN git clone ${TRITON_REPO}
+RUN cd triton \
+    && git checkout ${TRITON_BRANCH} \
+    && cd python \
+    && python3 setup.py bdist_wheel --dist-dir=dist
+RUN mkdir -p /app/install && cp /app/triton/python/dist/*.whl /app/install
+
+FROM base AS build_amdsmi
+RUN cd /opt/rocm/share/amd_smi \
+    && pip wheel . --wheel-dir=dist
+RUN mkdir -p /app/install && cp /opt/rocm/share/amd_smi/dist/*.whl /app/install
+
+FROM base AS build_pytorch
+ARG PYTORCH_BRANCH
+ARG PYTORCH_VISION_BRANCH
+ARG PYTORCH_REPO
+ARG PYTORCH_VISION_REPO
+ARG FA_BRANCH
+ARG FA_REPO
+RUN git clone ${PYTORCH_REPO} pytorch
+RUN cd pytorch && git checkout ${PYTORCH_BRANCH} && \
+    pip install -r requirements.txt && git submodule update --init --recursive \
+    && python3 tools/amd_build/build_amd.py \
+    && CMAKE_PREFIX_PATH=$(python3 -c 'import sys; print(sys.prefix)') python3 setup.py bdist_wheel --dist-dir=dist \
+    && pip install dist/*.whl
+RUN git clone ${PYTORCH_VISION_REPO} vision
+RUN cd vision && git checkout ${PYTORCH_VISION_BRANCH} \
+    && python3 setup.py bdist_wheel --dist-dir=dist \
+    && pip install dist/*.whl
+RUN git clone ${FA_REPO}
+RUN cd flash-attention \
+    && git checkout ${FA_BRANCH} \
+    && git submodule update --init \
+    && MAX_JOBS=64 GPU_ARCHS=${PYTORCH_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist
+RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \
+    && cp /app/vision/dist/*.whl /app/install \
+    && cp /app/flash-attention/dist/*.whl /app/install
+
+FROM base AS final
+RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \
+    dpkg -i /install/*deb \
+    && sed -i 's/, hipblaslt-dev \(.*\), hipcub-dev/, hipcub-dev/g' /var/lib/dpkg/status \
+    && sed -i 's/, hipblaslt \(.*\), hipfft/, hipfft/g' /var/lib/dpkg/status
+RUN --mount=type=bind,from=build_rccl,src=/app/install/,target=/install \
+    dpkg -i /install/*deb \
+    && sed -i 's/, rccl-dev \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status \
+    && sed -i 's/, rccl \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status
+RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \
+    pip install /install/*.whl
+RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
+    pip install /install/*.whl
+RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
+    pip install /install/*.whl
+
+ARG BASE_IMAGE
+ARG HIPBLASLT_BRANCH
+ARG LEGACY_HIPBLASLT_OPTION
+ARG RCCL_BRANCH
+ARG RCCL_REPO
+ARG TRITON_BRANCH
+ARG TRITON_REPO
+ARG PYTORCH_BRANCH
+ARG PYTORCH_VISION_BRANCH
+ARG PYTORCH_REPO
+ARG PYTORCH_VISION_REPO
+ARG FA_BRANCH
+ARG FA_REPO
+RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
+    && echo "HIPBLAS_COMMON_BRANCH: ${HIPBLAS_COMMON_BRANCH}" >> /app/versions.txt \
+    && echo "HIPBLASLT_BRANCH: ${HIPBLASLT_BRANCH}" >> /app/versions.txt \
+    && echo "LEGACY_HIPBLASLT_OPTION: ${LEGACY_HIPBLASLT_OPTION}" >> /app/versions.txt \
+    && echo "RCCL_BRANCH: ${RCCL_BRANCH}" >> /app/versions.txt \
+    && echo "RCCL_REPO: ${RCCL_REPO}" >> /app/versions.txt \
+    && echo "TRITON_BRANCH: ${TRITON_BRANCH}" >> /app/versions.txt \
+    && echo "TRITON_REPO: ${TRITON_REPO}" >> /app/versions.txt \
+    && echo "PYTORCH_BRANCH: ${PYTORCH_BRANCH}" >> /app/versions.txt \
+    && echo "PYTORCH_VISION_BRANCH: ${PYTORCH_VISION_BRANCH}" >> /app/versions.txt \
+    && echo "PYTORCH_REPO: ${PYTORCH_REPO}" >> /app/versions.txt \
+    && echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \
+    && echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
+    && echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt
diff --git a/docs/source/getting_started/installation/gpu/rocm.inc.md b/docs/source/getting_started/installation/gpu/rocm.inc.md
index 4256027e6c40e..8ef1bc95fd522 100644
--- a/docs/source/getting_started/installation/gpu/rocm.inc.md
+++ b/docs/source/getting_started/installation/gpu/rocm.inc.md
@@ -123,11 +123,10 @@ It is important that the user kicks off the docker build using buildkit. Either
 <gh-file:Dockerfile.rocm> uses ROCm 6.2 by default, but also supports ROCm 5.7, 6.0 and 6.1 in older vLLM branches.
 It provides flexibility to customize the build of docker image using the following arguments:
 
-- `BASE_IMAGE`: specifies the base image used when running `docker build`, specifically the PyTorch on ROCm base image.
-- `BUILD_FA`: specifies whether to build CK flash-attention. The default is 1. For [Radeon RX 7900 series (gfx1100)](https://rocm.docs.amd.com/projects/radeon/en/latest/index.html), this should be set to 0 before flash-attention supports this target.
-- `FX_GFX_ARCHS`: specifies the GFX architecture that is used to build CK flash-attention, for example, `gfx90a;gfx942` for MI200 and MI300. The default is `gfx90a;gfx942`
-- `FA_BRANCH`: specifies the branch used to build the CK flash-attention in [ROCm's flash-attention repo](https://github.com/ROCmSoftwarePlatform/flash-attention). The default is `ae7928c`
-- `BUILD_TRITON`: specifies whether to build triton flash-attention. The default value is 1.
+- `BASE_IMAGE`: specifies the base image used when running `docker build`. The default value `rocm/vllm-dev:base` is an image published and maintained by AMD. It is being built using <gh-file:Dockerfile.rocm_base>
+- `USE_CYTHON`: An option to run cython compilation on a subset of python files upon docker build
+- `BUILD_RPD`: Include RocmProfileData profiling tool in the image
+- `ARG_PYTORCH_ROCM_ARCH`: Allows to override the gfx architecture values from the base docker image
 
 Their values can be passed in when running `docker build` with `--build-arg` options.
 
@@ -137,10 +136,10 @@ To build vllm on ROCm 6.2 for MI200 and MI300 series, you can use the default:
 DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm -t vllm-rocm .
 ```
 
-To build vllm on ROCm 6.2 for Radeon RX7900 series (gfx1100), you should specify `BUILD_FA` as below:
+To build vllm on ROCm 6.2 for Radeon RX7900 series (gfx1100), you should pick the alternative base image:
 
 ```console
-DOCKER_BUILDKIT=1 docker build --build-arg BUILD_FA="0" -f Dockerfile.rocm -t vllm-rocm .
+DOCKER_BUILDKIT=1 docker build --build-arg BASE_IMAGE="rocm/vllm-dev:navi_base" -f Dockerfile.rocm -t vllm-rocm .
 ```
 
 To run the above docker image `vllm-rocm`, use the below command:
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json
index 6a976788f9b10..4d4b752fa5d64 100644
--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json
@@ -5,7 +5,7 @@
         "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
         "num_warps": 2,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 1
@@ -16,7 +16,7 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 2,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -27,7 +27,7 @@
         "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
         "num_warps": 2,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -38,7 +38,7 @@
         "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
         "num_warps": 1,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -49,7 +49,7 @@
         "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -60,7 +60,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
         "num_warps": 1,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -71,7 +71,7 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 4,
         "num_warps": 2,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 1
@@ -82,7 +82,7 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 4,
         "num_warps": 2,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -93,7 +93,7 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 4,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -104,7 +104,7 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 4,
         "num_warps": 4,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -115,7 +115,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 4,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -126,7 +126,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 4,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 1
@@ -137,7 +137,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 4,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -148,7 +148,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 32,
         "kpack": 2
@@ -159,7 +159,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -170,7 +170,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -181,7 +181,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 1
@@ -192,7 +192,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 1
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json
index 0a46390b2e31b..a218fc40642c1 100644
--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json
@@ -5,7 +5,7 @@
         "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
         "num_warps": 2,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -16,7 +16,7 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 1
@@ -27,7 +27,7 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -38,7 +38,7 @@
         "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
         "num_warps": 2,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -49,7 +49,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 1
@@ -60,7 +60,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 1
@@ -71,7 +71,7 @@
         "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 4,
         "num_warps": 2,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -82,7 +82,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -93,7 +93,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
         "num_warps": 2,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 1
@@ -104,7 +104,7 @@
         "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 4,
         "num_warps": 4,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -115,7 +115,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 4,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 1
@@ -126,7 +126,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 4,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 1
@@ -137,7 +137,7 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -148,7 +148,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 1
@@ -159,7 +159,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 1
@@ -170,7 +170,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -181,7 +181,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 1
@@ -192,7 +192,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 1
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json
index 91011e64c7de4..3682cc548f352 100644
--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json
@@ -5,7 +5,7 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 2,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 1
@@ -16,7 +16,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
         "num_warps": 2,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -27,7 +27,7 @@
         "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
         "num_warps": 2,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -38,7 +38,7 @@
         "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
         "num_warps": 2,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -49,7 +49,7 @@
         "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
         "num_warps": 2,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -60,7 +60,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 1
@@ -71,7 +71,7 @@
         "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 4,
         "num_warps": 2,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -82,7 +82,7 @@
         "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
         "num_warps": 2,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -93,7 +93,7 @@
         "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 4,
         "num_warps": 4,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -104,7 +104,7 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 4,
         "num_warps": 4,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 1
@@ -115,7 +115,7 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 4,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 1
@@ -126,7 +126,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 4,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 1
@@ -137,7 +137,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 32,
         "kpack": 2
@@ -148,7 +148,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 1
@@ -159,7 +159,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -170,7 +170,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 1
@@ -181,7 +181,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -192,7 +192,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 1
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json
index f807d4a5abaed..21742854c613f 100644
--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json
@@ -5,7 +5,7 @@
         "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
         "num_warps": 2,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -16,7 +16,7 @@
         "BLOCK_SIZE_K": 32,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 1
@@ -27,7 +27,7 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 1
@@ -38,7 +38,7 @@
         "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
         "num_warps": 2,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 1
@@ -49,7 +49,7 @@
         "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -60,7 +60,7 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 1
@@ -71,7 +71,7 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 4,
         "num_warps": 2,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -82,7 +82,7 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 4,
         "num_warps": 2,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 1
@@ -93,7 +93,7 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 4,
         "num_warps": 4,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -104,7 +104,7 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 4,
         "num_warps": 4,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -115,7 +115,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 4,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 1
@@ -126,7 +126,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 4,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 32,
         "kpack": 2
@@ -137,7 +137,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 1
@@ -148,7 +148,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 1
@@ -159,7 +159,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -170,7 +170,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 1
@@ -181,7 +181,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
@@ -192,7 +192,7 @@
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
         "num_warps": 8,
-        "num_stages": 0,
+        "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 1

From 5fe6bf29d657518eb4251981ada9f8c4f34dbbde Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Tue, 21 Jan 2025 05:23:14 +0100
Subject: [PATCH 301/462] [BugFix] Fix GGUF tp>1 when vocab_size is not
 divisible by 64 (#12230)

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 tests/models/decoder_only/language/test_gguf.py        | 10 ++++++++++
 vllm/model_executor/layers/vocab_parallel_embedding.py |  4 ++--
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/tests/models/decoder_only/language/test_gguf.py b/tests/models/decoder_only/language/test_gguf.py
index 81b93ebdf0fc0..38cea2462b440 100644
--- a/tests/models/decoder_only/language/test_gguf.py
+++ b/tests/models/decoder_only/language/test_gguf.py
@@ -66,12 +66,20 @@ def gguf_model(self):
     gguf_filename="starcoder2-3b.Q6_K.gguf",
 )
 
+DOLPHIN_CONFIG = GGUFTestConfig(
+    # Test VocabParallelEmbedding sharding issue.
+    original_model="cognitivecomputations/TinyDolphin-2.8-1.1b",
+    gguf_repo="tsunemoto/TinyDolphin-2.8-1.1b-GGUF",
+    gguf_filename="tinydolphin-2.8-1.1b.Q6_K.gguf",
+)
+
 MODELS = [
     LLAMA_CONFIG,
     QWEN2_CONFIG,
     PHI3_CONFIG,
     GPT2_CONFIG,
     STABLELM_CONFIG,
+    DOLPHIN_CONFIG
     # STARCODER_CONFIG, # broken
 ]
 
@@ -107,6 +115,7 @@ def test_models(
 
     # Run unquantized model.
     with vllm_runner(model_name=model.original_model,
+                     enforce_eager=True, # faster tests
                      dtype=dtype,
                      max_model_len=MAX_MODEL_LEN,
                      tensor_parallel_size=tp_size) as original_model:
@@ -115,6 +124,7 @@ def test_models(
 
     # Run gguf model.
     with vllm_runner(model_name=model.gguf_model,
+                     enforce_eager=True,
                      tokenizer_name=model.original_model,
                      dtype=dtype,
                      max_model_len=MAX_MODEL_LEN,
diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py
index 65920aa61ba15..3eb5c39ccf580 100644
--- a/vllm/model_executor/layers/vocab_parallel_embedding.py
+++ b/vllm/model_executor/layers/vocab_parallel_embedding.py
@@ -355,7 +355,7 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
         elif isinstance(param, UninitializedParameter):
             shape = list(loaded_weight.shape)
             if output_dim is not None:
-                shape[output_dim] = shape[output_dim] // self.tp_size
+                shape[output_dim] = self.num_embeddings_per_partition
             param.materialize(tuple(shape), dtype=loaded_weight.dtype)
 
         # If parameter does not have output dim, then it should
@@ -381,7 +381,7 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
         else:
             assert loaded_weight.shape[output_dim] == self.org_vocab_size
 
-        # Copy the data.
+        # Copy the data. Select chunk corresponding to current shard.
         loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
 
         if current_platform.is_hpu():

From 2fc6944c5e69d5d0ce15d09a855452c795d75c3c Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 21 Jan 2025 13:25:03 +0800
Subject: [PATCH 302/462] [ci/build] disable failed and flaky tests (#12240)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 .buildkite/test-pipeline.yaml | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index d2b140e718501..ed8c15358830c 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -477,7 +477,9 @@ steps:
   - pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)'
   - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
   - pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
-  - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
+  # this test fails consistently.
+  # TODO: investigate and fix
+  # - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py
 
@@ -515,7 +517,9 @@ steps:
   - vllm/engine
   - tests/multi_step
   commands:
-  - pytest -v -s multi_step/test_correctness_async_llm.py
+  # this test is quite flaky
+  # TODO: investigate and fix.
+  # - pytest -v -s multi_step/test_correctness_async_llm.py
   - pytest -v -s multi_step/test_correctness_llm.py
 
 - label: Pipeline Parallelism Test # 45min

From 96912550c8399af2632f3f6830f7c3fa9e10a75a Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 21 Jan 2025 15:31:19 +0800
Subject: [PATCH 303/462] [Misc] Rename `MultiModalInputsV2 ->
 MultiModalInputs` (#12244)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/source/api/multimodal/inputs.md      |  2 +-
 vllm/inputs/data.py                       | 12 ++++++------
 vllm/inputs/preprocess.py                 |  6 +++---
 vllm/model_executor/models/blip2.py       |  4 ++--
 vllm/model_executor/models/chameleon.py   |  4 ++--
 vllm/model_executor/models/fuyu.py        |  4 ++--
 vllm/model_executor/models/llava.py       |  6 +++---
 vllm/model_executor/models/phi3v.py       |  4 ++--
 vllm/model_executor/models/qwen2_audio.py |  4 ++--
 vllm/multimodal/inputs.py                 |  2 +-
 vllm/multimodal/processing.py             | 10 +++++-----
 vllm/multimodal/profiling.py              |  4 ++--
 12 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/docs/source/api/multimodal/inputs.md b/docs/source/api/multimodal/inputs.md
index 76b2fb95a5009..21bd938be9e89 100644
--- a/docs/source/api/multimodal/inputs.md
+++ b/docs/source/api/multimodal/inputs.md
@@ -43,7 +43,7 @@
 ```
 
 ```{eval-rst}
-.. autoclass:: vllm.multimodal.inputs.MultiModalInputsV2
+.. autoclass:: vllm.multimodal.inputs.MultiModalInputs
     :members:
     :show-inheritance:
 ```
diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py
index b8163a7acde1d..57e85779dd587 100644
--- a/vllm/inputs/data.py
+++ b/vllm/inputs/data.py
@@ -9,7 +9,7 @@
 if TYPE_CHECKING:
     from vllm.multimodal import (MultiModalDataDict, MultiModalKwargs,
                                  MultiModalPlaceholderDict)
-    from vllm.multimodal.inputs import MultiModalInputsV2
+    from vllm.multimodal.inputs import MultiModalInputs
 
 
 class TextPrompt(TypedDict):
@@ -207,7 +207,7 @@ def token_inputs(
     return inputs
 
 
-DecoderOnlyInputs = Union[TokenInputs, "MultiModalInputsV2"]
+DecoderOnlyInputs = Union[TokenInputs, "MultiModalInputs"]
 """
 The inputs in :class:`~vllm.LLMEngine` before they are
 passed to the model executor.
@@ -222,14 +222,14 @@ class EncoderDecoderInputs(TypedDict):
 
     This specifies the required data for encoder-decoder models.
     """
-    encoder: Union[TokenInputs, "MultiModalInputsV2"]
+    encoder: Union[TokenInputs, "MultiModalInputs"]
     """The inputs for the encoder portion."""
 
-    decoder: Union[TokenInputs, "MultiModalInputsV2"]
+    decoder: Union[TokenInputs, "MultiModalInputs"]
     """The inputs for the decoder portion."""
 
 
-SingletonInputs = Union[TokenInputs, "MultiModalInputsV2"]
+SingletonInputs = Union[TokenInputs, "MultiModalInputs"]
 """
 A processed :class:`SingletonPrompt` which can be passed to
 :class:`vllm.sequence.Sequence`.
@@ -311,7 +311,7 @@ def multi_modal_hashes(self) -> List[str]:
             return inputs.get("multi_modal_hashes", [])
 
         if inputs["type"] == "multimodal":
-            # only the case when we use MultiModalInputsV2
+            # only the case when we use MultiModalInputs
             return inputs.get("mm_hashes", [])  # type: ignore[return-value]
 
         assert_never(inputs)  # type: ignore[arg-type]
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index 0890883cc984f..70372e0cad22d 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -7,7 +7,7 @@
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
-from vllm.multimodal.inputs import MultiModalDataDict, MultiModalInputsV2
+from vllm.multimodal.inputs import MultiModalDataDict, MultiModalInputs
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
 
@@ -247,7 +247,7 @@ def _process_multimodal(
         mm_data: MultiModalDataDict,
         mm_processor_kwargs: Optional[Mapping[str, object]],
         lora_request: Optional[LoRARequest],
-    ) -> MultiModalInputsV2:
+    ) -> MultiModalInputs:
         """
         Apply the model's multi-modal processor to a multi-modal prompt,
         returning the corresponding token IDs and metadata.
@@ -271,7 +271,7 @@ async def _process_multimodal_async(
         mm_data: MultiModalDataDict,
         mm_processor_kwargs: Optional[Mapping[str, object]],
         lora_request: Optional[LoRARequest],
-    ) -> MultiModalInputsV2:
+    ) -> MultiModalInputs:
         """Async version of :meth:`_process_multimodal`."""
         tokenizer_group = self.get_tokenizer_group()
         tokenizer = await tokenizer_group.get_lora_tokenizer_async(lora_request
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index 917b88e802071..f5c796b1acae6 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -15,7 +15,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalInputsV2, MultiModalKwargs,
+                                    MultiModalInputs, MultiModalKwargs,
                                     NestedTensors, PlaceholderRange)
 from vllm.multimodal.parse import MultiModalDataItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
@@ -490,7 +490,7 @@ def apply(
         prompt: Union[str, list[int]],
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
-    ) -> MultiModalInputsV2:
+    ) -> MultiModalInputs:
         result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
 
         # Only <image> tokens should be considered as placeholders,
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index a6634204699c9..e2207865a693d 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -29,7 +29,7 @@
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalInputsV2, MultiModalKwargs,
+                                    MultiModalInputs, MultiModalKwargs,
                                     NestedTensors, PlaceholderRange)
 from vllm.multimodal.parse import MultiModalDataItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
@@ -159,7 +159,7 @@ def apply(
         prompt: Union[str, list[int]],
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
-    ) -> MultiModalInputsV2:
+    ) -> MultiModalInputs:
         result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
 
         # Only <image> tokens should be considered as placeholders,
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index 63e7147f84e03..3f16d3ccbd061 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -31,7 +31,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalInputsV2, MultiModalKwargs,
+                                    MultiModalInputs, MultiModalKwargs,
                                     NestedTensors, PlaceholderRange)
 from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
                                    MultiModalDataItems)
@@ -232,7 +232,7 @@ def apply(
         prompt: Union[str, list[int]],
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
-    ) -> MultiModalInputsV2:
+    ) -> MultiModalInputs:
         result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
 
         # Only |SPEAKER| (image) tokens should be considered as placeholders,
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 6cceded43a79d..a355ae494afd0 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -24,7 +24,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalInputsV2, MultiModalKwargs,
+                                    MultiModalInputs, MultiModalKwargs,
                                     NestedTensors)
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    ImageSize, MultiModalDataItems)
@@ -746,7 +746,7 @@ def apply(
         prompt: Union[str, list[int]],
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
-    ) -> MultiModalInputsV2:
+    ) -> MultiModalInputs:
         hf_config = self.info.get_hf_config()
         image_token_id = hf_config.image_token_index
 
@@ -805,7 +805,7 @@ def get_replacement_mantis(item_idx: int):
             for modality, placeholders in mm_placeholders.items()
         }
 
-        return MultiModalInputsV2(
+        return MultiModalInputs(
             type="multimodal",
             prompt=prompt,
             prompt_token_ids=prompt_ids,
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 7a230e5beb367..dd3b0b35c9294 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -31,7 +31,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalInputsV2, MultiModalKwargs,
+                                    MultiModalInputs, MultiModalKwargs,
                                     NestedTensors, PlaceholderRange)
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    ImageSize, MultiModalDataItems)
@@ -484,7 +484,7 @@ def apply(
         prompt: Union[str, list[int]],
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
-    ) -> MultiModalInputsV2:
+    ) -> MultiModalInputs:
         result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
 
         # Only <|image|> tokens should be considered as placeholders,
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index 47d56175261e4..9cb8f83ad7873 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -37,7 +37,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalInputsV2, MultiModalKwargs,
+                                    MultiModalInputs, MultiModalKwargs,
                                     NestedTensors, PlaceholderRange)
 from vllm.multimodal.parse import (AudioProcessorItems, MultiModalDataItems,
                                    MultiModalDataParser)
@@ -245,7 +245,7 @@ def apply(
         prompt: Union[str, list[int]],
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
-    ) -> MultiModalInputsV2:
+    ) -> MultiModalInputs:
         result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
 
         # Only <|AUDIO|> tokens should be considered as placeholders,
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 4b63703585214..b35184f6855ab 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -491,7 +491,7 @@ def get_items(self, modality: str) -> Sequence[MultiModalKwargsItem]:
 """
 
 
-class MultiModalInputsV2(TypedDict):
+class MultiModalInputs(TypedDict):
     """
     Represents the outputs of
     :class:`vllm.multimodal.processing.BaseMultiModalProcessor`,
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index fa199a07b4cf8..ff02bcc8e1f2d 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -18,8 +18,8 @@
 
 from .hasher import MultiModalHasher
 from .inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                     MultiModalInputsV2, MultiModalKwargs,
-                     MultiModalKwargsItem, PlaceholderRange)
+                     MultiModalInputs, MultiModalKwargs, MultiModalKwargsItem,
+                     PlaceholderRange)
 from .parse import MultiModalDataItems, MultiModalDataParser
 
 if TYPE_CHECKING:
@@ -609,7 +609,7 @@ def __call__(
         prompt: str,
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
-    ) -> MultiModalInputsV2:
+    ) -> MultiModalInputs:
         return self.apply(prompt, mm_data, hf_processor_mm_kwargs)
 
     def _get_data_parser(self) -> MultiModalDataParser:
@@ -1067,7 +1067,7 @@ def apply(
         prompt: Union[str, list[int]],
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
-    ) -> MultiModalInputsV2:
+    ) -> MultiModalInputs:
         """
         Process multi-modal inputs to be used in vLLM.
 
@@ -1169,7 +1169,7 @@ def apply(
             for modality, placeholders in mm_placeholders.items()
         }
 
-        return MultiModalInputsV2(
+        return MultiModalInputs(
             type="multimodal",
             prompt=prompt,
             prompt_token_ids=prompt_ids,
diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index ec580cd6ecddd..20da0f1d8316f 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -11,7 +11,7 @@
 from vllm.inputs import DummyData
 from vllm.logger import init_logger
 
-from .inputs import MultiModalDataDict, MultiModalInputsV2
+from .inputs import MultiModalDataDict, MultiModalInputs
 from .processing import BaseMultiModalProcessor, BaseProcessingInfo
 
 logger = init_logger(__name__)
@@ -131,7 +131,7 @@ def _get_dummy_mm_inputs(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-    ) -> MultiModalInputsV2:
+    ) -> MultiModalInputs:
         factory = self.dummy_inputs
         processor_inputs = factory.get_dummy_processor_inputs(
             seq_len, mm_counts)

From 1f1542afa915e0975d2b63559424403e5e8aae2c Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Tue, 21 Jan 2025 15:49:08 +0800
Subject: [PATCH 304/462] [Misc]Add BNB quantization for
 PaliGemmaForConditionalGeneration  (#12237)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/model_executor/models/paligemma.py | 13 ++++++++++++-
 vllm/model_executor/models/siglip.py    | 14 ++++++++++----
 2 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index f9ad0c67adaba..ed9ae1887259e 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -136,7 +136,18 @@ def forward(self, image_features: torch.Tensor) -> torch.Tensor:
 @INPUT_REGISTRY.register_input_processor(input_processor_for_paligemma)
 class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal,
                                         SupportsPP):
-
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+    
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py
index cca42842bc06e..211e5dc80066e 100644
--- a/vllm/model_executor/models/siglip.py
+++ b/vllm/model_executor/models/siglip.py
@@ -344,10 +344,16 @@ def __init__(
 
         self.config = config
         self.activation_fn = get_act_fn(config.hidden_act)
-
-        # For quantization, we require the hidden size to be a multiple of 64
-        quantizable = (config.hidden_size % 64 == 0
-                       and config.intermediate_size % 64 == 0)
+        # Special handling for BNB quantization
+        if quant_config and quant_config.get_name() == "bitsandbytes":
+            quantizable = True
+        else:
+            # For other quantization, we require the hidden size to be a 
+            # multiple of 64
+            quantizable = (
+                config.hidden_size % 64 == 0
+                and config.intermediate_size % 64 == 0
+            )
         self.fc1 = ColumnParallelLinear(
             config.hidden_size,
             config.intermediate_size,

From f2e9f2a3be6f1dce5a6f01b2263488c6533862ac Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 21 Jan 2025 16:40:39 +0800
Subject: [PATCH 305/462] [Misc] Remove redundant TypeVar from base model
 (#12248)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/interfaces_base.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py
index 4c353ae6ffc13..37b91a803d71e 100644
--- a/vllm/model_executor/models/interfaces_base.py
+++ b/vllm/model_executor/models/interfaces_base.py
@@ -3,7 +3,6 @@
 
 import torch
 import torch.nn as nn
-from transformers import PretrainedConfig
 from typing_extensions import TypeIs, TypeVar
 
 from vllm.logger import init_logger
@@ -19,9 +18,6 @@
 
 logger = init_logger(__name__)
 
-# The type of HF config
-C_co = TypeVar("C_co", bound=PretrainedConfig, covariant=True)
-
 # The type of hidden states
 # Currently, T = torch.Tensor for all models except for Medusa
 # which has T = List[torch.Tensor]
@@ -34,7 +30,7 @@
 
 
 @runtime_checkable
-class VllmModel(Protocol[C_co, T_co]):
+class VllmModel(Protocol[T_co]):
     """The interface required for all models in vLLM."""
 
     def __init__(
@@ -97,7 +93,7 @@ def is_vllm_model(
 
 
 @runtime_checkable
-class VllmModelForTextGeneration(VllmModel[C_co, T], Protocol[C_co, T]):
+class VllmModelForTextGeneration(VllmModel[T], Protocol[T]):
     """The interface required for all generative models in vLLM."""
 
     def compute_logits(
@@ -143,7 +139,7 @@ def is_text_generation_model(
 
 
 @runtime_checkable
-class VllmModelForPooling(VllmModel[C_co, T], Protocol[C_co, T]):
+class VllmModelForPooling(VllmModel[T], Protocol[T]):
     """The interface required for all pooling models in vLLM."""
 
     def pooler(

From a94eee4456b05458bafacc17377de4701ac598a0 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 21 Jan 2025 18:09:39 +0800
Subject: [PATCH 306/462] [Bugfix] Fix mm_limits access for merged multi-modal
 processor (#12252)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/multimodal/profiling.py |  4 ++--
 vllm/multimodal/registry.py  | 19 ++++++++++++++-----
 2 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index 20da0f1d8316f..c68edaff80167 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -106,7 +106,7 @@ def processing_info(self) -> BaseProcessingInfo:
     def dummy_inputs(self) -> BaseDummyInputsBuilder[_I]:
         return self.processor.dummy_inputs
 
-    def _get_mm_limits(self) -> Mapping[str, int]:
+    def get_mm_limits(self) -> Mapping[str, int]:
         mm_config = self.processing_info.ctx.get_mm_config()
         mm_limit_per_prompt = mm_config.limit_per_prompt
 
@@ -146,7 +146,7 @@ def get_dummy_data(self, seq_len: int) -> DummyData:
         # Avoid circular import
         from vllm.sequence import SequenceData
 
-        mm_counts = self._get_mm_limits()
+        mm_counts = self.get_mm_limits()
 
         info = self.processing_info
         mm_max_tokens_per_item = info.get_mm_max_tokens_per_item(seq_len)
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index aaf7ff34ca573..7a4b85385cac9 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -17,7 +17,7 @@
 from .inputs import MultiModalDataDict, MultiModalKwargs, NestedTensors
 from .processing import (BaseMultiModalProcessor, BaseProcessingInfo,
                          ProcessingCache)
-from .profiling import BaseDummyInputsBuilder
+from .profiling import BaseDummyInputsBuilder, MultiModalProfiler
 from .utils import cached_get_tokenizer
 from .video import VideoPlugin
 
@@ -282,13 +282,13 @@ def get_max_tokens_per_item_by_nonzero_modality(
             This is currently directly used only in V1 for profiling the memory 
             usage of a model.
         """
-        limits_per_plugin = self._limits_by_model[model_config]
+        mm_limits = self.get_mm_limits_per_prompt(model_config)
 
         return {
             key: max_tokens_per_mm_item
             for key, max_tokens_per_mm_item in
             self.get_max_tokens_per_item_by_modality(model_config).items()
-            if limits_per_plugin[key] > 0
+            if mm_limits[key] > 0
         }
 
     def get_max_tokens_by_modality(
@@ -304,10 +304,10 @@ def get_max_tokens_by_modality(
         Note:
             This should be called after :meth:`init_mm_limits_per_prompt`.
         """
-        limits_per_plugin = self._limits_by_model[model_config]
+        mm_limits = self.get_mm_limits_per_prompt(model_config)
 
         return {
-            key: limits_per_plugin[key] * max_tokens_per_mm_item
+            key: mm_limits[key] * max_tokens_per_mm_item
             for key, max_tokens_per_mm_item in
             self.get_max_tokens_per_item_by_modality(model_config).items()
         }
@@ -371,6 +371,15 @@ def get_mm_limits_per_prompt(
         Note:
             This should be called after :meth:`init_mm_limits_per_prompt`.
         """
+        if self.has_processor(model_config):
+            tokenizer = cached_get_tokenizer(
+                model_config.tokenizer,
+                trust_remote_code=model_config.trust_remote_code,
+            )
+            processor = self.create_processor(model_config, tokenizer)
+            profiler = MultiModalProfiler(processor)
+            return profiler.get_mm_limits()
+
         return self._limits_by_model[model_config]
 
     def register_processor(

From c81081fece240736e30e0f9a5ed82bb5b483c561 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 21 Jan 2025 19:32:55 +0800
Subject: [PATCH 307/462] [torch.compile] transparent compilation with more
 logging (#12246)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/compilation/backends.py   | 32 +++++++++++++++++++++++++-------
 vllm/compilation/decorators.py |  2 ++
 vllm/compilation/wrapper.py    | 22 ++++++++++++++++++++++
 vllm/config.py                 |  1 +
 4 files changed, 50 insertions(+), 7 deletions(-)

diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 955c25f300512..b9f96c00284b9 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -524,6 +524,7 @@ def configure_post_pass(self):
 
     def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
 
+        vllm_config = self.vllm_config
         if not self.compilation_config.cache_dir:
             # no provided cache dir, generate one based on the known factors
             # that affects the compilation. if none of the factors change,
@@ -532,7 +533,6 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
 
             # 1. factors come from the vllm_config (it mainly summarizes how the
             #    model is created)
-            vllm_config = self.vllm_config
             config_hash = vllm_config.compute_hash()
 
             # 2. factors come from the code files that are traced by Dynamo (
@@ -556,20 +556,26 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
             hash_key = hashlib.md5(
                 f"{config_hash}_{code_hash}".encode()).hexdigest()[:10]
             cache_dir = os.path.join(
-                envs.VLLM_CACHE_ROOT, "torch_compile_cache", hash_key,
-                f"rank_{vllm_config.parallel_config.rank}")
-        else:
-            cache_dir = self.compilation_config.cache_dir
+                envs.VLLM_CACHE_ROOT,
+                "torch_compile_cache",
+                hash_key,
+            )
+            self.compilation_config.cache_dir = cache_dir
+
+        cache_dir = self.compilation_config.cache_dir
         os.makedirs(cache_dir, exist_ok=True)
+        local_cache_dir = os.path.join(
+            cache_dir, f"rank_{vllm_config.parallel_config.rank}")
+        self.compilation_config.local_cache_dir = local_cache_dir
 
         disabled = envs.VLLM_DISABLE_COMPILE_CACHE
         self.inductor_hash_cache: InductorHashCache = InductorHashCache(
-            cache_dir, disabled=disabled)
+            local_cache_dir, disabled=disabled)
         if disabled:
             logger.info("vLLM's torch.compile cache is disabled.")
         else:
             logger.info("Using cache directory: %s for vLLM's torch.compile",
-                        cache_dir)
+                        local_cache_dir)
 
         # when dynamo calls the backend, it means the bytecode
         # transform and analysis are done
@@ -609,6 +615,18 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
                                     self.vllm_config, self.graph_pool,
                                     self).run(*example_inputs)
 
+        graph_path = os.path.join(local_cache_dir, "computation_graph.py")
+        if not os.path.exists(graph_path):
+            # code adapted from https://github.com/thuml/depyf/blob/dab831108a752d1facc00acdd6d4243891845c37/depyf/explain/patched_lazy_format_graph_code.py#L30 # noqa
+            # use `print_readable` because it can include submodules
+            src = "from __future__ import annotations\nimport torch\n" + \
+                self.split_gm.print_readable(print_output=False)
+            src = src.replace("<lambda>", "GraphModule")
+            with open(graph_path, "w") as f:
+                f.write(src)
+
+            logger.debug("Computation graph saved to %s", graph_path)
+
         self._called = True
 
         if not self.compilation_config.use_cudagraph or \
diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py
index 38f284794b8db..17eb0592ced6d 100644
--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -198,6 +198,8 @@ def __call__(self, *args, **kwargs):
                             f" {dims} for argument {k} with type {type(arg)}.")
             # here, it is the starting point of the `torch.compile` process
             start_monitoring_torch_compile(self.vllm_config)
+            logger.debug("Start compiling function %s",
+                         self.original_code_object)
 
         # if we don't use custom dispatcher, we can directly call the
         # compiled function and let torch.compile handle the dispatching,
diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py
index e3260a10c02ae..58a8fa76f6ce2 100644
--- a/vllm/compilation/wrapper.py
+++ b/vllm/compilation/wrapper.py
@@ -9,6 +9,9 @@
 
 import vllm.envs as envs
 from vllm.config import CompilationLevel, get_current_vllm_config
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
 
 
 class TorchCompileWrapperWithCustomDispatcher:
@@ -82,6 +85,25 @@ def bytecode_hook(self, old_code: CodeType, new_code: CodeType):
             return
 
         self.compiled_codes.append(new_code)
+        local_cache_dir = self.vllm_config.compilation_config.local_cache_dir
+        if isinstance(local_cache_dir, str):
+            decompiled_file = os.path.join(local_cache_dir,
+                                           "transformed_code.py")
+            if not os.path.exists(decompiled_file):
+                try:
+                    # usually the decompilation will succeed for most models,
+                    # as we guarantee a full-graph compilation in Dynamo.
+                    # but there's no 100% guarantee, since decompliation is
+                    # not a reversible process.
+                    import depyf
+                    src = depyf.decompile(new_code)
+                    with open(decompiled_file, "w") as f:
+                        f.write(src)
+
+                    logger.debug("Dynamo transformed code saved to %s",
+                                 decompiled_file)
+                except Exception:
+                    pass
 
         if self.vllm_config.compilation_config.use_cudagraph and \
             "update" in new_code.co_names:
diff --git a/vllm/config.py b/vllm/config.py
index b0a92b2e21343..b8628db4d2b80 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2785,6 +2785,7 @@ def model_post_init(self, __context: Any) -> None:
     compile_sizes: List[int] = PrivateAttr
     capture_sizes: List[int] = PrivateAttr
     max_capture_size: int = PrivateAttr
+    local_cache_dir: str = PrivateAttr  # local cache dir for each rank
     # optimization:
     # Intuitively, bs_to_padded_graph_size should be Dict[int, int].
     # since we know all keys are in a range [0, max_capture_size],

From b197a5ccfdaa46b6750feb5efa4c5d8bf4030d44 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Tue, 21 Jan 2025 05:18:43 -0800
Subject: [PATCH 308/462] [V1][Bugfix] Fix data item ordering in mixed-modality
 inference (#12259)

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/multimodal/utils.py           | 34 +++++++++++++++++++++++-
 vllm/v1/worker/gpu_model_runner.py | 42 +++++++++++++++++++++---------
 2 files changed, 62 insertions(+), 14 deletions(-)

diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 1c6bbf77b926f..094e0682a068b 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -1,4 +1,5 @@
 from functools import lru_cache
+from itertools import groupby
 from pathlib import Path
 from typing import TYPE_CHECKING, Optional, TypeVar, Union
 from urllib.parse import ParseResult, urlparse
@@ -26,7 +27,7 @@
 
 if TYPE_CHECKING:
     from .hasher import MultiModalHashDict
-    from .inputs import MultiModalPlaceholderDict
+    from .inputs import MultiModalKwargs, MultiModalPlaceholderDict
 
 
 class MediaConnector:
@@ -477,3 +478,34 @@ def merge_and_sort_multimodal_metadata(
         merged_hashes = None
 
     return sorted_modalities, merged_placeholders, merged_hashes
+
+
+def group_mm_inputs_by_modality(
+        mm_inputs: list["MultiModalKwargs"]) -> list[list["MultiModalKwargs"]]:
+    """Group consecutive MultiModalKwargs from mm_inputs with the same modality 
+    together into the same list for batching purpose. For MultiModalKwargs with 
+    multiple modalities, put them into their own list.
+
+    Args:
+        mm_inputs: List of MultiModalKwargs.
+
+    Returns:
+        list[list[MultiModalKwargs]]: List of list of MultiModalKwargs, each 
+        inner list contains consecutive MultiModalKwargs with same modality, or
+        one with multimodal modalities.
+    """
+    if not mm_inputs:
+        return []
+
+    def modality_group_func(mm_input: "MultiModalKwargs") -> Union[str, int]:
+        # If the input has multiple modalities, return a id as the unique key
+        # for the mm_input input.
+        if len(mm_input.modalities) > 1:
+            return id(mm_input)
+
+        # Otherwise return the modality string
+        return list(mm_input.modalities)[0]
+
+    return [
+        list(group) for _, group in groupby(mm_inputs, key=modality_group_func)
+    ]
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 2350074c23a59..fdf39449a2c59 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -17,6 +17,7 @@
 from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
 from vllm.model_executor.model_loader import get_model
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
+from vllm.multimodal.utils import group_mm_inputs_by_modality
 from vllm.sampling_params import SamplingType
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
                         LayerBlockType, cdiv, is_pin_memory_available)
@@ -629,19 +630,34 @@ def _execute_encoder(self, scheduler_output: "SchedulerOutput"):
             for input_id in encoder_input_ids:
                 mm_inputs.append(req_state.mm_inputs[input_id])
                 req_input_ids.append((req_id, input_id))
-        batched_mm_inputs = MultiModalKwargs.batch(mm_inputs)
-        batched_mm_inputs = MultiModalKwargs.as_kwargs(batched_mm_inputs,
-                                                       device=self.device)
-
-        # Run the encoder.
-        # `encoder_outputs` is either of the following:
-        # 1. A tensor of shape [num_images, feature_size, hidden_size]
-        # in case when feature_size is fixed across all images.
-        # 2. A list (length: num_images) of tensors, each of shape
-        # [feature_size, hidden_size] in case when the feature size is
-        # dynamic depending on input images.
-        encoder_outputs = self.model.get_multimodal_embeddings(
-            **batched_mm_inputs)
+
+        # Batch mm inputs as much as we can: if a request in the batch has
+        # multiple modalities or a different modality than the previous one,
+        # we process it separately to preserve item order.
+        # FIXME(ywang96): This is a hacky way to deal with multiple modalities
+        # in the same batch while still being able to benefit from batching
+        # multimodal inputs. The proper solution should be reordering the
+        # encoder outputs.
+        grouped_mm_inputs_list = group_mm_inputs_by_modality(mm_inputs)
+
+        encoder_outputs = []
+        for grouped_mm_inputs in grouped_mm_inputs_list:
+            batched_mm_inputs = MultiModalKwargs.batch(grouped_mm_inputs)
+            batched_mm_inputs = MultiModalKwargs.as_kwargs(batched_mm_inputs,
+                                                           device=self.device)
+
+            # Run the encoder.
+            # `curr_group_outputs` is either of the following:
+            # 1. A tensor of shape (num_items, feature_size, hidden_size)
+            # in case feature_size is fixed across all multimodal items.
+            # 2. A list or tuple (length: num_items) of tensors, each of shape
+            # (feature_size, hidden_size) in case the feature size is dynamic
+            # depending on the input multimodal items.
+            curr_group_outputs = self.model.get_multimodal_embeddings(
+                **batched_mm_inputs)
+
+            for output in curr_group_outputs:
+                encoder_outputs.append(output)
 
         # Cache the encoder outputs.
         for (req_id, input_id), output in zip(req_input_ids, encoder_outputs):

From 9a7c3a0042bb7f44b6c2784220cc3d99391eb4c2 Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Tue, 21 Jan 2025 14:49:08 +0100
Subject: [PATCH 309/462] Remove pytorch comments for outlines +
 compressed-tensors (#12260)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
---
 requirements-common.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements-common.txt b/requirements-common.txt
index 6c390bcfd18e6..777b2bb124323 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -19,7 +19,7 @@ pillow  # Required for image processing
 prometheus-fastapi-instrumentator >= 7.0.0
 tiktoken >= 0.6.0  # Required for DBRX tokenizer
 lm-format-enforcer >= 0.10.9, < 0.11
-outlines == 0.1.11 # Requires pytorch
+outlines == 0.1.11
 lark == 1.2.2 
 xgrammar >= 0.1.6; platform_machine == "x86_64"
 typing_extensions >= 4.10
@@ -34,6 +34,6 @@ pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
 setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
 einops # Required for Qwen2-VL.
-compressed-tensors == 0.8.1 # required for compressed-tensors, requires pytorch
+compressed-tensors == 0.8.1 # required for compressed-tensors
 depyf==0.18.0 # required for profiling and debugging with compilation config
 cloudpickle # allows pickling lambda functions in model_executor/models/registry.py

From c64612802b72f1e015b98e0c569367e12391f579 Mon Sep 17 00:00:00 2001
From: Mengqing Cao <cmq0113@163.com>
Date: Tue, 21 Jan 2025 22:42:41 +0800
Subject: [PATCH 310/462] [Platform] improve platforms getattr (#12264)

Signed-off-by: Mengqing Cao <cmq0113@163.com>
---
 vllm/platforms/__init__.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py
index 6033a806d2023..d20d35199bf5c 100644
--- a/vllm/platforms/__init__.py
+++ b/vllm/platforms/__init__.py
@@ -217,8 +217,11 @@ def __getattr__(name: str):
             global _init_trace
             _init_trace = "".join(traceback.format_stack())
         return _current_platform
-    else:
+    elif name in globals():
         return globals()[name]
+    else:
+        raise AttributeError(
+            f"No attribute named '{name}' exists in {__name__}.")
 
 
 __all__ = [

From 3aec49e56f60c8ccafe108a8922c731e235a8fcc Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 21 Jan 2025 23:03:17 +0800
Subject: [PATCH 311/462] [ci/build] update nightly torch for gh200 test
 (#12270)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index 4542bc9cf0bd2..261f5440aee47 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -52,7 +52,7 @@ WORKDIR /workspace
 # after this step
 RUN --mount=type=cache,target=/root/.cache/pip \
     if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" "torchvision==0.22.0.dev20241215";  \
+        python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu126 "torch==2.7.0.dev20250121+cu126" "torchvision==0.22.0.dev20250121";  \
     fi
 
 COPY requirements-common.txt requirements-common.txt

From 9705b90bcf66ba6316e70bef442074df7ee6cebf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jannis=20Sch=C3=B6nleber?= <joennlae@gmail.com>
Date: Tue, 21 Jan 2025 18:47:04 +0100
Subject: [PATCH 312/462] [Bugfix] fix race condition that leads to wrong order
 of token returned (#10802)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Jannis Schönleber <joennlae@gmail.com>
---
 vllm/engine/multiprocessing/client.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py
index a9ab899535180..74b98d06c509a 100644
--- a/vllm/engine/multiprocessing/client.py
+++ b/vllm/engine/multiprocessing/client.py
@@ -262,7 +262,14 @@ async def setup(self):
         """Setup the client before it starts sending server requests."""
 
         # Start output_loop
-        self.output_loop = asyncio.create_task(self.run_output_handler_loop())
+        if self.output_loop is None:
+            # only generate once to avoid multiple concurrent output_loops
+            # this will lead to race conditions and wrong orders of tokens
+            # returned by the engine
+            # setup will be called multiple times during the startup of
+            # the engine
+            self.output_loop = asyncio.create_task(
+                self.run_output_handler_loop())
 
         with self.get_data_socket() as socket:
             # Wait until server is ready.
@@ -271,8 +278,9 @@ async def setup(self):
             self.tracing_flag = response.tracing_enabled
 
             # Start health_loop.
-            self.health_loop = asyncio.create_task(
-                self.run_heartbeat_loop(timeout=VLLM_RPC_TIMEOUT))
+            if self.health_loop is None:
+                self.health_loop = asyncio.create_task(
+                    self.run_heartbeat_loop(timeout=VLLM_RPC_TIMEOUT))
 
     def close(self):
         """Destroy the ZeroMQ Context."""

From 1e60f87bb37bc28410e6cf6e9030e9a28ad49d12 Mon Sep 17 00:00:00 2001
From: Jinzhen Lin <linjinzhen@hotmail.com>
Date: Wed, 22 Jan 2025 02:30:28 +0800
Subject: [PATCH 313/462] [Kernel] fix moe_align_block_size error condition
 (#12239)

Signed-off-by: Jinzhen Lin <linjinzhen@hotmail.com>
---
 csrc/moe/moe_align_sum_kernels.cu | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/csrc/moe/moe_align_sum_kernels.cu b/csrc/moe/moe_align_sum_kernels.cu
index 715a1b42841f2..d609ce1697df3 100644
--- a/csrc/moe/moe_align_sum_kernels.cu
+++ b/csrc/moe/moe_align_sum_kernels.cu
@@ -233,15 +233,17 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
       (num_experts + 1) * sizeof(int32_t);
 
   bool use_global_memory = false;
-  bool use_i16 = false; // Use uint16_t for shared memory token counts
-  if (shared_mem_i16 > device_max_shared_mem) {
-    use_global_memory = true;
-  } else if (shared_mem_i32 > device_max_shared_mem &&
+  bool use_i16 = false;  // Use uint16_t for shared memory token counts
+  if (shared_mem_i32 < device_max_shared_mem) {
+    // Do nothing in this case. We're all set to use int32_t token counts
+  } else if (shared_mem_i16 < device_max_shared_mem &&
              topk_ids.numel() <= 65535) {
     // when nelements of topk_ids is smaller than 65535 (max value of uint16),
     // element value of token_cnts would also smaller than 65535,
     // so we can use uint16 as dtype of token_cnts
     use_i16 = true;
+  } else {
+    use_global_memory = true;
   }
 
   if (use_global_memory) {

From 132a1321004bb994e2260fcc02a3c312bd3b1fe0 Mon Sep 17 00:00:00 2001
From: Ricky Xu <xuchen727@hotmail.com>
Date: Tue, 21 Jan 2025 11:51:13 -0800
Subject: [PATCH 314/462] [v1][stats][1/n] Add RequestStatsUpdate and
 RequestStats types  (#10907)

Signed-off-by: rickyx <rickyx@anyscale.com>
---
 tests/v1/test_stats.py    | 300 +++++++++++++++++++++++++
 vllm/v1/stats/__init__.py |   0
 vllm/v1/stats/common.py   | 449 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 749 insertions(+)
 create mode 100644 tests/v1/test_stats.py
 create mode 100644 vllm/v1/stats/__init__.py
 create mode 100644 vllm/v1/stats/common.py

diff --git a/tests/v1/test_stats.py b/tests/v1/test_stats.py
new file mode 100644
index 0000000000000..580392ac5f446
--- /dev/null
+++ b/tests/v1/test_stats.py
@@ -0,0 +1,300 @@
+import pytest
+
+from vllm.sampling_params import SamplingParams
+from vllm.v1.stats.common import RequestStats, RequestStatsUpdate
+
+
+def make_update(
+    request_id: str,
+    update_type: RequestStatsUpdate.Type,
+    monotonic_ts_s: float,
+    **kwargs,
+):
+    if update_type == RequestStatsUpdate.Type.INPUT_PROCESSED:
+        kwargs.setdefault("sampling_params", SamplingParams(n=1))
+        kwargs.setdefault("num_prompt_tokens", 10)
+    elif update_type == RequestStatsUpdate.Type.PREFILLING:
+        kwargs.setdefault("num_computed_tokens", 10)
+        kwargs.setdefault("num_cached_tokens", 10)
+    elif update_type == RequestStatsUpdate.Type.DETOKENIZED:
+        kwargs.setdefault("num_new_tokens", 10)
+    elif update_type == RequestStatsUpdate.Type.FINISHED:
+        kwargs.setdefault("finish_reason", "test_reason")
+
+    return RequestStatsUpdate(
+        request_id=request_id,
+        type=update_type,
+        monotonic_ts_s=monotonic_ts_s,
+        **kwargs,
+    )
+
+
+def test_invalid_request_update():
+    request_id = "test_request"
+    update_specific_required_fields = {
+        RequestStatsUpdate.Type.INPUT_PROCESSED: [
+            "sampling_params",
+            "num_prompt_tokens",
+        ],
+        RequestStatsUpdate.Type.PREFILLING: [
+            "num_computed_tokens",
+            "num_cached_tokens",
+        ],
+        RequestStatsUpdate.Type.DETOKENIZED: ["num_new_tokens"],
+        RequestStatsUpdate.Type.FINISHED: ["finish_reason"],
+    }
+
+    # Missing a required field should raise an assertion error.
+    for update_type in RequestStatsUpdate.Type:
+        required_fields = update_specific_required_fields.get(update_type, [])
+
+        # Try to miss one of the required fields.
+        kwargs = {field: object() for field in required_fields}
+        for field in required_fields:
+            copy_kwargs = kwargs.copy()
+            copy_kwargs.pop(field)
+            with pytest.raises(ValueError):
+                RequestStatsUpdate(
+                    request_id=request_id,
+                    type=update_type,
+                    **copy_kwargs,
+                )
+
+
+def test_invalid_request_update_transition():
+    # Test invalid transition type.
+    for src in RequestStatsUpdate.Type:
+        for dst in RequestStatsUpdate.Type:
+            if dst not in RequestStatsUpdate._VALID_TRANSITIONS[src]:
+                with pytest.raises(AssertionError):
+                    RequestStatsUpdate.check_valid_update(
+                        make_update(
+                            update_type=dst,
+                            request_id="test_request",
+                            monotonic_ts_s=1,
+                        ),
+                        last_update_type=src,
+                        last_updated_ts_s=0,
+                    )
+            else:
+                RequestStatsUpdate.check_valid_update(
+                    make_update(
+                        request_id="test_request",
+                        update_type=dst,
+                        monotonic_ts_s=1,
+                    ),
+                    last_update_type=src,
+                    last_updated_ts_s=0,
+                )
+
+    # Test invalid timestamp.
+    with pytest.raises(AssertionError):
+        RequestStatsUpdate.check_valid_update(
+            make_update(
+                request_id="test_request",
+                update_type=RequestStatsUpdate.Type.ARRIVED,
+                monotonic_ts_s=1,
+            ),
+            last_update_type=None,
+            last_updated_ts_s=2,
+        )
+
+
+def test_lifecycle_updates():
+    request_id = "test_request"
+    stats = RequestStats(request_id=request_id)
+
+    # Test the below scenario:
+    arrived_ts = 0
+    input_processed_ts = 1
+    queued_ts = 2
+    prefilling_ts = 3
+    decoded_ts = 5
+    detokenized_ts = 6
+    decoded_2_ts = 7
+    detokenized_2_ts = 8
+    preempted_ts = 9
+    resumed_ts = 10
+    decoded_3_ts = 11
+    detokenized_3_ts = 12
+    finished_ts = 13
+
+    # Test ARRIVED
+    arrived_update = RequestStatsUpdate(
+        request_id=request_id,
+        type=RequestStatsUpdate.Type.ARRIVED,
+        monotonic_ts_s=arrived_ts,
+    )
+    stats.update_from(arrived_update)
+    assert stats.arrival_ts_s == arrived_ts
+    assert stats.last_updated_ts_s == arrived_ts
+
+    # Test INPUT_PROCESSED
+    sampling_params = SamplingParams(n=1)
+    input_processed_update = RequestStatsUpdate(
+        request_id=request_id,
+        type=RequestStatsUpdate.Type.INPUT_PROCESSED,
+        monotonic_ts_s=input_processed_ts,
+        sampling_params=sampling_params,
+        num_prompt_tokens=6,
+    )
+    stats.update_from(input_processed_update)
+    assert stats.input_processor_end_ts_s == input_processed_ts
+    assert stats.last_updated_ts_s == input_processed_ts
+    assert stats.num_prompt_tokens == 6
+    assert stats.sampling_params == sampling_params
+
+    assert stats.first_token_ts_s is None
+    assert stats.prefill_ts_s is None
+
+    # Test QUEUED
+    queued_update = RequestStatsUpdate(
+        request_id=request_id,
+        type=RequestStatsUpdate.Type.QUEUED,
+        monotonic_ts_s=queued_ts,
+    )
+    stats.update_from(queued_update)
+    assert stats.queued_ts_s == queued_ts
+    assert stats.last_updated_ts_s == queued_ts
+
+    # Test PREFILLING
+    prefilling_update = RequestStatsUpdate(
+        request_id=request_id,
+        type=RequestStatsUpdate.Type.PREFILLING,
+        monotonic_ts_s=prefilling_ts,
+        num_computed_tokens=3,
+        num_cached_tokens=1,
+    )
+    stats.update_from(prefilling_update)
+    assert stats.prefill_ts_s == prefilling_ts
+    assert stats.num_computed_tokens == 3
+    assert stats.num_cached_tokens == 1
+    assert stats.queue_duration_s == prefilling_ts - queued_ts
+
+    # Test DECODING
+    decoded_update = RequestStatsUpdate(
+        request_id=request_id,
+        type=RequestStatsUpdate.Type.DECODING,
+        monotonic_ts_s=decoded_ts,
+    )
+    stats.update_from(decoded_update)
+    assert stats.last_updated_ts_s == decoded_ts
+
+    # Test DETOKENIZED
+    detokenized_update = RequestStatsUpdate(
+        request_id=request_id,
+        type=RequestStatsUpdate.Type.DETOKENIZED,
+        monotonic_ts_s=detokenized_ts,
+        num_new_tokens=1,
+    )
+    stats.update_from(detokenized_update)
+    assert stats.last_updated_ts_s == detokenized_ts
+    assert stats.num_output_tokens == 1
+    # Since arrival
+    assert stats.first_token_latency_s == detokenized_ts - arrived_ts
+    # Since first scheduled
+    assert stats.prefill_latency_s == detokenized_ts - prefilling_ts
+
+    # Test another DECODING and DETOKENIZED should
+    # yield correct inter token latency
+    decoded_update = RequestStatsUpdate(
+        request_id=request_id,
+        type=RequestStatsUpdate.Type.DECODING,
+        monotonic_ts_s=decoded_2_ts,
+    )
+    stats.update_from(decoded_update)
+
+    detokenized_update = RequestStatsUpdate(
+        request_id=request_id,
+        type=RequestStatsUpdate.Type.DETOKENIZED,
+        monotonic_ts_s=detokenized_2_ts,
+        num_new_tokens=1,
+    )
+    stats.update_from(detokenized_update)
+    assert stats.output_token_latency_s_lst == [
+        detokenized_2_ts - detokenized_ts,
+    ]
+    assert stats.num_output_tokens == 2
+
+    # Test PREEMPTED
+    preempted_update = RequestStatsUpdate(
+        request_id=request_id,
+        type=RequestStatsUpdate.Type.PREEMPTED,
+        monotonic_ts_s=preempted_ts,
+    )
+    stats.update_from(preempted_update)
+    assert stats.last_updated_ts_s == preempted_ts
+    assert stats.preempted_ts_s_lst == [preempted_ts]
+    # States should be reset
+    assert stats.num_computed_tokens == 0
+    assert stats.num_cached_tokens == 0
+    # These states should not be reset
+    assert stats.num_output_tokens == 2
+    assert stats.output_token_latency_s_lst == [
+        detokenized_2_ts - detokenized_ts,
+    ]
+    assert stats.prefill_latency_s == prefilling_ts - arrived_ts
+    assert stats.num_prompt_tokens == 6
+    assert stats.prefill_start_ts_s_lst == [prefilling_ts]
+
+    # Test resumed
+    resumed_update = RequestStatsUpdate(
+        request_id=request_id,
+        type=RequestStatsUpdate.Type.PREFILLING,
+        monotonic_ts_s=resumed_ts,
+        num_computed_tokens=6,
+        num_cached_tokens=2,
+    )
+    stats.update_from(resumed_update)
+    # prefill timestamp should not be updated since it's a resumed prefill
+    assert stats.prefill_ts_s == prefilling_ts
+    assert stats.num_computed_tokens == 6
+    assert stats.num_cached_tokens == 2
+    assert stats.prefill_start_ts_s_lst == [
+        prefilling_ts,
+        resumed_ts,
+    ]
+    assert stats.last_updated_ts_s == resumed_ts
+
+    # Test another DECODED/DETOKENIZED should yield correct first token latency.
+    decoded_update = RequestStatsUpdate(
+        request_id=request_id,
+        type=RequestStatsUpdate.Type.DECODING,
+        monotonic_ts_s=decoded_3_ts,
+    )
+    detokenized_update = RequestStatsUpdate(
+        request_id=request_id,
+        type=RequestStatsUpdate.Type.DETOKENIZED,
+        monotonic_ts_s=detokenized_3_ts,
+        num_new_tokens=1,
+    )
+    stats.update_from(decoded_update)
+    stats.update_from(detokenized_update)
+    assert stats.first_token_ts_s == detokenized_ts - arrived_ts
+    assert stats.num_output_tokens == 3
+    assert stats.output_token_latency_s_lst == [
+        detokenized_2_ts - detokenized_ts,
+        detokenized_3_ts - detokenized_2_ts,
+    ]
+
+    # Test FINISHED
+    finished_update = RequestStatsUpdate(
+        request_id=request_id,
+        type=RequestStatsUpdate.Type.FINISHED,
+        monotonic_ts_s=finished_ts,
+        finish_reason="test_reason",
+    )
+    stats.update_from(finished_update)
+    assert stats.last_updated_ts_s == finished_ts
+    assert stats.e2e_latency_s == finished_ts - arrived_ts
+    assert stats.inference_latency_s == finished_ts - prefilling_ts
+    assert stats.prefill_latency_s == detokenized_ts - prefilling_ts
+    assert stats.decode_latency_s == finished_ts - detokenized_ts
+    assert stats.first_token_latency_s == detokenized_ts - arrived_ts
+    assert stats.queue_duration_s == prefilling_ts - queued_ts
+    assert stats.is_finished
+    assert stats.finish_reason == "test_reason"
+
+    # TODO(rickyx): Add model forward/execute time.
+    assert stats.model_forward_duration_s == 0.0
+    assert stats.model_execute_duration_s == 0.0
diff --git a/vllm/v1/stats/__init__.py b/vllm/v1/stats/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/vllm/v1/stats/common.py b/vllm/v1/stats/common.py
new file mode 100644
index 0000000000000..099d82c5904cf
--- /dev/null
+++ b/vllm/v1/stats/common.py
@@ -0,0 +1,449 @@
+import time
+from dataclasses import dataclass
+from dataclasses import field as dataclass_field
+from enum import IntEnum
+from typing import ClassVar, Dict, List, Optional, Set
+
+import msgspec
+from msgspec import field as msgspec_field
+
+from vllm.sampling_params import SamplingParams
+
+
+class RequestStatsUpdate(msgspec.Struct,
+                         array_like=True,
+                         omit_defaults=True,
+                         gc=False):
+    """
+    An update to the request stats.
+
+    This represents a stats update at a specific timestamp with metadata
+    associated with the update.
+
+    NOTE: since there might be multiple processes generating updates at
+    different parts of the engine (e.g. input processor, scheduler, engine core,
+    etc.), we use the monotonic timestamp to record the update to compute any
+    intervals, and explicit wall-clock timestamp should be used for timestamps.
+
+    WARNING: This assumes stats are generated in a single machine. If there are
+    potentially multiple machines, one should always generate the stats updates
+    on one single machine or use something else.
+    """
+
+    class Type(IntEnum):
+        """See `RequestStats` for the lifecycle of a request."""
+
+        # Request arrived at the engine frontend.
+        ARRIVED = 0
+        # Input processed by the input processor.
+        INPUT_PROCESSED = 1
+        # Queued on the engine core.
+        QUEUED = 2
+        # Scheduled running prefill by the scheduler.
+        # A request could be running a new prefill on the prompt tokens or
+        # a resumed prefill on the original prefill tokens + generated output
+        # tokens before preemption.
+        PREFILLING = 3
+        # Preempted by the scheduler.
+        PREEMPTED = 4
+        # Output token is generated by the engine core.
+        DECODING = 5
+        # Token detokenized by the detokenizer.
+        # We will record the timestamp for each output token, as well as the
+        # finish reason.
+        DETOKENIZED = 6
+        # Request finishes (or aborts).
+        FINISHED = 7
+
+    """
+    Valid state updates:
+    ARRIVED
+    │
+    ├──────► INPUT_PROCESSED ──────► QUEUED ──────► PREFILLING ◄────┐
+    │              │                   │              │             │
+    │              │                   │              ▼             │
+    │              │                   │       -──► DECODING        │
+    │              │                   │       |      │             │
+    │              │                   │       |      ▼             │
+    │              │                   │       └─ DETOKENIZED       │
+    │              │                   │              │             │
+    │              │                   │              ▼             │
+    │              ▼                   ▼           PREEMPTED ◄──────┘
+    │              │                   │              │
+    └──────────────┴───────────────────┴──────────────┴
+                                │
+                                ▼
+                FINISHED (All could go to FINISHED)
+    """
+    _VALID_TRANSITIONS: ClassVar[Dict[Type, Set[Type]]] = {
+        Type.ARRIVED: {
+            Type.INPUT_PROCESSED,
+            Type.FINISHED,
+        },
+        Type.INPUT_PROCESSED: {
+            Type.QUEUED,
+            Type.FINISHED,
+        },
+        Type.QUEUED: {
+            Type.PREFILLING,
+            Type.FINISHED,
+        },
+        Type.PREFILLING: {
+            Type.DECODING,
+            Type.PREEMPTED,
+            Type.FINISHED,
+        },
+        Type.DECODING: {
+            Type.DETOKENIZED,
+            Type.FINISHED,
+        },
+        Type.DETOKENIZED: {
+            Type.DECODING,
+            Type.PREEMPTED,
+            Type.FINISHED,
+        },
+        Type.PREEMPTED: {Type.PREFILLING, Type.FINISHED},
+        Type.FINISHED: set(),
+    }
+
+    request_id: str
+
+    type: Type
+
+    # Timestamp when the update is recorded. This is used to record time
+    # intervals between events rather than wall clock time.
+    monotonic_ts_s: float = msgspec_field(
+        default_factory=lambda: time.monotonic())
+
+    ############################################################
+    # Metadata associated with the update.
+    ############################################################
+    # For input_processed. Metadata needed for stats logging.
+    num_prompt_tokens: Optional[int] = None
+    sampling_params: Optional[SamplingParams] = None
+
+    # For running.
+    # Number of tokens computed when scheduled to run.
+    num_computed_tokens: Optional[int] = None
+    # Number of cached tokens when scheduled to run.
+    num_cached_tokens: Optional[int] = None
+
+    # For decoded.
+    # The number of new output tokens generated.
+    num_new_tokens: Optional[int] = None
+
+    # For both detokenized and decoded.
+    # Finished reason.
+    finish_reason: Optional[str] = None
+
+    # Non-optional fields for each update type.
+    _REQUIRED_FIELDS: ClassVar[Dict[Type, List[str]]] = {
+        Type.INPUT_PROCESSED: ["num_prompt_tokens", "sampling_params"],
+        Type.PREFILLING: ["num_computed_tokens", "num_cached_tokens"],
+        Type.DETOKENIZED: ["num_new_tokens"],
+        Type.FINISHED: ["finish_reason"],
+    }
+
+    def __post_init__(self):
+        required_fields = self._REQUIRED_FIELDS.get(self.type, [])
+        for field in required_fields:
+            if getattr(self, field) is None:
+                raise ValueError(
+                    f"Field {field} is required for update type {self.type}.")
+
+    @staticmethod
+    def check_valid_update(
+        update: "RequestStatsUpdate",
+        last_update_type: Optional[Type],
+        last_updated_ts_s: Optional[float],
+    ):
+        if last_update_type is None:
+            assert update.type == RequestStatsUpdate.Type.ARRIVED
+        else:
+            valid_cur_update_types = RequestStatsUpdate._VALID_TRANSITIONS[
+                last_update_type]
+            assert update.type in valid_cur_update_types, (
+                f"Invalid update type: {update.type} for last_update_type: "
+                f"{last_update_type}.")
+
+        if last_updated_ts_s is not None:
+            assert update.monotonic_ts_s >= last_updated_ts_s, (
+                "Update timestamp must be monotonically increasing, but "
+                f"last_updated_ts_s={last_updated_ts_s} and "
+                f"update.monotonic_ts_s={update.monotonic_ts_s}.")
+
+
+@dataclass
+class RequestStats:
+    """Stats associated with a request (`Request`)."""
+
+    ############################################################
+    # Metadata
+    ############################################################
+    request_id: str
+    sampling_params: Optional[SamplingParams] = None
+    num_prompt_tokens: Optional[int] = None
+
+    ############################################################
+    # Metrics and Stats
+    ############################################################
+    # Timestamp when the request was last updated.
+    last_updated_ts_s: Optional[float] = None
+
+    # Last update stats type.
+    last_update_type: Optional[RequestStatsUpdate.Type] = None
+
+    # Timestamp when the request arrived at the llm engine.
+    arrival_ts_s: Optional[float] = None
+
+    # Number of tokens cached. When part of the request prefix is cached,
+    # this will be set.
+    num_cached_tokens: int = 0
+
+    # Number of tokens computed.
+    num_computed_tokens: int = 0
+
+    # The timestamp when the request become waiting in the queue.
+    queued_ts_s: Optional[float] = None
+
+    # When the input processor is completed.
+    input_processor_end_ts_s: Optional[float] = None
+
+    # A sorted list of timestamps when the request was scheduled to prefill.
+    # This could be when:
+    # 1. the request is newly scheduled, so it's a new prefill.
+    # 2. the request was preempted and resumed. It is equivalent to running
+    #    a prefill of the original prefill tokens + generated output tokens
+    #    before preemption.
+    prefill_start_ts_s_lst: List[float] = dataclass_field(default_factory=list)
+
+    # A list of timestamps when a token is decoded by the engine core.
+    decoding_ts_s_lst: List[float] = dataclass_field(default_factory=list)
+
+    # A sorted list of timestamps for each output token.
+    output_token_ts_s_lst: List[float] = dataclass_field(default_factory=list)
+
+    # First token's timestamp.
+    first_token_ts_s: Optional[float] = None
+
+    # TODO(rickyx): we need model runner to surface these.
+    model_forward_duration_s: float = 0.0
+    # Includes model forward, block/sync across workers, cpu-gpu sync time
+    # and sampling time.
+    model_execute_duration_s: float = 0.0
+
+    # A sorted list of timestamps when the request was preempted at the
+    # scheduler.
+    # TODO(rickyx): right now, we don't actually have a good high-level
+    # metric to measure the impact of preemption other than observation of
+    # large P99 TPOT. Ideally we could quantify the impact of preemption by
+    # measuring the number of tokens re-computed due to preemption.
+    preempted_ts_s_lst: List[float] = dataclass_field(default_factory=list)
+
+    # Timestamp when the request was finished at the engine core.
+    finished_ts_s: Optional[float] = None
+
+    # Finish reason.
+    finish_reason: Optional[str] = None
+
+    ############################################################
+    # Derived properties.
+    ############################################################
+    @property
+    def prefill_ts_s(self) -> Optional[float]:
+        """The timestamp when the request started prefilling.
+        Since a request could be preempted in decoding and later resumed
+        to prefill the decoded tokens, we use the first prefill start timestamp.
+        """
+        return (self.prefill_start_ts_s_lst[0]
+                if self.prefill_start_ts_s_lst else None)
+
+    @property
+    def e2e_latency_s(self) -> Optional[float]:
+        if self.finished_ts_s is None or self.arrival_ts_s is None:
+            return None
+        assert self.finished_ts_s >= self.arrival_ts_s
+        return self.finished_ts_s - self.arrival_ts_s
+
+    @property
+    def queue_duration_s(self) -> Optional[float]:
+        """How long the request was waiting to run."""
+        if self.queued_ts_s is None or self.prefill_ts_s is None:
+            # Either not queued or not running yet.
+            return None
+        assert self.queued_ts_s <= self.prefill_ts_s
+        return self.prefill_ts_s - self.queued_ts_s
+
+    @property
+    def inference_latency_s(self) -> Optional[float]:
+        """How long the request was running inference
+        (prefill and decode)."""
+        if self.finished_ts_s is None or self.prefill_ts_s is None:
+            return None
+        assert self.finished_ts_s >= self.prefill_ts_s
+        return self.finished_ts_s - self.prefill_ts_s
+
+    @property
+    def first_token_latency_s(self) -> Optional[float]:
+        if self.first_token_ts_s is None or self.arrival_ts_s is None:
+            return None
+        assert self.first_token_ts_s >= self.arrival_ts_s
+        return self.first_token_ts_s - self.arrival_ts_s
+
+    @property
+    def prefill_latency_s(self) -> Optional[float]:
+        if self.first_token_ts_s is None or self.prefill_ts_s is None:
+            return None
+        assert self.first_token_ts_s >= self.prefill_ts_s
+        return self.first_token_ts_s - self.prefill_ts_s
+
+    @property
+    def decode_latency_s(self) -> Optional[float]:
+        if self.e2e_latency_s is None or self.first_token_latency_s is None:
+            return None
+        assert self.e2e_latency_s >= self.first_token_latency_s
+        return self.e2e_latency_s - self.first_token_latency_s
+
+    @property
+    def output_token_latency_s_lst(self) -> List[float]:
+        if len(self.output_token_ts_s_lst) == 0:
+            return []
+        latency_s_lst = []
+        for i in range(1, len(self.output_token_ts_s_lst)):
+            assert (self.output_token_ts_s_lst[i] >=
+                    self.output_token_ts_s_lst[i - 1])
+            latency_s = (self.output_token_ts_s_lst[i] -
+                         self.output_token_ts_s_lst[i - 1])
+            latency_s_lst.append(latency_s)
+        return latency_s_lst
+
+    @property
+    def num_output_tokens(self) -> int:
+        return len(self.output_token_ts_s_lst)
+
+    @property
+    def is_finished(self) -> bool:
+        return self.finished_ts_s is not None
+
+    def update_from(self, update: "RequestStatsUpdate"):
+        RequestStatsUpdate.check_valid_update(update, self.last_update_type,
+                                              self.last_updated_ts_s)
+        ts = update.monotonic_ts_s
+        self.last_updated_ts_s = ts
+        self.last_update_type = update.type
+        if update.type == RequestStatsUpdate.Type.ARRIVED:
+            self.arrival_ts_s = ts
+        elif update.type == RequestStatsUpdate.Type.INPUT_PROCESSED:
+            self.input_processor_end_ts_s = ts
+            self.sampling_params = update.sampling_params
+            self.num_prompt_tokens = update.num_prompt_tokens
+        elif update.type == RequestStatsUpdate.Type.QUEUED:
+            self.queued_ts_s = ts
+        elif update.type == RequestStatsUpdate.Type.PREFILLING:
+            self.prefill_start_ts_s_lst.append(ts)
+            self.num_cached_tokens = update.num_cached_tokens
+            self.num_computed_tokens = update.num_computed_tokens
+        elif update.type == RequestStatsUpdate.Type.PREEMPTED:
+            self._reset_for_preemption(ts)
+        elif update.type == RequestStatsUpdate.Type.DECODING:
+            self.decoding_ts_s_lst.append(ts)
+        elif update.type == RequestStatsUpdate.Type.DETOKENIZED:
+            self._record_detokenized_output(
+                ts,
+                update.num_new_tokens,
+            )
+        elif update.type == RequestStatsUpdate.Type.FINISHED:
+            self.finished_ts_s = ts
+            self.finish_reason = update.finish_reason
+        else:
+            raise ValueError(f"Unknown update type: {update.type}")
+
+    def _record_detokenized_output(
+        self,
+        ts_s: float,
+        num_new_tokens: int,
+    ):
+        # Update if first output token is generated.
+        if len(self.output_token_ts_s_lst) == 0:
+            self.first_token_ts_s = ts_s
+            assert (
+                self.prefill_ts_s is not None
+            ), "Request must be running before generating output tokens."
+
+        # Some X new tokens were generated at the ts.
+        self.output_token_ts_s_lst.extend([ts_s] * num_new_tokens)
+
+    def _reset_for_preemption(self, ts_s: float):
+        self.preempted_ts_s_lst.append(ts_s)
+        # Reset the computed tokens since it might restart the prefill.
+        self.num_computed_tokens = 0
+        # Cached token count might also change when resumed.
+        self.num_cached_tokens = 0
+        # These stats don't change since they happen before request running.
+        # - arrival_ts_s
+        # - input_processor_end_ts_s
+        # - sampling_params
+        # - num_prompt_tokens
+        # - first_token_ts_s
+        #
+        # These stats are accumulated over preemptions:
+        # - output_token_ts_s_lst
+        # - prefill_start_ts_s_lst (after preemption, it will prefill the
+        #   original prefill tokens and any output tokens generated before
+        #   preemption.)
+
+
+@dataclass
+class KVCacheStats:
+    #   KV Cache Usage in %
+    gpu_cache_usage_sys: float = 0.0
+    gpu_prefix_cache_hit_rate: float = 0.0
+
+
+@dataclass
+class SchedulerStats:
+    """Stats associated with the scheduler."""
+
+    # Number of requests currently running.
+    num_running_reqs: int = 0
+    # Number of requests currently waiting.
+    num_waiting_reqs: int = 0
+
+    kv_cache_stats: KVCacheStats = dataclass_field(
+        default_factory=KVCacheStats)
+
+
+@dataclass
+class EngineCoreProcessStats:
+    """Stats associated with the engine core process."""
+
+    # Number of requests currently in the input queue. None if the engine core
+    # is not running in multiprocess mode.
+    input_queue_size: Optional[int] = None
+    # Number of outputs currently in the output queue. None if the engine core
+    # is not running in multiprocess mode.
+    output_queue_size: Optional[int] = None
+
+
+class EngineCoreStatsSnapshot(msgspec.Struct,
+                              array_like=True,
+                              omit_defaults=True,
+                              gc=False):
+    """
+    A snapshot of the EngineCore's current stats over a period of time.
+    """
+
+    # Snapshot of the scheduler stats.
+    scheduler_stats: SchedulerStats = msgspec_field(
+        default_factory=SchedulerStats)
+
+    # Per request stats updates.
+    requests_stats_updates: List[RequestStatsUpdate] = msgspec_field(
+        default_factory=list)
+
+    # Engine core's queue stats.
+    engine_core_process_stats: EngineCoreProcessStats = msgspec_field(
+        default_factory=EngineCoreProcessStats)
+
+    # TODO(rickyx): Add other components' stats,
+    # e.g. model runner/worker and etc.

From 18fd4a83316868747def6e7e1e2a6caebf8b8ace Mon Sep 17 00:00:00 2001
From: Andy Lo <andylolu24@gmail.com>
Date: Tue, 21 Jan 2025 19:51:35 +0000
Subject: [PATCH 315/462] [Bugfix] Multi-sequence broken (#11898)

Signed-off-by: Andy Lo <andy@mistral.ai>
---
 tests/samplers/test_seeded_generate.py |  7 +-
 vllm/outputs.py                        |  2 +-
 vllm/sequence.py                       | 89 +++++++++++++++-----------
 3 files changed, 59 insertions(+), 39 deletions(-)

diff --git a/tests/samplers/test_seeded_generate.py b/tests/samplers/test_seeded_generate.py
index 88067f19c8f07..bf1ee6c397838 100644
--- a/tests/samplers/test_seeded_generate.py
+++ b/tests/samplers/test_seeded_generate.py
@@ -31,7 +31,7 @@ def test_random_sample_with_seed(
 
     sampling_params = SamplingParams(
         # Parameters to ensure sufficient randomness
-        temperature=2.0,
+        temperature=3.0,
         top_p=min(random.random() + 0.3, 1),
         top_k=random.randint(5, 20),
         n=random.randint(1, 10),
@@ -75,3 +75,8 @@ def test_random_sample_with_seed(
         # verify requests with the same seed match
         assert outputs[1] == outputs[4]
         assert outputs[2] == outputs[5]
+
+        # verify generations within the same parallel sampling group differ
+        for output in outputs:
+            for sub_output_a, sub_output_b in combinations(output, 2):
+                assert sub_output_a != sub_output_b
diff --git a/vllm/outputs.py b/vllm/outputs.py
index b519c159b1531..63df7dcf519b5 100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -172,9 +172,9 @@ def from_seq_group(
         if seq_group.request_id in seq_id_to_seq_group:
             group: SequenceGroupBase = seq_id_to_seq_group[
                 seq_group.request_id]
+            assembled_seq_group = group.maybe_assemble_group(seq_group)
             if finished:
                 group.finish_seq(seq_group)
-            assembled_seq_group = group.maybe_assemble_group(seq_group)
             if assembled_seq_group is None:
                 return None
             return cls.from_seq_group(assembled_seq_group, use_cache,
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 5857f656dfc10..74320db709f94 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -815,7 +815,9 @@ def set_finished_time(self, time: Optional[float]) -> None:
     def get_max_num_running_seqs(self) -> int:
         """The maximum number of sequences running in parallel in the remaining
         lifetime of the request."""
-        return 0 if self.first_seq.is_finished() else 1
+        if self.is_single_seq:
+            return 0 if self.first_seq.is_finished() else 1
+        return self.num_seqs() - self.num_finished_seqs()
 
     def get_seqs(
         self,
@@ -824,7 +826,10 @@ def get_seqs(
         if status is None:
             return self.seqs
 
-        return self.seqs if self.first_seq.status == status else []
+        if self.is_single_seq:
+            return self.seqs if self.first_seq.status == status else []
+
+        return [seq for seq in self.seqs if seq.status == status]
 
     def is_encoder_decoder(self) -> bool:
         return self.encoder_seq is not None
@@ -833,19 +838,22 @@ def get_encoder_seq(self) -> Optional[Sequence]:
         return self.encoder_seq
 
     def get_finished_seqs(self) -> List[Sequence]:
-        return self.seqs if self.first_seq.is_finished() else []
+        if self.is_single_seq:
+            return self.seqs if self.first_seq.is_finished() else []
+
+        return [seq for seq in self.seqs if seq.is_finished()]
 
     def update_num_computed_tokens(self, num_new_computed_tokens: int):
         """Update number of tokens computed so far."""
-        seq = self.first_seq
-        if not seq.is_finished():
-            seq.data.update_num_computed_tokens(num_new_computed_tokens)
+        for seq in self.seqs:
+            if not seq.is_finished():
+                seq.data.update_num_computed_tokens(num_new_computed_tokens)
 
     def get_num_uncomputed_tokens(self) -> int:
         num_uncomputed_tokens = 0
-        seq = self.first_seq
-        if not seq.is_finished():
-            num_uncomputed_tokens += seq.data.get_num_uncomputed_tokens()
+        for seq in self.seqs:
+            if not seq.is_finished():
+                num_uncomputed_tokens += seq.data.get_num_uncomputed_tokens()
         return num_uncomputed_tokens
 
     def num_seqs(self, status: Optional[SequenceStatus] = None) -> int:
@@ -860,10 +868,14 @@ def num_seqs(self, status: Optional[SequenceStatus] = None) -> int:
         return len(self.get_seqs(status))
 
     def num_finished_seqs(self) -> int:
-        return 1 if self.first_seq.is_finished() else 0
+        if self.is_single_seq:
+            return 1 if self.seqs[0].is_finished() else 0
+        return len(self.get_finished_seqs())
 
     def is_finished(self) -> bool:
-        return self.first_seq.is_finished()
+        if self.is_single_seq:
+            return self.first_seq.is_finished()
+        return all(seq.is_finished() for seq in self.seqs)
 
     def is_prefill(self) -> bool:
         return self.first_seq.is_prefill()
@@ -1391,13 +1403,15 @@ class ParallelSampleSequenceGroup(SequenceGroupBase):
     @staticmethod
     def add_request(request_id: str, engine, params, **kwargs):
         original_params = params
-        params = original_params.clone()
-        params.n = 1
         group = ParallelSampleSequenceGroup(request_id)
         seqs = []
         for i in range(original_params.n):
             request_id_i = f"{request_id}_parallel_sample_{i}"
             group.seq_id_to_index[request_id_i] = i
+            params = copy.deepcopy(original_params)
+            params.n = 1
+            if params.seed is not None:
+                params.seed += i
             seq_group = engine._add_processed_request(
                 request_id_i,
                 params=params,
@@ -1432,33 +1446,34 @@ def maybe_assemble_group(
             self, seq_group: SequenceGroup) -> Optional[SequenceGroup]:
 
         # in the streaming mode, we will return the assembled sequence
-        # for the first sequence, and then return None for the rest of
-        # sequences
+        # for the first remaining sequence, and then return None for the
+        # rest of sequences
         if self.streaming:
-            if self.seq_id_to_index[seq_group.request_id] == 0:
+            first_remaining_id = next(iter(self.to_be_finished))
+            if seq_group.request_id == first_remaining_id:
                 return self.assembled_seq_group
             return None
 
         # in the non-streaming mode, we will return the assembled sequence
-        # once after all sequences finish, and then return None for the
+        # when the last sequences finishes, and then return None for the
         # rest of the time
-
-        if len(self.to_be_finished) > 0:
-            return None
-
-        assert self.assembled_seq_group is not None
-        params = self.assembled_seq_group.sampling_params
-        assert isinstance(params, SamplingParams)
-        if not self.output_produced:
-            self.output_produced = True
-            if params._real_n is not None:
-                # Get the top-n sequences.
-                n = params._real_n or params.n
-                seqs = self.assembled_seq_group.seqs
-                sorting_key = lambda seq: seq.get_cumulative_logprob()
-                sorted_seqs = sorted(seqs, key=sorting_key, reverse=True)
-                top_n_seqs = sorted_seqs[:n]
-                self.assembled_seq_group.seqs = top_n_seqs
-            return self.assembled_seq_group
-        if self.output_produced:
-            return None
+        if (len(self.to_be_finished) == 1
+                and seq_group.request_id in self.to_be_finished
+                and seq_group.is_finished()):
+            assert self.assembled_seq_group is not None
+            params = self.assembled_seq_group.sampling_params
+            assert isinstance(params, SamplingParams)
+            if not self.output_produced:
+                self.output_produced = True
+                if params._real_n is not None:
+                    # Get the top-n sequences.
+                    n = params._real_n or params.n
+                    seqs = self.assembled_seq_group.seqs
+                    sorting_key = lambda seq: seq.get_cumulative_logprob()
+                    sorted_seqs = sorted(seqs, key=sorting_key, reverse=True)
+                    top_n_seqs = sorted_seqs[:n]
+                    self.assembled_seq_group.seqs = top_n_seqs
+                return self.assembled_seq_group
+            if self.output_produced:
+                return None
+        return None

From 347eeebe3bd84c1fb59af72be753ec42cc74b799 Mon Sep 17 00:00:00 2001
From: Adrian Cole <64215+codefromthecrypt@users.noreply.github.com>
Date: Tue, 21 Jan 2025 11:51:55 -0800
Subject: [PATCH 316/462] [Misc] Remove experimental dep from tracing.py
 (#12007)

Signed-off-by: Adrian Cole <adrian.cole@elastic.co>
---
 tests/tracing/test_tracing.py | 60 +++++++++++++++++------------------
 vllm/engine/llm_engine.py     | 32 +++++++++----------
 vllm/tracing.py               | 34 ++++++++++++--------
 3 files changed, 66 insertions(+), 60 deletions(-)

diff --git a/tests/tracing/test_tracing.py b/tests/tracing/test_tracing.py
index fe5fc979c66a3..49a16d16eb840 100644
--- a/tests/tracing/test_tracing.py
+++ b/tests/tracing/test_tracing.py
@@ -100,32 +100,32 @@ def test_traces(trace_service):
 
     attributes = decode_attributes(
         request.resource_spans[0].scope_spans[0].spans[0].attributes)
-    assert attributes.get(SpanAttributes.LLM_RESPONSE_MODEL) == model
+    assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model
     assert attributes.get(
-        SpanAttributes.LLM_REQUEST_ID) == outputs[0].request_id
+        SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id
+    assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE
+                          ) == sampling_params.temperature
     assert attributes.get(
-        SpanAttributes.LLM_REQUEST_TEMPERATURE) == sampling_params.temperature
+        SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p
     assert attributes.get(
-        SpanAttributes.LLM_REQUEST_TOP_P) == sampling_params.top_p
-    assert attributes.get(
-        SpanAttributes.LLM_REQUEST_MAX_TOKENS) == sampling_params.max_tokens
-    assert attributes.get(SpanAttributes.LLM_REQUEST_N) == sampling_params.n
-    assert attributes.get(SpanAttributes.LLM_USAGE_PROMPT_TOKENS) == len(
+        SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS) == sampling_params.max_tokens
+    assert attributes.get(SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n
+    assert attributes.get(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len(
         outputs[0].prompt_token_ids)
     completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
     assert attributes.get(
-        SpanAttributes.LLM_USAGE_COMPLETION_TOKENS) == completion_tokens
+        SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens
     metrics = outputs[0].metrics
     assert attributes.get(
-        SpanAttributes.LLM_LATENCY_TIME_IN_QUEUE) == metrics.time_in_queue
+        SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE) == metrics.time_in_queue
     ttft = metrics.first_token_time - metrics.arrival_time
     assert attributes.get(
-        SpanAttributes.LLM_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
+        SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
     e2e_time = metrics.finished_time - metrics.arrival_time
-    assert attributes.get(SpanAttributes.LLM_LATENCY_E2E) == e2e_time
+    assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time
     assert metrics.scheduler_time > 0
-    assert attributes.get(
-        SpanAttributes.LLM_LATENCY_TIME_IN_SCHEDULER) == metrics.scheduler_time
+    assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER
+                          ) == metrics.scheduler_time
     # Model forward and model execute should be none, since detailed traces is
     # not enabled.
     assert metrics.model_forward_time is None
@@ -166,37 +166,37 @@ def test_traces_with_detailed_steps(trace_service):
 
     attributes = decode_attributes(
         request.resource_spans[0].scope_spans[0].spans[0].attributes)
-    assert attributes.get(SpanAttributes.LLM_RESPONSE_MODEL) == model
+    assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model
     assert attributes.get(
-        SpanAttributes.LLM_REQUEST_ID) == outputs[0].request_id
+        SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id
+    assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE
+                          ) == sampling_params.temperature
     assert attributes.get(
-        SpanAttributes.LLM_REQUEST_TEMPERATURE) == sampling_params.temperature
+        SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p
     assert attributes.get(
-        SpanAttributes.LLM_REQUEST_TOP_P) == sampling_params.top_p
-    assert attributes.get(
-        SpanAttributes.LLM_REQUEST_MAX_TOKENS) == sampling_params.max_tokens
-    assert attributes.get(SpanAttributes.LLM_REQUEST_N) == sampling_params.n
-    assert attributes.get(SpanAttributes.LLM_USAGE_PROMPT_TOKENS) == len(
+        SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS) == sampling_params.max_tokens
+    assert attributes.get(SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n
+    assert attributes.get(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len(
         outputs[0].prompt_token_ids)
     completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
     assert attributes.get(
-        SpanAttributes.LLM_USAGE_COMPLETION_TOKENS) == completion_tokens
+        SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens
     metrics = outputs[0].metrics
     assert attributes.get(
-        SpanAttributes.LLM_LATENCY_TIME_IN_QUEUE) == metrics.time_in_queue
+        SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE) == metrics.time_in_queue
     ttft = metrics.first_token_time - metrics.arrival_time
     assert attributes.get(
-        SpanAttributes.LLM_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
+        SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
     e2e_time = metrics.finished_time - metrics.arrival_time
-    assert attributes.get(SpanAttributes.LLM_LATENCY_E2E) == e2e_time
+    assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time
     assert metrics.scheduler_time > 0
-    assert attributes.get(
-        SpanAttributes.LLM_LATENCY_TIME_IN_SCHEDULER) == metrics.scheduler_time
+    assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER
+                          ) == metrics.scheduler_time
     assert metrics.model_forward_time > 0
     assert attributes.get(
-        SpanAttributes.LLM_LATENCY_TIME_IN_MODEL_FORWARD) == pytest.approx(
+        SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD) == pytest.approx(
             metrics.model_forward_time / 1000)
     assert metrics.model_execute_time > 0
-    assert attributes.get(SpanAttributes.LLM_LATENCY_TIME_IN_MODEL_EXECUTE
+    assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE
                           ) == metrics.model_execute_time
     assert metrics.model_forward_time < 1000 * metrics.model_execute_time
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 6a6b4a14a4c49..dc6c9c1778dd7 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -1857,46 +1857,44 @@ def create_trace_span(self, seq_group: SequenceGroup) -> None:
             metrics = seq_group.metrics
             ttft = metrics.first_token_time - metrics.arrival_time
             e2e_time = metrics.finished_time - metrics.arrival_time
-            # attribute names are based on
-            # https://github.com/open-telemetry/semantic-conventions/blob/main/docs/gen-ai/llm-spans.md
-            seq_span.set_attribute(SpanAttributes.LLM_RESPONSE_MODEL,
+            seq_span.set_attribute(SpanAttributes.GEN_AI_RESPONSE_MODEL,
                                    self.model_config.model)
-            seq_span.set_attribute(SpanAttributes.LLM_REQUEST_ID,
+            seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_ID,
                                    seq_group.request_id)
-            seq_span.set_attribute(SpanAttributes.LLM_REQUEST_TEMPERATURE,
+            seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE,
                                    seq_group.sampling_params.temperature)
-            seq_span.set_attribute(SpanAttributes.LLM_REQUEST_TOP_P,
+            seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_TOP_P,
                                    seq_group.sampling_params.top_p)
-            seq_span.set_attribute(SpanAttributes.LLM_REQUEST_MAX_TOKENS,
+            seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS,
                                    seq_group.sampling_params.max_tokens)
-            seq_span.set_attribute(SpanAttributes.LLM_REQUEST_N,
+            seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_N,
                                    seq_group.sampling_params.n)
-            seq_span.set_attribute(SpanAttributes.LLM_USAGE_NUM_SEQUENCES,
+            seq_span.set_attribute(SpanAttributes.GEN_AI_USAGE_NUM_SEQUENCES,
                                    seq_group.num_seqs())
-            seq_span.set_attribute(SpanAttributes.LLM_USAGE_PROMPT_TOKENS,
+            seq_span.set_attribute(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS,
                                    len(seq_group.prompt_token_ids))
             seq_span.set_attribute(
-                SpanAttributes.LLM_USAGE_COMPLETION_TOKENS,
+                SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS,
                 sum([
                     seq.get_output_len()
                     for seq in seq_group.get_finished_seqs()
                 ]))
-            seq_span.set_attribute(SpanAttributes.LLM_LATENCY_TIME_IN_QUEUE,
+            seq_span.set_attribute(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE,
                                    metrics.time_in_queue)
             seq_span.set_attribute(
-                SpanAttributes.LLM_LATENCY_TIME_TO_FIRST_TOKEN, ttft)
-            seq_span.set_attribute(SpanAttributes.LLM_LATENCY_E2E, e2e_time)
+                SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN, ttft)
+            seq_span.set_attribute(SpanAttributes.GEN_AI_LATENCY_E2E, e2e_time)
             if metrics.scheduler_time is not None:
                 seq_span.set_attribute(
-                    SpanAttributes.LLM_LATENCY_TIME_IN_SCHEDULER,
+                    SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER,
                     metrics.scheduler_time)
             if metrics.model_forward_time is not None:
                 seq_span.set_attribute(
-                    SpanAttributes.LLM_LATENCY_TIME_IN_MODEL_FORWARD,
+                    SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD,
                     metrics.model_forward_time / 1000.0)
             if metrics.model_execute_time is not None:
                 seq_span.set_attribute(
-                    SpanAttributes.LLM_LATENCY_TIME_IN_MODEL_EXECUTE,
+                    SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE,
                     metrics.model_execute_time)
 
     def _validate_model_inputs(self, inputs: ProcessorInputs,
diff --git a/vllm/tracing.py b/vllm/tracing.py
index 50068d8cf9c25..72a3f85118d36 100644
--- a/vllm/tracing.py
+++ b/vllm/tracing.py
@@ -16,7 +16,6 @@
         OTEL_EXPORTER_OTLP_TRACES_PROTOCOL)
     from opentelemetry.sdk.trace import TracerProvider
     from opentelemetry.sdk.trace.export import BatchSpanProcessor
-    from opentelemetry.semconv_ai import SpanAttributes as BaseSpanAttributes
     from opentelemetry.trace import SpanKind, Tracer, set_tracer_provider
     from opentelemetry.trace.propagation.tracecontext import (
         TraceContextTextMapPropagator)
@@ -92,21 +91,30 @@ def extract_trace_headers(headers: Mapping[str, str]) -> Mapping[str, str]:
     return {h: headers[h] for h in TRACE_HEADERS if h in headers}
 
 
-class SpanAttributes(BaseSpanAttributes):
-    # The following span attribute names are added here because they are missing
-    # from the Semantic Conventions for LLM.
-    LLM_REQUEST_ID = "gen_ai.request.id"
-    LLM_REQUEST_N = "gen_ai.request.n"
-    LLM_USAGE_NUM_SEQUENCES = "gen_ai.usage.num_sequences"
-    LLM_LATENCY_TIME_IN_QUEUE = "gen_ai.latency.time_in_queue"
-    LLM_LATENCY_TIME_TO_FIRST_TOKEN = "gen_ai.latency.time_to_first_token"
-    LLM_LATENCY_E2E = "gen_ai.latency.e2e"
-    LLM_LATENCY_TIME_IN_SCHEDULER = "gen_ai.latency.time_in_scheduler"
+class SpanAttributes:
+    # Attribute names copied from here to avoid version conflicts:
+    # https://github.com/open-telemetry/semantic-conventions/blob/main/docs/gen-ai/gen-ai-spans.md
+    GEN_AI_USAGE_COMPLETION_TOKENS = "gen_ai.usage.completion_tokens"
+    GEN_AI_USAGE_PROMPT_TOKENS = "gen_ai.usage.prompt_tokens"
+    GEN_AI_REQUEST_MAX_TOKENS = "gen_ai.request.max_tokens"
+    GEN_AI_REQUEST_TOP_P = "gen_ai.request.top_p"
+    GEN_AI_REQUEST_TEMPERATURE = "gen_ai.request.temperature"
+    GEN_AI_RESPONSE_MODEL = "gen_ai.response.model"
+    # Attribute names added until they are added to the semantic conventions:
+    GEN_AI_REQUEST_ID = "gen_ai.request.id"
+    GEN_AI_REQUEST_N = "gen_ai.request.n"
+    GEN_AI_USAGE_NUM_SEQUENCES = "gen_ai.usage.num_sequences"
+    GEN_AI_LATENCY_TIME_IN_QUEUE = "gen_ai.latency.time_in_queue"
+    GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN = "gen_ai.latency.time_to_first_token"
+    GEN_AI_LATENCY_E2E = "gen_ai.latency.e2e"
+    GEN_AI_LATENCY_TIME_IN_SCHEDULER = "gen_ai.latency.time_in_scheduler"
     # Time taken in the forward pass for this across all workers
-    LLM_LATENCY_TIME_IN_MODEL_FORWARD = "gen_ai.latency.time_in_model_forward"
+    GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD = (
+        "gen_ai.latency.time_in_model_forward")
     # Time taken in the model execute function. This will include model
     # forward, block/sync across workers, cpu-gpu sync time and sampling time.
-    LLM_LATENCY_TIME_IN_MODEL_EXECUTE = "gen_ai.latency.time_in_model_execute"
+    GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE = (
+        "gen_ai.latency.time_in_model_execute")
 
 
 def contains_trace_headers(headers: Mapping[str, str]) -> bool:

From fa9ee08121d1d0ffb19c64f4b72ee653589e983b Mon Sep 17 00:00:00 2001
From: wangxiyuan <wangxiyuan1007@gmail.com>
Date: Wed, 22 Jan 2025 03:52:11 +0800
Subject: [PATCH 317/462] [Misc] Set default backend to SDPA for
 get_vit_attn_backend (#12235)

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
---
 vllm/model_executor/models/vision.py | 30 +++++++++++++++-------------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py
index a1395982af44c..57166f05cd9bf 100644
--- a/vllm/model_executor/models/vision.py
+++ b/vllm/model_executor/models/vision.py
@@ -82,23 +82,25 @@ def get_vit_attn_backend(support_fa: bool = False) -> _Backend:
         if backend_by_env_var is not None:
             selected_backend = backend_name_to_enum(backend_by_env_var)
     if selected_backend is None:
-        # For Volta and Turing GPUs, use xformers instead.
-        device_available = current_platform.has_device_capability(80)
-        if device_available and support_fa:
-            from transformers.utils import is_flash_attn_2_available
-            if is_flash_attn_2_available():
-                selected_backend = _Backend.FLASH_ATTN
+        if current_platform.is_cuda():
+            device_available = current_platform.has_device_capability(80)
+            if device_available and support_fa:
+                from transformers.utils import is_flash_attn_2_available
+                if is_flash_attn_2_available():
+                    selected_backend = _Backend.FLASH_ATTN
+                else:
+                    logger.warning_once(
+                        "Current `vllm-flash-attn` has a bug inside vision "
+                        "module, so we use xformers backend instead. You can "
+                        "run `pip install flash-attn` to use flash-attention "
+                        "backend.")
+                    selected_backend = _Backend.XFORMERS
             else:
-                logger.warning_once(
-                    "Current `vllm-flash-attn` has a bug inside vision module, "
-                    "so we use xformers backend instead. You can run "
-                    "`pip install flash-attn` to use flash-attention backend.")
+                # For Volta and Turing GPUs, use xformers instead.
                 selected_backend = _Backend.XFORMERS
-        elif current_platform.is_cpu() or current_platform.is_rocm():
-            # ROCM doesn't support xformers
-            selected_backend = _Backend.TORCH_SDPA
         else:
-            selected_backend = _Backend.XFORMERS
+            # Default to torch SDPA for other non-GPU platforms.
+            selected_backend = _Backend.TORCH_SDPA
     return selected_backend
 
 
From 9c485d9e252a8834ed15656838d5fbe0dc3a8f2f Mon Sep 17 00:00:00 2001
From: Jani Monoses <jani.monoses@gmail.com>
Date: Tue, 21 Jan 2025 21:56:41 +0200
Subject: [PATCH 318/462] [Core] Free CPU pinned memory on environment cleanup
 (#10477)

---
 vllm/distributed/parallel_state.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index bf8b30cccd5f6..ffdf8b0f48087 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -1183,6 +1183,11 @@ def cleanup_dist_env_and_memory(shutdown_ray: bool = False):
     from vllm.platforms import current_platform
     if not current_platform.is_cpu():
         torch.cuda.empty_cache()
+    try:
+        torch._C._host_emptyCache()
+    except AttributeError:
+        logger.warning(
+            "torch._C._host_emptyCache() only available in Pytorch >=2.5")
 
 
 def in_the_same_node_as(pg: Union[ProcessGroup, StatelessProcessGroup],

From 2acba47d9bf97135d33355eff303d61a2c8d3d8a Mon Sep 17 00:00:00 2001
From: Divakar Verma <137818590+divakar-amd@users.noreply.github.com>
Date: Tue, 21 Jan 2025 16:47:32 -0600
Subject: [PATCH 319/462] [bugfix] moe tuning. rm is_navi() (#12273)

Signed-off-by: Divakar Verma <divakar.verma@amd.com>
---
 benchmarks/kernels/benchmark_moe.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
index 1d59a01422412..1fa0da75c79d2 100644
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -12,10 +12,10 @@
 
 from vllm.model_executor.layers.fused_moe.fused_moe import *
 from vllm.platforms import current_platform
-from vllm.utils import FlexibleArgumentParser, is_navi
+from vllm.utils import FlexibleArgumentParser
 
 FP8_DTYPE = torch.float8_e4m3fnuz if current_platform.is_rocm(
-) and not is_navi() else torch.float8_e4m3fn
+) else torch.float8_e4m3fn
 
 
 class BenchmarkConfig(TypedDict):

From 69196a9bc7aefcd132c68a2184f1092ee3377ba9 Mon Sep 17 00:00:00 2001
From: Aleksandr Malyshev <164964928+maleksan85@users.noreply.github.com>
Date: Tue, 21 Jan 2025 15:30:46 -0800
Subject: [PATCH 320/462] [BUGFIX] When skip_tokenize_init and multistep are
 set, execution crashes (#12277)

Signed-off-by: maleksan85 <maleksan@amd.com>
Co-authored-by: maleksan85 <maleksan@amd.com>
---
 vllm/engine/output_processor/multi_step.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py
index c8b282b1a7676..99c2baf3f4df4 100644
--- a/vllm/engine/output_processor/multi_step.py
+++ b/vllm/engine/output_processor/multi_step.py
@@ -144,7 +144,7 @@ def process_outputs(self,
     def _process_decode_and_stop(self, seq: Sequence,
                                  sampling_params: SamplingParams) -> None:
         new_char_count = 0
-        if sampling_params.detokenize:
+        if sampling_params.detokenize and self.detokenizer:
             new_char_count = self.detokenizer.decode_sequence_inplace(
                 seq, sampling_params)
 

From 09ccc9c8f7943220f0b211639cee0655527e857d Mon Sep 17 00:00:00 2001
From: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com>
Date: Tue, 21 Jan 2025 18:49:22 -0500
Subject: [PATCH 321/462] [Documentation][AMD] Add information about prebuilt
 ROCm vLLM docker for perf validation purpose (#12281)

Signed-off-by: Hongxia Yang <hongxyan@amd.com>
---
 docs/source/getting_started/installation/gpu/rocm.inc.md | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/docs/source/getting_started/installation/gpu/rocm.inc.md b/docs/source/getting_started/installation/gpu/rocm.inc.md
index 8ef1bc95fd522..69238f6e36fb2 100644
--- a/docs/source/getting_started/installation/gpu/rocm.inc.md
+++ b/docs/source/getting_started/installation/gpu/rocm.inc.md
@@ -13,6 +13,14 @@ vLLM supports AMD GPUs with ROCm 6.2.
 
 Currently, there are no pre-built ROCm wheels.
 
+However, the [AMD Infinity hub for vLLM](https://hub.docker.com/r/rocm/vllm/tags) offers a prebuilt, optimized
+docker image designed for validating inference performance on the AMD Instinct™ MI300X accelerator.
+
+```{tip}
+Please check [LLM inference performance validation on AMD Instinct MI300X](https://rocm.docs.amd.com/en/latest/how-to/performance-validation/mi300x/vllm-benchmark.html)
+for instructions on how to use this prebuilt docker image.
+```
+
 ### Build wheel from source
 
 0. Install prerequisites (skip if you are already in an environment/docker with the following installed):

From df76e5af2635dee68bbace85da6c0257f6ee74bb Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 22 Jan 2025 08:48:13 +0800
Subject: [PATCH 322/462] [VLM] Simplify post-processing of replacement info
 (#12269)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .../multimodal/processing/test_common.py      |   2 +-
 tests/models/registry.py                      |   3 +-
 tests/multimodal/test_processing.py           |  42 +++---
 vllm/model_executor/models/aria.py            |  10 +-
 vllm/model_executor/models/blip2.py           |  33 ++---
 vllm/model_executor/models/chameleon.py       |  42 ++----
 vllm/model_executor/models/fuyu.py            |  38 ++----
 vllm/model_executor/models/phi3v.py           |  45 +++----
 vllm/model_executor/models/qwen2_audio.py     |  43 ++----
 vllm/multimodal/processing.py                 | 125 ++++++++++++------
 10 files changed, 175 insertions(+), 208 deletions(-)

diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index d6d3d3b34ad46..fe5b733c750a8 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -35,7 +35,7 @@ def _test_processing_correctness(
         task="auto",
         tokenizer=model_id,
         tokenizer_mode="auto",
-        trust_remote_code=True,
+        trust_remote_code=model_info.trust_remote_code,
         seed=0,
         dtype="float16",
         revision=None,
diff --git a/tests/models/registry.py b/tests/models/registry.py
index e99dbd16c47b9..0bd06dea0ec7f 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -261,7 +261,8 @@ def check_available_online(
                                        trust_remote_code=True),
     "Qwen2AudioForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-Audio-7B-Instruct"),  # noqa: E501
     "Qwen2VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-VL-2B-Instruct"),  # noqa: E501
-    "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_3"),
+    "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_3",
+                                     trust_remote_code=True),
     # [Encoder-decoder]
     "MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"),  # noqa: E501
     "WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"),  # noqa: E501
diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index 9e58ed4cfde93..13f820d013e2a 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -7,12 +7,16 @@
 
 from vllm.config import ModelConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.processing import (PlaceholderInfo, PromptReplacement,
+# yapf conflicts with isort for this block
+# yapf: disable
+from vllm.multimodal.processing import (PlaceholderFeaturesInfo,
+                                        PromptReplacement,
                                         find_mm_placeholders,
                                         find_text_matches, find_token_matches,
                                         iter_token_matches,
                                         replace_text_matches,
                                         replace_token_matches)
+# yapf: enable
 from vllm.multimodal.profiling import MultiModalProfiler
 from vllm.multimodal.utils import cached_get_tokenizer
 from vllm.transformers_utils.tokenizer import AnyTokenizer
@@ -433,19 +437,19 @@ def test_find_replace_tokens(
             [1, 9833, 28747, 32000, 9833, 28747, 32000, 32000, 918],
             {
                 "pattern_1": [
-                    PlaceholderInfo(
+                    PlaceholderFeaturesInfo(
                         modality="pattern_1",
                         item_idx=0,
                         start_idx=6,
-                        replacement=[32000, 32000],
+                        tokens=[32000, 32000],
                     ),
                 ],
                 "pattern_4": [
-                    PlaceholderInfo(
+                    PlaceholderFeaturesInfo(
                         modality="pattern_4",
                         item_idx=0,
                         start_idx=3,
-                        replacement=[32000],
+                        tokens=[32000],
                     ),
                 ],
             }
@@ -455,25 +459,25 @@ def test_find_replace_tokens(
             [1, 32000, 32000, 9833, 28747, 32000, 32000, 1550, 918, 1550],
             {
                 "pattern_1": [
-                    PlaceholderInfo(
+                    PlaceholderFeaturesInfo(
                         modality="pattern_1",
                         item_idx=0,
                         start_idx=1,
-                        replacement=[32000, 32000],
+                        tokens=[32000, 32000],
                     ),
-                    PlaceholderInfo(
+                    PlaceholderFeaturesInfo(
                         modality="pattern_1",
                         item_idx=1,
                         start_idx=5,
-                        replacement=[32000, 32000],
+                        tokens=[32000, 32000],
                     ),
                 ],
                 "pattern_3": [
-                    PlaceholderInfo(
+                    PlaceholderFeaturesInfo(
                         modality="pattern_3",
                         item_idx=0,
                         start_idx=7,
-                        replacement=[1550, 918, 1550],
+                        tokens=[1550, 918, 1550],
                     ),
                 ],
                 # No match for pattern_4 as it has lower priority than pattern_1
@@ -483,33 +487,33 @@ def test_find_replace_tokens(
             [1, 32000, 32000, 32000, 32000, 32000, 1550, 918, 1550],
             {
                 "pattern_1": [
-                    PlaceholderInfo(
+                    PlaceholderFeaturesInfo(
                         modality="pattern_1",
                         item_idx=0,
                         start_idx=1,
-                        replacement=[32000, 32000],
+                        tokens=[32000, 32000],
                     ),
-                    PlaceholderInfo(
+                    PlaceholderFeaturesInfo(
                         modality="pattern_1",
                         item_idx=1,
                         start_idx=3,
-                        replacement=[32000, 32000],
+                        tokens=[32000, 32000],
                     ),
                 ],
                 "pattern_4": [
-                    PlaceholderInfo(
+                    PlaceholderFeaturesInfo(
                         modality="pattern_4",
                         item_idx=0,
                         start_idx=5,
-                        replacement=[32000],
+                        tokens=[32000],
                     ),
                 ],
                 "pattern_3": [
-                    PlaceholderInfo(
+                    PlaceholderFeaturesInfo(
                         modality="pattern_3",
                         item_idx=0,
                         start_idx=6,
-                        replacement=[1550, 918, 1550],
+                        tokens=[1550, 918, 1550],
                     ),
                 ],
             }
diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
index 503d1a38d9ee9..34ac39b812deb 100644
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@@ -342,13 +342,7 @@ def get_vision_config(self):
         return self.get_hf_config().vision_config
 
     def get_hf_processor(self):
-        processor = self.ctx.get_hf_processor(AriaProcessor)
-
-        # Patch for https://github.com/huggingface/transformers/issues/35768
-        processor.tokenizer.image_token = "<|img|>"
-        processor.image_token = "<|img|>"
-
-        return processor
+        return self.ctx.get_hf_processor(AriaProcessor)
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None}
@@ -381,7 +375,7 @@ def get_dummy_processor_inputs(
         }
 
         hf_processor = self.info.get_hf_processor()
-        image_token: str = hf_processor.image_token  # type: ignore
+        image_token: str = hf_processor.tokenizer.image_token  # type: ignore
 
         return ProcessorInputs(
             prompt_text=image_token * num_images,
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index f5c796b1acae6..a8e6f3b6c6d41 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -14,12 +14,12 @@
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalInputs, MultiModalKwargs,
-                                    NestedTensors, PlaceholderRange)
+from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
+                                    NestedTensors)
 from vllm.multimodal.parse import MultiModalDataItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, PromptReplacement)
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptReplacementDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
@@ -481,30 +481,13 @@ def _get_prompt_replacements(
             PromptReplacement(
                 modality="image",
                 target="</s>",
-                replacement="<image>" * num_image_tokens + "</s>",
+                replacement=PromptReplacementDetails(
+                    full="<image>" * num_image_tokens + "</s>",
+                    features="<image>" * num_image_tokens,
+                ),
             )
         ]
 
-    def apply(
-        self,
-        prompt: Union[str, list[int]],
-        mm_data: MultiModalDataDict,
-        hf_processor_mm_kwargs: Mapping[str, object],
-    ) -> MultiModalInputs:
-        result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
-
-        # Only <image> tokens should be considered as placeholders,
-        # so we ignore the trailing bos_token
-        result["mm_placeholders"] = {
-            modality: [
-                PlaceholderRange(offset=p["offset"], length=p["length"] - 1)
-                for p in ps
-            ]
-            for modality, ps in result["mm_placeholders"].items()
-        }
-
-        return result
-
 
 @MULTIMODAL_REGISTRY.register_processor(Blip2MultiModalProcessor,
                                         info=Blip2ProcessingInfo,
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index e2207865a693d..2df50b3c4efe6 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -28,12 +28,12 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalInputs, MultiModalKwargs,
-                                    NestedTensors, PlaceholderRange)
+from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
+                                    NestedTensors)
 from vllm.multimodal.parse import MultiModalDataItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, PromptReplacement)
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptReplacementDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
@@ -141,39 +141,23 @@ def _get_prompt_replacements(
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
         processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+        image_tokens = processor.image_token * self.info.get_num_image_tokens()
 
         return [
             PromptReplacement(
                 modality="image",
                 target="<image>",
-                replacement="".join([
-                    processor.image_start_token,
-                    processor.image_token * self.info.get_num_image_tokens(),
-                    processor.image_end_token,
-                ]),
+                replacement=PromptReplacementDetails(
+                    full="".join([
+                        processor.image_start_token,
+                        image_tokens,
+                        processor.image_end_token,
+                    ]),
+                    features=image_tokens,
+                ),
             )
         ]
 
-    def apply(
-        self,
-        prompt: Union[str, list[int]],
-        mm_data: MultiModalDataDict,
-        hf_processor_mm_kwargs: Mapping[str, object],
-    ) -> MultiModalInputs:
-        result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
-
-        # Only <image> tokens should be considered as placeholders,
-        # so we ignore the image_start_token and image_end_token
-        result["mm_placeholders"] = {
-            modality: [
-                PlaceholderRange(offset=p["offset"] + 1,
-                                 length=p["length"] - 2) for p in ps
-            ]
-            for modality, ps in result["mm_placeholders"].items()
-        }
-
-        return result
-
 
 class ChameleonLayerNorm(nn.LayerNorm):
 
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index 3f16d3ccbd061..4865eb170364c 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -16,7 +16,7 @@
 """ PyTorch Fuyu model."""
 import math
 from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
-                    TypedDict, Union)
+                    TypedDict)
 
 import torch
 import torch.nn as nn
@@ -30,13 +30,13 @@
 from vllm.model_executor.models.persimmon import PersimmonForCausalLM
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalInputs, MultiModalKwargs,
-                                    NestedTensors, PlaceholderRange)
+from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
+                                    NestedTensors)
 from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
                                    MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, PromptReplacement)
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptReplacementDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
@@ -215,9 +215,13 @@ def get_replacement_fuyu(item_idx: int):
                 image_width=image_size.width,
                 image_height=image_size.height,
             )
+            image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
+                            [_NEWLINE_TOKEN_ID]) * nrows
 
-            return (([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows +
-                    [bos_token_id])
+            return PromptReplacementDetails(
+                full=image_tokens + [bos_token_id],
+                features=image_tokens,
+            )
 
         return [
             PromptReplacement(
@@ -227,26 +231,6 @@ def get_replacement_fuyu(item_idx: int):
             )
         ]
 
-    def apply(
-        self,
-        prompt: Union[str, list[int]],
-        mm_data: MultiModalDataDict,
-        hf_processor_mm_kwargs: Mapping[str, object],
-    ) -> MultiModalInputs:
-        result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
-
-        # Only |SPEAKER| (image) tokens should be considered as placeholders,
-        # so we ignore the trailing bos_token_id
-        result["mm_placeholders"] = {
-            modality: [
-                PlaceholderRange(offset=p["offset"], length=p["length"] - 1)
-                for p in ps
-            ]
-            for modality, ps in result["mm_placeholders"].items()
-        }
-
-        return result
-
 
 @MULTIMODAL_REGISTRY.register_processor(FuyuMultiModalProcessor,
                                         info=FuyuProcessingInfo,
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index dd3b0b35c9294..0fcda81da2800 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -30,15 +30,19 @@
     VocabParallelEmbedding)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalInputs, MultiModalKwargs,
-                                    NestedTensors, PlaceholderRange)
+from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
+                                    NestedTensors)
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    ImageSize, MultiModalDataItems)
+# yapf conflicts with isort for this block
+# yapf: disable
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo,
                                         BoundPromptReplacement,
-                                        PlaceholderInfo, PromptReplacement)
+                                        PlaceholderFeaturesInfo,
+                                        PromptReplacement,
+                                        PromptReplacementDetails)
+# yapf: enable
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 from vllm.utils import is_list_of
@@ -437,7 +441,12 @@ def get_replacement_phi3v(item_idx: int):
                     processor=hf_processor,
                 )
 
-            return [_IMAGE_TOKEN_ID] * num_image_tokens + [bos_token_id]
+            image_tokens = [_IMAGE_TOKEN_ID] * num_image_tokens
+
+            return PromptReplacementDetails(
+                full=image_tokens + [bos_token_id],
+                features=image_tokens,
+            )
 
         num_images = mm_items.get_count("image", strict=False)
 
@@ -454,7 +463,7 @@ def _apply_prompt_replacements(
         token_ids: list[int],
         mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]],
         mm_item_counts: Mapping[str, int],
-    ) -> tuple[list[int], str, Mapping[str, list[PlaceholderInfo]]]:
+    ) -> tuple[list[int], str, Mapping[str, list[PlaceholderFeaturesInfo]]]:
         token_ids, text, placeholders = super()._apply_prompt_replacements(
             token_ids=token_ids,
             mm_prompt_repls=mm_prompt_repls,
@@ -467,11 +476,11 @@ def _apply_prompt_replacements(
             token_ids = [token_ids[0], *token_ids[2:]]
             placeholders = {
                 modality: [
-                    PlaceholderInfo(
+                    PlaceholderFeaturesInfo(
                         modality=p.modality,
                         item_idx=p.item_idx,
                         start_idx=p.start_idx - 1,
-                        replacement=p.replacement,
+                        tokens=p.tokens,
                     ) for p in ps
                 ]
                 for modality, ps in placeholders.items()
@@ -479,26 +488,6 @@ def _apply_prompt_replacements(
 
         return token_ids, text, placeholders
 
-    def apply(
-        self,
-        prompt: Union[str, list[int]],
-        mm_data: MultiModalDataDict,
-        hf_processor_mm_kwargs: Mapping[str, object],
-    ) -> MultiModalInputs:
-        result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
-
-        # Only <|image|> tokens should be considered as placeholders,
-        # so we ignore the trailing bos_token_id
-        result["mm_placeholders"] = {
-            modality: [
-                PlaceholderRange(offset=p["offset"], length=p["length"] - 1)
-                for p in ps
-            ]
-            for modality, ps in result["mm_placeholders"].items()
-        }
-
-        return result
-
 
 @MULTIMODAL_REGISTRY.register_processor(Phi3VMultiModalProcessor,
                                         info=Phi3VProcessingInfo,
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index 9cb8f83ad7873..c21cb815b5283 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -36,13 +36,13 @@
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalInputs, MultiModalKwargs,
-                                    NestedTensors, PlaceholderRange)
+from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
+                                    NestedTensors)
 from vllm.multimodal.parse import (AudioProcessorItems, MultiModalDataItems,
                                    MultiModalDataParser)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, PromptReplacement)
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptReplacementDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
@@ -216,11 +216,16 @@ def get_replacement_qwen2_audio(item_idx: int):
                     f"The audio {audio} (len={len(audio)}) is too short "
                     "to be represented inside the model")
 
-            return "".join([
-                audio_bos_token,
-                audio_token * num_placeholders,
-                audio_eos_token,
-            ])
+            audio_tokens = audio_token * num_placeholders
+
+            return PromptReplacementDetails(
+                full="".join([
+                    audio_bos_token,
+                    audio_tokens,
+                    audio_eos_token,
+                ]),
+                features=audio_tokens,
+            )
 
         return [
             PromptReplacement(
@@ -240,26 +245,6 @@ def _always_apply_prompt_replacements(self) -> bool:
         # tokens than the number of audio items)
         return not hasattr(self.info.get_hf_processor(), "audio_token")
 
-    def apply(
-        self,
-        prompt: Union[str, list[int]],
-        mm_data: MultiModalDataDict,
-        hf_processor_mm_kwargs: Mapping[str, object],
-    ) -> MultiModalInputs:
-        result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
-
-        # Only <|AUDIO|> tokens should be considered as placeholders,
-        # so we ignore the audio_bos_token and audio_eos_token
-        result["mm_placeholders"] = {
-            modality: [
-                PlaceholderRange(offset=p["offset"] + 1,
-                                 length=p["length"] - 2) for p in ps
-            ]
-            for modality, ps in result["mm_placeholders"].items()
-        }
-
-        return result
-
 
 @MULTIMODAL_REGISTRY.register_processor(
     Qwen2AudioMultiModalProcessor,
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index ff02bcc8e1f2d..a0993f377dea7 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -1,7 +1,8 @@
 import re
 from abc import ABC, abstractmethod
 from collections import defaultdict
-from collections.abc import Callable, ItemsView, Iterable, Mapping, Sequence
+from collections.abc import (Callable, Generator, ItemsView, Iterable, Mapping,
+                             Sequence)
 from dataclasses import dataclass, field
 from functools import lru_cache
 from typing import (TYPE_CHECKING, Generic, NamedTuple, Optional, Protocol,
@@ -31,6 +32,24 @@
 _PromptSeq = Union[str, list[int]]
 
 
+@dataclass
+class PromptReplacementDetails:
+    full: _PromptSeq
+    """The full replacement."""
+
+    features: _PromptSeq
+    """
+    The part of the replacement that corresponds to placeholder feature tokens.
+    """
+
+    @staticmethod
+    def from_seq(seq: _PromptSeq):
+        return PromptReplacementDetails(full=seq, features=seq)
+
+
+_PromptRepl = Union[_PromptSeq, PromptReplacementDetails]
+
+
 @dataclass
 class PromptReplacement:
     """
@@ -43,8 +62,8 @@ class PromptReplacement:
     target: _PromptSeq
     """The token sequence (or text) to find and replace."""
 
-    replacement: Union[Callable[[int], _PromptSeq],
-                       _PromptSeq] = field(repr=False)
+    replacement: Union[Callable[[int], _PromptRepl],
+                       _PromptRepl] = field(repr=False)
     """
     Given the index of the processed item within :attr:`modality`,
     output the replacement token sequence (or text).
@@ -112,6 +131,14 @@ class _BoundPromptSequence:
     _text: Optional[str]
     _token_ids: Optional[list[int]]
 
+    @staticmethod
+    def from_seq(tokenizer: AnyTokenizer, seq: _PromptSeq):
+        return _BoundPromptSequence(
+            tokenizer=tokenizer,
+            _text=seq if isinstance(seq, str) else None,
+            _token_ids=seq if isinstance(seq, list) else None,
+        )
+
     def __post_init__(self) -> None:
         if self._text is None and self._token_ids is None:
             raise ValueError("At least one of 'text' and 'token_ids' must be "
@@ -134,6 +161,12 @@ def token_ids(self) -> list[int]:
         return self._token_ids
 
 
+@dataclass
+class _BoundPromptReplacementGroup:
+    full: _BoundPromptSequence
+    features: _BoundPromptSequence
+
+
 @dataclass
 class BoundPromptReplacement:
     """
@@ -145,24 +178,18 @@ class BoundPromptReplacement:
     modality: str
 
     _target: _PromptSeq
-    _replacement: Union[Callable[[int], _PromptSeq],
-                        _PromptSeq] = field(repr=False)
+    _replacement: Union[Callable[[int], _PromptRepl],
+                        _PromptRepl] = field(repr=False)
 
     def __post_init__(self) -> None:
-        self._replacement_cache = dict[int, _BoundPromptSequence]()
+        self._replacement_cache = dict[int, _BoundPromptReplacementGroup]()
 
     @property
     def target(self) -> _BoundPromptSequence:
         """The token sequence (or text) to find and replace."""
-        target = self._target
+        return _BoundPromptSequence.from_seq(self.tokenizer, self._target)
 
-        return _BoundPromptSequence(
-            tokenizer=self.tokenizer,
-            _text=target if isinstance(target, str) else None,
-            _token_ids=target if isinstance(target, list) else None,
-        )
-
-    def get_replacement(self, item_idx: int) -> _BoundPromptSequence:
+    def get_replacement(self, item_idx: int) -> _BoundPromptReplacementGroup:
         """
         Given the index of the processed item within :attr:`modality`,
         output the replacement token sequence (or text).
@@ -177,10 +204,16 @@ def get_replacement(self, item_idx: int) -> _BoundPromptSequence:
         else:
             cache_key = None
 
-        bound_replacement = _BoundPromptSequence(
-            tokenizer=self.tokenizer,
-            _text=replacement if isinstance(replacement, str) else None,
-            _token_ids=replacement if isinstance(replacement, list) else None,
+        if not isinstance(replacement, PromptReplacementDetails):
+            replacement = PromptReplacementDetails.from_seq(replacement)
+
+        bound_full = _BoundPromptSequence.from_seq(self.tokenizer,
+                                                   replacement.full)
+        bound_features = _BoundPromptSequence.from_seq(self.tokenizer,
+                                                       replacement.features)
+        bound_replacement = _BoundPromptReplacementGroup(
+            full=bound_full,
+            features=bound_features,
         )
 
         if cache_key is not None:
@@ -197,7 +230,7 @@ class _TokenMatch(NamedTuple):
 def iter_token_matches(
     token_ids: list[int],
     match_ids: list[int],
-) -> Iterable[_TokenMatch]:
+) -> Generator[_TokenMatch]:
     """
     Yield each occurrence of :code:`match_ids` in :code:`token_ids`.
 
@@ -272,15 +305,15 @@ def end_idx(self) -> int:
 
 
 @dataclass
-class PlaceholderInfo:
+class PlaceholderFeaturesInfo:
     modality: str
     item_idx: int
     start_idx: int
-    replacement: list[int]
+    tokens: list[int]
 
     @property
     def length(self) -> int:
-        return len(self.replacement)
+        return len(self.tokens)
 
     def to_range(self) -> PlaceholderRange:
         return PlaceholderRange(
@@ -362,10 +395,10 @@ def _replace_matches(
         replacement = repl_info.get_replacement(item_idx)
 
         if isinstance(prompt, str):
-            repl_seq = replacement.text
+            repl_seq = replacement.full.text
             out_seqs.append(prompt[prev_end_idx:start_idx] + repl_seq)
         else:
-            repl_seq = replacement.token_ids
+            repl_seq = replacement.full.token_ids
             out_seqs.append(prompt[prev_end_idx:start_idx] + repl_seq)
 
         prev_end_idx = end_idx
@@ -408,7 +441,7 @@ def _iter_placeholders(
     mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]],
     prompt: list[int],
     mm_item_counts: Mapping[str, int],
-) -> Iterable[PlaceholderInfo]:
+) -> Iterable[PlaceholderFeaturesInfo]:
     """
     Yield each set of placeholder tokens found in :code:`prompt`.
 
@@ -432,23 +465,33 @@ def _iter_placeholders(
 
             for repl_info in modality_repls:
                 replacement = repl_info.get_replacement(item_idx)
-                repl_tokens = replacement.token_ids
-                repl_len = len(repl_tokens)
-                end_idx = start_idx + repl_len
+                repl_tokens_full = replacement.full.token_ids
+                repl_len_full = len(repl_tokens_full)
+                end_idx_full = start_idx + repl_len_full
 
-                if repl_len == 0 or end_idx > prompt_len:
+                if repl_len_full == 0 or end_idx_full > prompt_len:
                     continue
 
-                if prompt[start_idx:end_idx] == repl_tokens:
-                    yield PlaceholderInfo(
-                        modality=modality,
-                        item_idx=item_idx,
-                        start_idx=start_idx,
-                        replacement=repl_tokens,
-                    )
+                if prompt[start_idx:end_idx_full] == repl_tokens_full:
+                    repl_tokens_feat = replacement.features.token_ids
+
+                    try:
+                        match = next(
+                            iter_token_matches(repl_tokens_full,
+                                               repl_tokens_feat))
+                        yield PlaceholderFeaturesInfo(
+                            modality=modality,
+                            item_idx=item_idx,
+                            start_idx=start_idx + match.start_idx,
+                            tokens=repl_tokens_feat,
+                        )
+                    except StopIteration:
+                        raise AssertionError(
+                            f"{repl_tokens_feat=} should be a "
+                            f"subsequence of {repl_tokens_full=}") from None
 
                     # Exclude overlapping matches
-                    start_idx = end_idx
+                    start_idx = end_idx_full
                     item_idx_by_modality[modality] += 1
                     found = True
                     break
@@ -464,7 +507,7 @@ def find_mm_placeholders(
     mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]],
     prompt: list[int],
     mm_item_counts: Mapping[str, int],
-) -> Mapping[str, list[PlaceholderInfo]]:
+) -> Mapping[str, list[PlaceholderFeaturesInfo]]:
     it = _iter_placeholders(mm_prompt_repls, prompt, mm_item_counts)
     return dict(full_groupby_modality(it))
 
@@ -679,7 +722,7 @@ def _find_mm_placeholders(
         mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]],
         new_token_ids: list[int],
         mm_item_counts: Mapping[str, int],
-    ) -> Mapping[str, list[PlaceholderInfo]]:
+    ) -> Mapping[str, list[PlaceholderFeaturesInfo]]:
         return find_mm_placeholders(mm_prompt_repls, new_token_ids,
                                     mm_item_counts)
 
@@ -948,7 +991,7 @@ def _apply_prompt_replacements(
         token_ids: list[int],
         mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]],
         mm_item_counts: Mapping[str, int],
-    ) -> tuple[list[int], str, Mapping[str, list[PlaceholderInfo]]]:
+    ) -> tuple[list[int], str, Mapping[str, list[PlaceholderFeaturesInfo]]]:
         tokenizer = self.info.get_tokenizer()
 
         mm_token_matches = {
@@ -1037,7 +1080,7 @@ def _validate_mm_kwargs(
 
     def _validate_mm_placeholders(
         self,
-        mm_placeholders: Mapping[str, list[PlaceholderInfo]],
+        mm_placeholders: Mapping[str, list[PlaceholderFeaturesInfo]],
         mm_item_counts: Mapping[str, int],
         *,
         allow_missing: bool = False,

From 64ea24d0b3ca794faabc5f1570a921a7b44118b7 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Tue, 21 Jan 2025 17:15:27 -0800
Subject: [PATCH 323/462] [ci/lint] Add back default arg for pre-commit
 (#12279)

Signed-off-by: kevin <kevin@anyscale.com>
---
 .github/workflows/pre-commit.yml              |  2 +-
 .../models/decoder_only/language/test_gguf.py | 17 ++++++-------
 vllm/model_executor/models/paligemma.py       |  2 +-
 vllm/model_executor/models/siglip.py          |  8 +++----
 vllm/platforms/__init__.py                    |  2 +-
 vllm/v1/stats/common.py                       | 24 ++++++++++---------
 6 files changed, 26 insertions(+), 29 deletions(-)

diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index bf9460151ec1b..06564969dc778 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -16,4 +16,4 @@ jobs:
     - run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
     - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
       with:
-        extra_args: --hook-stage manual
+        extra_args: --all-files --hook-stage manual
diff --git a/tests/models/decoder_only/language/test_gguf.py b/tests/models/decoder_only/language/test_gguf.py
index 38cea2462b440..ad8f8a0c320e9 100644
--- a/tests/models/decoder_only/language/test_gguf.py
+++ b/tests/models/decoder_only/language/test_gguf.py
@@ -74,11 +74,7 @@ def gguf_model(self):
 )
 
 MODELS = [
-    LLAMA_CONFIG,
-    QWEN2_CONFIG,
-    PHI3_CONFIG,
-    GPT2_CONFIG,
-    STABLELM_CONFIG,
+    LLAMA_CONFIG, QWEN2_CONFIG, PHI3_CONFIG, GPT2_CONFIG, STABLELM_CONFIG,
     DOLPHIN_CONFIG
     # STARCODER_CONFIG, # broken
 ]
@@ -114,11 +110,12 @@ def test_models(
             messages, tokenize=False, add_generation_prompt=True)
 
     # Run unquantized model.
-    with vllm_runner(model_name=model.original_model,
-                     enforce_eager=True, # faster tests
-                     dtype=dtype,
-                     max_model_len=MAX_MODEL_LEN,
-                     tensor_parallel_size=tp_size) as original_model:
+    with vllm_runner(
+            model_name=model.original_model,
+            enforce_eager=True,  # faster tests
+            dtype=dtype,
+            max_model_len=MAX_MODEL_LEN,
+            tensor_parallel_size=tp_size) as original_model:
         original_outputs = original_model.generate_greedy_logprobs(
             example_prompts[:-1], max_tokens, num_logprobs)
 
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index ed9ae1887259e..5a28b1ffbb7b4 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -147,7 +147,7 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal,
             "up_proj",
         ],
     }
-    
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py
index 211e5dc80066e..1e51018973e8c 100644
--- a/vllm/model_executor/models/siglip.py
+++ b/vllm/model_executor/models/siglip.py
@@ -348,12 +348,10 @@ def __init__(
         if quant_config and quant_config.get_name() == "bitsandbytes":
             quantizable = True
         else:
-            # For other quantization, we require the hidden size to be a 
+            # For other quantization, we require the hidden size to be a
             # multiple of 64
-            quantizable = (
-                config.hidden_size % 64 == 0
-                and config.intermediate_size % 64 == 0
-            )
+            quantizable = (config.hidden_size % 64 == 0
+                           and config.intermediate_size % 64 == 0)
         self.fc1 = ColumnParallelLinear(
             config.hidden_size,
             config.intermediate_size,
diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py
index d20d35199bf5c..ddbdc43ca5710 100644
--- a/vllm/platforms/__init__.py
+++ b/vllm/platforms/__init__.py
@@ -101,7 +101,7 @@ def cpu_platform_plugin() -> Optional[str]:
     try:
         from importlib.metadata import version
         is_cpu = "cpu" in version("vllm")
-        if is_cpu == False:
+        if not is_cpu:
             import platform
             is_cpu = platform.machine().lower().startswith("arm")
 
diff --git a/vllm/v1/stats/common.py b/vllm/v1/stats/common.py
index 099d82c5904cf..500bc356fc179 100644
--- a/vllm/v1/stats/common.py
+++ b/vllm/v1/stats/common.py
@@ -10,10 +10,11 @@
 from vllm.sampling_params import SamplingParams
 
 
-class RequestStatsUpdate(msgspec.Struct,
-                         array_like=True,
-                         omit_defaults=True,
-                         gc=False):
+class RequestStatsUpdate(
+        msgspec.Struct,  # type: ignore
+        array_like=True,
+        omit_defaults=True,
+        gc=False):
     """
     An update to the request stats.
 
@@ -341,8 +342,8 @@ def update_from(self, update: "RequestStatsUpdate"):
             self.queued_ts_s = ts
         elif update.type == RequestStatsUpdate.Type.PREFILLING:
             self.prefill_start_ts_s_lst.append(ts)
-            self.num_cached_tokens = update.num_cached_tokens
-            self.num_computed_tokens = update.num_computed_tokens
+            self.num_cached_tokens = update.num_cached_tokens or 0
+            self.num_computed_tokens = update.num_computed_tokens or 0
         elif update.type == RequestStatsUpdate.Type.PREEMPTED:
             self._reset_for_preemption(ts)
         elif update.type == RequestStatsUpdate.Type.DECODING:
@@ -350,7 +351,7 @@ def update_from(self, update: "RequestStatsUpdate"):
         elif update.type == RequestStatsUpdate.Type.DETOKENIZED:
             self._record_detokenized_output(
                 ts,
-                update.num_new_tokens,
+                update.num_new_tokens or 0,
             )
         elif update.type == RequestStatsUpdate.Type.FINISHED:
             self.finished_ts_s = ts
@@ -425,10 +426,11 @@ class EngineCoreProcessStats:
     output_queue_size: Optional[int] = None
 
 
-class EngineCoreStatsSnapshot(msgspec.Struct,
-                              array_like=True,
-                              omit_defaults=True,
-                              gc=False):
+class EngineCoreStatsSnapshot(
+        msgspec.Struct,  # type: ignore
+        array_like=True,
+        omit_defaults=True,
+        gc=False):
     """
     A snapshot of the EngineCore's current stats over a period of time.
     """

From 016e3676e7eab58b177f92e579ba20173defee01 Mon Sep 17 00:00:00 2001
From: Liangfu Chen <liangfc@amazon.com>
Date: Tue, 21 Jan 2025 18:47:49 -0800
Subject: [PATCH 324/462] [CI] add docker volume prune to neuron CI (#12291)

Signed-off-by: Liangfu Chen <liangfc@amazon.com>
---
 .buildkite/run-neuron-test.sh | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.buildkite/run-neuron-test.sh b/.buildkite/run-neuron-test.sh
index 189714ebb6d75..0590dad4f311f 100644
--- a/.buildkite/run-neuron-test.sh
+++ b/.buildkite/run-neuron-test.sh
@@ -25,8 +25,11 @@ if [ -f /tmp/neuron-docker-build-timestamp ]; then
     last_build=$(cat /tmp/neuron-docker-build-timestamp)
     current_time=$(date +%s)
     if [ $((current_time - last_build)) -gt 86400 ]; then
+        # Remove dangling images (those that are not tagged and not used by any container)
         docker image prune -f
-        docker system prune -f
+        # Remove unused volumes / force the system prune for old images as well.
+        docker volume prune -f && docker system prune -f
+        # Remove huggingface model artifacts and compiler cache
         rm -rf "${HF_MOUNT:?}/*"
         rm -rf "${NEURON_COMPILE_CACHE_MOUNT:?}/*"
         echo "$current_time" > /tmp/neuron-docker-build-timestamp

From cbdc4ad5a502f38270650a375931db439304094c Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 22 Jan 2025 12:06:54 +0800
Subject: [PATCH 325/462] [Ci/Build] Fix mypy errors on main (#12296)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/multimodal/processing.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index a0993f377dea7..1e0338dfb0226 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -43,7 +43,7 @@ class PromptReplacementDetails:
     """
 
     @staticmethod
-    def from_seq(seq: _PromptSeq):
+    def from_seq(seq: _PromptSeq) -> "PromptReplacementDetails":
         return PromptReplacementDetails(full=seq, features=seq)
 
 
@@ -132,7 +132,10 @@ class _BoundPromptSequence:
     _token_ids: Optional[list[int]]
 
     @staticmethod
-    def from_seq(tokenizer: AnyTokenizer, seq: _PromptSeq):
+    def from_seq(
+        tokenizer: AnyTokenizer,
+        seq: _PromptSeq,
+    ) -> "_BoundPromptSequence":
         return _BoundPromptSequence(
             tokenizer=tokenizer,
             _text=seq if isinstance(seq, str) else None,

From 222a9dc350d99e111ba09b29576113c9278e6a3e Mon Sep 17 00:00:00 2001
From: Nick Hill <nickhill@us.ibm.com>
Date: Tue, 21 Jan 2025 21:46:14 -0800
Subject: [PATCH 326/462] [Benchmark] More accurate TPOT calc in
 `benchmark_serving.py` (#12288)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 benchmarks/backend_request_func.py | 43 ++++++++++++-------
 benchmarks/benchmark_serving.py    | 69 +++++++++++++++++-------------
 2 files changed, 66 insertions(+), 46 deletions(-)

diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index a9ab4fc9b621e..f415d109bdfc8 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -35,6 +35,7 @@ class RequestFuncOutput:
     generated_text: str = ""
     success: bool = False
     latency: float = 0.0
+    output_tokens: int = 0
     ttft: float = 0.0  # Time to first token
     itl: List[float] = field(
         default_factory=list)  # List of inter-token latencies
@@ -156,7 +157,7 @@ async def async_request_trt_llm(
                         timestamp = time.perf_counter()
                         # First token
                         if ttft == 0.0:
-                            ttft = time.perf_counter() - st
+                            ttft = timestamp - st
                             output.ttft = ttft
 
                         # Decoding phase
@@ -245,6 +246,9 @@ async def async_request_openai_completions(
             "logprobs": request_func_input.logprobs,
             "stream": True,
             "ignore_eos": request_func_input.ignore_eos,
+            "stream_options": {
+                "include_usage": True,
+            },
         }
         if request_func_input.extra_body:
             payload.update(request_func_input.extra_body)
@@ -256,7 +260,6 @@ async def async_request_openai_completions(
         output.prompt_len = request_func_input.prompt_len
 
         generated_text = ""
-        ttft = 0.0
         st = time.perf_counter()
         most_recent_timestamp = st
         try:
@@ -271,15 +274,16 @@ async def async_request_openai_completions(
 
                         chunk = chunk_bytes.decode("utf-8").removeprefix(
                             "data: ")
-                        if chunk == "[DONE]":
-                            latency = time.perf_counter() - st
-                        else:
+                        if chunk != "[DONE]":
                             data = json.loads(chunk)
 
                             # NOTE: Some completion API might have a last
                             # usage summary response without a token so we
                             # want to check a token was generated
-                            if data["choices"][0]["text"]:
+                            if choices := data.get("choices"):
+                                # Note that text could be empty here
+                                # e.g. for special tokens
+                                text = choices[0].get("text")
                                 timestamp = time.perf_counter()
                                 # First token
                                 if not first_chunk_received:
@@ -293,7 +297,10 @@ async def async_request_openai_completions(
                                                       most_recent_timestamp)
 
                                 most_recent_timestamp = timestamp
-                                generated_text += data["choices"][0]["text"]
+                                generated_text += text
+                            elif usage := data.get("usage"):
+                                output.output_tokens = usage.get(
+                                    "completion_tokens")
                     if first_chunk_received:
                         output.success = True
                     else:
@@ -302,7 +309,7 @@ async def async_request_openai_completions(
                             "Never received a valid chunk to calculate TTFT."
                             "This response will be marked as failed!")
                     output.generated_text = generated_text
-                    output.latency = latency
+                    output.latency = most_recent_timestamp - st
                 else:
                     output.error = response.reason or ""
                     output.success = False
@@ -342,6 +349,9 @@ async def async_request_openai_chat_completions(
             "max_completion_tokens": request_func_input.output_len,
             "stream": True,
             "ignore_eos": request_func_input.ignore_eos,
+            "stream_options": {
+                "include_usage": True,
+            },
         }
         if request_func_input.extra_body:
             payload.update(request_func_input.extra_body)
@@ -368,17 +378,15 @@ async def async_request_openai_chat_completions(
 
                         chunk = chunk_bytes.decode("utf-8").removeprefix(
                             "data: ")
-                        if chunk == "[DONE]":
-                            latency = time.perf_counter() - st
-                        else:
+                        if chunk != "[DONE]":
                             timestamp = time.perf_counter()
                             data = json.loads(chunk)
 
-                            delta = data["choices"][0]["delta"]
-                            if delta.get("content", None):
+                            if choices := data.get("choices"):
+                                content = choices[0]["delta"].get("content")
                                 # First token
                                 if ttft == 0.0:
-                                    ttft = time.perf_counter() - st
+                                    ttft = timestamp - st
                                     output.ttft = ttft
 
                                 # Decoding phase
@@ -386,13 +394,16 @@ async def async_request_openai_chat_completions(
                                     output.itl.append(timestamp -
                                                       most_recent_timestamp)
 
-                                generated_text += delta["content"]
+                                generated_text += content
+                            elif usage := data.get("usage"):
+                                output.output_tokens = usage.get(
+                                    "completion_tokens")
 
                             most_recent_timestamp = timestamp
 
                     output.generated_text = generated_text
                     output.success = True
-                    output.latency = latency
+                    output.latency = most_recent_timestamp - st
                 else:
                     output.error = response.reason or ""
                     output.success = False
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 53186e10c5452..bc026b0ec1ca6 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -25,6 +25,7 @@
 import argparse
 import asyncio
 import base64
+import gc
 import io
 import json
 import os
@@ -423,7 +424,7 @@ def calculate_metrics(
     tokenizer: PreTrainedTokenizerBase,
     selected_percentile_metrics: List[str],
     selected_percentiles: List[float],
-    gootput_config_dict: Dict[str, float],
+    goodput_config_dict: Dict[str, float],
 ) -> Tuple[BenchmarkMetrics, List[int]]:
     actual_output_lens: List[int] = []
     total_input = 0
@@ -436,19 +437,23 @@ def calculate_metrics(
     e2els: List[float] = []
     for i in range(len(outputs)):
         if outputs[i].success:
-            # We use the tokenizer to count the number of output tokens for all
-            # serving backends instead of looking at len(outputs[i].itl) since
-            # multiple output tokens may be bundled together
-            # Note : this may inflate the output token count slightly
-            output_len = len(
-                tokenizer(outputs[i].generated_text,
-                          add_special_tokens=False).input_ids)
+            output_len = outputs[i].output_tokens
+
+            if output_len is None:
+                # We use the tokenizer to count the number of output tokens
+                # for some serving backends instead of looking at
+                # len(outputs[i].itl) since multiple output tokens may be
+                # bundled together
+                # Note : this may inflate the output token count slightly
+                output_len = len(
+                    tokenizer(outputs[i].generated_text,
+                              add_special_tokens=False).input_ids)
             actual_output_lens.append(output_len)
             total_input += input_requests[i][1]
             tpot = 0
             if output_len > 1:
-                tpot = (outputs[i].latency - outputs[i].ttft) / (output_len -
-                                                                 1)
+                latency_minus_ttft = outputs[i].latency - outputs[i].ttft
+                tpot = latency_minus_ttft / (output_len - 1)
                 tpots.append(tpot)
             # Note: if output_len <= 1, we regard tpot as 0 for goodput
             all_tpots.append(tpot)
@@ -459,21 +464,21 @@ def calculate_metrics(
         else:
             actual_output_lens.append(0)
 
-    if gootput_config_dict:
+    if goodput_config_dict:
         valid_metrics = []
         slo_values = []
 
-        if "ttft" in gootput_config_dict:
+        if "ttft" in goodput_config_dict:
             valid_metrics.append(ttfts)
-            slo_values.append(gootput_config_dict["ttft"] /
+            slo_values.append(goodput_config_dict["ttft"] /
                               MILLISECONDS_TO_SECONDS_CONVERSION)
-        if "tpot" in gootput_config_dict:
+        if "tpot" in goodput_config_dict:
             valid_metrics.append(all_tpots)
-            slo_values.append(gootput_config_dict["tpot"] /
+            slo_values.append(goodput_config_dict["tpot"] /
                               MILLISECONDS_TO_SECONDS_CONVERSION)
-        if "e2el" in gootput_config_dict:
+        if "e2el" in goodput_config_dict:
             valid_metrics.append(e2els)
-            slo_values.append(gootput_config_dict["e2el"] /
+            slo_values.append(goodput_config_dict["e2el"] /
                               MILLISECONDS_TO_SECONDS_CONVERSION)
 
         for req_metric in zip(*valid_metrics):
@@ -537,7 +542,7 @@ async def benchmark(
     selected_percentile_metrics: List[str],
     selected_percentiles: List[str],
     ignore_eos: bool,
-    gootput_config_dict: Dict[str, float],
+    goodput_config_dict: Dict[str, float],
     max_concurrency: Optional[int],
 ):
     if backend in ASYNC_REQUEST_FUNCS:
@@ -661,7 +666,7 @@ async def limited_request_func(request_func_input, pbar):
         tokenizer=tokenizer,
         selected_percentile_metrics=selected_percentile_metrics,
         selected_percentiles=selected_percentiles,
-        gootput_config_dict=gootput_config_dict,
+        goodput_config_dict=goodput_config_dict,
     )
 
     print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
@@ -673,7 +678,7 @@ async def limited_request_func(request_func_input, pbar):
                                  metrics.total_output))
     print("{:<40} {:<10.2f}".format("Request throughput (req/s):",
                                     metrics.request_throughput))
-    if gootput_config_dict:
+    if goodput_config_dict:
         print("{:<40} {:<10.2f}".format("Request goodput (req/s):",
                                         metrics.request_goodput))
     print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):",
@@ -688,7 +693,7 @@ async def limited_request_func(request_func_input, pbar):
         "total_output_tokens": metrics.total_output,
         "request_throughput": metrics.request_throughput,
         "request_goodput:":
-        metrics.request_goodput if gootput_config_dict else None,
+        metrics.request_goodput if goodput_config_dict else None,
         "output_throughput": metrics.output_throughput,
         "total_token_throughput": metrics.total_token_throughput,
         "input_lens": [output.prompt_len for output in outputs],
@@ -744,11 +749,11 @@ def process_one_metric(
 
 def check_goodput_args(args):
     # Check and parse goodput arguments
-    gootput_config_dict = {}
+    goodput_config_dict = {}
     VALID_NAMES = ["ttft", "tpot", "e2el"]
     if args.goodput:
-        gootput_config_dict = parse_goodput(args.goodput)
-        for slo_name, slo_val in gootput_config_dict.items():
+        goodput_config_dict = parse_goodput(args.goodput)
+        for slo_name, slo_val in goodput_config_dict.items():
             if slo_name not in VALID_NAMES:
                 raise ValueError(
                     f"Invalid metric name found, {slo_name}: {slo_val}. "
@@ -759,22 +764,22 @@ def check_goodput_args(args):
                     f"Invalid value found, {slo_name}: {slo_val}. "
                     "The service level objective value should be "
                     "non-negative.")
-    return gootput_config_dict
+    return goodput_config_dict
 
 
 def parse_goodput(slo_pairs):
-    gootput_config_dict = {}
+    goodput_config_dict = {}
     try:
         for slo_pair in slo_pairs:
             slo_name, slo_val = slo_pair.split(":")
-            gootput_config_dict[slo_name] = float(slo_val)
+            goodput_config_dict[slo_name] = float(slo_val)
     except ValueError as err:
         raise argparse.ArgumentTypeError(
             "Invalid format found for service level objectives. "
             "Specify service level objectives for goodput as \"KEY:VALUE\" "
             "pairs, where the key is a metric name, and the value is a "
             "number in milliseconds.") from err
-    return gootput_config_dict
+    return goodput_config_dict
 
 
 def main(args: argparse.Namespace):
@@ -874,7 +879,11 @@ def main(args: argparse.Namespace):
     else:
         raise ValueError(f"Unknown dataset: {args.dataset_name}")
 
-    gootput_config_dict = check_goodput_args(args)
+    goodput_config_dict = check_goodput_args(args)
+
+    # Avoid GC processing "static" data - reduce pause times.
+    gc.collect()
+    gc.freeze()
 
     benchmark_result = asyncio.run(
         benchmark(
@@ -896,7 +905,7 @@ def main(args: argparse.Namespace):
                 float(p) for p in args.metric_percentiles.split(",")
             ],
             ignore_eos=args.ignore_eos,
-            gootput_config_dict=gootput_config_dict,
+            goodput_config_dict=goodput_config_dict,
             max_concurrency=args.max_concurrency,
         ))
 

From 66818e5b63818a286756653816772faa8622ad89 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Wed, 22 Jan 2025 14:13:52 +0800
Subject: [PATCH 327/462] [core] separate builder init and builder prepare for
 each batch (#12253)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/attention/backends/abstract.py         | 11 ++++---
 vllm/attention/backends/flash_attn.py       | 11 ++++---
 vllm/attention/backends/flashinfer.py       | 14 ++++----
 vllm/attention/backends/placeholder_attn.py |  8 +++--
 vllm/attention/backends/torch_sdpa.py       |  5 ++-
 vllm/attention/backends/utils.py            | 13 ++++----
 vllm/worker/cpu_model_runner.py             | 24 ++++++++++----
 vllm/worker/model_runner.py                 | 36 ++++++++++++++-------
 vllm/worker/model_runner_base.py            |  5 +++
 vllm/worker/xpu_model_runner.py             | 10 ++++--
 10 files changed, 90 insertions(+), 47 deletions(-)

diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index e6ddca69bf01b..2efe142a17b69 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -65,11 +65,6 @@ def make_metadata(cls, *args, **kwargs) -> "AttentionMetadata":
     def get_builder_cls() -> Type["AttentionMetadataBuilder"]:
         raise NotImplementedError
 
-    @classmethod
-    def make_metadata_builder(cls, *args,
-                              **kwargs) -> "AttentionMetadataBuilder":
-        return cls.get_builder_cls()(*args, **kwargs)
-
     @staticmethod
     @abstractmethod
     def get_kv_cache_shape(
@@ -214,6 +209,12 @@ class AttentionMetadataBuilder(ABC, Generic[T]):
 
     @abstractmethod
     def __init__(self, input_builder: "ModelRunnerInputBuilderBase") -> None:
+        """Create the builder, remember some configuration and parameters."""
+        raise NotImplementedError
+
+    @abstractmethod
+    def prepare(self) -> None:
+        """Prepare for one batch."""
         raise NotImplementedError
 
     @abstractmethod
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index 40250ef08b595..60ed09d0cc44f 100644
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -375,6 +375,12 @@ class FlashAttentionMetadataBuilder(
         AttentionMetadataBuilder[FlashAttentionMetadata]):
 
     def __init__(self, input_builder: "ModelInputForGPUBuilder"):
+        self.input_builder = input_builder
+        self.runner = input_builder.runner
+        self.sliding_window = input_builder.sliding_window
+        self.block_size = input_builder.block_size
+
+    def prepare(self):
         self.slot_mapping: List[int] = []
         self.prefill_seq_lens: List[int] = []
         self.context_lens: List[int] = []
@@ -388,11 +394,6 @@ def __init__(self, input_builder: "ModelInputForGPUBuilder"):
         self.num_decode_tokens = 0
         self.has_prefix_cache_hit = False
 
-        self.input_builder = input_builder
-        self.runner = input_builder.runner
-        self.sliding_window = input_builder.sliding_window
-        self.block_size = input_builder.block_size
-
     def _add_seq_group(
             self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup",
             chunked_prefill_enabled: bool, prefix_cache_hit: bool):
diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
index b9cd805e81b45..b8ffbe6dd64dd 100644
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -488,6 +488,14 @@ def advance_step(self,
 class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
 
     def __init__(self, input_builder: "ModelInputForGPUBuilder"):
+
+        self.input_builder = input_builder
+        self.runner = input_builder.runner
+
+        self.sliding_window = input_builder.sliding_window
+        self.block_size = input_builder.block_size
+
+    def prepare(self):
         self.slot_mapping: List[int] = []
         self.prefill_seq_lens: List[int] = []
         self.context_lens: List[int] = []
@@ -500,12 +508,6 @@ def __init__(self, input_builder: "ModelInputForGPUBuilder"):
         self.num_prefill_tokens = 0
         self.num_decode_tokens = 0
 
-        self.input_builder = input_builder
-        self.runner = input_builder.runner
-
-        self.sliding_window = input_builder.sliding_window
-        self.block_size = input_builder.block_size
-
         # Please follow https://docs.flashinfer.ai/tutorials/kv_layout.html#page-layout
         # for the precise definition of the following fields.
         # An example:
diff --git a/vllm/attention/backends/placeholder_attn.py b/vllm/attention/backends/placeholder_attn.py
index 534f79b3a60bf..37860494702cf 100644
--- a/vllm/attention/backends/placeholder_attn.py
+++ b/vllm/attention/backends/placeholder_attn.py
@@ -253,6 +253,11 @@ class PlaceholderAttentionMetadataBuilder(
         AttentionMetadataBuilder[PlaceholderAttentionMetadata]):
 
     def __init__(self, input_builder: "ModelInputForGPUBuilder"):
+
+        self.input_builder = input_builder
+        self.runner = input_builder.runner
+
+    def prepare(self):
         self.prefill_seq_lens: List[int] = []
         self.context_lens: List[int] = []
         self.curr_seq_lens: List[int] = []
@@ -263,9 +268,6 @@ def __init__(self, input_builder: "ModelInputForGPUBuilder"):
         self.num_prefill_tokens = 0
         self.num_decode_tokens = 0
 
-        self.input_builder = input_builder
-        self.runner = input_builder.runner
-
     def _add_seq_group(
             self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup",
             chunked_prefill_enabled: bool):
diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py
index 7cd2049f0c0a5..8722d7376795a 100644
--- a/vllm/attention/backends/torch_sdpa.py
+++ b/vllm/attention/backends/torch_sdpa.py
@@ -282,7 +282,10 @@ class TorchSDPAMetadataBuilder(AttentionMetadataBuilder[TorchSDPAMetadata]):
 
     def __init__(self, input_builder: ModelInputForCPUBuilder) -> None:
         self.chunked_prefill = input_builder.chunked_prefill
-        self.input_data = input_builder.input_data
+        self.input_builder = input_builder
+
+    def prepare(self):
+        self.input_data = self.input_builder.input_data
 
     def build(self, seq_lens: List[int], query_lens: List[int],
               cuda_graph_pad_size: int, batch_size: int) -> TorchSDPAMetadata:
diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py
index 56cc43430301f..3df7f54cbd8d2 100644
--- a/vllm/attention/backends/utils.py
+++ b/vllm/attention/backends/utils.py
@@ -122,6 +122,13 @@ class CommonMetadataBuilder(AttentionMetadataBuilder[TAttentionMetadata]):
     _metadata_cls: Type[TAttentionMetadata]
 
     def __init__(self, input_builder: "ModelInputForGPUBuilder"):
+        self.input_builder = input_builder
+        self.runner = input_builder.runner
+
+        self.sliding_window = input_builder.sliding_window
+        self.block_size = input_builder.block_size
+
+    def prepare(self):
         self.slot_mapping: List[int] = []
         self.prefill_seq_lens: List[int] = []
         self.context_lens: List[int] = []
@@ -134,12 +141,6 @@ def __init__(self, input_builder: "ModelInputForGPUBuilder"):
         self.num_prefill_tokens = 0
         self.num_decode_tokens = 0
 
-        self.input_builder = input_builder
-        self.runner = input_builder.runner
-
-        self.sliding_window = input_builder.sliding_window
-        self.block_size = input_builder.block_size
-
     def _add_seq_group(
             self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup",
             chunked_prefill_enabled: bool):
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index abbf6450ab7f6..4b429b67b36f8 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -144,9 +144,7 @@ def __init__(self,
                  runner: "CPUModelRunner",
                  finished_requests_ids: Optional[List[str]] = None) -> None:
         super().__init__()
-        self.seq_group_metadata_list: List[SequenceGroupMetadata] = []
         self.runner = runner
-
         self.chunked_prefill = (runner.scheduler_config.chunked_prefill_enabled
                                 or runner.cache_config.enable_prefix_caching)
         self.model_input_cls = self.runner._model_input_cls
@@ -156,10 +154,17 @@ def __init__(self,
         self.device = self.runner.device
         self.multi_modal_input_mapper = self.runner.multi_modal_input_mapper
         self.enable_lora = self.runner.lora_config is not None
+        if self.runner.attn_backend is not None:
+            # spec decode (e.g. Medusa) does not have atten backend
+            attn_backend = self.runner.attn_backend
+            self.att_metadata_builder = attn_backend.get_builder_cls()(self)
+
+    def prepare(self,
+                finished_requests_ids: Optional[List[str]] = None) -> None:
+        self.seq_group_metadata_list: List[SequenceGroupMetadata] = []
         self.input_data = ModelInputForCPUBuilder.ModelInputData(
             self.runner.model_config.uses_mrope)
-        self.att_metadata_builder = self.runner.attn_backend.get_builder_cls()(
-            self)
+        self.att_metadata_builder.prepare()
 
     def add_seq_group(self, seq_group_metadata: SequenceGroupMetadata):
         self.seq_group_metadata_list.append(seq_group_metadata)
@@ -431,6 +436,7 @@ class CPUModelRunnerBase(ModelRunnerBase[TModelInputForCPU]):
     """
     _model_input_cls: Type[TModelInputForCPU]
     _builder_cls: Type[ModelInputForCPUBuilder]
+    builder: ModelInputForCPUBuilder
 
     def __init__(
         self,
@@ -477,6 +483,10 @@ def __init__(
         # Set after load_model.
         self.lora_manager: Optional[LRUCacheWorkerLoRAManager] = None
 
+        if hasattr(self, "_builder_cls"):
+            # multi-step model runner does not have `_builder_cls`
+            self.builder = self._builder_cls(weakref.proxy(self))
+
     def load_model(self) -> None:
         self.model = get_model(vllm_config=self.vllm_config)
 
@@ -522,10 +532,10 @@ def _prepare_model_input_tensors(
         metadata for possible additional steps, e.g., sampling.
 
         """
-        builder = self._builder_cls(weakref.proxy(self), finished_requests_ids)
-        builder.set_seq_group_list(seq_group_metadata_list)
+        self.builder.prepare(finished_requests_ids)
+        self.builder.set_seq_group_list(seq_group_metadata_list)
 
-        return builder.build()  # type: ignore
+        return self.builder.build()  # type: ignore
 
     # sampler property will be used by spec_decode_worker
     @property
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index cb2ff0c934da3..e311c14111d49 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -457,17 +457,13 @@ def __init__(self,
         self.enable_prompt_adapter = (self.runner.prompt_adapter_config
                                       is not None)
         self.multi_modal_input_mapper = self.runner.multi_modal_input_mapper
-        self.finished_requests_ids = finished_requests_ids
         self.decode_only = True
 
-        # Intermediate data (data in CPU before going to GPU) for
-        # the current sequence group.
-        self.inter_data_list: List[
-            ModelInputForGPUBuilder.InterDataForSeqGroup] = []
-
         # Attention metadata inputs.
-        self.attn_metadata_builder = self.attn_backend.make_metadata_builder(
-            weakref.proxy(self))
+        if self.attn_backend is not None:
+            # spec decode (e.g. Medusa) does not have atten backend
+            self.attn_metadata_builder = self.attn_backend.get_builder_cls()(
+                weakref.proxy(self))
 
         # Engine/Model configurations.
         self.chunked_prefill_enabled = (
@@ -479,6 +475,17 @@ def __init__(self,
             self.block_aligned_sliding_window = \
                 self.sliding_window_blocks * self.block_size
 
+    def prepare(self,
+                finished_requests_ids: Optional[List[str]] = None) -> None:
+        self.finished_requests_ids = finished_requests_ids
+
+        # Intermediate data (data in CPU before going to GPU) for
+        # the current sequence group.
+        self.inter_data_list: List[
+            ModelInputForGPUBuilder.InterDataForSeqGroup] = []
+
+        self.attn_metadata_builder.prepare()
+
     def _compute_lens(self, inter_data: InterDataForSeqGroup, seq_idx: int,
                       seq_group_metadata: SequenceGroupMetadata):
         """Compute context length, sequence length and tokens
@@ -993,6 +1000,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
     """
     _model_input_cls: Type[TModelInputForGPU]
     _builder_cls: Type[ModelInputForGPUBuilder]
+    builder: ModelInputForGPUBuilder
 
     def __init__(
         self,
@@ -1093,6 +1101,10 @@ def __init__(
               SamplingMetadataCache() \
                 if self.parallel_config.pipeline_parallel_size == 1 else None
 
+        if hasattr(self, "_builder_cls"):
+            # multi-step model runner does not have `_builder_cls`
+            self.builder = self._builder_cls(weakref.proxy(self))
+
     def load_model(self) -> None:
         logger.info("Starting to load model %s...", self.model_config.model)
         with DeviceMemoryProfiler() as m:
@@ -1226,13 +1238,13 @@ def _prepare_model_input_tensors(
 
         If cuda graph is required, this API automatically pads inputs.
         """
-        builder = self._builder_cls(weakref.proxy(self), finished_requests_ids)
+        self.builder.prepare(finished_requests_ids)
         for seq_group_metadata in seq_group_metadata_list:
-            builder.add_seq_group(seq_group_metadata)
+            self.builder.add_seq_group(seq_group_metadata)
 
-        builder.reset_cached_inter_data()
+        self.builder.reset_cached_inter_data()
 
-        return builder.build()  # type: ignore
+        return self.builder.build()  # type: ignore
 
     @contextmanager
     def set_in_profile_run(self):
diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py
index acfd6d0b03f62..aef4bdcdd4bf9 100644
--- a/vllm/worker/model_runner_base.py
+++ b/vllm/worker/model_runner_base.py
@@ -200,6 +200,11 @@ class ModelRunnerInputBuilderBase(ABC, Generic[T]):
     """A builder to create ModelRunnerInputBase objects.
   """
 
+    @abstractmethod
+    def prepare(self,
+                finished_requests_ids: Optional[List[str]] = None) -> None:
+        raise NotImplementedError
+
     @abstractmethod
     def add_seq_group(self, seq_group_metadata):
         """TBA"""
diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
index 25a2fea1e8eac..053658d047311 100644
--- a/vllm/worker/xpu_model_runner.py
+++ b/vllm/worker/xpu_model_runner.py
@@ -113,7 +113,6 @@ def __init__(self,
                  runner: "XPUModelRunner",
                  finished_requests_ids: Optional[List[str]] = None) -> None:
         super().__init__()
-        self.seq_group_metadata_list: List[SequenceGroupMetadata] = []
         self.runner = runner
         self.model_input_cls = self.runner._model_input_cls
         self.attn_backend = self.runner.attn_backend
@@ -121,6 +120,10 @@ def __init__(self,
         self.block_size = self.runner.block_size
         self.device = self.runner.device
 
+    def prepare(self,
+                finished_requests_ids: Optional[List[str]] = None) -> None:
+        self.seq_group_metadata_list: List[SequenceGroupMetadata] = []
+
     def add_seq_group(self, seq_group_metadata: SequenceGroupMetadata):
         self.seq_group_metadata_list.append(seq_group_metadata)
 
@@ -408,6 +411,8 @@ def __init__(
               SamplingMetadataCache() \
                 if self.parallel_config.pipeline_parallel_size == 1 else None
 
+        self.builder = self._builder_cls(weakref.proxy(self))
+
     def load_model(self) -> None:
         with DeviceMemoryProfiler() as m:
             self.model = get_model(vllm_config=self.vllm_config)
@@ -517,7 +522,8 @@ def _prepare_model_input_tensors(
         metadata for possible additional steps, e.g., sampling.
 
         """
-        builder = self._builder_cls(weakref.proxy(self), finished_requests_ids)
+        builder = self.builder
+        builder.prepare(finished_requests_ids)
         for seq_group_metadata in seq_group_metadata_list:
             builder.add_seq_group(seq_group_metadata)
 

From 4004f144f370f9d42c92e2edce5e5db1b453d57a Mon Sep 17 00:00:00 2001
From: Mengqing Cao <cmq0113@163.com>
Date: Wed, 22 Jan 2025 14:29:31 +0800
Subject: [PATCH 328/462] [Build] update requirements of no-device (#12299)

Signed-off-by: Mengqing Cao <cmq0113@163.com>
---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 978625a069778..66ce22dd65d5e 100644
--- a/setup.py
+++ b/setup.py
@@ -549,7 +549,7 @@ def _read_requirements(filename: str) -> List[str]:
         return resolved_requirements
 
     if _no_device():
-        requirements = _read_requirements("requirements-cuda.txt")
+        requirements = _read_requirements("requirements-cpu.txt")
     elif _is_cuda():
         requirements = _read_requirements("requirements-cuda.txt")
         cuda_major, cuda_minor = torch.version.cuda.split(".")

From 68ad4e3a8d8a66fb2a43be57471ee13a8bec4ec0 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Wed, 22 Jan 2025 14:39:32 +0800
Subject: [PATCH 329/462] [Core] Support fully transparent sleep mode (#11743)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 .buildkite/test-pipeline.yaml         |   2 +
 CMakeLists.txt                        |  25 +++
 csrc/cumem_allocator.cpp              | 310 ++++++++++++++++++++++++++
 setup.py                              |   2 +
 tests/basic_correctness/test_cumem.py | 112 ++++++++++
 vllm/config.py                        |  78 ++++---
 vllm/device_allocator/__init__.py     |   0
 vllm/device_allocator/cumem.py        | 254 +++++++++++++++++++++
 vllm/engine/arg_utils.py              |  11 +-
 vllm/engine/llm_engine.py             |  10 +
 vllm/entrypoints/llm.py               |  23 ++
 vllm/executor/executor_base.py        |  11 +
 vllm/v1/worker/gpu_worker.py          |  40 +++-
 vllm/worker/worker.py                 |  39 +++-
 14 files changed, 877 insertions(+), 40 deletions(-)
 create mode 100644 csrc/cumem_allocator.cpp
 create mode 100644 tests/basic_correctness/test_cumem.py
 create mode 100644 vllm/device_allocator/__init__.py
 create mode 100644 vllm/device_allocator/cumem.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index ed8c15358830c..daec46760117d 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -76,7 +76,9 @@ steps:
   - tests/basic_correctness/test_basic_correctness
   - tests/basic_correctness/test_cpu_offload
   - tests/basic_correctness/test_preemption
+  - tests/basic_correctness/test_cumem.py
   commands:
+  - pytest -v -s basic_correctness/test_cumem.py
   - pytest -v -s basic_correctness/test_basic_correctness.py
   - pytest -v -s basic_correctness/test_cpu_offload.py
   - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f4b9c3ec9c14f..0945905104f32 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -181,6 +181,31 @@ message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")
 # Define other extension targets
 #
 
+#
+# cumem_allocator extension
+#
+
+set(VLLM_CUMEM_EXT_SRC
+  "csrc/cumem_allocator.cpp")
+
+set_gencode_flags_for_srcs(
+  SRCS "${VLLM_CUMEM_EXT_SRC}"
+  CUDA_ARCHS "${CUDA_ARCHS}")
+
+if(VLLM_GPU_LANG STREQUAL "CUDA")
+  message(STATUS "Enabling cumem allocator extension.")
+  # link against cuda driver library
+  list(APPEND CUMEM_LIBS cuda)
+  define_gpu_extension_target(
+    cumem_allocator
+    DESTINATION vllm
+    LANGUAGE CXX
+    SOURCES ${VLLM_CUMEM_EXT_SRC}
+    LIBRARIES ${CUMEM_LIBS}
+    USE_SABI 3.8
+    WITH_SOABI)
+endif()
+
 #
 # _C extension
 #
diff --git a/csrc/cumem_allocator.cpp b/csrc/cumem_allocator.cpp
new file mode 100644
index 0000000000000..e8555d853b7ac
--- /dev/null
+++ b/csrc/cumem_allocator.cpp
@@ -0,0 +1,310 @@
+// A CUDAPluggableAllocator based on cumem* APIs.
+// Important: allocation size, CUdeviceptr and CUmemGenericAllocationHandle*
+// need to be unsigned long long
+#include <iostream>
+
+extern "C" {
+
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+
+#include <sys/types.h>
+#include <cuda_runtime_api.h>
+#include <cuda.h>
+
+#define CUDA_CHECK(condition)                                                  \
+  do {                                                                         \
+    CUresult error = condition;                                                \
+    if (error != 0) {                                                          \
+      char* error_string;                                                      \
+      cuGetErrorString(error, (const char**)&error_string);                    \
+      std::cerr << "CUDA Error: " << error_string << " at " << __FILE__ << ":" \
+                << __LINE__ << std::endl;                                      \
+    }                                                                          \
+  } while (0)
+
+// Global references to Python callables
+// NOTE: this is borrowed reference, so we don't need to DECREF them.
+// This brings the limitation that the allocator needs to be singleton.
+static PyObject* g_python_malloc_callback = nullptr;
+static PyObject* g_python_free_callback = nullptr;
+
+// ---------------------------------------------------------------------------
+// Helper functions:
+
+void ensure_context(unsigned long long device) {
+  CUcontext pctx;
+  CUDA_CHECK(cuCtxGetCurrent(&pctx));
+  if (!pctx) {
+    // Ensure device context.
+    CUDA_CHECK(cuDevicePrimaryCtxRetain(&pctx, device));
+    CUDA_CHECK(cuCtxSetCurrent(pctx));
+  }
+}
+
+void create_and_map(unsigned long long device, ssize_t size, CUdeviceptr d_mem,
+                    CUmemGenericAllocationHandle* p_memHandle) {
+  ensure_context(device);
+  // Define memory allocation properties
+  CUmemAllocationProp prop = {};
+  prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+  prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+  prop.location.id = device;
+  prop.allocFlags.compressionType = CU_MEM_ALLOCATION_COMP_NONE;
+
+  // Allocate memory using cuMemCreate
+  CUDA_CHECK(cuMemCreate(p_memHandle, size, &prop, 0));
+  CUDA_CHECK(cuMemMap(d_mem, size, 0, *p_memHandle, 0));
+
+  CUmemAccessDesc accessDesc = {};
+  accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+  accessDesc.location.id = device;
+  accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+
+  CUDA_CHECK(cuMemSetAccess(d_mem, size, &accessDesc, 1));
+  // std::cout << "create_and_map: device=" << device << ", size=" << size << ",
+  // d_mem=" << d_mem << ", p_memHandle=" << p_memHandle << std::endl;
+}
+
+void unmap_and_release(unsigned long long device, ssize_t size,
+                       CUdeviceptr d_mem,
+                       CUmemGenericAllocationHandle* p_memHandle) {
+  // std::cout << "unmap_and_release: device=" << device << ", size=" << size <<
+  // ", d_mem=" << d_mem << ", p_memHandle=" << p_memHandle << std::endl;
+  ensure_context(device);
+  CUDA_CHECK(cuMemUnmap(d_mem, size));
+  CUDA_CHECK(cuMemRelease(*p_memHandle));
+}
+
+PyObject* create_tuple_from_c_integers(unsigned long long a,
+                                       unsigned long long b,
+                                       unsigned long long c,
+                                       unsigned long long d) {
+  // Create a new tuple of size 4
+  PyObject* tuple = PyTuple_New(4);
+  if (!tuple) {
+    return NULL;  // Return NULL on failure
+  }
+
+  // Convert integers to Python objects and set them in the tuple
+  PyTuple_SetItem(
+      tuple, 0,
+      PyLong_FromUnsignedLongLong(a));  // Steals reference to the PyLong
+  PyTuple_SetItem(tuple, 1, PyLong_FromUnsignedLongLong(b));
+  PyTuple_SetItem(tuple, 2, PyLong_FromUnsignedLongLong(c));
+  PyTuple_SetItem(tuple, 3, PyLong_FromUnsignedLongLong(d));
+
+  // Note: PyTuple_SetItem "steals" a reference to each object,
+  // so we do not need to Py_DECREF the PyLong objects explicitly.
+
+  return tuple;  // Return the created tuple
+}
+
+// ---------------------------------------------------------------------------
+// Our exported C functions that call Python:
+
+// use CUstream instead of cudaStream_t, to avoid including cuda_runtime_api.h
+void* my_malloc(ssize_t size, int device, CUstream stream) {
+  ensure_context(device);
+
+  // first allocation, align the size, and reserve an address, and also allocate
+  // a CUmemGenericAllocationHandle
+
+  // Define memory allocation properties
+  CUmemAllocationProp prop = {};
+  prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+  prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+  prop.location.id = device;
+  prop.allocFlags.compressionType = CU_MEM_ALLOCATION_COMP_NONE;
+
+  // Check if the allocation is supported
+  size_t granularity;
+  CUDA_CHECK(cuMemGetAllocationGranularity(&granularity, &prop,
+                                           CU_MEM_ALLOC_GRANULARITY_MINIMUM));
+
+  size_t alignedSize = ((size + granularity - 1) / granularity) * granularity;
+
+  CUdeviceptr d_mem;
+  CUDA_CHECK(cuMemAddressReserve(&d_mem, alignedSize, 0, 0, 0));
+
+  // allocate the CUmemGenericAllocationHandle
+  CUmemGenericAllocationHandle* p_memHandle =
+      (CUmemGenericAllocationHandle*)malloc(
+          sizeof(CUmemGenericAllocationHandle));
+
+  if (!g_python_malloc_callback) {
+    std::cerr << "ERROR: g_python_malloc_callback not set.\n";
+    return nullptr;
+  }
+
+  // Acquire GIL (not in stable ABI officially, but often works)
+  PyGILState_STATE gstate = PyGILState_Ensure();
+
+  PyObject* arg_tuple = create_tuple_from_c_integers(
+      (unsigned long long)device, (unsigned long long)alignedSize,
+      (unsigned long long)d_mem, (unsigned long long)p_memHandle);
+
+  // Call g_python_malloc_callback
+  PyObject* py_result =
+      PyObject_CallFunctionObjArgs(g_python_malloc_callback, arg_tuple, NULL);
+  Py_DECREF(arg_tuple);
+
+  if (!py_result) {
+    PyErr_Print();
+    PyGILState_Release(gstate);
+    return nullptr;
+  }
+
+  PyGILState_Release(gstate);
+
+  // do the final mapping
+  create_and_map(device, alignedSize, d_mem, p_memHandle);
+
+  return (void*)d_mem;
+}
+
+// use CUstream instead of cudaStream_t, to avoid including cuda_runtime_api.h
+void my_free(void* ptr, ssize_t size, int device, CUstream stream) {
+  // get memory handle from the pointer
+  if (!g_python_free_callback) {
+    std::cerr << "ERROR: g_python_free_callback not set.\n";
+    return;
+  }
+
+  // Acquire GIL (not in stable ABI officially, but often works)
+  PyGILState_STATE gstate = PyGILState_Ensure();
+
+  PyObject* py_ptr =
+      PyLong_FromUnsignedLongLong(reinterpret_cast<unsigned long long>(ptr));
+
+  PyObject* py_result =
+      PyObject_CallFunctionObjArgs(g_python_free_callback, py_ptr, NULL);
+
+  if (!py_result || !PyTuple_Check(py_result) || PyTuple_Size(py_result) != 4) {
+    PyErr_SetString(PyExc_TypeError, "Expected a tuple of size 4");
+    return;
+  }
+
+  unsigned long long recv_device, recv_size;
+  unsigned long long recv_d_mem, recv_p_memHandle;
+  // Unpack the tuple into four C integers
+  if (!PyArg_ParseTuple(py_result, "KKKK", &recv_device, &recv_size,
+                        &recv_d_mem, &recv_p_memHandle)) {
+    // PyArg_ParseTuple sets an error if it fails
+    return;
+  }
+
+  PyGILState_Release(gstate);
+
+  // recv_size == size
+  // recv_device == device
+
+  // Free memory
+
+  CUdeviceptr d_mem = (CUdeviceptr)recv_d_mem;
+  CUmemGenericAllocationHandle* p_memHandle =
+      (CUmemGenericAllocationHandle*)recv_p_memHandle;
+  unmap_and_release(device, size, d_mem, p_memHandle);
+
+  // free address and the handle
+  CUDA_CHECK(cuMemAddressFree(d_mem, size));
+  free(p_memHandle);
+}
+
+// ---------------------------------------------------------------------------
+// Python extension boilerplate:
+
+// Python-exposed function: init_module(python_malloc, python_free)
+static PyObject* py_init_module(PyObject* self, PyObject* args) {
+  PyObject* malloc_callback = nullptr;
+  PyObject* free_callback = nullptr;
+
+  if (!PyArg_ParseTuple(args, "OO", &malloc_callback, &free_callback)) {
+    return nullptr;
+  }
+
+  if (!PyCallable_Check(malloc_callback) || !PyCallable_Check(free_callback)) {
+    PyErr_SetString(PyExc_TypeError, "Both arguments must be callables");
+    return nullptr;
+  }
+
+  // Save the Python callables
+  // This module does not handle GC of these objects, so they must be kept alive
+  // outside of this module.
+  g_python_malloc_callback = malloc_callback;
+  g_python_free_callback = free_callback;
+
+  Py_RETURN_NONE;
+}
+
+static PyObject* python_unmap_and_release(PyObject* self, PyObject* args) {
+  if (!args || !PyTuple_Check(args) || PyTuple_Size(args) != 4) {
+    PyErr_SetString(PyExc_TypeError, "Expected a tuple of size 4");
+    return nullptr;
+  }
+
+  unsigned long long recv_device, recv_size;
+  unsigned long long recv_d_mem, recv_p_memHandle;
+  // Unpack the tuple into four C integers
+  if (!PyArg_ParseTuple(args, "KKKK", &recv_device, &recv_size, &recv_d_mem,
+                        &recv_p_memHandle)) {
+    // PyArg_ParseTuple sets an error if it fails
+    return nullptr;
+  }
+
+  CUdeviceptr d_mem_ptr = (CUdeviceptr)recv_d_mem;
+  CUmemGenericAllocationHandle* p_memHandle =
+      (CUmemGenericAllocationHandle*)recv_p_memHandle;
+
+  unmap_and_release(recv_device, recv_size, d_mem_ptr, p_memHandle);
+
+  Py_RETURN_NONE;
+}
+
+static PyObject* python_create_and_map(PyObject* self, PyObject* args) {
+  if (!args || !PyTuple_Check(args) || PyTuple_Size(args) != 4) {
+    PyErr_SetString(PyExc_TypeError, "Expected a tuple of size 4");
+    return nullptr;
+  }
+
+  unsigned long long recv_device, recv_size;
+  unsigned long long recv_d_mem, recv_p_memHandle;
+  // Unpack the tuple into four C integers
+  if (!PyArg_ParseTuple(args, "KKKK", &recv_device, &recv_size, &recv_d_mem,
+                        &recv_p_memHandle)) {
+    // PyArg_ParseTuple sets an error if it fails
+    return nullptr;
+  }
+
+  CUdeviceptr d_mem_ptr = (CUdeviceptr)recv_d_mem;
+  CUmemGenericAllocationHandle* p_memHandle =
+      (CUmemGenericAllocationHandle*)recv_p_memHandle;
+
+  create_and_map(recv_device, recv_size, d_mem_ptr, p_memHandle);
+
+  Py_RETURN_NONE;
+}
+
+static PyMethodDef module_methods[] = {
+    {"init_module", (PyCFunction)py_init_module, METH_VARARGS,
+     "Initialize module with python_malloc and python_free callables."},
+    {"python_create_and_map", (PyCFunction)python_create_and_map, METH_VARARGS,
+     "Create and map memory on the device."},
+    {"python_unmap_and_release", (PyCFunction)python_unmap_and_release,
+     METH_VARARGS, "Unmap and release memory on the device."},
+    {NULL, NULL, 0, NULL}  // sentinel
+};
+
+static struct PyModuleDef cumem_allocator_module = {
+    PyModuleDef_HEAD_INIT, "cumem_allocator",
+    "cumem-based allocator for CUDAPluggableAllocator", -1, module_methods};
+
+PyMODINIT_FUNC PyInit_cumem_allocator(void) {
+  // Initialize the module
+  PyObject* module = PyModule_Create(&cumem_allocator_module);
+  if (!module) {
+    return NULL;
+  }
+  return module;
+}
+}  // extern "C"
diff --git a/setup.py b/setup.py
index 66ce22dd65d5e..dde5072139660 100644
--- a/setup.py
+++ b/setup.py
@@ -301,6 +301,7 @@ def run(self) -> None:
                 "vllm/vllm_flash_attn/vllm_flash_attn_c.abi3.so",
                 "vllm/vllm_flash_attn/flash_attn_interface.py",
                 "vllm/vllm_flash_attn/__init__.py",
+                "vllm/cumem_allocator.abi3.so",
                 # "vllm/_version.py", # not available in nightly wheels yet
             ]
             file_members = filter(lambda x: x.filename in files_to_copy,
@@ -594,6 +595,7 @@ def _read_requirements(filename: str) -> List[str]:
 if _is_cuda():
     ext_modules.append(
         CMakeExtension(name="vllm.vllm_flash_attn.vllm_flash_attn_c"))
+    ext_modules.append(CMakeExtension(name="vllm.cumem_allocator"))
 
 if _build_custom_ops():
     ext_modules.append(CMakeExtension(name="vllm._C"))
diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py
new file mode 100644
index 0000000000000..53f4ef08f36a2
--- /dev/null
+++ b/tests/basic_correctness/test_cumem.py
@@ -0,0 +1,112 @@
+import torch
+
+from vllm import LLM, SamplingParams
+from vllm.device_allocator.cumem import CuMemAllocator
+from vllm.utils import GiB_bytes
+
+from ..utils import fork_new_process_for_each_test
+
+
+@fork_new_process_for_each_test
+def test_basic_cumem():
+    # some tensors from default memory pool
+    shape = (1024, 1024)
+    x = torch.empty(shape, device='cuda')
+    x.zero_()
+
+    # some tensors from custom memory pool
+    allocator = CuMemAllocator.get_instance()
+    with allocator.use_memory_pool():
+        # custom memory pool
+        y = torch.empty(shape, device='cuda')
+        y.zero_()
+        y += 1
+        z = torch.empty(shape, device='cuda')
+        z.zero_()
+        z += 2
+
+    # they can be used together
+    output = x + y + z
+    assert torch.allclose(output, torch.ones_like(output) * 3)
+
+    free_bytes = torch.cuda.mem_get_info()[0]
+    allocator.sleep()
+    free_bytes_after_sleep = torch.cuda.mem_get_info()[0]
+    assert free_bytes_after_sleep > free_bytes
+    allocator.wake_up()
+
+    # they can be used together
+    output = x + y + z
+    assert torch.allclose(output, torch.ones_like(output) * 3)
+
+
+@fork_new_process_for_each_test
+def test_cumem_with_cudagraph():
+    allocator = CuMemAllocator.get_instance()
+    with allocator.use_memory_pool():
+        weight = torch.eye(1024, device='cuda')
+    with allocator.use_memory_pool(tag="discard"):
+        cache = torch.empty(1024, 1024, device='cuda')
+
+    def model(x):
+        out = x @ weight
+        cache[:out.size(0)].copy_(out)
+        return out + 1
+
+    x = torch.empty(128, 1024, device='cuda')
+
+    # warmup
+    model(x)
+
+    # capture cudagraph
+    model_graph = torch.cuda.CUDAGraph()
+    with torch.cuda.graph(model_graph):
+        y = model(x)
+
+    free_bytes = torch.cuda.mem_get_info()[0]
+    allocator.sleep()
+    free_bytes_after_sleep = torch.cuda.mem_get_info()[0]
+    assert free_bytes_after_sleep > free_bytes
+    allocator.wake_up()
+
+    # after waking up, the content in the weight tensor
+    # should be restored, but the content in the cache tensor
+    # should be discarded
+
+    # this operation is also compatible with cudagraph
+
+    x.random_()
+    model_graph.replay()
+
+    # cache content is as expected
+    assert torch.allclose(x, cache[:x.size(0)])
+
+    # output content is as expected
+    assert torch.allclose(y, x + 1)
+
+
+@fork_new_process_for_each_test
+def test_end_to_end():
+    free, total = torch.cuda.mem_get_info()
+    used_bytes_baseline = total - free  # in case other process is running
+    llm = LLM("meta-llama/Llama-3.2-1B", enable_sleep_mode=True)
+    prompt = "How are you?"
+    sampling_params = SamplingParams(temperature=0, max_tokens=10)
+    output = llm.generate(prompt, sampling_params)
+
+    # the benefit of `llm.sleep(level=2)` is mainly CPU memory usage,
+    # which is difficult to measure in the test. therefore, we only
+    # test sleep level 1 here.
+    llm.sleep(level=1)
+
+    free_gpu_bytes_after_sleep, total = torch.cuda.mem_get_info()
+    used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline
+    # now the memory usage is mostly cudagraph memory pool,
+    # and it should be less than the model weights (1B model, 2GiB weights)
+    assert used_bytes < 2 * GiB_bytes
+
+    llm.wake_up()
+    output2 = llm.generate(prompt, sampling_params)
+
+    # cmp output
+    assert output[0].outputs[0].text == output2[0].outputs[0].text
diff --git a/vllm/config.py b/vllm/config.py
index b8628db4d2b80..69577505fc9bd 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -195,40 +195,43 @@ def compute_hash(self) -> str:
         factors.append(self.rope_theta)
         return hashlib.sha256(str(factors).encode()).hexdigest()
 
-    def __init__(self,
-                 model: str,
-                 task: Union[TaskOption, Literal["draft"]],
-                 tokenizer: str,
-                 tokenizer_mode: str,
-                 trust_remote_code: bool,
-                 dtype: Union[str, torch.dtype],
-                 seed: int,
-                 allowed_local_media_path: str = "",
-                 revision: Optional[str] = None,
-                 code_revision: Optional[str] = None,
-                 rope_scaling: Optional[Dict[str, Any]] = None,
-                 rope_theta: Optional[float] = None,
-                 tokenizer_revision: Optional[str] = None,
-                 max_model_len: Optional[int] = None,
-                 spec_target_max_model_len: Optional[int] = None,
-                 quantization: Optional[str] = None,
-                 quantization_param_path: Optional[str] = None,
-                 enforce_eager: Optional[bool] = None,
-                 max_seq_len_to_capture: Optional[int] = None,
-                 max_logprobs: int = 20,
-                 disable_sliding_window: bool = False,
-                 skip_tokenizer_init: bool = False,
-                 served_model_name: Optional[Union[str, List[str]]] = None,
-                 limit_mm_per_prompt: Optional[Mapping[str, int]] = None,
-                 use_async_output_proc: bool = True,
-                 config_format: ConfigFormat = ConfigFormat.AUTO,
-                 hf_overrides: Optional[HfOverrides] = None,
-                 mm_processor_kwargs: Optional[Dict[str, Any]] = None,
-                 disable_mm_preprocessor_cache: bool = False,
-                 override_neuron_config: Optional[Dict[str, Any]] = None,
-                 override_pooler_config: Optional["PoolerConfig"] = None,
-                 logits_processor_pattern: Optional[str] = None,
-                 generation_config: Optional[str] = None) -> None:
+    def __init__(
+        self,
+        model: str,
+        task: Union[TaskOption, Literal["draft"]],
+        tokenizer: str,
+        tokenizer_mode: str,
+        trust_remote_code: bool,
+        dtype: Union[str, torch.dtype],
+        seed: int,
+        allowed_local_media_path: str = "",
+        revision: Optional[str] = None,
+        code_revision: Optional[str] = None,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        rope_theta: Optional[float] = None,
+        tokenizer_revision: Optional[str] = None,
+        max_model_len: Optional[int] = None,
+        spec_target_max_model_len: Optional[int] = None,
+        quantization: Optional[str] = None,
+        quantization_param_path: Optional[str] = None,
+        enforce_eager: Optional[bool] = None,
+        max_seq_len_to_capture: Optional[int] = None,
+        max_logprobs: int = 20,
+        disable_sliding_window: bool = False,
+        skip_tokenizer_init: bool = False,
+        served_model_name: Optional[Union[str, List[str]]] = None,
+        limit_mm_per_prompt: Optional[Mapping[str, int]] = None,
+        use_async_output_proc: bool = True,
+        config_format: ConfigFormat = ConfigFormat.AUTO,
+        hf_overrides: Optional[HfOverrides] = None,
+        mm_processor_kwargs: Optional[Dict[str, Any]] = None,
+        disable_mm_preprocessor_cache: bool = False,
+        override_neuron_config: Optional[Dict[str, Any]] = None,
+        override_pooler_config: Optional["PoolerConfig"] = None,
+        logits_processor_pattern: Optional[str] = None,
+        generation_config: Optional[str] = None,
+        enable_sleep_mode: bool = False,
+    ) -> None:
         self.model = model
         self.tokenizer = tokenizer
         self.tokenizer_mode = tokenizer_mode
@@ -277,6 +280,12 @@ def __init__(self,
         self.max_logprobs = max_logprobs
         self.disable_sliding_window = disable_sliding_window
         self.skip_tokenizer_init = skip_tokenizer_init
+        self.enable_sleep_mode = enable_sleep_mode
+
+        from vllm.platforms import current_platform
+
+        if self.enable_sleep_mode and not current_platform.is_cuda():
+            raise ValueError("Sleep mode is only supported on CUDA devices.")
 
         hf_config = get_config(self.model, trust_remote_code, revision,
                                code_revision, config_format)
@@ -348,7 +357,6 @@ def __init__(self,
         self.is_hybrid = self._init_is_hybrid()
         self.has_inner_state = self._init_has_inner_state()
 
-        from vllm.platforms import current_platform
         if current_platform.is_neuron():
             self.override_neuron_config = override_neuron_config
         else:
diff --git a/vllm/device_allocator/__init__.py b/vllm/device_allocator/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/vllm/device_allocator/cumem.py b/vllm/device_allocator/cumem.py
new file mode 100644
index 0000000000000..a43418dbb3b46
--- /dev/null
+++ b/vllm/device_allocator/cumem.py
@@ -0,0 +1,254 @@
+# cumem-based pytorch pluggable allocator to implement sleep mode.
+# other approaches tried but failed:
+# - cuda-python package binding
+# - custom libcuda driver ctypes wrapper
+# both of them failed because of cuda context mismatch.
+# not sure why, they are created from a different context.
+# the only successful approach is to call cuda driver API in C.
+import dataclasses
+from contextlib import contextmanager
+from typing import Callable, Dict, Optional, Tuple, Union
+
+import torch
+
+from vllm.utils import is_pin_memory_available
+
+
+def find_loaded_library(lib_name) -> Optional[str]:
+    """
+    According to according to https://man7.org/linux/man-pages/man5/proc_pid_maps.5.html,
+    the file `/proc/self/maps` contains the memory maps of the process, which includes the
+    shared libraries loaded by the process. We can use this file to find the path of the
+    a loaded library.
+    """ # noqa
+    found_line = None
+    with open("/proc/self/maps") as f:
+        for line in f:
+            if lib_name in line:
+                found_line = line
+                break
+    if found_line is None:
+        # the library is not loaded in the current process
+        return None
+    # if lib_name is libcudart, we need to match a line with:
+    # address /path/to/libcudart-hash.so.11.0
+    start = found_line.index("/")
+    path = found_line[start:].strip()
+    filename = path.split("/")[-1]
+    assert filename.rpartition(".so")[0].startswith(lib_name), \
+        f"Unexpected filename: {filename} for library {lib_name}"
+    return path
+
+
+cumem_available = False
+try:
+    from vllm.cumem_allocator import (init_module, python_create_and_map,
+                                      python_unmap_and_release)
+    from vllm.distributed.device_communicators.cuda_wrapper import (
+        CudaRTLibrary)
+    lib_name = find_loaded_library("cumem_allocator")
+    libcudart = CudaRTLibrary()
+    cumem_available = True
+except ModuleNotFoundError:
+    # rocm platform does not support cumem allocator
+    init_module = None
+    python_create_and_map = None
+    python_unmap_and_release = None
+    CudaRTLibrary = None
+    lib_name = None
+    libcudart = None
+
+# py_device, py_alignedSize, py_d_mem, py_p_memHandle
+HandleType = Tuple[int, int, int, int]
+
+
+@dataclasses.dataclass
+class AllocationData:
+    handle: HandleType
+    tag: str
+    cpu_backup_tensor: Optional[torch.Tensor] = None
+
+
+def create_and_map(allocation_handle: HandleType) -> None:
+    python_create_and_map(*allocation_handle)
+
+
+def unmap_and_release(allocation_handle: HandleType) -> None:
+    python_unmap_and_release(*allocation_handle)
+
+
+def get_pluggable_allocator(
+    python_malloc_fn: Callable[[int],
+                               int], python_free_func: Callable[[int, int],
+                                                                None]
+) -> torch.cuda.memory.CUDAPluggableAllocator:
+    init_module(python_malloc_fn, python_free_func)
+    new_alloc = torch.cuda.memory.CUDAPluggableAllocator(
+        lib_name, 'my_malloc', 'my_free')
+    return new_alloc
+
+
+@contextmanager
+def use_memory_pool_with_allocator(
+        python_malloc_fn: Callable[[int], int],
+        python_free_func: Callable[[int, int], None]) -> None:
+    new_alloc = get_pluggable_allocator(python_malloc_fn, python_free_func)
+    mem_pool = torch.cuda.memory.MemPool(new_alloc._allocator)
+    with torch.cuda.memory.use_mem_pool(mem_pool):
+        yield mem_pool
+
+
+class CuMemAllocator:
+    """
+    A singleton class that manages a memory pool for CUDA tensors.
+    The memory in this pool can be offloaded or discarded when the
+    allocator sleeps.
+
+    Inside the `use_memory_pool(tag)` context, all tensors created will
+    be allocated in the memory pool, and has the same tag as the
+    tag passed to the context.
+
+    When we call `sleep`, all tensors with the specified tag will be
+    offloaded to CPU memory, and the rest of the tensors will be discarded.
+    When we call `wake_up`, all tensors that are previously offloaded
+    will be loaded back to GPU memory, and the rest of the tensors will
+    have empty memory.
+
+    Why it needs to be a singleton?
+    When allocated tensors are garbage collected, PyTorch will call
+    the free callback, which will call the `python_free_callback` method.
+    The C-extension uses a global variable to store the function of an
+    instance of this class. If we create multiple instances of this class,
+    the global variable will be overwritten and the free callback will
+    not work as expected.
+    """
+    instance: "CuMemAllocator" = None
+    default_tag: str = "default"
+
+    @staticmethod
+    def get_instance() -> "CuMemAllocator":
+        """
+        CuMemAllocator is a singleton class.
+        We cannot call the constructor directly.
+        Call this method to get the instance.
+        """
+        assert cumem_available, "cumem allocator is not available"
+        if CuMemAllocator.instance is None:
+            CuMemAllocator.instance = CuMemAllocator()
+        return CuMemAllocator.instance
+
+    def __init__(self):
+        self.pointer_to_data: Dict[int, AllocationData] = {}
+        self.current_tag: str = CuMemAllocator.default_tag
+
+    def python_malloc_callback(self, allocation_handle: HandleType) -> None:
+        """
+        Internal method to store the allocation data
+        when memory is allocated in the memory pool."""
+        py_d_mem = allocation_handle[2]
+        self.pointer_to_data[py_d_mem] = AllocationData(
+            allocation_handle, self.current_tag)
+        return
+
+    def python_free_callback(self, ptr: int) -> HandleType:
+        """
+        Internal method to look up the allocation data
+        when memory is freed in the memory pool."""
+        data = self.pointer_to_data.pop(ptr)
+        if data.cpu_backup_tensor is not None:
+            data.cpu_backup_tensor = None
+        return data.handle
+
+    def sleep(
+            self,
+            offload_tags: Optional[Union[Tuple[str, ...],
+                                         str]] = None) -> None:
+        """
+        Put the allocator in sleep mode.
+        All data in the memory allocation with the specified tag will be 
+        offloaded to CPU memory, and others will be discarded.
+
+        :param offload_tags: The tags of the memory allocation that will be
+            offloaded. The rest of the memory allocation will be discarded.
+        """
+        if offload_tags is None:
+            # by default, allocated tensors are offloaded
+            # when the allocator sleeps
+            offload_tags = (CuMemAllocator.default_tag, )
+        elif isinstance(offload_tags, str):
+            offload_tags = (offload_tags, )
+
+        assert isinstance(offload_tags, tuple)
+
+        for ptr, data in self.pointer_to_data.items():
+            handle = data.handle
+            if data.tag in offload_tags:
+                size_in_bytes = handle[1]
+                cpu_backup_tensor = torch.empty(
+                    size_in_bytes,
+                    dtype=torch.uint8,
+                    device='cpu',
+                    pin_memory=is_pin_memory_available())
+                cpu_ptr = cpu_backup_tensor.data_ptr()
+                libcudart.cudaMemcpy(cpu_ptr, ptr, size_in_bytes)
+                data.cpu_backup_tensor = cpu_backup_tensor
+            unmap_and_release(handle)
+
+    def wake_up(self):
+        """
+        Wake up the allocator from sleep mode.
+        All data that is previously offloaded will be loaded back to GPU 
+        memory, and the rest of the data will have empty memory."""
+        for ptr, data in self.pointer_to_data.items():
+            handle = data.handle
+            create_and_map(handle)
+            if data.cpu_backup_tensor is not None:
+                cpu_backup_tensor = data.cpu_backup_tensor
+                if cpu_backup_tensor is not None:
+                    size_in_bytes = cpu_backup_tensor.numel(
+                    ) * cpu_backup_tensor.element_size()
+                    cpu_ptr = cpu_backup_tensor.data_ptr()
+                    libcudart.cudaMemcpy(ptr, cpu_ptr, size_in_bytes)
+                    data.cpu_backup_tensor = None
+
+    @contextmanager
+    def use_memory_pool(self, tag: Optional[str] = None):
+        """
+        A context manager to use the memory pool.
+        All memory allocation created inside the context will be allocated 
+        in the memory pool, and has the specified tag.
+
+        :param tag: The tag of the memory allocation. If None, the default tag
+            will be used.
+        """
+        if tag is None:
+            tag = CuMemAllocator.default_tag
+
+        assert isinstance(tag, str)
+
+        old_tag = self.current_tag
+        self.current_tag = tag
+        with use_memory_pool_with_allocator(self.python_malloc_callback,
+                                            self.python_free_callback):
+            yield
+            # PyTorch's bug, calling torch.cuda.empty_cache() will error
+            # when using pluggable allocator, see
+            # https://github.com/pytorch/pytorch/issues/145168 .
+            # if we have some memory allocated and then freed,
+            # the memory will not be released.
+            # right now it is fine, because we only use this allocator
+            # during weight loading and kv cache creation, where we only
+            # allocate memory.
+            # TODO: we need to find a way to release the memory,
+            # i.e. calling torch.cuda.empty_cache()
+            self.current_tag = old_tag
+
+    def get_current_usage(self) -> int:
+        """
+        Get the total number of bytes allocated in the memory pool.
+        """
+        sum_bytes: int = 0
+        for ptr, data in self.pointer_to_data.items():
+            handle = data.handle
+            sum_bytes += handle[1]
+        return sum_bytes
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index a4f4c9558d056..ba58614bf8f95 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -197,6 +197,7 @@ class EngineArgs:
     kv_transfer_config: Optional[KVTransferConfig] = None
 
     generation_config: Optional[str] = None
+    enable_sleep_mode: bool = False
 
     def __post_init__(self):
         if not self.tokenizer:
@@ -955,6 +956,12 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             "loaded from model. If set to a folder path, the generation config "
             "will be loaded from the specified folder path.")
 
+        parser.add_argument("--enable-sleep-mode",
+                            action="store_true",
+                            default=False,
+                            help="Enable sleep mode for the engine. "
+                            "(only cuda platform is supported)")
+
         return parser
 
     @classmethod
@@ -999,7 +1006,9 @@ def create_model_config(self) -> ModelConfig:
             override_neuron_config=self.override_neuron_config,
             override_pooler_config=self.override_pooler_config,
             logits_processor_pattern=self.logits_processor_pattern,
-            generation_config=self.generation_config)
+            generation_config=self.generation_config,
+            enable_sleep_mode=self.enable_sleep_mode,
+        )
 
     def create_load_config(self) -> LoadConfig:
         return LoadConfig(
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index dc6c9c1778dd7..5271027d41863 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -1818,6 +1818,16 @@ def start_profile(self) -> None:
     def stop_profile(self) -> None:
         self.model_executor.stop_profile()
 
+    def sleep(self, level: int = 1) -> None:
+        assert self.vllm_config.model_config.enable_sleep_mode, (
+            "Sleep mode is not enabled in the model config")
+        self.model_executor.sleep(level=level)
+
+    def wake_up(self) -> None:
+        assert self.vllm_config.model_config.enable_sleep_mode, (
+            "Sleep mode is not enabled in the model config")
+        self.model_executor.wake_up()
+
     def check_health(self) -> None:
         if self.tokenizer:
             self.tokenizer.check_health()
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 27386daa4bbc9..04056f37f851b 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -1132,6 +1132,29 @@ def start_profile(self) -> None:
     def stop_profile(self) -> None:
         self.llm_engine.stop_profile()
 
+    def sleep(self, level: int = 1):
+        """
+        Put the engine to sleep. The engine should not process any requests.
+        The caller should guarantee that no requests are being processed
+        during the sleep period, before `wake_up` is called.
+
+        :param level: The sleep level. Level 1 sleep will offload the model 
+            weights and discard the kv cache. The content of kv cache is 
+            forgotten. Level 1 sleep is good for sleeping and waking up the 
+            engine to run the same model again. The model weights are backed 
+            up in CPU memory. Please make sure there's enough CPU memory to 
+            store the model weights. Level 2 sleep will discard both the model 
+            weights and the kv cache. The content of both the model weights 
+            and kv cache is forgotten. Level 2 sleep is good for sleeping and 
+            waking up the engine to run a different model or update the model, 
+            where previous model weights are not needed. It reduces CPU memory 
+            pressure.
+        """
+        self.llm_engine.sleep(level=level)
+
+    def wake_up(self):
+        self.llm_engine.wake_up()
+
     # LEGACY
     def _convert_v1_inputs(
         self,
diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py
index 859e105f15d97..069be05eafa50 100644
--- a/vllm/executor/executor_base.py
+++ b/vllm/executor/executor_base.py
@@ -193,6 +193,17 @@ def start_profile(self) -> None:
     def stop_profile(self) -> None:
         self.collective_rpc("stop_profile")
 
+    def sleep(self, level: int = 1):
+        if self.cache_config.enable_prefix_caching:
+            # TODO: support sleep with prefix caching
+            # by resetting the prefix cache state,
+            # after https://github.com/vllm-project/vllm/pull/12284
+            raise ValueError("Cannot sleep when prefix caching is enabled.")
+        self.collective_rpc("sleep", kwargs=dict(level=level))
+
+    def wake_up(self):
+        self.collective_rpc("wake_up")
+
     def save_sharded_state(
         self,
         path: str,
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index bd40112aea5e8..e69b6c90454c4 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -9,12 +9,14 @@
 
 import vllm.envs as envs
 from vllm.config import ParallelConfig, VllmConfig
+from vllm.device_allocator.cumem import CuMemAllocator
 from vllm.distributed import (ensure_model_parallel_initialized,
                               init_distributed_environment,
                               set_custom_all_reduce)
 from vllm.logger import init_logger
 from vllm.model_executor import set_random_seed
 from vllm.platforms import current_platform
+from vllm.utils import GiB_bytes
 from vllm.v1.core.scheduler import SchedulerOutput
 from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
 from vllm.v1.outputs import ModelRunnerOutput
@@ -77,6 +79,23 @@ def __init__(
         else:
             self.profiler = None
 
+    def sleep(self, level: int = 1) -> None:
+        free_bytes_before_sleep = torch.cuda.mem_get_info()[0]
+        allocator = CuMemAllocator.get_instance()
+        allocator.sleep(offload_tags=("weights", ) if level == 1 else tuple())
+        free_bytes_after_sleep, total = torch.cuda.mem_get_info()
+        freed_bytes = free_bytes_after_sleep - free_bytes_before_sleep
+        used_bytes = total - free_bytes_after_sleep
+        assert freed_bytes >= 0, "Memory usage increased after sleeping."
+        logger.info(
+            "Sleep mode freed %.2f GiB memory, "
+            "%.2f GiB memory is still in use.", freed_bytes / GiB_bytes,
+            used_bytes / GiB_bytes)
+
+    def wake_up(self) -> None:
+        allocator = CuMemAllocator.get_instance()
+        allocator.wake_up()
+
     def init_device(self):
         if self.device_config.device.type == "cuda":
             # torch.distributed.all_reduce does not free the input tensor until
@@ -110,7 +129,17 @@ def init_device(self):
         self.model_runner = GPUModelRunner(self.vllm_config, self.device)
 
     def load_model(self) -> None:
-        self.model_runner.load_model()
+        if self.vllm_config.model_config.enable_sleep_mode:
+            allocator = CuMemAllocator.get_instance()
+            assert allocator.get_current_usage() == 0, (
+                "Sleep mode can only be "
+                "used for one instance per process.")
+            context = allocator.use_memory_pool(tag="weights")
+        else:
+            from contextlib import nullcontext
+            context = nullcontext()
+        with context:
+            self.model_runner.load_model()
 
     @torch.inference_mode()
     def determine_available_memory(self) -> int:
@@ -167,7 +196,14 @@ def get_kv_cache_spec(self) -> KVCacheSpec:
 
     def initialize_cache(self, kv_cache_config: KVCacheConfig) -> None:
         """Allocate GPU KV cache with the specified kv_cache_config."""
-        self.model_runner.initialize_kv_cache(kv_cache_config)
+        if self.vllm_config.model_config.enable_sleep_mode:
+            allocator = CuMemAllocator.get_instance()
+            context = allocator.use_memory_pool(tag="kv_cache")
+        else:
+            from contextlib import nullcontext
+            context = nullcontext()
+        with context:
+            self.model_runner.initialize_kv_cache(kv_cache_config)
 
     def compile_or_warm_up_model(self) -> None:
         if not self.model_config.enforce_eager:
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 29d62ddda3dc0..c95d2a9a69ff3 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -8,6 +8,7 @@
 
 import vllm.envs as envs
 from vllm.config import VllmConfig
+from vllm.device_allocator.cumem import CuMemAllocator
 from vllm.distributed import (ensure_kv_transfer_initialized,
                               ensure_model_parallel_initialized,
                               init_distributed_environment,
@@ -120,6 +121,23 @@ def stop_profile(self):
             raise RuntimeError("Profiler is not enabled.")
         self.profiler.stop()
 
+    def sleep(self, level: int = 1) -> None:
+        free_bytes_before_sleep = torch.cuda.mem_get_info()[0]
+        allocator = CuMemAllocator.get_instance()
+        allocator.sleep(offload_tags=("weights", ) if level == 1 else tuple())
+        free_bytes_after_sleep, total = torch.cuda.mem_get_info()
+        freed_bytes = free_bytes_after_sleep - free_bytes_before_sleep
+        used_bytes = total - free_bytes_after_sleep
+        assert freed_bytes >= 0, "Memory usage increased after sleeping."
+        logger.info(
+            "Sleep mode freed %.2f GiB memory, "
+            "%.2f GiB memory is still in use.", freed_bytes / GiB_bytes,
+            used_bytes / GiB_bytes)
+
+    def wake_up(self) -> None:
+        allocator = CuMemAllocator.get_instance()
+        allocator.wake_up()
+
     def init_device(self) -> None:
         if self.device_config.device.type == "cuda":
             # torch.distributed.all_reduce does not free the input tensor until
@@ -151,7 +169,17 @@ def init_device(self) -> None:
         set_random_seed(self.model_config.seed)
 
     def load_model(self):
-        self.model_runner.load_model()
+        if self.vllm_config.model_config.enable_sleep_mode:
+            allocator = CuMemAllocator.get_instance()
+            assert allocator.get_current_usage() == 0, (
+                "Sleep mode can only be "
+                "used for one instance per process.")
+            context = allocator.use_memory_pool(tag="weights")
+        else:
+            from contextlib import nullcontext
+            context = nullcontext()
+        with context:
+            self.model_runner.load_model()
 
     def save_sharded_state(
         self,
@@ -270,7 +298,14 @@ def initialize_cache(self, num_gpu_blocks: int,
         self.cache_config.num_gpu_blocks = num_gpu_blocks
         self.cache_config.num_cpu_blocks = num_cpu_blocks
 
-        self._init_cache_engine()
+        if self.vllm_config.model_config.enable_sleep_mode:
+            allocator = CuMemAllocator.get_instance()
+            context = allocator.use_memory_pool(tag="kv_cache")
+        else:
+            from contextlib import nullcontext
+            context = nullcontext()
+        with context:
+            self._init_cache_engine()
         self._warm_up_model()
 
     def _init_cache_engine(self):

From cd7b6f0857c6622d6b98eae6d6526dfe8dfdec70 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 22 Jan 2025 19:08:31 +0800
Subject: [PATCH 330/462] [VLM] Avoid unnecessary tokenization (#12310)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/blip2.py        | 12 +++++++++--
 vllm/model_executor/models/chameleon.py    | 23 +++++++++++++---------
 vllm/model_executor/models/deepseek_vl2.py |  6 ++++--
 vllm/model_executor/models/fuyu.py         |  5 ++++-
 vllm/model_executor/models/llava.py        | 17 ++++++++--------
 vllm/model_executor/models/qwen2_audio.py  | 20 ++++++++++---------
 vllm/model_executor/models/qwen2_vl.py     | 12 ++++++-----
 vllm/model_executor/models/ultravox.py     | 10 +++++++---
 vllm/transformers_utils/tokenizer.py       |  6 +++++-
 9 files changed, 71 insertions(+), 40 deletions(-)

diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index a8e6f3b6c6d41..09c5087c2dc36 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -475,15 +475,23 @@ def _get_prompt_replacements(
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
+        tokenizer = self.info.get_tokenizer()
+        vocab = tokenizer.get_vocab()
+
+        bos_token_id = tokenizer.bos_token_id
+        assert isinstance(bos_token_id, int)
+
+        image_token_id = vocab["image"]
         num_image_tokens = self.info.get_num_image_tokens()
+        image_tokens = [image_token_id] * num_image_tokens
 
         return [
             PromptReplacement(
                 modality="image",
                 target="</s>",
                 replacement=PromptReplacementDetails(
-                    full="<image>" * num_image_tokens + "</s>",
-                    features="<image>" * num_image_tokens,
+                    full=image_tokens + [bos_token_id],
+                    features=image_tokens,
                 ),
             )
         ]
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index 2df50b3c4efe6..e834c9004f140 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -122,8 +122,9 @@ def _apply_hf_processor_tokens_only(
     ) -> list[int]:
         # HF processor adds sep token for chat mode
         tokenizer = self.info.get_tokenizer()
-        sep_token_id: int = \
-            tokenizer.vocab[tokenizer.sep_token]  # type: ignore
+        vocab = tokenizer.get_vocab()
+
+        sep_token_id = vocab[tokenizer.sep_token]  # type: ignore
 
         return prompt_tokens + [sep_token_id]
 
@@ -141,18 +142,22 @@ def _get_prompt_replacements(
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
         processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
-        image_tokens = processor.image_token * self.info.get_num_image_tokens()
+        tokenizer = self.info.get_tokenizer()
+        vocab = tokenizer.get_vocab()
+
+        image_start_id = vocab[processor.image_start_token]
+        image_token_id = vocab[processor.image_token]
+        image_end_id = vocab[processor.image_end_token]
+
+        num_image_tokens = self.info.get_num_image_tokens()
+        image_tokens = [image_token_id] * num_image_tokens
 
         return [
             PromptReplacement(
                 modality="image",
-                target="<image>",
+                target=[image_token_id],
                 replacement=PromptReplacementDetails(
-                    full="".join([
-                        processor.image_start_token,
-                        image_tokens,
-                        processor.image_end_token,
-                    ]),
+                    full=([image_start_id] + image_tokens + [image_end_id]),
                     features=image_tokens,
                 ),
             )
diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
index 4d3d1c329a2c0..344832d8b33e6 100644
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -249,8 +249,10 @@ def _get_prompt_replacements(
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
-        hf_processor = self.info.get_hf_processor()
-        image_token_id: int = hf_processor.image_token_id
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+
+        image_token_id = hf_processor.image_token_id
+        assert isinstance(image_token_id, int)
 
         def get_replacement_deepseek_vl2(item_idx: int):
             images = mm_items.get_items(
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index 4865eb170364c..dbf9da50cc9de 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -183,7 +183,9 @@ def _apply_hf_processor_tokens_only(
     ) -> list[int]:
         # HF processor adds boa_token_id
         tokenizer = self.info.get_tokenizer()
-        boa_token_id: int = tokenizer.vocab["<0x04>"]  # type: ignore
+        vocab = tokenizer.get_vocab()
+
+        boa_token_id = vocab["<0x04>"]
 
         return prompt_tokens + [boa_token_id]
 
@@ -202,6 +204,7 @@ def _get_prompt_replacements(
     ) -> list[PromptReplacement]:
         hf_config = self.info.get_hf_config()
         bos_token_id = hf_config.bos_token_id
+        assert isinstance(bos_token_id, int)
 
         tokenizer = self.info.get_tokenizer()
         eot_token_id = tokenizer.bos_token_id
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index a355ae494afd0..296af2aac5660 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -315,13 +315,14 @@ def _get_prompt_replacements(
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
+        processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         hf_config = self.info.get_hf_config()
-        image_token_id = hf_config.image_token_index
+        tokenizer = self.info.get_tokenizer()
+        vocab = tokenizer.get_vocab()
 
-        processor = self.info.get_hf_processor()
-        image_token = processor.image_token
-        image_break_token = processor.image_break_token
-        image_end_token = processor.image_end_token
+        image_break_id = vocab[processor.image_break_token]
+        image_token_id = hf_config.image_token_index
+        image_end_id = vocab[processor.image_end_token]
 
         vision_config = hf_config.vision_config
         assert isinstance(vision_config, PixtralVisionConfig)
@@ -336,10 +337,10 @@ def get_replacement(item_idx: int):
                 image_height=image_size.height,
             )
 
-            tokens = ([image_token] * ncols + [image_break_token]) * nrows
-            tokens[-1] = image_end_token
+            tokens = ([image_token_id] * ncols + [image_break_id]) * nrows
+            tokens[-1] = image_end_id
 
-            return "".join(tokens)
+            return tokens
 
         return [
             PromptReplacement(
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index c21cb815b5283..fc5aed5c94abb 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -188,7 +188,9 @@ def _get_prompt_replacements(
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
-        processor = self.info.get_hf_processor()
+        processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+        tokenizer = self.info.get_tokenizer()
+        vocab = tokenizer.get_vocab()
 
         # Use getattr with default to be compatible with transformers<4.48
         audio_token = getattr(processor, "audio_token", "<|AUDIO|>")
@@ -197,6 +199,10 @@ def _get_prompt_replacements(
         audio_eos_token = getattr(processor, "audio_eos_token",
                                   "<|audio_eos|>")
 
+        audio_token_id = vocab[audio_token]
+        audio_bos_id = vocab[audio_bos_token]
+        audio_eos_id = vocab[audio_eos_token]
+
         feature_attention_mask = out_mm_kwargs.get("feature_attention_mask")
         if feature_attention_mask is None:
             audio_output_lengths = []
@@ -208,22 +214,18 @@ def _get_prompt_replacements(
             audio_output_lengths = audio_output_lens.tolist()
 
         def get_replacement_qwen2_audio(item_idx: int):
-            num_placeholders = audio_output_lengths[item_idx]
-            if num_placeholders == 0:
+            num_features = audio_output_lengths[item_idx]
+            if num_features == 0:
                 audios = mm_items.get_items("audio", AudioProcessorItems)
                 audio = audios.get(item_idx)
                 raise ValueError(
                     f"The audio {audio} (len={len(audio)}) is too short "
                     "to be represented inside the model")
 
-            audio_tokens = audio_token * num_placeholders
+            audio_tokens = [audio_token_id] * num_features
 
             return PromptReplacementDetails(
-                full="".join([
-                    audio_bos_token,
-                    audio_tokens,
-                    audio_eos_token,
-                ]),
+                full=[audio_bos_id] + audio_tokens + [audio_eos_id],
                 features=audio_tokens,
             )
 
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 34d5c8ad089a3..f1ff8badaf35a 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -953,12 +953,14 @@ def _get_prompt_replacements(
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         image_processor = self.info.get_image_processor(
             **hf_processor_mm_kwargs)
+        tokenizer = self.info.get_tokenizer()
+        vocab = tokenizer.get_vocab()
 
         # NOTE: Only Qwen2VLProcessor in transformers 4.47.0 has
         # image_token and video_token registered
         placeholder = {
-            "image": hf_processor.image_token,
-            "video": hf_processor.video_token,
+            "image": vocab[hf_processor.image_token],
+            "video": vocab[hf_processor.video_token],
         }
 
         merge_length = image_processor.merge_size**2
@@ -967,13 +969,13 @@ def get_replacement_qwen2vl(item_idx: int, modality: str):
             grid_thw = out_mm_kwargs[f"{modality}_grid_thw"][item_idx]
             assert isinstance(grid_thw, torch.Tensor)
 
-            num_tokens = grid_thw.prod().item() // merge_length
-            return placeholder[modality] * num_tokens
+            num_tokens = int(grid_thw.prod()) // merge_length
+            return [placeholder[modality]] * num_tokens
 
         return [
             PromptReplacement(
                 modality=modality,
-                target=placeholder[modality],
+                target=[placeholder[modality]],
                 replacement=partial(get_replacement_qwen2vl,
                                     modality=modality),
             ) for modality in ("image", "video")
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index 9301422383696..145df09a427e4 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -205,16 +205,20 @@ def _get_prompt_replacements(
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
-        placeholder = hf_processor.audio_token_replacement  # type: ignore
+        tokenizer = self.info.get_tokenizer()
+        vocab = tokenizer.get_vocab()
+
+        replacement_id = vocab[
+            hf_processor.audio_token_replacement]  # type: ignore
 
         def get_replacement_ultravox(item_idx: int):
             audio_token_len = out_mm_kwargs["audio_token_len"][item_idx]
-            return placeholder * audio_token_len
+            return [replacement_id] * int(audio_token_len)  # type: ignore
 
         return [
             PromptReplacement(
                 modality="audio",
-                target="<|audio|>",
+                target='<|audio|>',
                 replacement=get_replacement_ultravox,
             )
         ]
diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index 294262484f2fb..1f1d67fabb243 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -67,9 +67,10 @@ def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer:
     tokenizer_all_special_tokens_extended = (
         tokenizer.all_special_tokens_extended)
     tokenizer_all_special_tokens = set(tokenizer.all_special_tokens)
+    tokenizer_vocab = tokenizer.get_vocab()
     tokenizer_len = len(tokenizer)
 
-    max_token_id = max(tokenizer.get_vocab().values())
+    max_token_id = max(tokenizer_vocab.values())
     # Some tokenizers (e.g., QwenTokenizer) have special tokens that
     # are added and included in the implementation of the vocab_size
     # property, but not in get_vocab(); if there is an implementation
@@ -96,6 +97,9 @@ def all_special_tokens_extended(self):
         def max_token_id(self):
             return max_token_id
 
+        def get_vocab(self):
+            return tokenizer_vocab
+
         def __len__(self):
             return tokenizer_len
 

From 528dbcac7d042d84b5541a1fba968087b7dc2866 Mon Sep 17 00:00:00 2001
From: zhou fan <1247714429@qq.com>
Date: Wed, 22 Jan 2025 19:39:19 +0800
Subject: [PATCH 331/462] [Model][Bugfix]: correct Aria model output (#12309)

Signed-off-by: xffxff <1247714429@qq.com>
---
 examples/offline_inference/vision_language.py |  3 +-
 vllm/model_executor/models/aria.py            | 54 ++++++++++++++++++-
 2 files changed, 54 insertions(+), 3 deletions(-)

diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index f9048c7735ebf..415439e88ed59 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -28,9 +28,10 @@ def run_aria(question: str, modality: str):
     llm = LLM(model=model_name,
               max_model_len=4096,
               max_num_seqs=2,
+              dtype="bfloat16",
               disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
 
-    prompt = (f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>\n{question}"
+    prompt = (f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>{question}"
               "<|im_end|>\n<|im_start|>assistant\n")
 
     stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
index 34ac39b812deb..8c6873de13627 100644
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@@ -30,6 +30,7 @@
 from vllm.sequence import IntermediateTensors
 
 # yapf: disable
+from .idefics2_vision_model import Idefics2VisionConfig
 from .idefics2_vision_model import (
     Idefics2VisionTransformer as Idefics3VisionTransformer)
 # yapf: enable
@@ -50,6 +51,53 @@ class AriaImagePixelInputs(TypedDict):
     """
 
 
+class AriaVisionTransformer(Idefics3VisionTransformer):
+
+    def __init__(
+        self,
+        config: Idefics2VisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__(config, quant_config, prefix)
+        # Unlike Idefics3VisionTransformer which uses LayerNorm after the
+        # final layer, Aria omits this normalization, so we replace it with an
+        # Identity layer
+        self.post_layernorm = nn.Identity()
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+
+            # NOTE: post_layernorm is not used in Aria
+            if "post_layernorm" in name:
+                continue
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
 class AriaProjectorMLP(nn.Module):
 
     def __init__(
@@ -228,8 +276,10 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         router_output = torch.nn.functional.linear(hidden_states,
                                                    self.router_weight)
 
+        hidden_states_copy = hidden_states.clone()
+        # NOTE: hidden_states will be modified inplace by `FusedMoE`
         sparse_expert_output = self.experts(hidden_states, router_output)
-        shared_expert_output = self.shared_experts(hidden_states)
+        shared_expert_output = self.shared_experts(hidden_states_copy)
 
         return sparse_expert_output + shared_expert_output
 
@@ -445,7 +495,7 @@ def __init__(
         quant_config = vllm_config.quant_config
 
         self.config = config
-        self.vision_tower = Idefics3VisionTransformer(
+        self.vision_tower = AriaVisionTransformer(
             config.vision_config,
             quant_config,
             prefix=f"{prefix}.vision_tower",

From 16366ee8bbdc30aad9776b74121cfc4d8f8c897d Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Wed, 22 Jan 2025 05:06:36 -0800
Subject: [PATCH 332/462] [Bugfix][VLM] Fix mixed-modality inference backward
 compatibility for V0 (#12313)

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/model_executor/models/llava_onevision.py | 53 ++++++++++++---
 vllm/model_executor/models/qwen2_vl.py        | 67 +++++++++++++------
 2 files changed, 92 insertions(+), 28 deletions(-)

diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index 6faa79f65d8de..5b0f35b08646b 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -816,7 +816,7 @@ def apply_pooling(self, image_features, stride=2):
         return image_feature
 
     def get_multimodal_embeddings(
-            self, **kwargs) -> Optional[List[Tuple[NestedTensors, str]]]:
+            self, **kwargs) -> Optional[tuple[torch.Tensor, ...]]:
         modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
         if not modalities:
             return None
@@ -842,8 +842,7 @@ def get_multimodal_embeddings(
     def get_input_embeddings(
         self,
         input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[List[Tuple[NestedTensors,
-                                                   str]]] = None,
+        multimodal_embeddings: Optional[tuple[torch.Tensor, ...]] = None,
     ) -> torch.Tensor:
         inputs_embeds = self.language_model.get_input_embeddings(input_ids)
         if multimodal_embeddings is not None:
@@ -852,6 +851,34 @@ def get_input_embeddings(
                 [self.config.image_token_index, self.config.video_token_index])
         return inputs_embeds
 
+    def get_input_embeddings_v0(
+        self,
+        input_ids: torch.Tensor,
+        image_input: Optional[NestedTensors] = None,
+        video_input: Optional[NestedTensors] = None,
+    ) -> torch.Tensor:
+
+        inputs_embeds = self.get_input_embeddings(input_ids)
+        if image_input is not None:
+            image_embeds = self._process_image_input(image_input)
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                image_embeds,
+                placeholder_token_id=self.config.image_token_index,
+            )
+
+        if video_input is not None:
+            video_embeds = self._process_video_pixels(video_input)
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                video_embeds,
+                placeholder_token_id=self.config.video_token_index,
+            )
+
+        return inputs_embeds
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -871,13 +898,21 @@ def forward(
         if intermediate_tensors is not None:
             inputs_embeds = None
 
-        # NOTE: In v1, inputs_embeds is always generated at model runner, this
-        # condition is for v0 compatibility.
+        # NOTE: In v1, inputs_embeds is always generated at model runner from
+        # `get_multimodal_embeddings` and `get_input_embeddings`, this
+        # condition is only for v0 compatibility.
         elif inputs_embeds is None:
-            multimodal_embeddings = self.get_multimodal_embeddings(**kwargs)
-            inputs_embeds = self.get_input_embeddings(input_ids,
-                                                      multimodal_embeddings)
-            input_ids = None
+            image_input = self._parse_and_validate_image_input(**kwargs)
+            video_input = self._parse_and_validate_video_input(**kwargs)
+
+            if image_input is None and video_input is None:
+                inputs_embeds = None
+            else:
+                inputs_embeds = self.get_input_embeddings_v0(
+                    input_ids,
+                    image_input=image_input,
+                    video_input=video_input)
+                input_ids = None
 
         hidden_states = self.language_model.model(input_ids,
                                                   positions,
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index f1ff8badaf35a..a2778ee73810e 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -55,7 +55,7 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (ImageItem, ModalityData,
                                     MultiModalFieldConfig, MultiModalKwargs,
-                                    NestedTensors, VideoItem)
+                                    VideoItem)
 from vllm.multimodal.parse import (ImageSize, ModalityDataItems,
                                    MultiModalDataItems, MultiModalDataParser)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
@@ -1233,7 +1233,7 @@ def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
         return modalities
 
     def get_multimodal_embeddings(
-            self, **kwargs) -> Optional[List[Tuple[NestedTensors, str]]]:
+            self, **kwargs) -> Optional[tuple[torch.Tensor, ...]]:
 
         modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
         if not modalities:
@@ -1260,8 +1260,7 @@ def get_multimodal_embeddings(
     def get_input_embeddings(
         self,
         input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[List[Tuple[NestedTensors,
-                                                   str]]] = None,
+        multimodal_embeddings: Optional[tuple[torch.Tensor, ...]] = None,
     ) -> torch.Tensor:
         inputs_embeds = self.language_model.get_input_embeddings(input_ids)
         if multimodal_embeddings is not None:
@@ -1270,6 +1269,33 @@ def get_input_embeddings(
                 [self.config.image_token_id, self.config.video_token_id])
         return inputs_embeds
 
+    def get_input_embeddings_v0(
+        self,
+        input_ids: torch.Tensor,
+        image_input: Optional[tuple[torch.Tensor, ...]] = None,
+        video_input: Optional[tuple[torch.Tensor, ...]] = None,
+    ) -> torch.Tensor:
+
+        inputs_embeds = self.get_input_embeddings(input_ids)
+        if image_input is not None:
+            image_embeds = self._process_image_input(image_input)
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                image_embeds,
+                placeholder_token_id=self.config.image_token_id,
+            )
+
+        if video_input is not None:
+            video_embeds = self._process_video_input(video_input)
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                video_embeds,
+                placeholder_token_id=self.config.video_token_id,
+            )
+        return inputs_embeds
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -1303,22 +1329,25 @@ def forward(
         if intermediate_tensors is not None:
             inputs_embeds = None
 
-        # NOTE: In v1, inputs_embeds is always generated at model runner, this
-        # condition is for v0 compatibility.
+        # NOTE: In v1, inputs_embeds is always generated at model runner from
+        # `get_multimodal_embeddings` and `get_input_embeddings`, this
+        # condition is only for v0 compatibility.
         elif inputs_embeds is None:
-            multimodal_embeddings = self.get_multimodal_embeddings(**kwargs)
-
-            # We need to check for usage of mrope here in case there is
-            # multimodal data.
-            # TODO (ywang96): move this to model runner in V1.
-            if multimodal_embeddings is not None and uses_mrope(self.config):
-                assert positions.ndim == 2 and positions.size(0) == 3, (
-                    "multimodal section rotary embedding requires "
-                    f"(3, seq_len) positions, but got {positions.size()}")
-
-            inputs_embeds = self.get_input_embeddings(input_ids,
-                                                      multimodal_embeddings)
-            input_ids = None
+            image_input = self._parse_and_validate_image_input(**kwargs)
+            video_input = self._parse_and_validate_video_input(**kwargs)
+
+            if image_input is None and video_input is None:
+                inputs_embeds = None
+            else:
+                if uses_mrope(self.config):
+                    assert positions.ndim == 2 and positions.size(0) == 3, (
+                        "multimodal section rotary embedding requires "
+                        f"(3, seq_len) positions, but got {positions.size()}")
+                inputs_embeds = self.get_input_embeddings_v0(
+                    input_ids,
+                    image_input=image_input,
+                    video_input=video_input)
+                input_ids = None
 
         hidden_states = self.language_model.model(
             input_ids=input_ids,

From 6609cdf0195480f3d3bf1352a96acd6fd6ab89d8 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 22 Jan 2025 22:56:29 +0800
Subject: [PATCH 333/462] [Doc] Add docs for prompt replacement (#12318)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/ultravox.py |  2 +-
 vllm/multimodal/processing.py          | 92 ++++++++++++++++++++++----
 2 files changed, 79 insertions(+), 15 deletions(-)

diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index 145df09a427e4..d577e545a473b 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -218,7 +218,7 @@ def get_replacement_ultravox(item_idx: int):
         return [
             PromptReplacement(
                 modality="audio",
-                target='<|audio|>',
+                target="<|audio|>",
                 replacement=get_replacement_ultravox,
             )
         ]
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 1e0338dfb0226..750646ac6e431 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -29,41 +29,101 @@
 logger = init_logger(__name__)
 
 _S = TypeVar("_S", str, list[int])
-_PromptSeq = Union[str, list[int]]
+
+PromptSeq = Union[str, list[int]]
+"""A token sequence (list of token IDs) or text."""
 
 
 @dataclass
 class PromptReplacementDetails:
-    full: _PromptSeq
+    """Details about the replacement token sequence or text."""
+
+    full: PromptSeq
     """The full replacement."""
 
-    features: _PromptSeq
+    features: PromptSeq
     """
-    The part of the replacement that corresponds to placeholder feature tokens.
+    The part of the replacement that corresponds to feature placeholders;
+    this will be replaced by the output of the vision encoder during model
+    inference.
     """
 
     @staticmethod
-    def from_seq(seq: _PromptSeq) -> "PromptReplacementDetails":
+    def from_seq(seq: PromptSeq) -> "PromptReplacementDetails":
         return PromptReplacementDetails(full=seq, features=seq)
 
 
-_PromptRepl = Union[_PromptSeq, PromptReplacementDetails]
+PromptRepl = Union[PromptSeq, PromptReplacementDetails]
+"""
+The replacement token sequence or text.
+
+If only part of the replacement corresponds to feature placeholders, you can
+use :class:`PromptReplacementDetails` to specify which part.
+"""
 
 
 @dataclass
 class PromptReplacement:
     """
     Defines how to replace portions of an input prompt with placeholder tokens.
+
+    Example:
+
+        For each image, replace one ``<image>`` input placeholder in the prompt
+        with a number of ``<image>`` feature placeholders
+        equal to the feature size of the vision encoder:
+
+        .. code-block:: python
+
+            PromptReplacement(
+                modality="image",
+                target="<image>",
+                replacement="<image>" * image_feature_size,
+            )
+
+        As above, but further pad the feature placeholders with ``<image_bos>``
+        and `<image_eos>``, which are not supposed to be passed to the vision
+        encoder:
+
+        .. code-block:: python
+
+            PromptReplacement(
+                modality="image",
+                target="<image>",
+                replacement=PromptReplacementDetails(
+                    full="".join([
+                        "<image_bos>",
+                        "<image>" * image_feature_size,
+                        "<image_eos>",
+                    ]),
+                    features="<image>" * image_feature_size,
+                ),
+            )
+
+        To avoid unnecessary tokenization during prompt replacement,
+        we recommended passing token sequences instead of text:
+
+        .. code-block:: python
+
+            PromptReplacement(
+                modality="image",
+                target=[image_token_id],
+                replacement=PromptReplacementDetails(
+                    full=([image_bos_id] + [image_token_id] * image_feature_size
+                          + [image_eos_id]),
+                    features=[image_token_id] * image_feature_size,
+                ),
+            )
     """
 
     modality: str
     """The modality for which the replacement is made."""
 
-    target: _PromptSeq
+    target: PromptSeq
     """The token sequence (or text) to find and replace."""
 
-    replacement: Union[Callable[[int], _PromptRepl],
-                       _PromptRepl] = field(repr=False)
+    replacement: Union[Callable[[int], PromptRepl],
+                       PromptRepl] = field(repr=False)
     """
     Given the index of the processed item within :attr:`modality`,
     output the replacement token sequence (or text).
@@ -126,6 +186,10 @@ def full_groupby_modality(values: Iterable[_M]) -> ItemsView[str, list[_M]]:
 
 @dataclass
 class _BoundPromptSequence:
+    """
+    A :data:`_PromptSeq` bound to a tokenizer to automatically
+    convert between token sequence and text representations.
+    """
     tokenizer: AnyTokenizer = field(repr=False)
 
     _text: Optional[str]
@@ -134,7 +198,7 @@ class _BoundPromptSequence:
     @staticmethod
     def from_seq(
         tokenizer: AnyTokenizer,
-        seq: _PromptSeq,
+        seq: PromptSeq,
     ) -> "_BoundPromptSequence":
         return _BoundPromptSequence(
             tokenizer=tokenizer,
@@ -180,9 +244,9 @@ class BoundPromptReplacement:
     tokenizer: AnyTokenizer = field(repr=False)
     modality: str
 
-    _target: _PromptSeq
-    _replacement: Union[Callable[[int], _PromptRepl],
-                        _PromptRepl] = field(repr=False)
+    _target: PromptSeq
+    _replacement: Union[Callable[[int], PromptRepl],
+                        PromptRepl] = field(repr=False)
 
     def __post_init__(self) -> None:
         self._replacement_cache = dict[int, _BoundPromptReplacementGroup]()
@@ -350,7 +414,7 @@ def find_text_matches(
 
 
 def _resolve_matches(
-    prompt: _PromptSeq,
+    prompt: PromptSeq,
     mm_matches: Mapping[str, Sequence[_PromptReplacementMatch]],
 ) -> list[_PromptReplacementMatch]:
     """

From fc66dee76d9957eab8fdb9d138d1ebaa0fad4649 Mon Sep 17 00:00:00 2001
From: Robin <863579016@qq.com>
Date: Thu, 23 Jan 2025 00:48:41 +0800
Subject: [PATCH 334/462] [Misc] Fix the error in the tip for the
 --lora-modules parameter (#12319)

Signed-off-by: wangerxiao <863579016@qq.com>
---
 vllm/entrypoints/openai/cli_args.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index 35445449463e9..4df75a665bab9 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -117,7 +117,7 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         "or JSON format. "
         "Example (old format): ``'name=path'`` "
         "Example (new format): "
-        "``{\"name\": \"name\", \"local_path\": \"path\", "
+        "``{\"name\": \"name\", \"path\": \"lora_path\", "
         "\"base_model_name\": \"id\"}``")
     parser.add_argument(
         "--prompt-adapters",

From 84bee4bd5c41896d626186c9265f30824b928f7a Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Thu, 23 Jan 2025 00:56:54 +0800
Subject: [PATCH 335/462] [Misc]  Improve the readability of BNB error messages
  (#12320)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/model_executor/model_loader/loader.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index f697c3245f098..e9779878710ee 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -1076,8 +1076,8 @@ def _load_weights(self, model_config: ModelConfig,
         # weight tensor. So TP does not work with pre_quantized bnb models.
         if pre_quant and get_tensor_model_parallel_world_size() > 1:
             raise ValueError(
-                "Prequant BitsAndBytes models with TP is not supported."
-                "Please try with PP.")
+                "Prequant BitsAndBytes models with tensor parallelism is not "
+                "supported. Please try with pipeline parallelism.")
 
         load_8bit = False
         if pre_quant:

From 96f6a7596fed0a8a8b5a13ce1ca2a7e06b1e5adf Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Wed, 22 Jan 2025 19:07:07 +0100
Subject: [PATCH 336/462] [Bugfix] Fix HPU multiprocessing executor (#12167)

Signed-off-by: Konrad Zawora <kzawora@habana.ai>
---
 vllm/config.py            |  2 +-
 vllm/engine/arg_utils.py  |  2 +-
 vllm/platforms/hpu.py     | 18 ++++++++++++++++++
 vllm/worker/hpu_worker.py |  4 ++--
 4 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 69577505fc9bd..f4548c4466e48 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1293,7 +1293,7 @@ def __post_init__(self) -> None:
                 raise ValueError(f"worker-use-ray can't be used with "
                                  f"distributed executor backend "
                                  f"'{self.distributed_executor_backend}'.")
-        ray_only_devices = ["tpu", "hpu"]
+        ray_only_devices = ["tpu"]
         from vllm.platforms import current_platform
         if (current_platform.device_type in ray_only_devices
                 and self.world_size > 1):
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index ba58614bf8f95..f58c1b55e0c70 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -397,7 +397,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             'or equal to the number of GPUs available, "mp" will be used to '
             'keep processing on a single host. Otherwise, this will default '
             'to "ray" if Ray is installed and fail otherwise. Note that tpu '
-            'and hpu only support Ray for distributed inference.')
+            'only supports Ray for distributed inference.')
 
         parser.add_argument(
             '--worker-use-ray',
diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py
index 242c2c127979a..a32c262c84efa 100644
--- a/vllm/platforms/hpu.py
+++ b/vllm/platforms/hpu.py
@@ -1,7 +1,9 @@
+import os
 from typing import TYPE_CHECKING, Optional
 
 import torch
 
+from vllm import envs
 from vllm.logger import init_logger
 
 from .interface import Platform, PlatformEnum, _Backend
@@ -58,6 +60,22 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         cache_config = vllm_config.cache_config
         if cache_config and cache_config.block_size is None:
             cache_config.block_size = 128
+        if (parallel_config.distributed_executor_backend == 'mp'
+                and envs.VLLM_WORKER_MULTIPROC_METHOD == 'fork'):
+            if os.environ.get("VLLM_WORKER_MULTIPROC_METHOD",
+                              None) is not None:
+                logger.warning("On HPU, VLLM_WORKER_MULTIPROC_METHOD=fork "
+                               "might cause application hangs on exit. Using "
+                               "VLLM_WORKER_MULTIPROC_METHOD=fork anyway, "
+                               "as it was explicitly requested.")
+            else:
+                logger.warning(
+                    "On HPU, VLLM_WORKER_MULTIPROC_METHOD=fork "
+                    "might cause application hangs on exit. Setting "
+                    "VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. "
+                    "To override that behavior, please set "
+                    "VLLM_WORKER_MULTIPROC_METHOD=fork explicitly.")
+                os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
 
     @classmethod
     def is_pin_memory_available(cls):
diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py
index 9401241073c7d..3c570212625c4 100644
--- a/vllm/worker/hpu_worker.py
+++ b/vllm/worker/hpu_worker.py
@@ -130,7 +130,6 @@ def execute_model(
         self,
         execute_model_req: Optional[ExecuteModelRequest] = None,
     ) -> Optional[List[SamplerOutput]]:
-        assert execute_model_req is not None
         # VLLM_HPU_LOG_STEP_GRAPH_COMPILATION     - will log graph compilations per engine step, only when there was any - highly recommended to use alongside PT_HPU_METRICS_GC_DETAILS! # noqa:E501
         # VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL - will log graph compilations per engine step, always, even if there were none # noqa:E501
         # VLLM_HPU_LOG_STEP_CPU_FALLBACKS         - will log cpu fallbacks per engine step, only when there was any # noqa:E501
@@ -144,7 +143,8 @@ def execute_model(
             'VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL', '0') != '0'
         log_cpu_fallbacks = os.environ.get('VLLM_HPU_LOG_STEP_CPU_FALLBACKS',
                                            '0') != '0' or log_cpu_fallbacks_all
-        if log_graph_compilation or log_cpu_fallbacks:
+        if (log_graph_compilation or log_cpu_fallbacks) and \
+            execute_model_req is not None:
             from habana_frameworks.torch.hpu.metrics import metric_localcontext
             seq_group_metadata_list = execute_model_req.seq_group_metadata_list
             is_prompt = any([

From 7206ce4ce112ed117796a59045c968a6d353f691 Mon Sep 17 00:00:00 2001
From: Cody Yu <hao.yu.cody@gmail.com>
Date: Wed, 22 Jan 2025 10:52:27 -0800
Subject: [PATCH 337/462] [Core] Support `reset_prefix_cache` (#12284)

---
 tests/core/block/test_prefix_caching_block.py | 38 ++++++++++++++++
 tests/v1/core/test_prefix_caching.py          | 39 ++++++++++++++++
 vllm/core/block/cpu_gpu_block_allocator.py    |  7 +++
 vllm/core/block/interfaces.py                 | 10 +++++
 vllm/core/block/naive_block.py                | 19 +++++---
 vllm/core/block/prefix_caching_block.py       | 44 ++++++++++++++++++-
 vllm/core/block_manager.py                    |  3 ++
 vllm/core/interfaces.py                       |  5 +++
 vllm/core/placeholder_block_space_manager.py  |  3 ++
 vllm/core/scheduler.py                        |  3 ++
 vllm/engine/async_llm_engine.py               |  3 ++
 vllm/engine/llm_engine.py                     |  8 ++++
 vllm/engine/multiprocessing/__init__.py       |  7 ++-
 vllm/engine/multiprocessing/client.py         | 12 ++++-
 vllm/engine/multiprocessing/engine.py         | 10 ++++-
 vllm/engine/protocol.py                       |  5 +++
 vllm/entrypoints/llm.py                       |  4 ++
 vllm/entrypoints/openai/api_server.py         | 12 +++++
 vllm/envs.py                                  |  7 +++
 vllm/executor/executor_base.py                |  5 ---
 vllm/v1/core/kv_cache_manager.py              | 27 ++++++++++++
 vllm/v1/core/scheduler.py                     |  3 ++
 vllm/v1/engine/__init__.py                    |  9 +++-
 vllm/v1/engine/async_llm.py                   |  3 ++
 vllm/v1/engine/core.py                        | 11 ++++-
 vllm/v1/engine/core_client.py                 | 21 ++++++++-
 vllm/v1/engine/llm_engine.py                  |  3 ++
 27 files changed, 300 insertions(+), 21 deletions(-)

diff --git a/tests/core/block/test_prefix_caching_block.py b/tests/core/block/test_prefix_caching_block.py
index 29ac3a3c86cb4..6642174c17d8b 100644
--- a/tests/core/block/test_prefix_caching_block.py
+++ b/tests/core/block/test_prefix_caching_block.py
@@ -796,6 +796,44 @@ def test_find_cached_blocks_prefix():
             block_hashes=block_hashes_seq1)
         assert len(cached_blocks) == len(blocks_seq1) - num_evicted_blocks
 
+    # Test reset prefix cache
+    @staticmethod
+    @pytest.mark.parametrize("num_blocks", [10])
+    @pytest.mark.parametrize("block_size", [16])
+    def test_reset_prefix_cache(num_blocks: int, block_size: int):
+        """This test case simulates the case of resetting the prefix cache."""
+
+        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
+                                                block_size=block_size)
+        token_ids = list(range(3 * block_size))
+
+        first_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
+            block_size=block_size,
+            token_ids=token_ids,
+            allocator=allocator,
+        )
+        second_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
+            block_size=block_size,
+            token_ids=token_ids,
+            allocator=allocator,
+        )
+
+        # Free each block in the first chain.
+        for block in first_chain:
+            allocator.free(block)
+
+        # Failed to reset prefix cache because some blocks are not freed yet.
+        assert not allocator.reset_prefix_cache()
+        assert allocator.get_prefix_cache_hit_rate() > 0.0
+
+        # Free each block in the second chain.
+        for block in second_chain:
+            allocator.free(block)
+
+        # Reset prefix cache.
+        assert allocator.reset_prefix_cache()
+        assert allocator.get_prefix_cache_hit_rate() == 0.0
+
     @staticmethod
     def create_immutable_chain(
         block_size: int,
diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index fafd9d0ce4455..c5860809f9e62 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -587,3 +587,42 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks():
     assert {block.ref_cnt for block in block_part1[:3]} == {1}
     # Block 3-5 are free.
     assert {block.ref_cnt for block in block_part1[3:]} == {0}
+
+
+def test_reset_prefix_cache():
+    manager = KVCacheManager(
+        block_size=16,
+        num_gpu_blocks=10,
+        max_model_len=8192,
+        sliding_window=None,
+        enable_caching=True,
+        num_preallocate_tokens=0,
+    )
+
+    full_block_token_ids = [i for i in range(3) for _ in range(16)]
+    unique_token_ids = [3] * 7
+    all_token_ids = full_block_token_ids + unique_token_ids
+    req0 = make_request("0", all_token_ids)
+    blocks = manager.allocate_slots(req0, 55, [])
+    assert [b.block_id for b in blocks] == [0, 1, 2, 3]
+
+    unique_token_ids = [4] * 7
+    all_token_ids = full_block_token_ids + unique_token_ids
+    req1 = make_request("1", all_token_ids)
+    computed_blocks, _ = manager.get_computed_blocks(req1)
+    assert len(req1.kv_block_hashes) == 3
+    assert len(computed_blocks) == 3
+    blocks = manager.allocate_slots(req1, 7, computed_blocks)
+    assert [b.block_id for b in blocks] == [4]
+
+    # Failed to reset prefix cache because some blocks are not freed yet.
+    assert not manager.reset_prefix_cache()
+    assert manager.cached_block_hash_to_block
+
+    # Free the blocks.
+    manager.free(req0)
+    manager.free(req1)
+
+    assert manager.reset_prefix_cache()
+    assert not manager.cached_block_hash_to_block
+    assert all([blk.block_hash is None for blk in manager.block_pool])
diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py
index 3a57487a6cd8a..c3e1665b4464e 100644
--- a/vllm/core/block/cpu_gpu_block_allocator.py
+++ b/vllm/core/block/cpu_gpu_block_allocator.py
@@ -339,6 +339,13 @@ def get_prefix_cache_hit_rate(self, device: Device) -> float:
         assert device in self._allocators
         return self._allocators[device].get_prefix_cache_hit_rate()
 
+    def reset_prefix_cache(self) -> bool:
+        """Reset prefix cache for all devices."""
+        success = True
+        for allocator in self._allocators.values():
+            success = success and allocator.reset_prefix_cache()
+        return success
+
     def get_and_reset_swaps(self) -> List[Tuple[int, int]]:
         """Returns and clears the mapping of source to destination block IDs.
         Will be called after every swapping operations for now, and after every
diff --git a/vllm/core/block/interfaces.py b/vllm/core/block/interfaces.py
index 985a1098b6cd1..cb432db919c73 100644
--- a/vllm/core/block/interfaces.py
+++ b/vllm/core/block/interfaces.py
@@ -192,6 +192,11 @@ def get_prefix_cache_hit_rate(self) -> float:
         """Prefix cache hit rate. -1 means not supported or disabled."""
         pass
 
+    @abstractmethod
+    def reset_prefix_cache(self) -> bool:
+        """Reset prefix cache."""
+        pass
+
     class NoFreeBlocksError(ValueError):
         pass
 
@@ -297,6 +302,11 @@ def get_prefix_cache_hit_rate(self, device: Device) -> float:
         """Prefix cache hit rate. -1 means not supported or disabled."""
         pass
 
+    @abstractmethod
+    def reset_prefix_cache(self) -> bool:
+        """Reset prefix cache."""
+        pass
+
     @abstractmethod
     def find_cached_blocks_prefix(
         self,
diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py
index 9b94918ab38ef..c38ae2dd6761b 100644
--- a/vllm/core/block/naive_block.py
+++ b/vllm/core/block/naive_block.py
@@ -1,5 +1,5 @@
 from collections import deque
-from typing import Deque, FrozenSet, Iterable, List, Optional, Tuple
+from typing import Deque, FrozenSet, Iterable, List, Optional, Tuple, Union
 
 from vllm.core.block.common import (BlockPool, CopyOnWriteTracker, RefCounter,
                                     get_all_blocks_recursively)
@@ -136,16 +136,18 @@ def _allocate_block_id(self) -> BlockId:
         self._refcounter.incr(block_id)
         return block_id
 
-    def _free_block_id(self, block: Block) -> None:
-        block_id = block.block_id
+    def _free_block_id(self, block: Union[Block, BlockId]) -> None:
+        if isinstance(block, Block):
+            block_id = block.block_id
+            block.block_id = None
+        else:
+            block_id = block
         assert block_id is not None
 
         refcount = self._refcounter.decr(block_id)
         if refcount == 0:
             self._free_block_indices.appendleft(block_id)
 
-        block.block_id = None
-
     def free(self, block: Block, keep_block_object: bool = False) -> None:
         # Release the physical block id
         self._free_block_id(block)
@@ -154,6 +156,9 @@ def free(self, block: Block, keep_block_object: bool = False) -> None:
         if not keep_block_object:
             self._block_pool.free_block(block)
 
+    def free_block_id(self, block_id: BlockId) -> None:
+        self._free_block_id(block_id)
+
     def fork(self, last_block: Block) -> List[Block]:
         """Creates a new sequence of blocks that shares the same underlying
         memory as the original sequence.
@@ -325,6 +330,10 @@ def swap_in(self, blocks: List[Block]) -> None:
     def get_prefix_cache_hit_rate(self) -> float:
         return -1
 
+    def reset_prefix_cache(self) -> bool:
+        """No prefix cache for naive block allocator."""
+        return True
+
     def find_cached_blocks_prefix(self, block_hashes: List[int]) -> List[int]:
         # Not applicable for naive block allocator.
         return []
diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py
index 1238303234deb..ccdc5daa9595c 100644
--- a/vllm/core/block/prefix_caching_block.py
+++ b/vllm/core/block/prefix_caching_block.py
@@ -12,6 +12,7 @@
 from vllm.core.block.naive_block import (BlockPool, NaiveBlock,
                                          NaiveBlockAllocator)
 from vllm.core.evictor import EvictionPolicy, Evictor, make_evictor
+from vllm.logger import init_logger
 from vllm.sequence import Sequence
 
 PrefixHash = int
@@ -21,6 +22,8 @@
 # then we know this block hasn't been accessed yet.
 _DEFAULT_LAST_ACCESSED_TIME = -1
 
+logger = init_logger(__name__)
+
 
 class BlockTracker:
     """Used to track the status of a block inside the prefix caching allocator
@@ -105,7 +108,8 @@ def __init__(
 
         # Evitor used to maintain how we want to handle those computed blocks
         # if we find memory pressure is high.
-        self.evictor: Evictor = make_evictor(eviction_policy)
+        self.eviction_policy = eviction_policy
+        self.evictor: Evictor = make_evictor(self.eviction_policy)
 
         # We share the refcounter between allocators. This allows us to promote
         # blocks originally allocated in the hashless allocator to immutable
@@ -428,6 +432,44 @@ def all_block_ids(self) -> FrozenSet[int]:
     def get_prefix_cache_hit_rate(self) -> float:
         return self.metric_data.get_hit_rate()
 
+    def reset_prefix_cache(self) -> bool:
+        """Reset prefix cache. This function may be used in RLHF
+        flows to invalid prefix caching after the weights are updated,
+        or used for resetting prefix caching status for benchmarking.
+
+        Returns:
+            bool: True if the prefix cache is successfully reset,
+            False otherwise.
+        """
+        num_used_blocks = (self.get_num_total_blocks() -
+                           self.get_num_free_blocks())
+        if num_used_blocks > 0:
+            logger.warning(
+                "Failed to reset prefix cache because some "
+                "blocks (%d) are not freed yet", num_used_blocks)
+            return False
+
+        # Free all blocks in the evictor.
+        while (block_id :=
+               self._maybe_allocate_evicted_block_id()) is not None:
+            self._hashless_allocator.free_block_id(block_id)
+
+        # Should not have any cached blocks because all blocks are evicted.
+        assert not self._cached_blocks
+
+        # Reset the evictor.
+        self.evictor = make_evictor(self.eviction_policy)
+
+        # Reset the block tracker.
+        for block_id in self._block_tracker:
+            self._block_tracker[block_id] = BlockTracker()
+
+        # Reset the metrics.
+        self.metric_data = CacheMetricData()
+
+        logger.info("Successfully reset prefix cache")
+        return True
+
     def is_block_cached(self, block: Block) -> bool:
         assert block.content_hash is not None
         return block.content_hash in self._cached_blocks
diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index b41e848221882..62a5f0bda061a 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -455,6 +455,9 @@ def get_num_free_cpu_blocks(self) -> int:
     def get_prefix_cache_hit_rate(self, device: Device) -> float:
         return self.block_allocator.get_prefix_cache_hit_rate(device)
 
+    def reset_prefix_cache(self) -> bool:
+        return self.block_allocator.reset_prefix_cache()
+
     def _can_swap(self,
                   seq_group: SequenceGroup,
                   device: Device,
diff --git a/vllm/core/interfaces.py b/vllm/core/interfaces.py
index b10b8d3f4a5bf..9c7e246e3c4ed 100644
--- a/vllm/core/interfaces.py
+++ b/vllm/core/interfaces.py
@@ -122,6 +122,11 @@ def get_prefix_cache_hit_rate(self, device: Device) -> float:
         """Prefix cache hit rate. -1 means not supported or disabled."""
         pass
 
+    @abstractmethod
+    def reset_prefix_cache(self) -> bool:
+        """Reset prefix cache for all devices."""
+        pass
+
     @abstractmethod
     def get_num_cached_tokens(self, seq: Sequence) -> int:
         pass
diff --git a/vllm/core/placeholder_block_space_manager.py b/vllm/core/placeholder_block_space_manager.py
index a47e594518534..f9924be4a3835 100644
--- a/vllm/core/placeholder_block_space_manager.py
+++ b/vllm/core/placeholder_block_space_manager.py
@@ -90,5 +90,8 @@ def mark_blocks_as_computed(self, seq_group: SequenceGroup,
     def get_prefix_cache_hit_rate(self, device: Device) -> float:
         return -1
 
+    def reset_prefix_cache(self) -> bool:
+        return True
+
     def get_num_cached_tokens(self, seq: Sequence) -> int:
         return 0
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index b3d396f9cedda..b1630b34947bd 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -504,6 +504,9 @@ def has_unfinished_seqs(self) -> bool:
     def get_prefix_cache_hit_rate(self, device: Device) -> float:
         return self.block_manager.get_prefix_cache_hit_rate(device)
 
+    def reset_prefix_cache(self) -> bool:
+        return self.block_manager.reset_prefix_cache()
+
     def get_num_unfinished_seq_groups(self) -> int:
         return len(self.waiting) + len(self.running) + len(self.swapped)
 
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 08fef8250d483..739ea06ae3818 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -1182,6 +1182,9 @@ async def start_profile(self) -> None:
     async def stop_profile(self) -> None:
         self.engine.stop_profile()
 
+    async def reset_prefix_cache(self) -> None:
+        self.engine.reset_prefix_cache()
+
     async def add_lora(self, lora_request: LoRARequest) -> None:
         self.engine.add_lora(lora_request)
 
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 5271027d41863..7da18d5f7d2eb 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -914,6 +914,14 @@ def has_unfinished_requests_for_virtual_engine(
         """
         return self.scheduler[virtual_engine].has_unfinished_seqs()
 
+    def reset_prefix_cache(self) -> bool:
+        """Reset prefix cache for all devices."""
+
+        success = True
+        for scheduler in self.scheduler:
+            success = success and scheduler.reset_prefix_cache()
+        return success
+
     @staticmethod
     def _process_sequence_group_outputs(
         seq_group: SequenceGroup,
diff --git a/vllm/engine/multiprocessing/__init__.py b/vllm/engine/multiprocessing/__init__.py
index 7132f9840001a..d9703b820a779 100644
--- a/vllm/engine/multiprocessing/__init__.py
+++ b/vllm/engine/multiprocessing/__init__.py
@@ -121,6 +121,10 @@ class RPCUProfileRequest(Enum):
     STOP_PROFILE = 2
 
 
+class RPCResetPrefixCacheRequest(Enum):
+    RESET_PREFIX_CACHE = 1
+
+
 @dataclass
 class RPCLoadAdapterRequest:
     lora_request: LoRARequest
@@ -134,7 +138,8 @@ class RPCAdapterLoadedResponse:
 
 
 RPC_REQUEST_T = Union[RPCProcessRequest, RPCAbortRequest, RPCStartupRequest,
-                      RPCUProfileRequest, RPCLoadAdapterRequest]
+                      RPCUProfileRequest, RPCLoadAdapterRequest,
+                      RPCResetPrefixCacheRequest]
 
 REQUEST_OUTPUTS_T = Union[List[RequestOutput], RPCAdapterLoadedResponse,
                           RPCError]
diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py
index 74b98d06c509a..5237f63c34c01 100644
--- a/vllm/engine/multiprocessing/client.py
+++ b/vllm/engine/multiprocessing/client.py
@@ -27,8 +27,9 @@
                                          VLLM_RPC_SUCCESS_STR, RPCAbortRequest,
                                          RPCAdapterLoadedResponse, RPCError,
                                          RPCLoadAdapterRequest,
-                                         RPCProcessRequest, RPCStartupRequest,
-                                         RPCStartupResponse,
+                                         RPCProcessRequest,
+                                         RPCResetPrefixCacheRequest,
+                                         RPCStartupRequest, RPCStartupResponse,
                                          RPCUProfileRequest)
 from vllm.engine.protocol import EngineClient
 # yapf: enable
@@ -675,6 +676,13 @@ async def stop_profile(self) -> None:
         await self._send_one_way_rpc_request(
             request=RPCUProfileRequest.STOP_PROFILE, socket=self.input_socket)
 
+    async def reset_prefix_cache(self) -> None:
+        """Reset the prefix cache"""
+
+        await self._send_one_way_rpc_request(
+            request=RPCResetPrefixCacheRequest.RESET_PREFIX_CACHE,
+            socket=self.input_socket)
+
     async def add_lora(self, lora_request: LoRARequest) -> None:
         """Load a new LoRA adapter into the engine for future requests."""
         # Uses the same I/O as generate requests
diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py
index 3aa9d30549f36..166f89743b3cd 100644
--- a/vllm/engine/multiprocessing/engine.py
+++ b/vllm/engine/multiprocessing/engine.py
@@ -16,8 +16,9 @@
                                          VLLM_RPC_SUCCESS_STR, RPCAbortRequest,
                                          RPCAdapterLoadedResponse, RPCError,
                                          RPCLoadAdapterRequest,
-                                         RPCProcessRequest, RPCStartupRequest,
-                                         RPCStartupResponse,
+                                         RPCProcessRequest,
+                                         RPCResetPrefixCacheRequest,
+                                         RPCStartupRequest, RPCStartupResponse,
                                          RPCUProfileRequest)
 # yapf: enable
 from vllm.logger import init_logger
@@ -237,6 +238,8 @@ def handle_new_input(self):
                         self.stop_profile()
                 elif isinstance(request, RPCLoadAdapterRequest):
                     self._handle_load_adapter_request(request)
+                elif isinstance(request, RPCResetPrefixCacheRequest):
+                    self.reset_prefix_cache()
                 else:
                     raise ValueError("Unknown RPCRequest Type: "
                                      f"{type(request)}")
@@ -361,6 +364,9 @@ def start_profile(self) -> None:
     def stop_profile(self) -> None:
         self.engine.stop_profile()
 
+    def reset_prefix_cache(self) -> bool:
+        return self.engine.reset_prefix_cache()
+
 
 def signal_handler(*_) -> None:
     raise KeyboardInterrupt("MQLLMEngine terminated")
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index f05ff62c4766b..de7b2c1b91f50 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -271,6 +271,11 @@ async def stop_profile(self) -> None:
         """Start profiling the engine"""
         ...
 
+    @abstractmethod
+    async def reset_prefix_cache(self) -> None:
+        """Reset the prefix cache"""
+        ...
+
     @abstractmethod
     async def add_lora(self, lora_request: LoRARequest) -> None:
         """Load a new LoRA adapter into the engine for future requests."""
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 04056f37f851b..563031cfadc4c 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -1132,6 +1132,9 @@ def start_profile(self) -> None:
     def stop_profile(self) -> None:
         self.llm_engine.stop_profile()
 
+    def reset_prefix_cache(self) -> bool:
+        return self.llm_engine.reset_prefix_cache()
+
     def sleep(self, level: int = 1):
         """
         Put the engine to sleep. The engine should not process any requests.
@@ -1150,6 +1153,7 @@ def sleep(self, level: int = 1):
             where previous model weights are not needed. It reduces CPU memory 
             pressure.
         """
+        self.reset_prefix_cache()
         self.llm_engine.sleep(level=level)
 
     def wake_up(self):
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 1aeefe86cd05e..9bb11907f7402 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -518,6 +518,18 @@ async def create_score_v1(request: ScoreRequest, raw_request: Request):
     },
 }
 
+if envs.VLLM_SERVER_DEV_MODE:
+
+    @router.post("/reset_prefix_cache")
+    async def reset_prefix_cache(raw_request: Request):
+        """
+        Reset the prefix cache. Note that we currently do not check if the
+        prefix cache is successfully reset in the API server.
+        """
+        logger.info("Resetting prefix cache...")
+        await engine_client(raw_request).reset_prefix_cache()
+        return Response(status_code=200)
+
 
 @router.post("/invocations")
 async def invocations(raw_request: Request):
diff --git a/vllm/envs.py b/vllm/envs.py
index b7b597ea15af3..1e68326b2d908 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -72,6 +72,7 @@
     VLLM_ENABLE_V1_MULTIPROCESSING: bool = True
     VLLM_LOG_BATCHSIZE_INTERVAL: float = -1
     VLLM_DISABLE_COMPILE_CACHE: bool = False
+    VLLM_SERVER_DEV_MODE: bool = False
 
 
 def get_default_cache_root():
@@ -467,6 +468,12 @@ def get_default_config_root():
     lambda: float(os.getenv("VLLM_LOG_BATCHSIZE_INTERVAL", "-1")),
     "VLLM_DISABLE_COMPILE_CACHE":
     lambda: bool(int(os.getenv("VLLM_DISABLE_COMPILE_CACHE", "0"))),
+
+    # If set, vllm will run in development mode, which will enable
+    # some additional endpoints for developing and debugging,
+    # e.g. `/reset_prefix_cache`
+    "VLLM_SERVER_DEV_MODE":
+    lambda: bool(int(os.getenv("VLLM_SERVER_DEV_MODE", "0"))),
 }
 
 # end-env-vars-definition
diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py
index 069be05eafa50..6be62d4068572 100644
--- a/vllm/executor/executor_base.py
+++ b/vllm/executor/executor_base.py
@@ -194,11 +194,6 @@ def stop_profile(self) -> None:
         self.collective_rpc("stop_profile")
 
     def sleep(self, level: int = 1):
-        if self.cache_config.enable_prefix_caching:
-            # TODO: support sleep with prefix caching
-            # by resetting the prefix cache state,
-            # after https://github.com/vllm-project/vllm/pull/12284
-            raise ValueError("Cannot sleep when prefix caching is enabled.")
         self.collective_rpc("sleep", kwargs=dict(level=level))
 
     def wake_up(self):
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index bac77443c8560..8c8c8b3b55c0b 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -285,6 +285,33 @@ def free(self, request: Request) -> None:
             if block.ref_cnt == 0:
                 self.free_block_queue.append(block)
 
+    def reset_prefix_cache(self) -> bool:
+        """Reset prefix cache. This function may be used in RLHF
+        flows to invalid prefix caching after the weights are updated,
+        or used for resetting prefix caching status for benchmarking.
+
+        Returns:
+            bool: True if the prefix cache is successfully reset,
+            False otherwise.
+        """
+        num_used_blocks = (self.num_gpu_blocks -
+                           self.free_block_queue.num_free_blocks)
+        if num_used_blocks > 0:
+            logger.warning(
+                "Failed to reset prefix cache because some "
+                "blocks (%d) are not freed yet", num_used_blocks)
+            return False
+
+        # Remove all hashes so that no new blocks will hit.
+        self.cached_block_hash_to_block = defaultdict(dict)
+
+        # Remove all hashes from all blocks.
+        for block in self.block_pool:
+            block.reset_hash()
+
+        logger.info("Successfully reset prefix cache")
+        return True
+
     def get_num_common_prefix_blocks(
         self,
         request: Request,
diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index 64df21d59fef4..8ded5e5787133 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -529,6 +529,9 @@ def get_num_unfinished_requests(self) -> int:
     def has_unfinished_requests(self) -> bool:
         return self.get_num_unfinished_requests() > 0
 
+    def reset_prefix_cache(self) -> bool:
+        return self.kv_cache_manager.reset_prefix_cache()
+
     def make_stats(self) -> SchedulerStats:
         return SchedulerStats(
             num_running_reqs=len(self.running),
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 6d90c38c72cf5..abe4952c4baff 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -66,6 +66,11 @@ class EngineCoreProfile:
     is_start: bool
 
 
+@dataclass
+class EngineCoreResetPrefixCache:
+    pass
+
+
 class EngineCoreRequestType(enum.Enum):
     """
     Request types defined as hex byte strings, so it can be sent over sockets
@@ -74,6 +79,8 @@ class EngineCoreRequestType(enum.Enum):
     ADD = b'\x00'
     ABORT = b'\x01'
     PROFILE = b'\x02'
+    RESET_PREFIX_CACHE = b'\x03'
 
 
-EngineCoreRequestUnion = Union[EngineCoreRequest, EngineCoreProfile, List[str]]
+EngineCoreRequestUnion = Union[EngineCoreRequest, EngineCoreProfile,
+                               EngineCoreResetPrefixCache, List[str]]
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index a74699f7513e6..b4d3e441173df 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -321,6 +321,9 @@ async def start_profile(self) -> None:
     async def stop_profile(self) -> None:
         await self.engine_core.profile_async(False)
 
+    async def reset_prefix_cache(self) -> None:
+        await self.engine_core.reset_prefix_cache_async()
+
     @property
     def is_running(self) -> bool:
         return True
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 26ebc7edcf03e..cf94033a38d96 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -20,7 +20,7 @@
 from vllm.v1.core.scheduler import Scheduler
 from vllm.v1.engine import (EngineCoreOutputs, EngineCoreProfile,
                             EngineCoreRequest, EngineCoreRequestType,
-                            EngineCoreRequestUnion)
+                            EngineCoreRequestUnion, EngineCoreResetPrefixCache)
 from vllm.v1.engine.mm_input_mapper import MMInputMapperServer
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.request import Request, RequestStatus
@@ -135,6 +135,9 @@ def shutdown(self):
     def profile(self, is_start: bool = True):
         self.model_executor.profile(is_start)
 
+    def reset_prefix_cache(self):
+        self.scheduler.reset_prefix_cache()
+
 
 class EngineCoreProc(EngineCore):
     """ZMQ-wrapper for running EngineCore in background process."""
@@ -247,6 +250,8 @@ def _handle_client_request(self, request: EngineCoreRequestUnion) -> None:
             self.add_request(request)
         elif isinstance(request, EngineCoreProfile):
             self.model_executor.profile(request.is_start)
+        elif isinstance(request, EngineCoreResetPrefixCache):
+            self.reset_prefix_cache()
         else:
             # TODO: make an EngineCoreAbort wrapper
             assert isinstance(request, list)
@@ -271,7 +276,9 @@ def process_input_socket(self, input_path: str):
                     request = decoder_add_req.decode(request_data)
                 elif request_type == EngineCoreRequestType.ABORT.value:
                     request = decoder_abort_req.decode(request_data)
-                elif request_type == EngineCoreRequestType.PROFILE.value:
+                elif request_type in (
+                        EngineCoreRequestType.PROFILE.value,
+                        EngineCoreRequestType.RESET_PREFIX_CACHE.value):
                     request = pickle.loads(request_data)
                 else:
                     raise ValueError(f"Unknown RequestType: {request_type}")
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index ac0f0f14bf1ab..19b89003cc69d 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -14,7 +14,7 @@
                         make_zmq_socket)
 from vllm.v1.engine import (EngineCoreOutputs, EngineCoreProfile,
                             EngineCoreRequest, EngineCoreRequestType,
-                            EngineCoreRequestUnion)
+                            EngineCoreRequestUnion, EngineCoreResetPrefixCache)
 from vllm.v1.engine.core import EngineCore, EngineCoreProc
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.serial_utils import PickleEncoder
@@ -69,6 +69,9 @@ def add_request(self, request: EngineCoreRequest) -> None:
     def profile(self, is_start: bool = True) -> None:
         raise NotImplementedError
 
+    def reset_prefix_cache(self) -> None:
+        raise NotImplementedError
+
     def abort_requests(self, request_ids: List[str]) -> None:
         raise NotImplementedError
 
@@ -81,6 +84,9 @@ async def add_request_async(self, request: EngineCoreRequest) -> None:
     async def profile_async(self, is_start: bool = True) -> None:
         raise NotImplementedError
 
+    async def reset_prefix_cache_async(self) -> None:
+        raise NotImplementedError
+
     async def abort_requests_async(self, request_ids: List[str]) -> None:
         raise NotImplementedError
 
@@ -108,12 +114,15 @@ def abort_requests(self, request_ids: List[str]) -> None:
         if len(request_ids) > 0:
             self.engine_core.abort_requests(request_ids)
 
-    def shutdown(self):
+    def shutdown(self) -> None:
         self.engine_core.shutdown()
 
     def profile(self, is_start: bool = True) -> None:
         self.engine_core.profile(is_start)
 
+    def reset_prefix_cache(self) -> None:
+        self.engine_core.reset_prefix_cache()
+
 
 class MPClient(EngineCoreClient):
     """
@@ -229,6 +238,10 @@ def profile(self, is_start: bool = True) -> None:
         self._send_input(EngineCoreRequestType.PROFILE,
                          EngineCoreProfile(is_start))
 
+    def reset_prefix_cache(self) -> None:
+        self._send_input(EngineCoreRequestType.RESET_PREFIX_CACHE,
+                         EngineCoreResetPrefixCache())
+
 
 class AsyncMPClient(MPClient):
     """Asyncio-compatible client for multi-proc EngineCore."""
@@ -266,3 +279,7 @@ async def abort_requests_async(self, request_ids: List[str]) -> None:
     async def profile_async(self, is_start: bool = True) -> None:
         await self._send_input(EngineCoreRequestType.PROFILE,
                                EngineCoreProfile(is_start))
+
+    async def reset_prefix_cache_async(self) -> None:
+        await self._send_input(EngineCoreRequestType.RESET_PREFIX_CACHE,
+                               EngineCoreResetPrefixCache())
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index f5999ccda6447..55d314ebeb955 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -162,6 +162,9 @@ def start_profile(self):
     def stop_profile(self):
         self.engine_core.profile(False)
 
+    def reset_prefix_cache(self):
+        self.engine_core.reset_prefix_cache()
+
     def get_tokenizer_group(
         self,
         group_type: Type[_G] = BaseTokenizerGroup,

From aea94362c9bdd08ed2b346701bdc09d278e85f66 Mon Sep 17 00:00:00 2001
From: Nick Hill <nickhill@us.ibm.com>
Date: Wed, 22 Jan 2025 14:22:12 -0800
Subject: [PATCH 338/462] [Frontend][V1] Online serving performance
 improvements (#12287)

---
 vllm/entrypoints/openai/api_server.py |  6 +++
 vllm/entrypoints/openai/protocol.py   | 30 +++++++++------
 vllm/envs.py                          | 11 ++++++
 vllm/v1/engine/async_llm.py           | 54 ++++++++++++++++++---------
 vllm/v1/engine/core_client.py         | 21 +++++++++--
 vllm/v1/engine/output_processor.py    |  6 ++-
 vllm/v1/request.py                    | 18 +++------
 7 files changed, 101 insertions(+), 45 deletions(-)

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 9bb11907f7402..f510c41503011 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -1,5 +1,6 @@
 import asyncio
 import atexit
+import gc
 import importlib
 import inspect
 import multiprocessing
@@ -104,6 +105,11 @@ async def _force_log():
             task.add_done_callback(_running_tasks.remove)
         else:
             task = None
+
+        # Mark the startup heap as static so that it's ignored by GC.
+        # Reduces pause times of oldest generation collections.
+        gc.collect()
+        gc.freeze()
         try:
             yield
         finally:
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 14e41346df775..80403f77d5375 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -3,7 +3,7 @@
 import re
 import time
 from argparse import Namespace
-from typing import Any, Dict, List, Literal, Optional, Union
+from typing import Any, ClassVar, Dict, List, Literal, Optional, Set, Union
 
 import torch
 from pydantic import BaseModel, ConfigDict, Field, model_validator
@@ -42,23 +42,31 @@ class OpenAIBaseModel(BaseModel):
     # OpenAI API does allow extra fields
     model_config = ConfigDict(extra="allow")
 
+    # Cache class field names
+    field_names: ClassVar[Optional[Set[str]]] = None
+
     @model_validator(mode="before")
     @classmethod
     def __log_extra_fields__(cls, data):
-        if isinstance(data, dict):
+
+        field_names = cls.field_names
+        if field_names is None:
+            if not isinstance(data, dict):
+                return data
             # Get all class field names and their potential aliases
             field_names = set()
             for field_name, field in cls.model_fields.items():
                 field_names.add(field_name)
-                if hasattr(field, 'alias') and field.alias:
-                    field_names.add(field.alias)
-
-            # Compare against both field names and aliases
-            extra_fields = data.keys() - field_names
-            if extra_fields:
-                logger.warning(
-                    "The following fields were present in the request "
-                    "but ignored: %s", extra_fields)
+                if alias := getattr(field, 'alias', None):
+                    field_names.add(alias)
+            cls.field_names = field_names
+
+        # Compare against both field names and aliases
+        if any(k not in field_names for k in data):
+            logger.warning(
+                "The following fields were present in the request "
+                "but ignored: %s",
+                data.keys() - field_names)
         return data
 
 
diff --git a/vllm/envs.py b/vllm/envs.py
index 1e68326b2d908..3a15e00e7b50a 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -73,6 +73,7 @@
     VLLM_LOG_BATCHSIZE_INTERVAL: float = -1
     VLLM_DISABLE_COMPILE_CACHE: bool = False
     VLLM_SERVER_DEV_MODE: bool = False
+    VLLM_V1_OUTPUT_PROC_CHUNK_SIZE: int = 128
 
 
 def get_default_cache_root():
@@ -474,6 +475,16 @@ def get_default_config_root():
     # e.g. `/reset_prefix_cache`
     "VLLM_SERVER_DEV_MODE":
     lambda: bool(int(os.getenv("VLLM_SERVER_DEV_MODE", "0"))),
+
+    # Controls the maximum number of requests to handle in a
+    # single asyncio task when processing per-token outputs in the
+    # V1 AsyncLLM interface. It is applicable when handling a high
+    # concurrency of streaming requests.
+    # Setting this too high can result in a higher variance of
+    # inter-message latencies. Setting it too low can negatively impact
+    # TTFT and overall throughput.
+    "VLLM_V1_OUTPUT_PROC_CHUNK_SIZE":
+    lambda: int(os.getenv("VLLM_V1_OUTPUT_PROC_CHUNK_SIZE", "128")),
 }
 
 # end-env-vars-definition
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index b4d3e441173df..1505b62504a2f 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -2,9 +2,12 @@
 import os
 from typing import AsyncGenerator, List, Mapping, Optional, Type, Union
 
+import numpy as np
+
 from vllm.config import ModelConfig, VllmConfig
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.protocol import EngineClient
+from vllm.envs import VLLM_V1_OUTPUT_PROC_CHUNK_SIZE
 from vllm.inputs import INPUT_REGISTRY, InputRegistry, PromptType
 from vllm.inputs.preprocess import InputPreprocessor
 from vllm.logger import init_logger
@@ -16,7 +19,7 @@
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import kill_process_tree
+from vllm.utils import cdiv, kill_process_tree
 from vllm.v1.engine.core_client import EngineCoreClient
 from vllm.v1.engine.output_processor import OutputProcessor
 from vllm.v1.engine.processor import Processor
@@ -205,17 +208,15 @@ async def generate(
 
             # The output_handler task pushes items into the queue.
             # This task pulls from the queue and yields to caller.
-            while True:
+            finished = False
+            while not finished:
                 # Note: drain queue without await if possible (avoids
                 # task switching under load which helps performance).
-                out = q.get_nowait() if q.qsize() > 0 else await q.get()
+                out = q.get_nowait() if not q.empty() else await q.get()
 
                 # Note: both OutputProcessor and EngineCore handle their
                 # own request cleanup based on finished.
-                if out.finished:
-                    yield out
-                    break
-
+                finished = out.finished
                 yield out
 
         # If the request is disconnected by the client, the
@@ -233,22 +234,41 @@ async def _run_output_handler(self):
                 # 1) Pull EngineCoreOutputs from the EngineCore.
                 outputs = await self.engine_core.get_output_async()
 
-                # 2) Process EngineCoreOutputs.
-                processed_outputs = self.output_processor.process_outputs(
-                    outputs.outputs)
-                # NOTE: RequestOutputs are pushed to their queues.
-                assert len(processed_outputs.request_outputs) == 0
-
-                # 3) Abort any reqs that finished due to stop strings.
-                await self.engine_core.abort_requests_async(
-                    processed_outputs.reqs_to_abort)
+                # Split outputs into chunks of at most
+                # VLLM_V1_OUTPUT_PROC_CHUNK_SIZE, so that we don't block the
+                # event loop for too long.
+                num_outputs = len(outputs.outputs)
+                if num_outputs <= VLLM_V1_OUTPUT_PROC_CHUNK_SIZE:
+                    slices = (outputs.outputs, )
+                else:
+                    slices = np.array_split(
+                        outputs.outputs,
+                        cdiv(num_outputs, VLLM_V1_OUTPUT_PROC_CHUNK_SIZE))
+
+                iteration_stats = None
+                for i, outputs_slice in enumerate(slices):
+                    # 2) Process EngineCoreOutputs.
+                    processed_outputs = self.output_processor.process_outputs(
+                        outputs_slice, iteration_stats)
+                    # NOTE: RequestOutputs are pushed to their queues.
+                    assert not processed_outputs.request_outputs
+                    iteration_stats = processed_outputs.iteration_stats
+
+                    # Allow other asyncio tasks to run between chunks
+                    if i + 1 < len(slices):
+                        await asyncio.sleep(0)
+
+                    # 3) Abort any reqs that finished due to stop strings.
+                    await self.engine_core.abort_requests_async(
+                        processed_outputs.reqs_to_abort)
 
                 # 4) Logging.
                 # TODO(rob): make into a coroutine and launch it in
                 # background thread once we add Prometheus.
+                assert iteration_stats is not None
                 self._log_stats(
                     scheduler_stats=outputs.scheduler_stats,
-                    iteration_stats=processed_outputs.iteration_stats,
+                    iteration_stats=iteration_stats,
                 )
 
         except Exception as e:
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 19b89003cc69d..f3b992d6873e7 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -1,8 +1,9 @@
+import asyncio
 import os
 import signal
 import weakref
 from abc import ABC, abstractmethod
-from typing import List, Type
+from typing import List, Optional, Type
 
 import msgspec
 import zmq
@@ -255,10 +256,24 @@ def __init__(self, vllm_config: VllmConfig,
             log_stats=True,
         )
 
+        self.outputs_queue: Optional[asyncio.Queue[bytes]] = None
+        self.queue_task: Optional[asyncio.Task] = None
+
     async def get_output_async(self) -> EngineCoreOutputs:
+        if self.outputs_queue is None:
+            # Perform IO in separate task to parallelize as much as possible
+            self.outputs_queue = asyncio.Queue()
+
+            async def process_outputs_socket():
+                assert self.outputs_queue is not None
+                while True:
+                    (frame, ) = await self.output_socket.recv_multipart(
+                        copy=False)
+                    self.outputs_queue.put_nowait(frame.buffer)
+
+            self.queue_task = asyncio.create_task(process_outputs_socket())
 
-        frames = await self.output_socket.recv_multipart(copy=False)
-        return self.decoder.decode(frames[0].buffer)
+        return self.decoder.decode(await self.outputs_queue.get())
 
     async def _send_input(self, request_type: EngineCoreRequestType,
                           request: EngineCoreRequestUnion) -> None:
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index 749f4f5043c97..564eab51bd3a8 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -101,6 +101,7 @@ def add_request(
     def process_outputs(
         self,
         engine_core_outputs: List[EngineCoreOutput],
+        iteration_stats: Optional[IterationStats] = None,
     ) -> OutputProcessorOutput:
         """
         Process the EngineCoreOutputs:
@@ -133,7 +134,8 @@ def process_outputs(
 
         request_outputs: List[RequestOutput] = []
         reqs_to_abort: List[str] = []
-        iteration_stats = IterationStats(self.log_stats)
+        if not iteration_stats:
+            iteration_stats = IterationStats(self.log_stats)
         for engine_core_output in engine_core_outputs:
             req_id = engine_core_output.request_id
             req_state = self.request_states.get(req_id)
@@ -175,8 +177,8 @@ def process_outputs(
             iteration_stats=iteration_stats,
         )
 
+    @staticmethod
     def _make_request_output(
-        self,
         request_state: RequestState,
         detokenizer_output: Optional[DetokenizerOutput],
     ) -> Optional[RequestOutput]:
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index 45450165eaefe..eefcdaf29e753 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -64,6 +64,12 @@ def __init__(
         # recomputing.
         self._kv_block_hashes: List[BlockHashType] = []
 
+        # Read-only views
+        # Prevent directly appending to the these lists since
+        # they should also be updated simultaneously.
+        self.output_token_ids = ConstantList(self._output_token_ids)
+        self.all_token_ids = ConstantList(self._all_token_ids)
+
     @classmethod
     def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request":
         return cls(
@@ -79,18 +85,6 @@ def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request":
             lora_request=request.lora_request,
         )
 
-    @property
-    def output_token_ids(self) -> ConstantList[int]:
-        # Prevent directly appending to the output_token_ids since
-        # all_token_ids should also be updated simultaneously.
-        return ConstantList(self._output_token_ids)
-
-    @property
-    def all_token_ids(self) -> ConstantList[int]:
-        # Prevent directly appending to the all_token_ids since
-        # output_token_ids should also be updated simultaneously
-        return ConstantList(self._all_token_ids)
-
     def append_output_token_ids(
         self,
         token_ids: Union[int, List[int]],

From 68c4421b6d898c8cfde9da6ef03b4262f7195fce Mon Sep 17 00:00:00 2001
From: rasmith <Randall.Smith@amd.com>
Date: Wed, 22 Jan 2025 18:10:37 -0600
Subject: [PATCH 339/462] [AMD][Quantization] Add TritonScaledMMLinearKernel
 since int8 is broken for AMD (#12282)

Signed-off-by: Randall Smith <Randall.Smith@amd.com>
---
 tests/kernels/test_triton_scaled_mm.py        | 17 +++++++++
 .../kernels/scaled_mm/__init__.py             |  8 ++--
 .../quantization/kernels/scaled_mm/triton.py  | 38 +++++++++++++++++++
 3 files changed, 58 insertions(+), 5 deletions(-)
 create mode 100644 vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py

diff --git a/tests/kernels/test_triton_scaled_mm.py b/tests/kernels/test_triton_scaled_mm.py
index 8e96a2f70d751..a5aab3c2ea4b0 100644
--- a/tests/kernels/test_triton_scaled_mm.py
+++ b/tests/kernels/test_triton_scaled_mm.py
@@ -39,6 +39,23 @@ def get_8bit_types():
     return types
 
 
+# This test is to check regressions for int8 support on ROCm.
+@pytest.mark.parametrize("model_path", [
+    "neuralmagic/Llama-3.2-1B-quantized.w8a8",
+])
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("num_logprobs", [10])
+@pytest.mark.skipif(not current_platform.is_rocm(),
+                    reason="Should only run on ROCm")
+def test_rocm_compressed_tensors_w8a8(vllm_runner, example_prompts, model_path,
+                                      max_tokens, num_logprobs):
+    dtype = "bfloat16"
+
+    with vllm_runner(model_path, dtype=dtype) as vllm_model:
+        vllm_model.generate_greedy_logprobs(example_prompts, max_tokens,
+                                            num_logprobs)
+
+
 @pytest.mark.parametrize("M", [1, 33, 64, 512])
 @pytest.mark.parametrize("N", [256, 971, 20486])
 @pytest.mark.parametrize("K", [128, 496, 1024])
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
index 586752d3d34e3..4824a11804163 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
@@ -5,8 +5,8 @@
     CutlassScaledMMLinearKernel)
 from vllm.model_executor.layers.quantization.kernels.scaled_mm.ScaledMMLinearKernel import (  # noqa: E501
     ScaledMMLinearKernel, ScaledMMLinearLayerConfig)
-# from vllm.model_executor.layers.quantization.kernels.scaled_mm.triton import (
-#     TritonScaledMMLinear)
+from vllm.model_executor.layers.quantization.kernels.scaled_mm.triton import (
+    TritonScaledMMLinearKernel)
 from vllm.model_executor.layers.quantization.kernels.scaled_mm.xla import (
     XLAScaledMMLinearKernel)
 from vllm.platforms import PlatformEnum, current_platform
@@ -15,9 +15,7 @@
 _POSSIBLE_KERNELS: Dict[PlatformEnum, List[Type[ScaledMMLinearKernel]]] = {
     PlatformEnum.CPU: [CutlassScaledMMLinearKernel],
     PlatformEnum.CUDA: [CutlassScaledMMLinearKernel],
-    # TODO(rob): Create TritonScaledMMLinear kernel. ROCM will
-    # incorrectly attempt to run AZP models if prompted to.
-    PlatformEnum.ROCM: [CutlassScaledMMLinearKernel],
+    PlatformEnum.ROCM: [TritonScaledMMLinearKernel],
     PlatformEnum.TPU: [XLAScaledMMLinearKernel],
 }
 
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py
new file mode 100644
index 0000000000000..97ec8cb0500d7
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py
@@ -0,0 +1,38 @@
+from typing import Optional, Tuple
+
+import torch
+
+from vllm.platforms import current_platform
+
+from .cutlass import CutlassScaledMMLinearKernel
+from .ScaledMMLinearKernel import ScaledMMLinearLayerConfig
+
+
+class TritonScaledMMLinearKernel(CutlassScaledMMLinearKernel):
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 75
+
+    @classmethod
+    def can_implement(
+            cls, c: ScaledMMLinearLayerConfig) -> Tuple[bool, Optional[str]]:
+        if current_platform.is_cpu():
+            return (
+                False,
+                "TritonScaledMMLinearKernel requires Triton which is not " +
+                "currently supported on CPU.")
+        if not c.input_symmetric:
+            return (False,
+                    "TritonScaledMMLinearKernel only supports symmetric " +
+                    "quantization.")
+        return True, None
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        super().process_weights_after_loading(layer)
+
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        return super().apply_weights(layer, x, bias)

From 8d7aa9de718d55dd5e17846c6a88f3cd11c138d4 Mon Sep 17 00:00:00 2001
From: Alexei-V-Ivanov-AMD
 <156011006+Alexei-V-Ivanov-AMD@users.noreply.github.com>
Date: Wed, 22 Jan 2025 20:53:02 -0600
Subject: [PATCH 340/462] [Bugfix] Fixing  AMD LoRA CI test. (#12329)

Signed-off-by: Alexei V. Ivanov <alexei.ivanov@amd.com>
---
 Dockerfile.rocm | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index 7213a15a2e005..14c522afd7f9e 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -72,7 +72,8 @@ COPY --from=build_vllm ${COMMON_WORKDIR}/vllm /vllm-workspace
 RUN cd /vllm-workspace \
     && rm -rf vllm \
     && python3 -m pip install -e tests/vllm_test_utils \
-    && python3 -m pip install lm-eval[api]==0.4.4
+    && python3 -m pip install lm-eval[api]==0.4.4 \
+    && python3 -m pip install pytest-shard
 
 # -----------------------
 # Final vLLM image

From 01a55941f5443c16a88889995f8149edcee2eec3 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Wed, 22 Jan 2025 22:18:09 -0500
Subject: [PATCH 341/462] [Docs] Update FP8 KV Cache documentation (#12238)

Signed-off-by: mgoin <michael@neuralmagic.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 .../features/quantization/fp8_e4m3_kvcache.md |  44 ------
 .../features/quantization/fp8_e5m2_kvcache.md |  31 ----
 docs/source/features/quantization/index.md    |   3 +-
 .../quantization/quantized_kvcache.md         | 145 ++++++++++++++++++
 4 files changed, 146 insertions(+), 77 deletions(-)
 delete mode 100644 docs/source/features/quantization/fp8_e4m3_kvcache.md
 delete mode 100644 docs/source/features/quantization/fp8_e5m2_kvcache.md
 create mode 100644 docs/source/features/quantization/quantized_kvcache.md

diff --git a/docs/source/features/quantization/fp8_e4m3_kvcache.md b/docs/source/features/quantization/fp8_e4m3_kvcache.md
deleted file mode 100644
index 1cd67cb8fd336..0000000000000
--- a/docs/source/features/quantization/fp8_e4m3_kvcache.md
+++ /dev/null
@@ -1,44 +0,0 @@
-(fp8-e4m3-kvcache)=
-
-# FP8 E4M3 KV Cache
-
-Quantizing the KV cache to FP8 reduces its memory footprint. This increases the number of tokens that can be stored in the cache,
-improving throughput. OCP (Open Compute Project www.opencompute.org) specifies two common 8-bit floating point data formats: E5M2
-(5 exponent bits and 2 mantissa bits) and E4M3FN (4 exponent bits and 3 mantissa bits), often shortened as E4M3. One benefit of
-the E4M3 format over E5M2 is that floating point numbers are represented in higher precision. However, the small dynamic range of
-FP8 E4M3 (±240.0 can be represented) typically necessitates the use of a higher-precision (typically FP32) scaling factor alongside
-each quantized tensor. For now, only per-tensor (scalar) scaling factors are supported. Development is ongoing to support scaling
-factors of a finer granularity (e.g. per-channel).
-
-These scaling factors can be specified by passing an optional quantization param JSON to the LLM engine at load time. If
-this JSON is not specified, scaling factors default to 1.0. These scaling factors are typically obtained when running an
-unquantized model through a quantizer tool (e.g. AMD quantizer or NVIDIA AMMO).
-
-To install AMMO (AlgorithMic Model Optimization):
-
-```console
-pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com nvidia-ammo
-```
-
-Studies have shown that FP8 E4M3 quantization typically only minimally degrades inference accuracy. The most recent silicon
-offerings e.g. AMD MI300, NVIDIA Hopper or later support native hardware conversion to and from fp32, fp16, bf16, etc.
-Thus, LLM inference is greatly accelerated with minimal accuracy loss.
-
-Here is an example of how to enable this feature:
-
-```python
-# two float8_e4m3fn kv cache scaling factor files are provided under tests/fp8_kv, please refer to
-# https://github.com/vllm-project/vllm/blob/main/examples/other/fp8/README.md to generate kv_cache_scales.json of your own.
-
-from vllm import LLM, SamplingParams
-sampling_params = SamplingParams(temperature=1.3, top_p=0.8)
-llm = LLM(model="meta-llama/Llama-2-7b-chat-hf",
-          kv_cache_dtype="fp8",
-          quantization_param_path="./tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json")
-prompt = "London is the capital of"
-out = llm.generate(prompt, sampling_params)[0].outputs[0].text
-print(out)
-
-# output w/ scaling factors:  England, the United Kingdom, and one of the world's leading financial,
-# output w/o scaling factors:  England, located in the southeastern part of the country. It is known
-```
diff --git a/docs/source/features/quantization/fp8_e5m2_kvcache.md b/docs/source/features/quantization/fp8_e5m2_kvcache.md
deleted file mode 100644
index 3a81ab17f332f..0000000000000
--- a/docs/source/features/quantization/fp8_e5m2_kvcache.md
+++ /dev/null
@@ -1,31 +0,0 @@
-(fp8-kv-cache)=
-
-# FP8 E5M2 KV Cache
-
-The int8/int4 quantization scheme requires additional scale GPU memory storage, which reduces the expected GPU memory benefits.
-The FP8 data format retains 2~3 mantissa bits and can convert float/fp16/bfloat16 and fp8 to each other.
-
-Here is an example of how to enable this feature:
-
-```python
-from vllm import LLM, SamplingParams
-# Sample prompts.
-prompts = [
-    "Hello, my name is",
-    "The president of the United States is",
-    "The capital of France is",
-    "The future of AI is",
-]
-# Create a sampling params object.
-sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-# Create an LLM.
-llm = LLM(model="facebook/opt-125m", kv_cache_dtype="fp8")
-# Generate texts from the prompts. The output is a list of RequestOutput objects
-# that contain the prompt, generated text, and other information.
-outputs = llm.generate(prompts, sampling_params)
-# Print the outputs.
-for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-```
diff --git a/docs/source/features/quantization/index.md b/docs/source/features/quantization/index.md
index 861cb165c11c2..56ccdb5f00c34 100644
--- a/docs/source/features/quantization/index.md
+++ b/docs/source/features/quantization/index.md
@@ -14,6 +14,5 @@ bnb
 gguf
 int8
 fp8
-fp8_e5m2_kvcache
-fp8_e4m3_kvcache
+quantized_kvcache
 ```
diff --git a/docs/source/features/quantization/quantized_kvcache.md b/docs/source/features/quantization/quantized_kvcache.md
new file mode 100644
index 0000000000000..95fa5e81e2f74
--- /dev/null
+++ b/docs/source/features/quantization/quantized_kvcache.md
@@ -0,0 +1,145 @@
+(quantized-kvcache)=
+
+# Quantized KV Cache
+
+## FP8 KV Cache
+
+Quantizing the KV cache to FP8 reduces its memory footprint. This increases the number of tokens that can be stored in the cache, improving throughput.
+
+### FP8 Formats
+
+[OCP (Open Compute Project)](https://www.opencompute.org) specifies two common 8-bit floating point data formats:
+
+- E5M2 (5 exponent bits and 2 mantissa bits)
+- E4M3FN (4 exponent bits and 3 mantissa bits, often shortened as E4M3)
+
+The E4M3 format offers higher precision compared to E5M2. However, due to its small dynamic range (±240.0), E4M3 typically requires a higher-precision (FP32) scaling factor alongside each quantized tensor.
+
+### Current Limitations
+
+For now, only per-tensor (scalar) scaling factors are supported. Development is ongoing to support scaling factors of a finer granularity (e.g. per-channel).
+
+### Performance Impact
+
+The current FP8 KV cache implementation primarily benefits throughput by allowing approximately double the amount of space for KV cache allocation. This enables either:
+
+- Processing longer context lengths for individual requests, or
+- Handling more concurrent request batches
+
+However, there are currently no latency improvements as the implementation does not yet include fused dequantization and attention operations. Future releases will support quantized attention with hardware acceleration, which should provide additional performance benefits. While the most recent silicon offerings (e.g. AMD MI300, NVIDIA Hopper or later) support native hardware conversion between FP8 and other formats (fp32, fp16, bf16), this benefit is not yet fully realized.
+
+Studies have shown that FP8 E4M3 quantization typically only minimally degrades inference accuracy, making it a practical choice for throughput optimization.
+
+## Usage Example
+
+Here is an example of how to enable FP8 quantization:
+
+```python
+from vllm import LLM, SamplingParams
+
+sampling_params = SamplingParams(temperature=0.7, top_p=0.8)
+llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct", kv_cache_dtype="fp8")
+prompt = "London is the capital of"
+out = llm.generate(prompt, sampling_params)[0].outputs[0].text
+print(out)
+
+# output w/ scaling factors:  England, the United Kingdom, and one of the world's leading financial,
+# output w/o scaling factors: England, located in the southeastern part of the country. It is known
+```
+
+The `kv_cache_dtype` argument specifies the data type for KV cache storage:
+- `"auto"`: Uses the model's default "unquantized" data type
+- `"fp8"` or `"fp8_e4m3"`: Supported on CUDA 11.8+ and ROCm (AMD GPU)
+- `"fp8_e5m2"`: Supported on CUDA 11.8+
+
+## Calibrated Scales for Better Accuracy
+
+For optimal model quality when using FP8 KV Cache, we recommend using calibrated scales tuned to representative inference data. [LLM Compressor](https://github.com/vllm-project/llm-compressor/) is the recommended tool for this process.
+
+### Installation
+
+First, install the required dependencies:
+
+```console
+pip install llmcompressor
+```
+
+### Example Usage
+
+Here's a complete example using `meta-llama/Llama-3.1-8B-Instruct` (most models can use this same pattern):
+
+```python
+from datasets import load_dataset
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from llmcompressor.transformers import oneshot
+
+# Select model and load it
+MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto", torch_dtype="auto")
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+
+# Select calibration dataset
+DATASET_ID = "HuggingFaceH4/ultrachat_200k"
+DATASET_SPLIT = "train_sft"
+
+# Configure calibration parameters
+NUM_CALIBRATION_SAMPLES = 512  # 512 samples is a good starting point
+MAX_SEQUENCE_LENGTH = 2048
+
+# Load and preprocess dataset
+ds = load_dataset(DATASET_ID, split=DATASET_SPLIT)
+ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
+
+def process_and_tokenize(example):
+    text = tokenizer.apply_chat_template(example["messages"], tokenize=False)
+    return tokenizer(
+        text,
+        padding=False,
+        max_length=MAX_SEQUENCE_LENGTH,
+        truncation=True,
+        add_special_tokens=False,
+    )
+
+ds = ds.map(process_and_tokenize, remove_columns=ds.column_names)
+
+# Configure quantization settings
+recipe = """
+quant_stage:
+    quant_modifiers:
+        QuantizationModifier:
+            kv_cache_scheme:
+                num_bits: 8
+                type: float
+                strategy: tensor
+                dynamic: false
+                symmetric: true
+"""
+
+# Apply quantization
+oneshot(
+    model=model,
+    dataset=ds,
+    recipe=recipe,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+)
+
+# Save quantized model
+SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-KV"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
+```
+
+The above script will create a folder in your current directory containing your quantized model (e.g., `Llama-3.1-8B-Instruct-FP8-KV`) with calibrated scales.
+
+When running the model you must specify `kv_cache_dtype="fp8"` in order to enable the kv cache quantization and use the scales.
+
+```python
+from vllm import LLM, SamplingParams
+
+sampling_params = SamplingParams(temperature=0.7, top_p=0.8)
+llm = LLM(model="Llama-3.1-8B-Instruct-FP8-KV", kv_cache_dtype="fp8")
+prompt = "London is the capital of"
+out = llm.generate(prompt, sampling_params)[0].outputs[0].text
+print(out)
+```

From 7551a340328dec66d5905b01d0113e251c3afb3b Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Wed, 22 Jan 2025 22:44:09 -0500
Subject: [PATCH 342/462] [Docs] Document vulnerability disclosure process
 (#12326)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 .../contributing/vulnerability_management.md    | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/docs/source/contributing/vulnerability_management.md b/docs/source/contributing/vulnerability_management.md
index 422dc13e6a644..a9bbfde2af770 100644
--- a/docs/source/contributing/vulnerability_management.md
+++ b/docs/source/contributing/vulnerability_management.md
@@ -41,3 +41,20 @@ You may use the `#security` channel in the [VLLM Slack](https://slack.vllm.ai)
 to discuss security-related topics. However, please do not disclose any
 vulnerabilities in this channel. If you need to report a vulnerability, please
 use the GitHub security advisory system or contact a VMT member privately.
+
+## Vulnerability Disclosure
+
+The process for disclosing vulnerabilities is the following:
+
+- The VMT will work with the project maintainers to develop a fix for the
+  vulnerability.
+- The VMT will coordinate with the reporter and project maintainers to prepare a
+  security advisory that adequately describes the vulnerability and its impact.
+- The VMT will coordinate with the project maintainers to publish a fix and
+  release an update that includes that fix.
+- The VMT will publish the security advisory on GitHub. Release notes will be
+  updated to include a reference to the security advisory.
+
+The VMT and project maintainers will work to minimize the amount of time in
+between disclosing any public information about the vulnerability and making a
+release and advisory available.

From f0ef37233ea0ba5251edaea7362984110411e7eb Mon Sep 17 00:00:00 2001
From: Cody Yu <hao.yu.cody@gmail.com>
Date: Wed, 22 Jan 2025 20:19:21 -0800
Subject: [PATCH 343/462] [V1] Add `uncache_blocks` (#12333)

---
 tests/v1/core/test_prefix_caching.py | 30 +++++++++++++++++++++++++
 vllm/v1/core/kv_cache_manager.py     | 33 ++++++++++++++++++++++++++--
 2 files changed, 61 insertions(+), 2 deletions(-)

diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index c5860809f9e62..f434fa8c61a80 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -626,3 +626,33 @@ def test_reset_prefix_cache():
     assert manager.reset_prefix_cache()
     assert not manager.cached_block_hash_to_block
     assert all([blk.block_hash is None for blk in manager.block_pool])
+
+
+def test_uncache_blocks():
+    manager = KVCacheManager(
+        block_size=16,
+        num_gpu_blocks=10,
+        max_model_len=8192,
+        sliding_window=None,
+        enable_caching=True,
+        num_preallocate_tokens=0,
+    )
+
+    req0 = make_request("0", list(range(30)))
+    blocks = manager.allocate_slots(req0, 30, [])
+    assert [b.block_id for b in blocks] == [0, 1]
+    assert len(manager.cached_block_hash_to_block) == 1
+
+    req0.num_computed_tokens = 30
+
+    # Simulate speculative tokens.
+    for _ in range(5):
+        req0.append_output_token_ids(8)
+    manager.append_slots(req0, 5)
+    assert len(manager.cached_block_hash_to_block) == 2
+
+    # After sampling, assuming only 1 token is accepted.
+    req0.num_computed_tokens = 31
+    num_uncached_blocks = manager.uncache_blocks(req0)
+    assert num_uncached_blocks == 1
+    assert len(manager.cached_block_hash_to_block) == 1
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index 8c8c8b3b55c0b..18fdfdfe4a010 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -285,6 +285,29 @@ def free(self, request: Request) -> None:
             if block.ref_cnt == 0:
                 self.free_block_queue.append(block)
 
+    def uncache_blocks(self, request: Request) -> int:
+        """Uncache the blocks that are no longer full based on the
+        num_computed_tokens in the given request. This happens when
+        the blocks were full and cached due to speculative tokens, but the
+        speculative tokens are not accepted.
+
+        Args:
+            request: The request.
+
+        Returns:
+            The number of uncached blocks.
+        """
+        blocks = self.req_to_blocks[request.request_id]
+        num_computed_tokens = request.num_computed_tokens
+        num_full_blocks = num_computed_tokens // self.block_size
+        num_uncached_blocks = 0
+        for block in blocks[num_full_blocks:]:
+            # If the block is not cached, the following blocks are not cached.
+            if not self._maybe_evict_cached_block(block):
+                break
+            num_uncached_blocks += 1
+        return num_uncached_blocks
+
     def reset_prefix_cache(self) -> bool:
         """Reset prefix cache. This function may be used in RLHF
         flows to invalid prefix caching after the weights are updated,
@@ -386,7 +409,7 @@ def _get_new_blocks(self, num_blocks: int) -> List[KVCacheBlock]:
 
             # If the block is cached, evict it.
             if self.enable_caching:
-                self._evict_cached_block(curr_block)
+                self._maybe_evict_cached_block(curr_block)
 
             curr_block.incr_ref()
             ret.append(curr_block)
@@ -394,13 +417,16 @@ def _get_new_blocks(self, num_blocks: int) -> List[KVCacheBlock]:
 
         return ret
 
-    def _evict_cached_block(self, block: KVCacheBlock) -> None:
+    def _maybe_evict_cached_block(self, block: KVCacheBlock) -> bool:
         """
         If a block is cached in `cached_block_hash_to_block`, we reset its hash
         metadata and evict it from the cache.
 
         Args:
             block: The block to evict.
+
+        Returns:
+            True if the block is evicted, False otherwise.
         """
         block_hash = block.block_hash
         if block_hash and block_hash in self.cached_block_hash_to_block:
@@ -410,6 +436,9 @@ def _evict_cached_block(self, block: KVCacheBlock) -> None:
             if len(self.cached_block_hash_to_block[block_hash]) == 0:
                 del self.cached_block_hash_to_block[block_hash]
 
+            return True
+        return False
+
     def _get_cached_block(self,
                           block_hash: BlockHashType) -> Optional[KVCacheBlock]:
         """Get a cached block by the block hash, or None if cache miss.

From 511627445e819169ded3aaead54062cddfd37a5d Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 23 Jan 2025 14:56:02 +0800
Subject: [PATCH 344/462] [doc] explain common errors around torch.compile
 (#12340)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 .../source/getting_started/troubleshooting.md | 21 +++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/docs/source/getting_started/troubleshooting.md b/docs/source/getting_started/troubleshooting.md
index 1e290d2b4c0bd..ec4b184bf4604 100644
--- a/docs/source/getting_started/troubleshooting.md
+++ b/docs/source/getting_started/troubleshooting.md
@@ -197,6 +197,27 @@ if __name__ == '__main__':
     llm = vllm.LLM(...)
 ```
 
+## `torch.compile` Error
+
+vLLM heavily depends on `torch.compile` to optimize the model for better performance, which introduces the dependency on the `torch.compile` functionality and the `triton` library. By default, we use `torch.compile` to [optimize some functions](https://github.com/vllm-project/vllm/pull/10406) in the model. Before running vLLM, you can check if `torch.compile` is working as expected by running the following script:
+
+```python
+import torch
+
+@torch.compile
+def f(x):
+    # a simple function to test torch.compile
+    x = x + 1
+    x = x * 2
+    x = x.sin()
+    return x
+
+x = torch.randn(4, 4).cuda()
+print(f(x))
+```
+
+If it raises errors from `torch/_inductor` directory, usually it means you have a custom `triton` library that is not compatible with the version of PyTorch you are using. See [this issue](https://github.com/vllm-project/vllm/issues/12219) for example.
+
 ## Known Issues
 
 - In `v0.5.2`, `v0.5.3`, and `v0.5.3.post1`, there is a bug caused by [zmq](https://github.com/zeromq/pyzmq/issues/2000) , which can occasionally cause vLLM to hang depending on the machine configuration. The solution is to upgrade to the latest version of `vllm` to include the [fix](gh-pr:6759).

From 8ae5ff20093f74671bcdf2c9d4e078839c7a7732 Mon Sep 17 00:00:00 2001
From: liuzhenwei <zhenweiliu@habana.ai>
Date: Thu, 23 Jan 2025 16:35:46 +0800
Subject: [PATCH 345/462] [Hardware][Gaudi][BugFix] Fix dataclass error due to
 triton package update (#12338)

Signed-off-by: zhenwei <zhenweiliu@habana.ai>
---
 requirements-hpu.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements-hpu.txt b/requirements-hpu.txt
index f4fb89ef42834..63a5f8b18f6b8 100644
--- a/requirements-hpu.txt
+++ b/requirements-hpu.txt
@@ -3,7 +3,7 @@
 
 # Dependencies for HPU code
 ray
-triton
+triton==3.1.0
 pandas
 tabulate
 setuptools>=61

From c5b4b11d7f4b69160d6a0d99771cb5c04e923a8d Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Thu, 23 Jan 2025 18:15:33 +0800
Subject: [PATCH 346/462] [Bugfix] Fix k_proj's bias for whisper self attention
 (#12342)

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 vllm/model_executor/models/whisper.py | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
index c1f3bb0ca33c2..b8512b735da94 100644
--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@@ -729,7 +729,22 @@ def sample(
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self, skip_prefixes=["proj_out."])
-        loaded_weights = [(name, loaded_weight)
-                          for name, loaded_weight in weights]
         mapper = WeightsMapper({".fc1.": ".mlp.fc1.", ".fc2.": ".mlp.fc2."})
-        return loader.load_weights(loaded_weights, mapper=mapper)
+        # add fake zeros bias for k_proj to state_dict
+        weights = _create_fake_bias_for_k_proj(weights)
+        return loader.load_weights(weights, mapper=mapper)
+
+
+def _create_fake_bias_for_k_proj(
+    weights: Iterable[Tuple[str, torch.Tensor]]
+) -> Iterable[Tuple[str, torch.Tensor]]:
+    """
+    Create full zeros bias for k_proj weight in self-attention layers.
+    So that the bias for k_proj in qkv_proj can be initialized with zeros.
+    """
+    for name, weight in weights:
+        if ".self_attn.k_proj.weight" in name:
+            bias = torch.zeros(weight.size(0))
+            bias_name = name.replace("weight", "bias")
+            yield from [(name, weight), (bias_name, bias)]
+        yield name, weight

From 978b45f39970f81d2857fbb8aada5b44619d258c Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Thu, 23 Jan 2025 09:45:48 -0500
Subject: [PATCH 347/462] [Kernel] Flash Attention 3 Support (#12093)

Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
---
 CMakeLists.txt                           | 45 +++++++++++------------
 setup.py                                 | 12 ++++---
 tests/kernels/test_cascade_flash_attn.py | 24 +++++++------
 tests/kernels/test_flash_attn.py         | 22 +++++++++---
 vllm/attention/backends/flash_attn.py    | 27 ++++++++++++--
 vllm/envs.py                             | 12 +++++++
 vllm/v1/attention/backends/flash_attn.py | 44 +++++++++++++++++------
 vllm/v1/worker/gpu_model_runner.py       | 46 +++++++++++-------------
 8 files changed, 150 insertions(+), 82 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0945905104f32..5039ac2448f83 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -24,9 +24,6 @@ include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
 # Suppress potential warnings about unused manually-specified variables
 set(ignoreMe "${VLLM_PYTHON_PATH}")
 
-# Prevent installation of dependencies (cutlass) by default.
-install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
-
 #
 # Supported python versions.  These versions will be searched in order, the
 # first match will be selected.  These should be kept in sync with setup.py.
@@ -535,7 +532,7 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
 endif()
 
 # vllm-flash-attn currently only supported on CUDA
-if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda")
+if (NOT VLLM_GPU_LANG STREQUAL "CUDA")
   return()
 endif ()
 
@@ -558,7 +555,7 @@ endif()
 # They should be identical but if they aren't, this is a massive footgun.
 #
 # The vllm-flash-attn install rules are nested under vllm to make sure the library gets installed in the correct place.
-# To only install vllm-flash-attn, use --component vllm_flash_attn_c.
+# To only install vllm-flash-attn, use --component _vllm_fa2_C (for FA2) or --component _vllm_fa3_C (for FA3).
 # If no component is specified, vllm-flash-attn is still installed.
 
 # If VLLM_FLASH_ATTN_SRC_DIR is set, vllm-flash-attn is installed from that directory instead of downloading.
@@ -570,43 +567,41 @@ if (DEFINED ENV{VLLM_FLASH_ATTN_SRC_DIR})
 endif()
 
 if(VLLM_FLASH_ATTN_SRC_DIR)
-  FetchContent_Declare(vllm-flash-attn SOURCE_DIR ${VLLM_FLASH_ATTN_SRC_DIR})
+  FetchContent_Declare(
+          vllm-flash-attn SOURCE_DIR 
+          ${VLLM_FLASH_ATTN_SRC_DIR}
+          BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
+  )
 else()
   FetchContent_Declare(
           vllm-flash-attn
           GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 96266b1111111f3d11aabefaf3bacbab6a89d03c
+          GIT_TAG 90eacc1af2a7c3de62ea249e929ed5faccf38954
           GIT_PROGRESS TRUE
           # Don't share the vllm-flash-attn build between build types
           BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
   )
 endif()
 
-# Set the parent build flag so that the vllm-flash-attn library does not redo compile flag and arch initialization.
-set(VLLM_PARENT_BUILD ON)
-
-# Ensure the vllm/vllm_flash_attn directory exists before installation
-install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/vllm/vllm_flash_attn\")" COMPONENT vllm_flash_attn_c)
-
-# Make sure vllm-flash-attn install rules are nested under vllm/
-install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY FALSE)" COMPONENT vllm_flash_attn_c)
-install(CODE "set(OLD_CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}\")" COMPONENT vllm_flash_attn_c)
-install(CODE "set(CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}/vllm/\")" COMPONENT vllm_flash_attn_c)
 
 # Fetch the vllm-flash-attn library
 FetchContent_MakeAvailable(vllm-flash-attn)
 message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}")
 
-# Restore the install prefix
-install(CODE "set(CMAKE_INSTALL_PREFIX \"\${OLD_CMAKE_INSTALL_PREFIX}\")" COMPONENT vllm_flash_attn_c)
-install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" COMPONENT vllm_flash_attn_c)
+# Copy over the vllm-flash-attn python files (duplicated for fa2 and fa3, in
+# case only one is built, in the case both are built redundant work is done)
+install(
+  DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
+  DESTINATION vllm_flash_attn
+  COMPONENT _vllm_fa2_C
+  FILES_MATCHING PATTERN "*.py"
+)
 
-# Copy over the vllm-flash-attn python files
 install(
-        DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
-        DESTINATION vllm/vllm_flash_attn
-        COMPONENT vllm_flash_attn_c
-        FILES_MATCHING PATTERN "*.py"
+  DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
+  DESTINATION vllm_flash_attn
+  COMPONENT _vllm_fa3_C
+  FILES_MATCHING PATTERN "*.py"
 )
 
 # Nothing after vllm-flash-attn, see comment about macros above
diff --git a/setup.py b/setup.py
index dde5072139660..36c89d435c7b7 100644
--- a/setup.py
+++ b/setup.py
@@ -228,8 +228,11 @@ def target_name(s: str) -> str:
 
             # CMake appends the extension prefix to the install path,
             # and outdir already contains that prefix, so we need to remove it.
+            # We assume only the final component of extension prefix is added by
+            # CMake, this is currently true for current extensions but may not
+            # always be the case.
             prefix = outdir
-            for i in range(ext.name.count('.')):
+            if '.' in ext.name:
                 prefix = prefix.parent
 
             # prefix here should actually be the same for all components
@@ -298,7 +301,8 @@ def run(self) -> None:
             files_to_copy = [
                 "vllm/_C.abi3.so",
                 "vllm/_moe_C.abi3.so",
-                "vllm/vllm_flash_attn/vllm_flash_attn_c.abi3.so",
+                "vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so",
+                "vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so",
                 "vllm/vllm_flash_attn/flash_attn_interface.py",
                 "vllm/vllm_flash_attn/__init__.py",
                 "vllm/cumem_allocator.abi3.so",
@@ -593,8 +597,8 @@ def _read_requirements(filename: str) -> List[str]:
     ext_modules.append(CMakeExtension(name="vllm._rocm_C"))
 
 if _is_cuda():
-    ext_modules.append(
-        CMakeExtension(name="vllm.vllm_flash_attn.vllm_flash_attn_c"))
+    ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa2_C"))
+    ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa3_C"))
     ext_modules.append(CMakeExtension(name="vllm.cumem_allocator"))
 
 if _build_custom_ops():
diff --git a/tests/kernels/test_cascade_flash_attn.py b/tests/kernels/test_cascade_flash_attn.py
index 45ec6df4e711e..00eb927205d46 100644
--- a/tests/kernels/test_cascade_flash_attn.py
+++ b/tests/kernels/test_cascade_flash_attn.py
@@ -78,6 +78,7 @@ def test_merge_kernel(
 @pytest.mark.parametrize("block_size", BLOCK_SIZES)
 @pytest.mark.parametrize("soft_cap", [None, 50])
 @pytest.mark.parametrize("num_blocks", [2048])
+@pytest.mark.parametrize("fa_version", [2, 3])
 @torch.inference_mode()
 def test_cascade(
     seq_lens_and_common_prefix: Tuple[List[Tuple[int, int]], int],
@@ -87,8 +88,14 @@ def test_cascade(
     block_size: int,
     soft_cap: Optional[float],
     num_blocks: int,
+    fa_version: int,
 ) -> None:
     torch.set_default_device("cuda")
+    if fa_version == 3 and (torch.cuda.get_device_capability() == (8, 6)
+                            or torch.cuda.get_device_capability() == (8, 9)):
+        pytest.skip("Flash attention version 3 fails on 8.6 and 8.9 due to "
+                    "insufficient shared memory for some shapes")
+
     current_platform.seed_everything(0)
 
     window_size = (-1, -1)
@@ -118,9 +125,7 @@ def test_cascade(
     cu_query_lens = torch.tensor([0] + query_lens,
                                  dtype=torch.int32).cumsum(dim=0,
                                                            dtype=torch.int32)
-    cu_kv_lens = torch.tensor([0] + kv_lens,
-                              dtype=torch.int32).cumsum(dim=0,
-                                                        dtype=torch.int32)
+    kv_lens_tensor = torch.tensor(kv_lens, dtype=torch.int32)
     max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
     block_tables = torch.randint(0,
                                  num_blocks,
@@ -140,7 +145,7 @@ def test_cascade(
         k=key_cache,
         v=value_cache,
         cu_seqlens_q=cu_query_lens,
-        cu_seqlens_k=cu_kv_lens,
+        seqused_k=kv_lens_tensor,
         max_seqlen_q=max_query_len,
         max_seqlen_k=max_kv_len,
         softmax_scale=scale,
@@ -154,10 +159,8 @@ def test_cascade(
     assert all(common_prefix_len < kv_len for kv_len in kv_lens)
     cu_prefix_query_lens = torch.tensor([0, total_num_query_tokens],
                                         dtype=torch.int32)
-    cu_prefix_kv_lens = torch.tensor([0, common_prefix_len], dtype=torch.int32)
-    cu_suffix_kv_lens = (
-        cu_kv_lens -
-        torch.arange(num_seqs + 1, dtype=torch.int32) * common_prefix_len)
+    prefix_kv_lens = torch.tensor([common_prefix_len], dtype=torch.int32)
+    suffix_kv_lens = kv_lens_tensor - common_prefix_len
     output = torch.empty_like(query)
     cascade_attention(
         output=output,
@@ -167,8 +170,8 @@ def test_cascade(
         cu_query_lens=cu_query_lens,
         max_query_len=max_query_len,
         cu_prefix_query_lens=cu_prefix_query_lens,
-        cu_prefix_kv_lens=cu_prefix_kv_lens,
-        cu_suffix_kv_lens=cu_suffix_kv_lens,
+        prefix_kv_lens=prefix_kv_lens,
+        suffix_kv_lens=suffix_kv_lens,
         max_kv_len=max_kv_len,
         softmax_scale=scale,
         alibi_slopes=None,
@@ -176,6 +179,7 @@ def test_cascade(
         logits_soft_cap=soft_cap if soft_cap is not None else 0,
         block_table=block_tables,
         common_prefix_len=common_prefix_len,
+        fa_version=fa_version,
     )
 
     # Compare the results.
diff --git a/tests/kernels/test_flash_attn.py b/tests/kernels/test_flash_attn.py
index 1ae78d7b46c5b..b22153c86b25f 100644
--- a/tests/kernels/test_flash_attn.py
+++ b/tests/kernels/test_flash_attn.py
@@ -80,6 +80,7 @@ def ref_paged_attn(
 @pytest.mark.parametrize("soft_cap", [None, 10.0, 50.0])
 @pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
 @pytest.mark.parametrize("sliding_window", [None, 256])
+@pytest.mark.parametrize("fa_version", [2, 3])
 @torch.inference_mode()
 def test_flash_attn_with_paged_kv(
     use_out: bool,
@@ -91,8 +92,14 @@ def test_flash_attn_with_paged_kv(
     soft_cap: Optional[float],
     num_blocks: int,
     sliding_window: Optional[int],
+    fa_version: int,
 ) -> None:
     torch.set_default_device("cuda")
+    if fa_version == 3 and (torch.cuda.get_device_capability() == (8, 6)
+                            or torch.cuda.get_device_capability() == (8, 9)):
+        pytest.skip("Flash attention version 3 fails on 8.6 and 8.9 due to "
+                    "insufficient shared memory for some shapes")
+
     current_platform.seed_everything(0)
     num_seqs = len(kv_lens)
     num_query_heads = num_heads[0]
@@ -131,6 +138,7 @@ def test_flash_attn_with_paged_kv(
         cache_seqlens=kv_lens_tensor,
         softcap=soft_cap if soft_cap is not None else 0,
         window_size=window_size,
+        fa_version=fa_version,
     )
     output = output if not use_out else out
     output = output.squeeze(1)
@@ -159,6 +167,7 @@ def test_flash_attn_with_paged_kv(
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("soft_cap", [None, 10.0, 50.0])
 @pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
+@pytest.mark.parametrize("fa_version", [2, 3])
 @torch.inference_mode()
 def test_varlen_with_paged_kv(
     use_out: bool,
@@ -170,8 +179,14 @@ def test_varlen_with_paged_kv(
     block_size: int,
     soft_cap: Optional[float],
     num_blocks: int,
+    fa_version: int,
 ) -> None:
     torch.set_default_device("cuda")
+    if fa_version == 3 and (torch.cuda.get_device_capability() == (8, 6)
+                            or torch.cuda.get_device_capability() == (8, 9)):
+        pytest.skip("Flash attention version 3 fails on 8.6 and 8.9 due to "
+                    "insufficient shared memory for some shapes")
+
     current_platform.seed_everything(0)
     num_seqs = len(seq_lens)
     query_lens = [x[0] for x in seq_lens]
@@ -198,9 +213,7 @@ def test_varlen_with_paged_kv(
     cu_query_lens = torch.tensor([0] + query_lens,
                                  dtype=torch.int32).cumsum(dim=0,
                                                            dtype=torch.int32)
-    cu_kv_lens = torch.tensor([0] + kv_lens,
-                              dtype=torch.int32).cumsum(dim=0,
-                                                        dtype=torch.int32)
+    kv_lens = torch.tensor(kv_lens, dtype=torch.int32)
 
     max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
     block_tables = torch.randint(0,
@@ -215,7 +228,7 @@ def test_varlen_with_paged_kv(
         v=value_cache,
         out=out,
         cu_seqlens_q=cu_query_lens,
-        cu_seqlens_k=cu_kv_lens,
+        seqused_k=kv_lens,
         max_seqlen_q=max_query_len,
         max_seqlen_k=max_kv_len,
         softmax_scale=scale,
@@ -223,6 +236,7 @@ def test_varlen_with_paged_kv(
         window_size=window_size,
         block_table=block_tables,
         softcap=soft_cap if soft_cap is not None else 0,
+        fa_version=fa_version,
     )
     output = output if not use_out else out
 
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index 60ed09d0cc44f..18acfb82fac58 100644
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -17,7 +17,9 @@
     compute_slot_mapping_start_idx, get_num_prefill_decode_query_kv_tokens,
     get_seq_len_block_table_args, is_all_cross_attn_metadata_set,
     is_all_encoder_attn_metadata_set, is_block_tables_empty)
+from vllm.envs import VLLM_FLASH_ATTN_VERSION
 from vllm.multimodal import MultiModalPlaceholderMap
+from vllm.platforms import current_platform
 from vllm.utils import async_tensor_h2d, make_tensor_with_pad
 
 if TYPE_CHECKING:
@@ -25,7 +27,8 @@
                                           ModelInputForGPUWithSamplingMetadata)
 
 from vllm.vllm_flash_attn import (flash_attn_varlen_func,
-                                  flash_attn_with_kvcache)
+                                  flash_attn_with_kvcache,
+                                  is_fa_version_supported)
 
 
 class FlashAttentionBackend(AttentionBackend):
@@ -634,6 +637,20 @@ def __init__(
                 f"Supported head sizes are: {support_head_sizes}.")
         self.attn_type = attn_type
 
+        # if hopper default to FA3, otherwise stick to FA2 for now
+        # TODO(lucas): profile FA3 on ampere to see if it makes sense to
+        #  use FA3 as default for both
+        if current_platform.get_device_capability()[0] >= 9:
+            self.fa_version = 3 if is_fa_version_supported(3) else 2
+        else:
+            self.fa_version = 2
+
+        if VLLM_FLASH_ATTN_VERSION is not None:
+            assert VLLM_FLASH_ATTN_VERSION in [2, 3]
+            self.fa_version = VLLM_FLASH_ATTN_VERSION
+
+        assert is_fa_version_supported(self.fa_version)
+
     def forward(
         self,
         layer: AttentionLayer,
@@ -752,6 +769,7 @@ def forward(
                     alibi_slopes=alibi_slopes,
                     softcap=logits_soft_cap,
                     out=prefill_output,
+                    fa_version=self.fa_version,
                 )
             else:
                 # prefix-enabled attention
@@ -765,7 +783,7 @@ def forward(
                     v=value_cache,
                     cu_seqlens_q=prefill_meta.query_start_loc,
                     max_seqlen_q=prefill_meta.max_query_len,
-                    cu_seqlens_k=prefill_meta.seq_start_loc,
+                    seqused_k=prefill_meta.seq_lens_tensor,
                     max_seqlen_k=max_seq_len,
                     softmax_scale=softmax_scale,
                     causal=True,
@@ -774,6 +792,7 @@ def forward(
                     block_table=prefill_meta.block_tables,
                     softcap=logits_soft_cap,
                     out=prefill_output,
+                    fa_version=self.fa_version,
                 )
 
         if decode_meta := attn_metadata.decode_metadata:
@@ -793,7 +812,7 @@ def forward(
                     v=value_cache,
                     cu_seqlens_q=decode_meta.query_start_loc,
                     max_seqlen_q=decode_meta.max_decode_query_len,
-                    cu_seqlens_k=decode_meta.seq_start_loc,
+                    seqused_k=decode_meta.seq_lens_tensor,
                     max_seqlen_k=decode_meta.max_decode_seq_len,
                     softmax_scale=softmax_scale,
                     causal=True,
@@ -802,6 +821,7 @@ def forward(
                     softcap=logits_soft_cap,
                     block_table=decode_meta.block_tables,
                     out=decode_output,
+                    fa_version=self.fa_version,
                 )
             else:
                 # Use flash_attn_with_kvcache for normal decoding.
@@ -822,6 +842,7 @@ def forward(
                     alibi_slopes=alibi_slopes,
                     softcap=logits_soft_cap,
                     out=decode_output.unsqueeze(1),
+                    fa_version=self.fa_version,
                 )
         return output
 
diff --git a/vllm/envs.py b/vllm/envs.py
index 3a15e00e7b50a..b72e9141ac792 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -11,6 +11,7 @@
     VLLM_NCCL_SO_PATH: Optional[str] = None
     LD_LIBRARY_PATH: Optional[str] = None
     VLLM_USE_TRITON_FLASH_ATTN: bool = False
+    VLLM_FLASH_ATTN_VERSION: Optional[int] = None
     LOCAL_RANK: int = 0
     CUDA_VISIBLE_DEVICES: Optional[str] = None
     VLLM_ENGINE_ITERATION_TIMEOUT_S: int = 60
@@ -90,6 +91,12 @@ def get_default_config_root():
     )
 
 
+def maybe_convert_int(value: Optional[str]) -> Optional[int]:
+    if value is None:
+        return None
+    return int(value)
+
+
 # The begin-* and end* here are used by the documentation generator
 # to extract the used env vars.
 
@@ -203,6 +210,11 @@ def get_default_config_root():
     lambda: (os.environ.get("VLLM_USE_TRITON_FLASH_ATTN", "True").lower() in
              ("true", "1")),
 
+    # Force vllm to use a specific flash-attention version (2 or 3), only valid
+    # when using the flash-attention backend.
+    "VLLM_FLASH_ATTN_VERSION":
+    lambda: maybe_convert_int(os.environ.get("VLLM_FLASH_ATTN_VERSION", None)),
+
     # Internal flag to enable Dynamo fullgraph capture
     "VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE":
     lambda: bool(
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index fd36ea8d8806b..1806fec8833a3 100644
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -9,8 +9,11 @@
 
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionMetadata, AttentionType)
+from vllm.envs import VLLM_FLASH_ATTN_VERSION
+from vllm.platforms import current_platform
 from vllm.utils import cdiv
-from vllm.vllm_flash_attn import flash_attn_varlen_func
+from vllm.vllm_flash_attn import (flash_attn_varlen_func,
+                                  is_fa_version_supported)
 
 
 class FlashAttentionBackend(AttentionBackend):
@@ -63,7 +66,7 @@ class FlashAttentionMetadata:
     max_query_len: int
     query_start_loc: torch.Tensor
     max_seq_len: int
-    seq_start_loc: torch.Tensor
+    seq_lens: torch.Tensor
     block_table: torch.Tensor
     slot_mapping: torch.Tensor
 
@@ -71,8 +74,8 @@ class FlashAttentionMetadata:
     use_cascade: bool
     common_prefix_len: int
     cu_prefix_query_lens: Optional[torch.Tensor]
-    cu_prefix_kv_lens: Optional[torch.Tensor]
-    cu_suffix_kv_lens: Optional[torch.Tensor]
+    prefix_kv_lens: Optional[torch.Tensor]
+    suffix_kv_lens: Optional[torch.Tensor]
 
     # For logging.
     num_input_tokens: int = 0  # Number of tokens including padding.
@@ -128,6 +131,20 @@ def __init__(
                                       "are not implemented for "
                                       "FlashAttentionImpl")
 
+        # if hopper default to FA3, otherwise stick to FA2 for now
+        # TODO(lucas): profile FA3 on ampere to see if it makes sense to
+        #  use FA3 as default for both
+        if current_platform.get_device_capability()[0] >= 9:
+            self.fa_version = 3 if is_fa_version_supported(3) else 2
+        else:
+            self.fa_version = 2
+
+        if VLLM_FLASH_ATTN_VERSION is not None:
+            assert VLLM_FLASH_ATTN_VERSION in [2, 3]
+            self.fa_version = VLLM_FLASH_ATTN_VERSION
+
+        assert is_fa_version_supported(self.fa_version)
+
     def forward(
         self,
         layer: torch.nn.Module,
@@ -196,7 +213,7 @@ def forward(
                 out=output[:num_actual_tokens],
                 cu_seqlens_q=attn_metadata.query_start_loc,
                 max_seqlen_q=attn_metadata.max_query_len,
-                cu_seqlens_k=attn_metadata.seq_start_loc,
+                seqused_k=attn_metadata.seq_lens,
                 max_seqlen_k=attn_metadata.max_seq_len,
                 softmax_scale=self.scale,
                 causal=True,
@@ -204,6 +221,7 @@ def forward(
                 window_size=self.sliding_window,
                 block_table=attn_metadata.block_table,
                 softcap=self.logits_soft_cap,
+                fa_version=self.fa_version,
             )
             return output
 
@@ -216,8 +234,8 @@ def forward(
             cu_query_lens=attn_metadata.query_start_loc,
             max_query_len=attn_metadata.max_query_len,
             cu_prefix_query_lens=attn_metadata.cu_prefix_query_lens,
-            cu_prefix_kv_lens=attn_metadata.cu_prefix_kv_lens,
-            cu_suffix_kv_lens=attn_metadata.cu_suffix_kv_lens,
+            prefix_kv_lens=attn_metadata.prefix_kv_lens,
+            suffix_kv_lens=attn_metadata.suffix_kv_lens,
             max_kv_len=attn_metadata.max_seq_len,
             softmax_scale=self.scale,
             alibi_slopes=self.alibi_slopes,
@@ -225,6 +243,7 @@ def forward(
             logits_soft_cap=self.logits_soft_cap,
             block_table=attn_metadata.block_table,
             common_prefix_len=attn_metadata.common_prefix_len,
+            fa_version=self.fa_version,
         )
         return output
 
@@ -305,8 +324,8 @@ def cascade_attention(
     cu_query_lens: torch.Tensor,
     max_query_len: int,
     cu_prefix_query_lens: torch.Tensor,
-    cu_prefix_kv_lens: torch.Tensor,
-    cu_suffix_kv_lens: torch.Tensor,
+    prefix_kv_lens: torch.Tensor,
+    suffix_kv_lens: torch.Tensor,
     max_kv_len: int,
     softmax_scale: float,
     alibi_slopes: Optional[torch.Tensor],
@@ -314,6 +333,7 @@ def cascade_attention(
     logits_soft_cap: float,
     block_table: torch.Tensor,
     common_prefix_len: int,
+    fa_version: int,
 ) -> torch.Tensor:
     assert alibi_slopes is None, ("Cascade attention does not support ALiBi.")
     # TODO: Support sliding window.
@@ -332,7 +352,7 @@ def cascade_attention(
         k=key_cache,
         v=value_cache,
         cu_seqlens_q=cu_prefix_query_lens,
-        cu_seqlens_k=cu_prefix_kv_lens,
+        seqused_k=prefix_kv_lens,
         max_seqlen_q=num_tokens,
         max_seqlen_k=common_prefix_len,
         softmax_scale=softmax_scale,
@@ -341,6 +361,7 @@ def cascade_attention(
         block_table=block_table[:1],
         softcap=logits_soft_cap,
         return_softmax_lse=True,
+        fa_version=fa_version,
     )
 
     # Process suffix per query.
@@ -349,7 +370,7 @@ def cascade_attention(
         k=key_cache,
         v=value_cache,
         cu_seqlens_q=cu_query_lens,
-        cu_seqlens_k=cu_suffix_kv_lens,
+        seqused_k=suffix_kv_lens,
         max_seqlen_q=max_query_len,
         max_seqlen_k=max_kv_len - common_prefix_len,
         softmax_scale=softmax_scale,
@@ -358,6 +379,7 @@ def cascade_attention(
         block_table=block_table[:, num_common_kv_blocks:],
         softcap=logits_soft_cap,
         return_softmax_lse=True,
+        fa_version=fa_version,
     )
 
     # Merge prefix and suffix outputs, and store the result in output.
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index fdf39449a2c59..99d463b940923 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -199,11 +199,11 @@ def __init__(
                                                device="cpu",
                                                pin_memory=self.pin_memory)
         self.query_start_loc_np = self.query_start_loc_cpu.numpy()
-        self.seq_start_loc_cpu = torch.zeros(self.max_num_reqs + 1,
-                                             dtype=torch.int32,
-                                             device="cpu",
-                                             pin_memory=self.pin_memory)
-        self.seq_start_loc_np = self.seq_start_loc_cpu.numpy()
+        self.seq_lens_cpu = torch.zeros(self.max_num_reqs,
+                                        dtype=torch.int32,
+                                        device="cpu",
+                                        pin_memory=self.pin_memory)
+        self.seq_lens_np = self.seq_lens_cpu.numpy()
 
     def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
         # Remove stopped requests from the cached states.
@@ -412,11 +412,10 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
         np.cumsum(num_scheduled_tokens,
                   out=self.query_start_loc_np[1:num_reqs + 1])
 
-        seq_lens = (self.input_batch.num_computed_tokens_cpu[:num_reqs] +
-                    num_scheduled_tokens)
-        max_seq_len = seq_lens.max()
-        self.seq_start_loc_np[0] = 0
-        np.cumsum(seq_lens, out=self.seq_start_loc_np[1:num_reqs + 1])
+        self.seq_lens_np[:num_reqs] = (
+            self.input_batch.num_computed_tokens_cpu[:num_reqs] +
+            num_scheduled_tokens)
+        max_seq_len = self.seq_lens_np[:num_reqs].max()
 
         # Copy the tensors to the GPU.
         self.input_ids[:total_num_scheduled_tokens].copy_(
@@ -433,8 +432,8 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
                 non_blocking=True)
         query_start_loc = self.query_start_loc_cpu[:num_reqs + 1].to(
             self.device, non_blocking=True)
-        seq_start_loc = self.seq_start_loc_cpu[:num_reqs + 1].to(
-            self.device, non_blocking=True)
+        seq_lens = self.seq_lens_cpu[:num_reqs].to(self.device,
+                                                   non_blocking=True)
         slot_mapping = self.slot_mapping_cpu[:total_num_scheduled_tokens].to(
             self.device, non_blocking=True).long()
 
@@ -506,33 +505,30 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
                 [0, total_num_scheduled_tokens],
                 dtype=torch.int32,
                 device=self.device)
-            cu_prefix_kv_lens = torch.tensor([0, common_prefix_len],
-                                             dtype=torch.int32,
-                                             device=self.device)
-            cu_suffix_kv_lens = (
-                self.seq_start_loc_np[:num_reqs + 1] -
-                self.arange_np[:num_reqs + 1] * common_prefix_len)
-            cu_suffix_kv_lens = torch.from_numpy(cu_suffix_kv_lens).to(
-                self.device)
+            prefix_kv_lens = torch.tensor([common_prefix_len],
+                                          dtype=torch.int32,
+                                          device=self.device)
+            suffix_kv_lens = (self.seq_lens_np[:num_reqs] - common_prefix_len)
+            suffix_kv_lens = torch.from_numpy(suffix_kv_lens).to(self.device)
         else:
             cu_prefix_query_lens = None
-            cu_prefix_kv_lens = None
-            cu_suffix_kv_lens = None
+            prefix_kv_lens = None
+            suffix_kv_lens = None
 
         attn_metadata = FlashAttentionMetadata(
             num_actual_tokens=total_num_scheduled_tokens,
             max_query_len=max_num_scheduled_tokens,
             query_start_loc=query_start_loc,
             max_seq_len=max_seq_len,
-            seq_start_loc=seq_start_loc,
+            seq_lens=seq_lens,
             block_table=(
                 self.input_batch.block_table.get_device_tensor()[:num_reqs]),
             slot_mapping=slot_mapping,
             use_cascade=use_cascade,
             common_prefix_len=common_prefix_len,
             cu_prefix_query_lens=cu_prefix_query_lens,
-            cu_prefix_kv_lens=cu_prefix_kv_lens,
-            cu_suffix_kv_lens=cu_suffix_kv_lens,
+            prefix_kv_lens=prefix_kv_lens,
+            suffix_kv_lens=suffix_kv_lens,
         )
         # NOTE(woosuk): Due to chunked prefills, there can be at most 1 partial
         # request in the batch. While we should not sample any token from this

From d07efb31c5efdb16eb386493b326cf3e90047978 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 23 Jan 2025 22:46:58 +0800
Subject: [PATCH 348/462] [Doc] Troubleshooting errors during model inspection
 (#12351)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .../source/getting_started/troubleshooting.md | 40 ++++++++++++++++++-
 docs/source/serving/offline_inference.md      | 33 +--------------
 2 files changed, 40 insertions(+), 33 deletions(-)

diff --git a/docs/source/getting_started/troubleshooting.md b/docs/source/getting_started/troubleshooting.md
index ec4b184bf4604..7bfe9b4036adf 100644
--- a/docs/source/getting_started/troubleshooting.md
+++ b/docs/source/getting_started/troubleshooting.md
@@ -22,9 +22,9 @@ It'd be better to store the model in a local disk. Additionally, have a look at
 To isolate the model downloading and loading issue, you can use the `--load-format dummy` argument to skip loading the model weights. This way, you can check if the model downloading and loading is the bottleneck.
 ```
 
-## Model is too large
+## Out of memory
 
-If the model is too large to fit in a single GPU, you might want to [consider tensor parallelism](#distributed-serving) to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using <gh-file:examples/offline_inference/save_sharded_state.py>. The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.
+If the model is too large to fit in a single GPU, you will get an out-of-memory (OOM) error. Consider [using tensor parallelism](#distributed-serving) to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using <gh-file:examples/offline_inference/save_sharded_state.py>. The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.
 
 ## Enable more logging
 
@@ -218,6 +218,42 @@ print(f(x))
 
 If it raises errors from `torch/_inductor` directory, usually it means you have a custom `triton` library that is not compatible with the version of PyTorch you are using. See [this issue](https://github.com/vllm-project/vllm/issues/12219) for example.
 
+## Model failed to be inspected
+
+If you see an error like:
+
+```text
+  File "vllm/model_executor/models/registry.py", line xxx, in _raise_for_unsupported
+    raise ValueError(
+ValueError: Model architectures ['<arch>'] failed to be inspected. Please check the logs for more details.
+```
+
+It means that vLLM failed to import the model file.
+Usually, it is related to missing dependencies or outdated binaries in the vLLM build.
+Please read the logs carefully to determine the root cause of the error.
+
+## Model not supported
+
+If you see an error like:
+
+```text
+Traceback (most recent call last):
+...
+  File "vllm/model_executor/models/registry.py", line xxx, in inspect_model_cls
+    for arch in architectures:
+TypeError: 'NoneType' object is not iterable
+```
+
+or:
+
+```text
+  File "vllm/model_executor/models/registry.py", line xxx, in _raise_for_unsupported
+    raise ValueError(
+ValueError: Model architectures ['<arch>'] are not supported for now. Supported architectures: [...]
+```
+
+But you are sure that the model is in the [list of supported models](#supported-models), there may be some issue with vLLM's model resolution. In that case, please follow [these steps](#model-resolution) to explicitly specify the vLLM implementation for the model.
+
 ## Known Issues
 
 - In `v0.5.2`, `v0.5.3`, and `v0.5.3.post1`, there is a bug caused by [zmq](https://github.com/zeromq/pyzmq/issues/2000) , which can occasionally cause vLLM to hang depending on the machine configuration. The solution is to upgrade to the latest version of `vllm` to include the [fix](gh-pr:6759).
diff --git a/docs/source/serving/offline_inference.md b/docs/source/serving/offline_inference.md
index 1f5a54f755f13..8a18598665a70 100644
--- a/docs/source/serving/offline_inference.md
+++ b/docs/source/serving/offline_inference.md
@@ -31,6 +31,8 @@ Please refer to the above pages for more details about each API.
 This section lists the most common options for running the vLLM engine.
 For a full list, refer to the [Engine Arguments](#engine-args) page.
 
+(model-resolution)=
+
 ### Model resolution
 
 vLLM loads HuggingFace-compatible models by inspecting the `architectures` field in `config.json` of the model repository
@@ -41,37 +43,6 @@ Nevertheless, our model resolution may fail for the following reasons:
 - Unofficial repositories refer to a model using alternative names which are not recorded in vLLM.
 - The same architecture name is used for multiple models, creating ambiguity as to which model should be loaded.
 
-In those cases, vLLM may throw an error like:
-
-```text
-Traceback (most recent call last):
-...
-  File "vllm/model_executor/models/registry.py", line xxx, in inspect_model_cls
-    for arch in architectures:
-TypeError: 'NoneType' object is not iterable
-```
-
-or:
-
-```text
-  File "vllm/model_executor/models/registry.py", line xxx, in _raise_for_unsupported
-    raise ValueError(
-ValueError: Model architectures ['<arch>'] are not supported for now. Supported architectures: [...]
-```
-
-:::{note}
-The above error is distinct from the following similar but different error:
-
-```text
-  File "vllm/model_executor/models/registry.py", line xxx, in _raise_for_unsupported
-    raise ValueError(
-ValueError: Model architectures ['<arch>'] failed to be inspected. Please check the logs for more details.
-```
-
-This error means that vLLM failed to import the model file. Usually, it is related to missing dependencies or outdated
-binaries in the vLLM build. Please read the logs carefully to determine the real cause of the error.
-:::
-
 To fix this, explicitly specify the model architecture by passing `config.json` overrides to the `hf_overrides` option.
 For example:
 

From 99d01a5e3d5278284bad359ac8b87ee7a551afda Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Thu, 23 Jan 2025 07:13:23 -0800
Subject: [PATCH 349/462] [V1] Simplify M-RoPE (#12352)

Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: imkero <kerorek@outlook.com>
---
 vllm/v1/worker/gpu_model_runner.py | 24 ++++++++++--------------
 1 file changed, 10 insertions(+), 14 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 99d463b940923..bfc0684de0622 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -144,28 +144,24 @@ def __init__(
 
         # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
         if self.model_config.uses_mrope:
-            # NOTE: `mrope_positions` is implemented as a permuted tensor to
-            # satisfy the following properties to allow `torch.compile` to work
-            # properly:
-            # - shape: (3, <variable>)
-            # - stride: (1, 3)
-            # See detailed explanation in https://github.com/vllm-project/vllm/pull/12128#discussion_r1921022256
+            # NOTE: `mrope_positions` is implemented with one additional dummy
+            # position on purpose to make it non-contiguous so that it can work
+            # with torch compile.
+            # See detailed explanation in https://github.com/vllm-project/vllm/pull/12128#discussion_r1926431923
 
             # NOTE: When M-RoPE is enabled, position ids are 3D regardless of
             # the modality of inputs. For text-only inputs, each dimension has
             # identical position IDs, making M-RoPE functionally equivalent to
             # 1D-RoPE.
             # See page 5 of https://arxiv.org/abs/2409.12191
-            self.mrope_positions = torch.zeros((self.max_num_tokens, 3),
+            self.mrope_positions = torch.zeros((3, self.max_num_tokens + 1),
                                                dtype=torch.int64,
                                                device=self.device)
-            self.mrope_positions_cpu = torch.zeros((self.max_num_tokens, 3),
-                                                   dtype=torch.int64,
-                                                   device="cpu",
-                                                   pin_memory=self.pin_memory)
-
-            self.mrope_positions = self.mrope_positions.permute((1, 0))
-            self.mrope_positions_cpu = self.mrope_positions_cpu.permute((1, 0))
+            self.mrope_positions_cpu = torch.zeros(
+                (3, self.max_num_tokens + 1),
+                dtype=torch.int64,
+                device="cpu",
+                pin_memory=self.pin_memory)
 
         self.inputs_embeds = torch.zeros(
             (self.max_num_tokens, self.hidden_size),

From 8c01b8022c78d583394509f8cfa2af4e7ca79279 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Fri, 24 Jan 2025 01:20:33 +0800
Subject: [PATCH 350/462] [Bugfix] Fix broken internvl2 inference with v1
 (#12360)

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 vllm/multimodal/utils.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 094e0682a068b..900bed5929b3d 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -503,8 +503,13 @@ def modality_group_func(mm_input: "MultiModalKwargs") -> Union[str, int]:
         if len(mm_input.modalities) > 1:
             return id(mm_input)
 
-        # Otherwise return the modality string
-        return list(mm_input.modalities)[0]
+        elif len(mm_input.modalities) == 1:
+            return list(mm_input.modalities)[0]
+
+        # FIXME(Isotr0py): Modality of mm_input from legacy pipeline is empty,
+        # this is used to make InternVL with legacy pipeline still work with v1.
+        else:
+            return ""
 
     return [
         list(group) for _, group in groupby(mm_inputs, key=modality_group_func)

From 3f50c148fd6476fc9099c04add39e873d9972663 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Fri, 24 Jan 2025 02:00:50 +0800
Subject: [PATCH 351/462] [core] add wake_up doc and some sanity check (#12361)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/entrypoints/llm.py        | 3 +++
 vllm/executor/executor_base.py | 9 +++++++++
 2 files changed, 12 insertions(+)

diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 563031cfadc4c..1860ed3d7db5a 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -1157,6 +1157,9 @@ def sleep(self, level: int = 1):
         self.llm_engine.sleep(level=level)
 
     def wake_up(self):
+        """
+        Wake up the engine from sleep mode. See the :meth:`sleep` method
+        for more details."""
         self.llm_engine.wake_up()
 
     # LEGACY
diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py
index 6be62d4068572..471d1bfac3119 100644
--- a/vllm/executor/executor_base.py
+++ b/vllm/executor/executor_base.py
@@ -47,6 +47,7 @@ def __init__(
         self.prompt_adapter_config = vllm_config.prompt_adapter_config
         self.observability_config = vllm_config.observability_config
         self._init_executor()
+        self.is_sleeping = False
 
     @abstractmethod
     def _init_executor(self) -> None:
@@ -194,10 +195,18 @@ def stop_profile(self) -> None:
         self.collective_rpc("stop_profile")
 
     def sleep(self, level: int = 1):
+        if self.is_sleeping:
+            logger.warning("Executor is already sleeping.")
+            return
         self.collective_rpc("sleep", kwargs=dict(level=level))
+        self.is_sleeping = True
 
     def wake_up(self):
+        if not self.is_sleeping:
+            logger.warning("Executor is not sleeping.")
+            return
         self.collective_rpc("wake_up")
+        self.is_sleeping = False
 
     def save_sharded_state(
         self,

From 6e650f56a16618db87147d97f699fa407ed1205d Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Fri, 24 Jan 2025 02:01:30 +0800
Subject: [PATCH 352/462] [torch.compile] decouple compile sizes and cudagraph
 sizes (#12243)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/compilation/backends.py       | 10 ++---
 vllm/config.py                     | 71 ++++++++++++++++--------------
 vllm/engine/metrics.py             |  3 +-
 vllm/v1/worker/gpu_model_runner.py | 18 ++++----
 vllm/v1/worker/gpu_worker.py       | 12 +++++
 vllm/worker/model_runner.py        | 27 +++++++-----
 vllm/worker/worker.py              | 12 +++++
 7 files changed, 95 insertions(+), 58 deletions(-)

diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index b9f96c00284b9..7f4f97466d503 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -680,7 +680,7 @@ def copy_and_call(*args):
 class ConcreteSizeEntry:
     runtime_shape: int
     need_to_compile: bool  # the size is in compile_sizes
-    use_cudagraph: bool  # the size is in capture_sizes
+    use_cudagraph: bool  # the size is in cudagraph_capture_sizes
 
     compiled: bool = False
     runnable: Callable = None  # type: ignore
@@ -727,8 +727,8 @@ def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig,
 
         self.compile_sizes: Set[int] = set(
             self.compilation_config.compile_sizes)
-        self.capture_sizes: Set[int] = set(
-            self.compilation_config.capture_sizes
+        self.cudagraph_capture_sizes: Set[int] = set(
+            self.compilation_config.cudagraph_capture_sizes
         ) if self.compilation_config.use_cudagraph else set()
 
         self.first_run_finished = False
@@ -746,11 +746,11 @@ def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig,
         # to_be_compiled_sizes tracks the remaining sizes to compile,
         # and updates during the compilation process, so we need to copy it
         self.to_be_compiled_sizes: Set[int] = self.compile_sizes.copy()
-        for shape in self.compile_sizes.union(self.capture_sizes):
+        for shape in self.compile_sizes.union(self.cudagraph_capture_sizes):
             self.concrete_size_entries[shape] = ConcreteSizeEntry(
                 runtime_shape=shape,
                 need_to_compile=shape in self.compile_sizes,
-                use_cudagraph=shape in self.capture_sizes,
+                use_cudagraph=shape in self.cudagraph_capture_sizes,
             )
 
     def check_for_ending_compilation(self):
diff --git a/vllm/config.py b/vllm/config.py
index f4548c4466e48..f7547921a05ea 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2711,10 +2711,11 @@ class CompilationConfig(BaseModel):
         - use_inductor: whether to use inductor compilation.
             - False: inductor compilation is not used. graph runs in eager.
             - True: inductor compilation is used. one graph for symbolic shape
-                is compiled. In addition, compile for cudagraph sizes that are
-                in candidate_compile_sizes, using configurations
-                in inductor_compile_config.
-        - candidate_compile_sizes: sizes to compile for inductor.
+                is compiled. In addition, compile for compile_sizes,
+                using configurations in inductor_compile_config.
+        - compile_sizes: sizes to compile for inductor. In addition
+            to integers, it also supports "cudagraph_capture_sizes" to
+            specify the sizes for cudagraph capture.
         - inductor_compile_config: additional configurations for inductor.
             - None: use default configurations.
         - inductor_passes: additional passes for inductor. It is a dictionary
@@ -2742,7 +2743,7 @@ class CompilationConfig(BaseModel):
     splitting_ops: List[str] = Field(default=None)  # type: ignore
 
     use_inductor: bool = True
-    candidate_compile_sizes: Optional[List[int]] = Field(default=None)
+    compile_sizes: Optional[List[Union[int, str]]] = Field(default=None)
     inductor_compile_config: Dict = Field(default_factory=dict)
     inductor_passes: Dict[str, str] = Field(default_factory=dict)
 
@@ -2790,8 +2791,6 @@ def model_post_init(self, __context: Any) -> None:
     pass_config: PassConfig = Field(default_factory=PassConfig)
 
     # not configurable, computed after init
-    compile_sizes: List[int] = PrivateAttr
-    capture_sizes: List[int] = PrivateAttr
     max_capture_size: int = PrivateAttr
     local_cache_dir: str = PrivateAttr  # local cache dir for each rank
     # optimization:
@@ -2918,43 +2917,47 @@ def init_backend(self, vllm_config: "VllmConfig") -> Union[str, Callable]:
         from vllm.compilation.backends import VllmBackend
         return VllmBackend(vllm_config)
 
-    def init_with_cudagraph_sizes(self, sizes_to_specialize: List[int]):
+    def init_with_cudagraph_sizes(self,
+                                  cudagraph_capture_sizes: List[int]) -> None:
         """To complete the initialization of config,
         we need to know the cudagraph sizes."""
 
         if self.cudagraph_capture_sizes is None:
-            self.capture_sizes = sizes_to_specialize
+            self.cudagraph_capture_sizes = cudagraph_capture_sizes
         else:
-            self.capture_sizes = self.cudagraph_capture_sizes
+            # de-duplicate the sizes provided by the config
+            self.cudagraph_capture_sizes = list(
+                set(self.cudagraph_capture_sizes))
             logger.info(("cudagraph sizes specified by model runner"
                          " %s is overridden by config %s"),
-                        sizes_to_specialize, self.cudagraph_capture_sizes)
-
-        if self.candidate_compile_sizes is None:
-            self.candidate_compile_sizes = []
-        self.compile_sizes = [
-            x for x in self.candidate_compile_sizes if x in self.capture_sizes
-        ]
-        ignored_sizes = [
-            x for x in self.candidate_compile_sizes
-            if x not in self.capture_sizes
-        ]
-        if ignored_sizes:
-            logger.warning(("candidate_compile_sizes %s are ignored "
-                            "because they are not cudagraph capture sizes."),
-                           ignored_sizes)
+                        cudagraph_capture_sizes, self.cudagraph_capture_sizes)
+
+        computed_compile_sizes = []
+        if self.compile_sizes is not None:
+            # de-duplicate the sizes provided by the config
+            self.compile_sizes = list(set(self.compile_sizes))
+            for x in self.compile_sizes:
+                if isinstance(x, str):
+                    assert x == "cudagraph_capture_sizes", \
+                    "Unrecognized size type in compile_sizes, " \
+                    f"expect 'cudagraph_capture_sizes', got {x}"
+                    computed_compile_sizes.extend(self.cudagraph_capture_sizes)
+                else:
+                    assert isinstance(x, int)
+                    computed_compile_sizes.append(x)
+        self.compile_sizes = computed_compile_sizes  # type: ignore
 
         # sort to make sure cudagraph capture sizes are in descending order
-        self.capture_sizes.sort(reverse=True)
-        self.max_capture_size = self.capture_sizes[
-            0] if self.capture_sizes else 0
+        self.cudagraph_capture_sizes.sort(reverse=True)
+        self.max_capture_size = self.cudagraph_capture_sizes[
+            0] if self.cudagraph_capture_sizes else 0
 
         # pre-compute the mapping from batch size to padded graph size
         self.bs_to_padded_graph_size = [
             0 for i in range(self.max_capture_size + 1)
         ]
-        for end, start in zip(self.capture_sizes,
-                              self.capture_sizes[1:] + [0]):
+        for end, start in zip(self.cudagraph_capture_sizes,
+                              self.cudagraph_capture_sizes[1:] + [0]):
             for bs in range(start, end):
                 if bs == start:
                     self.bs_to_padded_graph_size[bs] = start
@@ -3225,14 +3228,14 @@ def _set_cudagraph_sizes(self):
         However, if users specify the cudagraph capture sizes through
         compilation config, we will use the specified sizes instead.
 
-        In the end, `vllm_config.compilation_config.capture_sizes` will be the
-        final sizes to capture cudagraph (in descending order).
+        In the end, `vllm_config.compilation_config.cudagraph_capture_sizes`
+        will be the final sizes to capture cudagraph (in descending order).
 
         During runtime, if batchsize is larger than
-        `vllm_config.compilation_config.capture_sizes`,
+        `vllm_config.compilation_config.cudagraph_capture_sizes`,
         no cudagraph will be used.
         If the batch size is no larger than
-        `vllm_config.compilation_config.capture_sizes`,
+        `vllm_config.compilation_config.cudagraph_capture_sizes`,
         we can quickly find the padded graph size for a given batch size by
         looking up `vllm_config.compilation_config.bs_to_padded_graph_size`.
         """
diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py
index c8aec8dd3afa3..f7ce21d0ae988 100644
--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@@ -120,7 +120,8 @@ def __init__(self, labelnames: List[str], vllm_config: VllmConfig):
             labelnames=labelnames)
         buckets = [1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8096]
         if not vllm_config.model_config.enforce_eager:
-            buckets = vllm_config.compilation_config.capture_sizes.copy()
+            buckets = vllm_config.compilation_config.\
+                cudagraph_capture_sizes.copy()
             buckets.sort()
         self.histogram_iteration_tokens = self._histogram_cls(
             name="vllm:iteration_tokens_total",
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index bfc0684de0622..4b3c325ded906 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1,6 +1,6 @@
 import gc
 import time
-from typing import TYPE_CHECKING, Dict, List, Tuple, cast
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, cast
 
 import numpy as np
 import torch
@@ -128,7 +128,8 @@ def __init__(
         # self.cudagraph_batch_sizes sorts in ascending order.
         # The batch sizes in the config are in descending order.
         self.cudagraph_batch_sizes = list(
-            reversed(self.vllm_config.compilation_config.capture_sizes))
+            reversed(
+                self.vllm_config.compilation_config.cudagraph_capture_sizes))
 
         # Cache the device properties.
         self.device_properties = torch.cuda.get_device_properties(self.device)
@@ -834,10 +835,12 @@ def load_model(self) -> None:
     @torch.inference_mode()
     def _dummy_run(
         self,
-        model: nn.Module,
         num_tokens: int,
-        kv_caches: List[torch.Tensor],
+        kv_caches: Optional[List[torch.Tensor]] = None,
     ) -> torch.Tensor:
+        model = self.model
+        if kv_caches is None:
+            kv_caches = self.kv_caches
         if self.is_multimodal_model:
             input_ids = None
             inputs_embeds = self.inputs_embeds[:num_tokens]
@@ -963,8 +966,7 @@ def profile_run(self) -> None:
             self.encoder_cache["tmp"] = dict(enumerate(dummy_encoder_outputs))
 
         # Trigger compilation for general shape.
-        hidden_states = self._dummy_run(self.model, self.max_num_tokens,
-                                        dummy_kv_caches)
+        hidden_states = self._dummy_run(self.max_num_tokens, dummy_kv_caches)
         logits = self.model.compute_logits(hidden_states, None)
         logits = logits[:self.max_num_tokens]
         # TODO(woosuk): Consider the memory usage of the sampler.
@@ -990,8 +992,8 @@ def capture_model(self) -> None:
             for num_tokens in reversed(self.cudagraph_batch_sizes):
                 for _ in range(self.vllm_config.compilation_config.
                                cudagraph_num_of_warmups):
-                    self._dummy_run(self.model, num_tokens, self.kv_caches)
-                self._dummy_run(self.model, num_tokens, self.kv_caches)
+                    self._dummy_run(num_tokens)
+                self._dummy_run(num_tokens)
 
         end_time = time.perf_counter()
         end_free_gpu_memory = torch.cuda.mem_get_info()[0]
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index e69b6c90454c4..a8cf0aec3f17b 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -206,6 +206,18 @@ def initialize_cache(self, kv_cache_config: KVCacheConfig) -> None:
             self.model_runner.initialize_kv_cache(kv_cache_config)
 
     def compile_or_warm_up_model(self) -> None:
+        # warm up sizes that are not in cudagraph capture sizes,
+        # but users still want to compile for better performance,
+        # e.g. for the max-num-batched token size in chunked prefill.
+        warmup_sizes = self.vllm_config.compilation_config.compile_sizes.copy()
+        if not self.model_config.enforce_eager:
+            warmup_sizes = [
+                x for x in warmup_sizes if x not in
+                self.vllm_config.compilation_config.cudagraph_capture_sizes
+            ]
+        for size in sorted(warmup_sizes, reverse=True):
+            logger.info("Compile and warming up model for size %d", size)
+            self.model_runner._dummy_run(size)
         if not self.model_config.enforce_eager:
             self.model_runner.capture_model()
         # Reset the seed to ensure that the random state is not affected by
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index e311c14111d49..fe504821a0d96 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1256,13 +1256,19 @@ def set_in_profile_run(self):
 
     @torch.inference_mode()
     def profile_run(self) -> None:
+        max_num_batched_tokens = \
+            self.scheduler_config.max_num_batched_tokens
+        max_num_seqs = self.scheduler_config.max_num_seqs
+        self._dummy_run(max_num_batched_tokens, max_num_seqs)
+
+    def _dummy_run(self,
+                   max_num_batched_tokens: int,
+                   max_num_seqs: int = 1) -> None:
         with self.set_in_profile_run():
             # Enable top-k sampling to reflect the accurate memory usage.
             sampling_params = \
                 SamplingParams(top_p=0.99, top_k=self.vocab_size - 1)
-            max_num_batched_tokens = \
-                self.scheduler_config.max_num_batched_tokens
-            max_num_seqs = self.scheduler_config.max_num_seqs
+
             # This represents the maximum number of different requests
             # that will have unique loras, an therefore the max amount of memory
             # consumption create dummy lora request copies from the lora request
@@ -1491,13 +1497,14 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None:
             for virtual_engine in range(
                     self.parallel_config.pipeline_parallel_size):
                 # Only rank 0 should print progress bar during capture
-                capture_sizes = (
-                    tqdm(
-                        self.vllm_config.compilation_config.capture_sizes,
-                        desc="Capturing CUDA graph shapes",
-                    ) if get_tensor_model_parallel_rank() == 0 else
-                    self.vllm_config.compilation_config.capture_sizes)
-                for batch_size in capture_sizes:
+                cudagraph_capture_sizes = (tqdm(
+                    self.vllm_config.compilation_config.
+                    cudagraph_capture_sizes,
+                    desc="Capturing CUDA graph shapes",
+                ) if get_tensor_model_parallel_rank() == 0 else
+                                           self.vllm_config.compilation_config.
+                                           cudagraph_capture_sizes)
+                for batch_size in cudagraph_capture_sizes:
                     attn_metadata = (
                         self.attn_state.graph_capture_get_metadata_for_batch(
                             batch_size,
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index c95d2a9a69ff3..24bba79fedd75 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -323,6 +323,18 @@ def _init_cache_engine(self):
                       self.gpu_cache)
 
     def _warm_up_model(self) -> None:
+        # warm up sizes that are not in cudagraph capture sizes,
+        # but users still want to compile for better performance,
+        # e.g. for the max-num-batched token size in chunked prefill.
+        warmup_sizes = self.vllm_config.compilation_config.compile_sizes.copy()
+        if not self.model_config.enforce_eager:
+            warmup_sizes = [
+                x for x in warmup_sizes if x not in
+                self.vllm_config.compilation_config.cudagraph_capture_sizes
+            ]
+        for size in sorted(warmup_sizes, reverse=True):
+            logger.info("Compile and warming up model for size %d", size)
+            self.model_runner._dummy_run(size)
         if not self.model_config.enforce_eager:
             self.model_runner.capture_model(self.gpu_cache)
         # Reset the seed to ensure that the random state is not affected by

From e97f802b2d74861af77997691a7d1c36498f6dca Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com>
Date: Thu, 23 Jan 2025 13:04:03 -0500
Subject: [PATCH 353/462] [FP8][Kernel] Dynamic kv cache scaling factors
 computation (#11906)

Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
Co-authored-by: Micah Williamson <micah.williamson@amd.com>
---
 .../kernels/benchmark_paged_attention.py      |   4 +-
 csrc/attention/attention_kernels.cuh          |  10 +-
 csrc/attention/paged_attention_v1.cu          |  17 +-
 csrc/attention/paged_attention_v2.cu          |  17 +-
 csrc/cache.h                                  |   6 +-
 csrc/cache_kernels.cu                         |  30 +-
 csrc/cpu/attention.cpp                        |  12 +-
 csrc/cpu/cache.cpp                            |   6 +-
 csrc/cpu/torch_bindings.cpp                   |   6 +-
 csrc/ops.h                                    |  10 +-
 csrc/rocm/attention.cu                        |  17 +-
 csrc/rocm/ops.h                               |   4 +-
 csrc/rocm/torch_bindings.cpp                  |   2 +-
 csrc/torch_bindings.cpp                       |   8 +-
 .../quantization/quantized_kvcache.md         |  10 +-
 examples/other/fp8/README.md                  |  96 -----
 examples/other/fp8/extract_scales.py          | 367 ------------------
 examples/other/fp8/quantizer/README.md        |  32 --
 examples/other/fp8/quantizer/quantize.py      | 367 ------------------
 .../llama2-70b-fp8-kv/kv_cache_scales.json    |  90 -----
 .../llama2-7b-fp8-kv/kv_cache_scales.json     |  42 --
 tests/kernels/test_attention.py               |   2 +-
 tests/kernels/test_blocksparse_attention.py   |   2 +-
 tests/kernels/test_cache.py                   |  10 +-
 tests/kernels/test_prefix_prefill.py          |  10 +
 tests/kernels/utils.py                        |   2 +
 .../models/decoder_only/language/test_fp8.py  |  15 +-
 tests/worker/test_model_input.py              |   3 +
 vllm/_custom_ops.py                           |  20 +-
 vllm/attention/backends/abstract.py           |  10 +-
 vllm/attention/backends/blocksparse_attn.py   |   2 +
 vllm/attention/backends/flash_attn.py         |   5 +-
 vllm/attention/backends/flashinfer.py         |  10 +-
 vllm/attention/backends/ipex_attn.py          |   2 +-
 vllm/attention/backends/pallas.py             |   2 +-
 vllm/attention/backends/placeholder_attn.py   |   3 +
 vllm/attention/backends/rocm_flash_attn.py    |   2 +
 vllm/attention/backends/torch_sdpa.py         |   2 +-
 vllm/attention/backends/utils.py              |   2 +
 vllm/attention/backends/xformers.py           |   2 +
 vllm/attention/layer.py                       |  28 +-
 vllm/attention/ops/ipex_attn.py               |  16 +-
 vllm/attention/ops/paged_attn.py              |  12 +-
 vllm/attention/ops/prefix_prefill.py          |  12 +-
 vllm/config.py                                |  16 +-
 vllm/engine/arg_utils.py                      |  25 +-
 vllm/envs.py                                  |   9 +
 .../layers/quantization/kv_cache.py           |  19 +-
 .../model_loader/weight_utils.py              |  45 +--
 vllm/model_executor/models/exaone.py          |  35 +-
 vllm/model_executor/models/granite.py         |  32 +-
 vllm/model_executor/models/llama.py           |  35 +-
 vllm/model_executor/models/mllama.py          |   7 +-
 vllm/model_executor/models/solar.py           |  35 +-
 vllm/v1/attention/backends/flash_attn.py      |   4 -
 vllm/worker/hpu_model_runner.py               |   7 +-
 vllm/worker/model_runner.py                   |  37 +-
 vllm/worker/openvino_model_runner.py          |   1 +
 vllm/worker/tpu_model_runner.py               |   5 +
 vllm/worker/xpu_model_runner.py               |   2 +
 60 files changed, 276 insertions(+), 1365 deletions(-)
 delete mode 100644 examples/other/fp8/README.md
 delete mode 100644 examples/other/fp8/extract_scales.py
 delete mode 100644 examples/other/fp8/quantizer/README.md
 delete mode 100644 examples/other/fp8/quantizer/quantize.py
 delete mode 100644 tests/fp8_kv/llama2-70b-fp8-kv/kv_cache_scales.json
 delete mode 100644 tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json

diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py
index 14eef00b855ac..219013a38134b 100644
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@@ -98,7 +98,9 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
         start_time = time.perf_counter()
 
         # Using default kv_scale
-        k_scale = v_scale = 1.0
+        k_scale = v_scale = torch.tensor(1.0,
+                                         dtype=torch.float32,
+                                         device=device)
 
         for _ in range(num_iters):
             if version == "v1":
diff --git a/csrc/attention/attention_kernels.cuh b/csrc/attention/attention_kernels.cuh
index 563e1438f0b01..eb216dc8baf10 100644
--- a/csrc/attention/attention_kernels.cuh
+++ b/csrc/attention/attention_kernels.cuh
@@ -105,7 +105,7 @@ __device__ void paged_attention_kernel(
     const int max_num_blocks_per_seq,
     const float* __restrict__ alibi_slopes,  // [num_heads]
     const int q_stride, const int kv_block_stride, const int kv_head_stride,
-    const float k_scale, const float v_scale, const int tp_rank,
+    const float* k_scale, const float* v_scale, const int tp_rank,
     const int blocksparse_local_blocks, const int blocksparse_vert_stride,
     const int blocksparse_block_size, const int blocksparse_head_sliding_step) {
   const int seq_idx = blockIdx.y;
@@ -285,7 +285,7 @@ __device__ void paged_attention_kernel(
           Quant_vec k_vec_quant = *reinterpret_cast<const Quant_vec*>(
               k_ptr + offset1 * BLOCK_SIZE * x + offset2);
           k_vecs[j] = fp8::scaled_convert<K_vec, Quant_vec, KV_DTYPE>(
-              k_vec_quant, k_scale);
+              k_vec_quant, *k_scale);
         }
       }
 
@@ -415,7 +415,7 @@ __device__ void paged_attention_kernel(
               *reinterpret_cast<const V_quant_vec*>(v_ptr + offset);
           // Vector conversion from V_quant_vec to V_vec.
           v_vec = fp8::scaled_convert<V_vec, V_quant_vec, KV_DTYPE>(v_quant_vec,
-                                                                    v_scale);
+                                                                    *v_scale);
         }
         if (block_idx == num_seq_blocks - 1) {
           // NOTE(woosuk): When v_vec contains the tokens that are out of the
@@ -513,7 +513,7 @@ __global__ void paged_attention_v1_kernel(
     const int max_num_blocks_per_seq,
     const float* __restrict__ alibi_slopes,  // [num_heads]
     const int q_stride, const int kv_block_stride, const int kv_head_stride,
-    const float k_scale, const float v_scale, const int tp_rank,
+    const float* k_scale, const float* v_scale, const int tp_rank,
     const int blocksparse_local_blocks, const int blocksparse_vert_stride,
     const int blocksparse_block_size, const int blocksparse_head_sliding_step) {
   paged_attention_kernel<scalar_t, cache_t, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS,
@@ -549,7 +549,7 @@ __global__ void paged_attention_v2_kernel(
     const int max_num_blocks_per_seq,
     const float* __restrict__ alibi_slopes,  // [num_heads]
     const int q_stride, const int kv_block_stride, const int kv_head_stride,
-    const float k_scale, const float v_scale, const int tp_rank,
+    const float* k_scale, const float* v_scale, const int tp_rank,
     const int blocksparse_local_blocks, const int blocksparse_vert_stride,
     const int blocksparse_block_size, const int blocksparse_head_sliding_step) {
   paged_attention_kernel<scalar_t, cache_t, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS,
diff --git a/csrc/attention/paged_attention_v1.cu b/csrc/attention/paged_attention_v1.cu
index 27321148f6dda..9b3a5c4b1014a 100644
--- a/csrc/attention/paged_attention_v1.cu
+++ b/csrc/attention/paged_attention_v1.cu
@@ -41,7 +41,7 @@
           out_ptr, query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, \
           scale, block_tables_ptr, seq_lens_ptr, max_num_blocks_per_seq,    \
           alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride,      \
-          k_scale, v_scale, tp_rank, blocksparse_local_blocks,              \
+          k_scale_ptr, v_scale_ptr, tp_rank, blocksparse_local_blocks,      \
           blocksparse_vert_stride, blocksparse_block_size,                  \
           blocksparse_head_sliding_step);
 
@@ -53,10 +53,10 @@ void paged_attention_v1_launcher(
     torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
     torch::Tensor& value_cache, int num_kv_heads, float scale,
     torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
-    const std::optional<torch::Tensor>& alibi_slopes, float k_scale,
-    float v_scale, const int tp_rank, const int blocksparse_local_blocks,
-    const int blocksparse_vert_stride, const int blocksparse_block_size,
-    const int blocksparse_head_sliding_step) {
+    const std::optional<torch::Tensor>& alibi_slopes, torch::Tensor& k_scale,
+    torch::Tensor& v_scale, const int tp_rank,
+    const int blocksparse_local_blocks, const int blocksparse_vert_stride,
+    const int blocksparse_block_size, const int blocksparse_head_sliding_step) {
   int num_seqs = query.size(0);
   int num_heads = query.size(1);
   int head_size = query.size(2);
@@ -80,6 +80,8 @@ void paged_attention_v1_launcher(
   CACHE_T* value_cache_ptr = reinterpret_cast<CACHE_T*>(value_cache.data_ptr());
   int* block_tables_ptr = block_tables.data_ptr<int>();
   int* seq_lens_ptr = seq_lens.data_ptr<int>();
+  const float* k_scale_ptr = reinterpret_cast<const float*>(k_scale.data_ptr());
+  const float* v_scale_ptr = reinterpret_cast<const float*>(v_scale.data_ptr());
 
   constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
   int padded_max_seq_len =
@@ -177,8 +179,9 @@ void paged_attention_v1(
     torch::Tensor& seq_lens,      // [num_seqs]
     int64_t block_size, int64_t max_seq_len,
     const std::optional<torch::Tensor>& alibi_slopes,
-    const std::string& kv_cache_dtype, double k_scale, double v_scale,
-    const int64_t tp_rank, const int64_t blocksparse_local_blocks,
+    const std::string& kv_cache_dtype, torch::Tensor& k_scale,
+    torch::Tensor& v_scale, const int64_t tp_rank,
+    const int64_t blocksparse_local_blocks,
     const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
     const int64_t blocksparse_head_sliding_step) {
   const bool is_block_sparse = (blocksparse_vert_stride > 1);
diff --git a/csrc/attention/paged_attention_v2.cu b/csrc/attention/paged_attention_v2.cu
index a453b2243e48c..9935359e02fb1 100644
--- a/csrc/attention/paged_attention_v2.cu
+++ b/csrc/attention/paged_attention_v2.cu
@@ -37,7 +37,7 @@
           exp_sums_ptr, max_logits_ptr, tmp_out_ptr, query_ptr, key_cache_ptr, \
           value_cache_ptr, num_kv_heads, scale, block_tables_ptr,              \
           seq_lens_ptr, max_num_blocks_per_seq, alibi_slopes_ptr, q_stride,    \
-          kv_block_stride, kv_head_stride, k_scale, v_scale, tp_rank,          \
+          kv_block_stride, kv_head_stride, k_scale_ptr, v_scale_ptr, tp_rank,  \
           blocksparse_local_blocks, blocksparse_vert_stride,                   \
           blocksparse_block_size, blocksparse_head_sliding_step);              \
   vllm::paged_attention_v2_reduce_kernel<T, HEAD_SIZE, NUM_THREADS,            \
@@ -54,10 +54,10 @@ void paged_attention_v2_launcher(
     torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
     torch::Tensor& value_cache, int num_kv_heads, float scale,
     torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
-    const std::optional<torch::Tensor>& alibi_slopes, float k_scale,
-    float v_scale, const int tp_rank, const int blocksparse_local_blocks,
-    const int blocksparse_vert_stride, const int blocksparse_block_size,
-    const int blocksparse_head_sliding_step) {
+    const std::optional<torch::Tensor>& alibi_slopes, torch::Tensor& k_scale,
+    torch::Tensor& v_scale, const int tp_rank,
+    const int blocksparse_local_blocks, const int blocksparse_vert_stride,
+    const int blocksparse_block_size, const int blocksparse_head_sliding_step) {
   int num_seqs = query.size(0);
   int num_heads = query.size(1);
   int head_size = query.size(2);
@@ -84,6 +84,8 @@ void paged_attention_v2_launcher(
   CACHE_T* value_cache_ptr = reinterpret_cast<CACHE_T*>(value_cache.data_ptr());
   int* block_tables_ptr = block_tables.data_ptr<int>();
   int* seq_lens_ptr = seq_lens.data_ptr<int>();
+  const float* k_scale_ptr = reinterpret_cast<const float*>(k_scale.data_ptr());
+  const float* v_scale_ptr = reinterpret_cast<const float*>(v_scale.data_ptr());
 
   constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
   int max_num_partitions = DIVIDE_ROUND_UP(max_seq_len, PARTITION_SIZE);
@@ -188,8 +190,9 @@ void paged_attention_v2(
     torch::Tensor& seq_lens,      // [num_seqs]
     int64_t block_size, int64_t max_seq_len,
     const std::optional<torch::Tensor>& alibi_slopes,
-    const std::string& kv_cache_dtype, double k_scale, double v_scale,
-    const int64_t tp_rank, const int64_t blocksparse_local_blocks,
+    const std::string& kv_cache_dtype, torch::Tensor& k_scale,
+    torch::Tensor& v_scale, const int64_t tp_rank,
+    const int64_t blocksparse_local_blocks,
     const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
     const int64_t blocksparse_head_sliding_step) {
   const bool is_block_sparse = (blocksparse_vert_stride > 1);
diff --git a/csrc/cache.h b/csrc/cache.h
index 11c4c5001daaa..eedad9fafa3c0 100644
--- a/csrc/cache.h
+++ b/csrc/cache.h
@@ -18,15 +18,15 @@ void copy_blocks(std::vector<torch::Tensor> const& key_caches,
 void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
                        torch::Tensor& key_cache, torch::Tensor& value_cache,
                        torch::Tensor& slot_mapping,
-                       const std::string& kv_cache_dtype, const double k_scale,
-                       const double v_scale);
+                       const std::string& kv_cache_dtype,
+                       torch::Tensor& k_scale, torch::Tensor& v_scale);
 
 void reshape_and_cache_flash(torch::Tensor& key, torch::Tensor& value,
                              torch::Tensor& key_cache,
                              torch::Tensor& value_cache,
                              torch::Tensor& slot_mapping,
                              const std::string& kv_cache_dtype,
-                             const double k_scale, const double v_scale);
+                             torch::Tensor& k_scale, torch::Tensor& v_scale);
 
 // Just for unittest
 void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
index 8a95279f9a25a..21a0aec0ececc 100644
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -159,8 +159,8 @@ __global__ void reshape_and_cache_kernel(
                                          // block_size]
     const int64_t* __restrict__ slot_mapping,  // [num_tokens]
     const int key_stride, const int value_stride, const int num_heads,
-    const int head_size, const int block_size, const int x, const float k_scale,
-    const float v_scale) {
+    const int head_size, const int block_size, const int x,
+    const float* k_scale, const float* v_scale) {
   const int64_t token_idx = blockIdx.x;
   const int64_t slot_idx = slot_mapping[token_idx];
   if (slot_idx < 0) {
@@ -196,9 +196,9 @@ __global__ void reshape_and_cache_kernel(
       value_cache[tgt_value_idx] = tgt_value;
     } else {
       key_cache[tgt_key_idx] =
-          fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_key, k_scale);
+          fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_key, *k_scale);
       value_cache[tgt_value_idx] =
-          fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_value, v_scale);
+          fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_value, *v_scale);
     }
   }
 }
@@ -214,7 +214,7 @@ __global__ void reshape_and_cache_flash_kernel(
     const int64_t* __restrict__ slot_mapping,  // [num_tokens]
     const int block_stride, const int key_stride, const int value_stride,
     const int num_heads, const int head_size, const int block_size,
-    const float k_scale, const float v_scale) {
+    const float* k_scale, const float* v_scale) {
   const int64_t token_idx = blockIdx.x;
   const int64_t slot_idx = slot_mapping[token_idx];
   // NOTE: slot_idx can be -1 if the token is padded
@@ -239,9 +239,9 @@ __global__ void reshape_and_cache_flash_kernel(
       value_cache[tgt_key_value_idx] = tgt_value;
     } else {
       key_cache[tgt_key_value_idx] =
-          fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_key, k_scale);
+          fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_key, *k_scale);
       value_cache[tgt_key_value_idx] =
-          fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_value, v_scale);
+          fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_value, *v_scale);
     }
   }
 }
@@ -258,7 +258,9 @@ __global__ void reshape_and_cache_flash_kernel(
           reinterpret_cast<CACHE_T*>(key_cache.data_ptr()),           \
           reinterpret_cast<CACHE_T*>(value_cache.data_ptr()),         \
           slot_mapping.data_ptr<int64_t>(), key_stride, value_stride, \
-          num_heads, head_size, block_size, x, k_scale, v_scale);
+          num_heads, head_size, block_size, x,                        \
+          reinterpret_cast<const float*>(k_scale.data_ptr()),         \
+          reinterpret_cast<const float*>(v_scale.data_ptr()));
 
 void reshape_and_cache(
     torch::Tensor& key,    // [num_tokens, num_heads, head_size]
@@ -268,8 +270,8 @@ void reshape_and_cache(
     torch::Tensor&
         value_cache,  // [num_blocks, num_heads, head_size, block_size]
     torch::Tensor& slot_mapping,  // [num_tokens]
-    const std::string& kv_cache_dtype, const double k_scale,
-    const double v_scale) {
+    const std::string& kv_cache_dtype, torch::Tensor& k_scale,
+    torch::Tensor& v_scale) {
   int num_tokens = key.size(0);
   int num_heads = key.size(1);
   int head_size = key.size(2);
@@ -299,7 +301,9 @@ void reshape_and_cache(
           reinterpret_cast<CACHE_T*>(key_cache.data_ptr()),           \
           reinterpret_cast<CACHE_T*>(value_cache.data_ptr()),         \
           slot_mapping.data_ptr<int64_t>(), block_stride, key_stride, \
-          value_stride, num_heads, head_size, block_size, k_scale, v_scale);
+          value_stride, num_heads, head_size, block_size,             \
+          reinterpret_cast<const float*>(k_scale.data_ptr()),         \
+          reinterpret_cast<const float*>(v_scale.data_ptr()));
 
 void reshape_and_cache_flash(
     torch::Tensor& key,        // [num_tokens, num_heads, head_size]
@@ -308,8 +312,8 @@ void reshape_and_cache_flash(
     torch::Tensor&
         value_cache,  // [num_blocks, block_size, num_heads, head_size]
     torch::Tensor& slot_mapping,  // [num_tokens] or [num_actual_tokens]
-    const std::string& kv_cache_dtype, const double k_scale,
-    const double v_scale) {
+    const std::string& kv_cache_dtype, torch::Tensor& k_scale,
+    torch::Tensor& v_scale) {
   // NOTE(woosuk): In vLLM V1, key.size(0) can be different from
   // slot_mapping.size(0) because of padding for CUDA graphs.
   // In vLLM V0, key.size(0) is always equal to slot_mapping.size(0) because
diff --git a/csrc/cpu/attention.cpp b/csrc/cpu/attention.cpp
index ef5b14088c63b..b9764056e8a2d 100644
--- a/csrc/cpu/attention.cpp
+++ b/csrc/cpu/attention.cpp
@@ -460,11 +460,11 @@ void paged_attention_v1(
     torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
     torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
     int64_t max_seq_len, const std::optional<torch::Tensor>& alibi_slopes,
-    const std::string& kv_cache_dtype, double k_scale, double v_scale,
-    const int64_t tp_rank, const int64_t blocksparse_local_blocks,
+    const std::string& kv_cache_dtype, torch::Tensor& k_scale,
+    torch::Tensor& v_scale, const int64_t tp_rank,
+    const int64_t blocksparse_local_blocks,
     const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
     const int64_t blocksparse_head_sliding_step) {
-  TORCH_CHECK(k_scale == 1.0f && v_scale == 1.0f);
   TORCH_CHECK(blocksparse_vert_stride <= 1,
               "CPU backend does not support blocksparse attention yet.");
   VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "paged_attention_v1_impl",
@@ -782,11 +782,11 @@ void paged_attention_v2(
     torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
     torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
     int64_t max_seq_len, const std::optional<torch::Tensor>& alibi_slopes,
-    const std::string& kv_cache_dtype, double k_scale, double v_scale,
-    const int64_t tp_rank, const int64_t blocksparse_local_blocks,
+    const std::string& kv_cache_dtype, torch::Tensor& k_scale,
+    torch::Tensor& v_scale, const int64_t tp_rank,
+    const int64_t blocksparse_local_blocks,
     const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
     const int64_t blocksparse_head_sliding_step) {
-  TORCH_CHECK(k_scale == 1.0f && v_scale == 1.0f);
   TORCH_CHECK(blocksparse_vert_stride <= 1,
               "CPU backend does not support blocksparse attention yet.");
   VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "paged_attention_v2_impl",
diff --git a/csrc/cpu/cache.cpp b/csrc/cpu/cache.cpp
index 31d454328b2c1..e3809acad7453 100644
--- a/csrc/cpu/cache.cpp
+++ b/csrc/cpu/cache.cpp
@@ -107,10 +107,8 @@ void copy_blocks(std::vector<torch::Tensor> const& key_caches,
 void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
                        torch::Tensor& key_cache, torch::Tensor& value_cache,
                        torch::Tensor& slot_mapping,
-                       const std::string& kv_cache_dtype, double k_scale,
-                       double v_scale) {
-  TORCH_CHECK(k_scale == 1.0f && v_scale == 1.0f);
-
+                       const std::string& kv_cache_dtype,
+                       torch::Tensor& k_scale, torch::Tensor& v_scale) {
   int num_tokens = key.size(0);
   int num_heads = key.size(1);
   int head_size = key.size(2);
diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp
index 74e4d8189d403..5d1c5f4c83d3e 100644
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@@ -30,7 +30,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "    Tensor value_cache, int num_kv_heads, float scale,"
       "    Tensor block_tables, Tensor seq_lens, int block_size,"
       "    int max_seq_len, Tensor? alibi_slopes,"
-      "    str kv_cache_dtype, float k_scale, float v_scale,"
+      "    str kv_cache_dtype, Tensor k_scale, Tensor v_scale,"
       "    int tp_rank, int blocksparse_local_blocks,"
       "    int blocksparse_vert_stride, int blocksparse_block_size,"
       "    int blocksparse_head_sliding_step) -> ()");
@@ -44,7 +44,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "    Tensor value_cache, int num_kv_heads, float scale,"
       "    Tensor block_tables, Tensor seq_lens, int block_size,"
       "    int max_seq_len, Tensor? alibi_slopes,"
-      "    str kv_cache_dtype, float k_scale, float v_scale,"
+      "    str kv_cache_dtype, Tensor k_scale, Tensor v_scale,"
       "    int tp_rank, int blocksparse_local_blocks,"
       "    int blocksparse_vert_stride, int blocksparse_block_size,"
       "    int blocksparse_head_sliding_step) -> ()");
@@ -148,7 +148,7 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
       "                  Tensor! key_cache, Tensor! value_cache,"
       "                  Tensor slot_mapping,"
       "                  str kv_cache_dtype,"
-      "                  float k_scale, float v_scale) -> ()");
+      "                  Tensor k_scale, Tensor v_scale) -> ()");
   cache_ops.impl("reshape_and_cache", torch::kCPU, &reshape_and_cache);
 }
 
diff --git a/csrc/ops.h b/csrc/ops.h
index 5a194a0dd3654..346898964010d 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -34,8 +34,9 @@ void paged_attention_v1(
     torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
     torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
     int64_t max_seq_len, const std::optional<torch::Tensor>& alibi_slopes,
-    const std::string& kv_cache_dtype, double k_scale, double v_scale,
-    const int64_t tp_rank, const int64_t blocksparse_local_blocks,
+    const std::string& kv_cache_dtype, torch::Tensor& k_scale,
+    torch::Tensor& v_scale, const int64_t tp_rank,
+    const int64_t blocksparse_local_blocks,
     const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
     const int64_t blocksparse_head_sliding_step);
 
@@ -45,8 +46,9 @@ void paged_attention_v2(
     torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
     torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
     int64_t max_seq_len, const std::optional<torch::Tensor>& alibi_slopes,
-    const std::string& kv_cache_dtype, double k_scale, double v_scale,
-    const int64_t tp_rank, const int64_t blocksparse_local_blocks,
+    const std::string& kv_cache_dtype, torch::Tensor& k_scale,
+    torch::Tensor& v_scale, const int64_t tp_rank,
+    const int64_t blocksparse_local_blocks,
     const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
     const int64_t blocksparse_head_sliding_step);
 
diff --git a/csrc/rocm/attention.cu b/csrc/rocm/attention.cu
index 0fec9624c457e..9477790629c9f 100644
--- a/csrc/rocm/attention.cu
+++ b/csrc/rocm/attention.cu
@@ -218,7 +218,7 @@ __global__ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_kernel(
     scalar_t* __restrict__ out,  // [num_seqs, num_heads, max_num_partitions,
                                  // head_size]
     scalar_t* __restrict__ final_out,  // [num_seqs, num_heads, head_size]
-    int max_ctx_blocks, float k_scale, float v_scale) {
+    int max_ctx_blocks, const float* k_scale_ptr, const float* v_scale_ptr) {
   constexpr int NWARPS = NUM_THREADS / WARP_SIZE;
   const int warpid = threadIdx.x / WARP_SIZE;
   const int laneid = threadIdx.x % WARP_SIZE;
@@ -406,7 +406,7 @@ __global__ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_kernel(
             // Vlocalb8[h][b * BLOCK_SIZE / 8 + d] = v_ptrh8be[d];
             const _B8x8 Vlocalb8 = v_ptrh8be[d];
             Vlocal[h][b * BLOCK_SIZE / 8 + d] =
-                scaled_convert_b8x8<scalar_t, KV_DTYPE>(Vlocalb8, v_scale);
+                scaled_convert_b8x8<scalar_t, KV_DTYPE>(Vlocalb8, *v_scale_ptr);
           }
         }
       }
@@ -416,7 +416,7 @@ __global__ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_kernel(
   #pragma unroll
       for (int d = 0; d < KHELOOP; d++) {
         Klocal[d] =
-            scaled_convert_b8x8<scalar_t, KV_DTYPE>(Klocalb8[d], k_scale);
+            scaled_convert_b8x8<scalar_t, KV_DTYPE>(Klocalb8[d], *k_scale_ptr);
       }
     }
 
@@ -890,7 +890,7 @@ __global__ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_kernel(
     scalar_t* __restrict__ out,  // [num_seqs, num_heads, max_num_partitions,
                                  // head_size]
     scalar_t* __restrict__ final_out,  // [num_seqs, num_heads, head_size]
-    int max_ctx_blocks, float k_scale, float v_scale) {
+    int max_ctx_blocks, const float* k_scale, const float* v_scale) {
   UNREACHABLE_CODE
 }
 
@@ -919,7 +919,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
           block_tables_ptr, context_lens_ptr, max_num_blocks_per_seq,         \
           alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride,        \
           exp_sums_ptr, max_logits_ptr, tmp_out_ptr, out_ptr, max_ctx_blocks, \
-          k_scale, v_scale);
+          k_scale_ptr, v_scale_ptr);
 
 template <typename T, typename KVT, vllm::Fp8KVCacheDataType KV_DTYPE,
           int BLOCK_SIZE, int HEAD_SIZE, int PARTITION_SIZE = 512>
@@ -929,7 +929,7 @@ void paged_attention_custom_launcher(
     torch::Tensor& value_cache, const int num_kv_heads, float scale,
     torch::Tensor& block_tables, torch::Tensor& context_lens,
     int max_context_len, const std::optional<torch::Tensor>& alibi_slopes,
-    float k_scale, float v_scale) {
+    torch::Tensor& k_scale, torch::Tensor& v_scale) {
   int num_seqs = query.size(0);
   int num_heads = query.size(1);
   int head_size = query.size(2);
@@ -953,6 +953,8 @@ void paged_attention_custom_launcher(
   KVT* value_cache_ptr = reinterpret_cast<KVT*>(value_cache.data_ptr());
   int* block_tables_ptr = block_tables.data_ptr<int>();
   int* context_lens_ptr = context_lens.data_ptr<int>();
+  const float* k_scale_ptr = reinterpret_cast<const float*>(k_scale.data_ptr());
+  const float* v_scale_ptr = reinterpret_cast<const float*>(v_scale.data_ptr());
 
   const int max_ctx_blocks = DIVIDE_ROUND_UP(max_context_len, BLOCK_SIZE);
   const int max_num_partitions =
@@ -1087,7 +1089,8 @@ void paged_attention(
     torch::Tensor& context_lens,  // [num_seqs]
     int64_t block_size, int64_t max_context_len,
     const std::optional<torch::Tensor>& alibi_slopes,
-    const std::string& kv_cache_dtype, double k_scale, double v_scale) {
+    const std::string& kv_cache_dtype, torch::Tensor& k_scale,
+    torch::Tensor& v_scale) {
   const int head_size = query.size(2);
   if (kv_cache_dtype == "auto") {
     if (query.dtype() == at::ScalarType::Half) {
diff --git a/csrc/rocm/ops.h b/csrc/rocm/ops.h
index 34b2f9ce8a4c4..ba161951772ad 100644
--- a/csrc/rocm/ops.h
+++ b/csrc/rocm/ops.h
@@ -10,5 +10,5 @@ void paged_attention(torch::Tensor& out, torch::Tensor& exp_sums,
                      torch::Tensor& context_lens, int64_t block_size,
                      int64_t max_context_len,
                      const std::optional<torch::Tensor>& alibi_slopes,
-                     const std::string& kv_cache_dtype, double k_scale,
-                     double v_scale);
+                     const std::string& kv_cache_dtype, torch::Tensor& k_scale,
+                     torch::Tensor& v_scale);
diff --git a/csrc/rocm/torch_bindings.cpp b/csrc/rocm/torch_bindings.cpp
index a283d4263d293..a5d2e2f97a3ed 100644
--- a/csrc/rocm/torch_bindings.cpp
+++ b/csrc/rocm/torch_bindings.cpp
@@ -27,7 +27,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, rocm_ops) {
       "                int max_context_len,"
       "                Tensor? alibi_slopes,"
       "                str kv_cache_dtype,"
-      "                float k_scale, float v_scale) -> ()");
+      "                Tensor k_scale, Tensor v_scale) -> ()");
   rocm_ops.impl("paged_attention", torch::kCUDA, &paged_attention);
 }
 
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index fb53d122487d3..ec63170d511f0 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -30,7 +30,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "    Tensor value_cache, int num_kv_heads, float scale,"
       "    Tensor block_tables, Tensor seq_lens, int block_size,"
       "    int max_seq_len, Tensor? alibi_slopes,"
-      "    str kv_cache_dtype, float k_scale, float v_scale,"
+      "    str kv_cache_dtype, Tensor k_scale, Tensor v_scale,"
       "    int tp_rank, int blocksparse_local_blocks,"
       "    int blocksparse_vert_stride, int blocksparse_block_size,"
       "    int blocksparse_head_sliding_step) -> ()");
@@ -44,7 +44,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "    Tensor value_cache, int num_kv_heads, float scale,"
       "    Tensor block_tables, Tensor seq_lens, int block_size,"
       "    int max_seq_len, Tensor? alibi_slopes,"
-      "    str kv_cache_dtype, float k_scale, float v_scale,"
+      "    str kv_cache_dtype, Tensor k_scale, Tensor v_scale,"
       "    int tp_rank, int blocksparse_local_blocks,"
       "    int blocksparse_vert_stride, int blocksparse_block_size,"
       "    int blocksparse_head_sliding_step) -> ()");
@@ -449,7 +449,7 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
       "                  Tensor! key_cache, Tensor! value_cache,"
       "                  Tensor slot_mapping,"
       "                  str kv_cache_dtype,"
-      "                  float k_scale, float v_scale) -> ()");
+      "                  Tensor k_scale, Tensor v_scale) -> ()");
   cache_ops.impl("reshape_and_cache", torch::kCUDA, &reshape_and_cache);
 
   // Reshape the key and value tensors and cache them.
@@ -459,7 +459,7 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
       "                        Tensor! value_cache,"
       "                        Tensor slot_mapping,"
       "                        str kv_cache_dtype,"
-      "                        float k_scale, float v_scale) -> ()");
+      "                        Tensor k_scale, Tensor v_scale) -> ()");
   cache_ops.impl("reshape_and_cache_flash", torch::kCUDA,
                  &reshape_and_cache_flash);
 
diff --git a/docs/source/features/quantization/quantized_kvcache.md b/docs/source/features/quantization/quantized_kvcache.md
index 95fa5e81e2f74..9f36c2949e0dd 100644
--- a/docs/source/features/quantization/quantized_kvcache.md
+++ b/docs/source/features/quantization/quantized_kvcache.md
@@ -35,16 +35,18 @@ Studies have shown that FP8 E4M3 quantization typically only minimally degrades
 Here is an example of how to enable FP8 quantization:
 
 ```python
+# To calculate kv cache scales on the fly enable the calculate_kv_scales
+# parameter
+
 from vllm import LLM, SamplingParams
 
 sampling_params = SamplingParams(temperature=0.7, top_p=0.8)
-llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct", kv_cache_dtype="fp8")
+llm = LLM(model="meta-llama/Llama-2-7b-chat-hf",
+          kv_cache_dtype="fp8",
+          calculate_kv_scales=True)
 prompt = "London is the capital of"
 out = llm.generate(prompt, sampling_params)[0].outputs[0].text
 print(out)
-
-# output w/ scaling factors:  England, the United Kingdom, and one of the world's leading financial,
-# output w/o scaling factors: England, located in the southeastern part of the country. It is known
 ```
 
 The `kv_cache_dtype` argument specifies the data type for KV cache storage:
diff --git a/examples/other/fp8/README.md b/examples/other/fp8/README.md
deleted file mode 100644
index 4e8031d954113..0000000000000
--- a/examples/other/fp8/README.md
+++ /dev/null
@@ -1,96 +0,0 @@
-# FP8 KV Cache 
-
-This utility extracts the KV cache scaling factors from a quantized HF (Hugging Face) model. The extracted scaling factors are saved to a JSON file, which can later be used by vLLM (variable-length language model) during runtime. This tool is particularly useful when the KV cache data type is FP8 and is intended for use on ROCm (AMD GPU) platforms.
-
-## Prerequisites
-
-- Python 3.x
-- PyTorch
-- NumPy
-- Hugging Face Transformers
-- Hugging Face Hub
-- AMMO 
-
-Before incorporating the FP8 datatype for inference workloads, you must adhere to the following steps:
-1. Install all necessary prerequisites and dependencies. 
-2. Convert HF model into a quantized HF model. 
-3. Extract KV Cache Scaling Factors from quantized HF model.
-4. Load KV Cache Scaling Factors into VLLM.
-
-### 2. Convert HF model into a quantized HF model.
-Note: The following steps are adapted from the [TensorRT-LLM repository](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/quantization/README.md).
-
-`quantize.py` (examples/other/fp8/quantizer/quantize.py) uses the quantization toolkit  (AMMO) to calibrate the PyTorch models and export TensorRT-LLM checkpoints. Each TensorRT-LLM checkpoint contains a config file (in .json format) and one or several rank weight files (in .safetensors format).
-
-The detailed quantization toolkit (AMMO) conversion guide for FP8 can be found at `examples/other/fp8/quantizer/README.md`.
-
-### 3. Extract KV Cache Scaling Factors from quantized HF model.
-`extract_scales.py` (examples/other/fp8/extract_scales.py) can be utilized to extract the KV cache scaling factors from your quantized HF model, however at the moment, this tool exclusively supports Llama 2 models. It is also important to note the following:
-1. **File Structure**: The utility operates under the assumption that all parameters, including KV cache scaling factors, corresponding to a particular Tensor Parallelism (TP) rank are stored in a single file. These files must adhere to a specific naming convention where the TP rank is immediately identified after a specific keyword (e.g., "rank") in the filename.
-
-2. **TP Decomposition**: The utility assumes consistency between the TP decomposition employed by the quantizer tool and that used by vLLM.
-
-3. **AMMO Compatibility**: Currently, the generated KV cache scaling factors for AMMO remain uniform across all TP ranks.
-
-```python
-# prerequisites:
-# - Quantized HF LLaMa 2 model 
-python3 examples/other/fp8/extract_scales.py --help
-Usage: extract_scales.py [-h] --quantized_model QUANTIZED_MODEL [--load_format {auto,safetensors,npz,pt}] [--output_dir OUTPUT_DIR] [--output_name OUTPUT_NAME] [--tp_size TP_SIZE]
-
-KV Scale Extraction Example
-
-optional arguments:
---quantized_model: Specify either the local path to, or name of, a quantized HF model. It is expected that the quantization format is FP8_E4M3, for use on ROCm (AMD GPU).
-Optional arguments:
---cache_dir: Specify a cache directory to use in the event of a HF model download. (Default: None)
---load_format: Specify the format of the model's tensor files containing the KV cache scaling factors. (Choices: auto, safetensors, npz, pt; Default: auto)
---revision: Specify the model's revision number. (Default: None)
---output_dir: Specify the output directory. By default the KV cache scaling factors will be saved in the model directory. (Default: None)
---output_name: Specify the output filename. (Default: kv_cache_scales.json)
---tp_size: Specify the tensor-parallel (TP) size that the quantized model should correspond to. If specified, during KV cache scaling factor extraction the observed TP size will be checked against this and an error will be raised if there is a mismatch. (Default: None)
-```
-```python
-Example:
-python3 examples/other/fp8/extract_scales.py --quantized_model <QUANTIZED_MODEL_DIR> --tp_size <TENSOR_PARALLEL_SIZE> --output_dir <PATH_TO_OUTPUT_DIR>
-```
-### 4. Load KV Cache Scaling Factors into VLLM.
-This script evaluates the inference throughput of language models using various backends such as vLLM. It measures the time taken to process a given number of prompts and generate sequences for each prompt. The recently generated KV cache scaling factors are now integrated into the benchmarking process and allow for KV cache scaling factors to be utilized for FP8.
-```
-# prerequisites:
-# -  LLaMa 2 kv_cache_scales.json file
-
-python3 benchmarks/benchmark_throughput.py --help 
-usage: benchmark_throughput.py [-h] [--backend {vllm,hf,mii}] [--dataset DATASET] [--input-len INPUT_LEN] [--output-len OUTPUT_LEN] [--model MODEL]
-                               [--tokenizer TOKENIZER] [--quantization {awq,gptq,None}] [--tensor-parallel-size TENSOR_PARALLEL_SIZE] [--n N]
-                               [--use-beam-search] [--num-prompts NUM_PROMPTS] [--seed SEED] [--hf-max-batch-size HF_MAX_BATCH_SIZE] [--trust-remote-code]
-                               [--max-model-len MAX_MODEL_LEN] [--dtype {auto,half,float16,bfloat16,float,float32}] [--enforce-eager] [--kv-cache-dtype {auto,fp8}]
-                               [--quantization-param-path KV_CACHE_quantization_param_path]
-
-Benchmark Throughput Example  
-optional arguments:
-  -h, --help  show this help message and exit
-  --backend {vllm,hf,mii}
-  --dataset DATASET  Path to the dataset.
-  --input-len INPUT_LEN  Input prompt length for each request
-  --output-len OUTPUT_LEN  Output length for each request. Overrides the output length from the dataset.
-  --model MODEL
-  --tokenizer TOKENIZER
-  --quantization {awq,gptq,None}, -q {awq,gptq,None}
-  --tensor-parallel-size TENSOR_PARALLEL_SIZE, -tp TENSOR_PARALLEL_SIZE
-  --n N  Number of generated sequences per prompt.
-  --use-beam-search
-  --num-prompts NUM_PROMPTS  Number of prompts to process.
-  --seed SEED
-  --hf-max-batch-size HF_MAX_BATCH_SIZE   Maximum batch size for HF backend.
-  --trust-remote-code trust remote code from huggingface
-  --max-model-len MAX_MODEL_LEN  Maximum length of a sequence (including prompt and output). If None, will be derived from the model.
-  --dtype {auto,half,float16,bfloat16,float,float32}  data type for model weights and activations. The "auto" option will use FP16 precision for FP32 and FP16 models, and BF16 precision for BF16 models.
-  --enforce-eager  enforce eager execution
-  --kv-cache-dtype {auto,fp8} Data type for kv cache storage. If "auto", will use model data type. FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported ```for common inference criteria.
-  --quantization-param-path QUANT_PARAM_JSON Path to the JSON file containing the KV cache scaling factors. This should generally be supplied, when KV cache dtype is FP8. Otherwise, KV cache scaling factors default to 1.0, which may cause accuracy issues. FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria.
-```
-Example:
-```console
-python3 benchmarks/benchmark_throughput.py --input-len <INPUT_LEN> --output-len <OUTPUT_LEN> -tp <TENSOR_PARALLEL_SIZE> --kv-cache-dtype fp8 --quantization-param-path <path/to/kv_cache_scales.json> --model <path-to-llama2>
-```
diff --git a/examples/other/fp8/extract_scales.py b/examples/other/fp8/extract_scales.py
deleted file mode 100644
index 1dce9d7e993a0..0000000000000
--- a/examples/other/fp8/extract_scales.py
+++ /dev/null
@@ -1,367 +0,0 @@
-import argparse
-import glob
-import json
-import os
-from typing import Any, Callable, Dict, List, Optional, Tuple
-
-import numpy as np
-import torch
-from safetensors.torch import safe_open
-
-from vllm.model_executor.layers.quantization.schema import QuantParamSchema
-
-
-# Adapted from vllm/model_executor/model_loader/weight_utils.py
-# The main differences are that we add the NPZ format and simplify
-# its functionality drastically for our purposes (e.g. we assume that
-# the quantized model exists locally and there is no need to download it)
-def _prepare_hf_weights(
-    quantized_model_dir: str,
-    load_format: str = "auto",
-    fall_back_to_pt: bool = True,
-) -> Tuple[List[str], bool]:
-    if not os.path.isdir(quantized_model_dir):
-        raise FileNotFoundError(
-            f"The quantized model directory `{quantized_model_dir}` "
-            "does not exist.")
-    use_safetensors = False
-    # Some quantized models use .pt files for storing the weights.
-    if load_format == "auto":
-        allow_patterns = ["*.safetensors", "*.bin"]
-    elif load_format == "safetensors":
-        use_safetensors = True
-        allow_patterns = ["*.safetensors"]
-    elif load_format == "pt":
-        allow_patterns = ["*.pt"]
-    elif load_format == "npz":
-        allow_patterns = ["*.npz"]
-    else:
-        raise ValueError(f"Unknown load_format: {load_format}")
-    if fall_back_to_pt:
-        allow_patterns += ["*.pt"]
-
-    hf_weights_files: List[str] = []
-    for pattern in allow_patterns:
-        hf_weights_files += glob.glob(
-            os.path.join(quantized_model_dir, pattern))
-        if len(hf_weights_files) > 0:
-            if pattern == "*.safetensors":
-                use_safetensors = True
-            break
-
-    if not use_safetensors:
-        # Exclude files that are not needed for inference.
-        # https://github.com/huggingface/transformers/blob/v4.34.0/src/transformers/trainer.py#L227-L233
-        blacklist = [
-            "training_args.bin",
-            "optimizer.bin",
-            "optimizer.pt",
-            "scheduler.pt",
-            "scaler.pt",
-        ]
-        hf_weights_files = [
-            f for f in hf_weights_files
-            if not any(f.endswith(x) for x in blacklist)
-        ]
-
-    if len(hf_weights_files) == 0:
-        raise RuntimeError(
-            f"Cannot find any model weights with `{quantized_model_dir}`")
-
-    return hf_weights_files, use_safetensors
-
-
-# Adapted from vllm/model_executor/model_loader/weight_utils.py
-def _hf_tensorfile_iterator(filename: str, load_format: str,
-                            use_safetensors: bool):
-    if load_format == "npz":
-        assert not use_safetensors
-        with np.load(filename) as data:
-            for name in data.files:
-                param = torch.from_numpy(data[name])
-                yield name, param
-    elif use_safetensors:
-        with safe_open(filename, framework="pt") as f:
-            for name in f.keys():  # NOQA: SIM118
-                param = f.get_tensor(name)
-                yield name, param
-    else:
-        state = torch.load(filename, map_location="cpu")
-        for name, param in state.items():
-            yield name, param
-        del state
-        torch.cuda.empty_cache()
-
-
-def _kv_scales_extractor(
-        hf_tensor_files: List[str],
-        use_safetensors: bool,
-        rank_keyword: str = "rank",
-        expected_tp_size: Optional[int] = None) -> Dict[int, Dict[int, float]]:
-    """
-    Given a list of files containing tensor data, attempt to extract KV cache
-    scales from these files. Intended as a helper function taking in the output
-    from _prepare_hf_weights.
-    Args:
-    rank_keyword        Matches the number immediately after this keyword in the
-                        tensor filename to determine the TP rank corresponding
-                        to said tensor file
-    expected_tp_size    If specified, the TP size of the tensor files is checked
-                        against this and an error is raised if they don't match.
-    Returns a dictionary mapping TP ranks to their relevant KV cache scales.
-    The per-rank scales are themselves represented as a dictionary of layer
-    indices to the respective per-layer scale.
-    """
-    for char in rank_keyword:
-        assert not char.isdecimal(
-        ), f"Rank keyword {rank_keyword} contains a numeric character!"
-    rank_scales_map: Dict[int, Dict[int, float]] = {}
-    for tensor_file in hf_tensor_files:
-        try:
-            rank_idx = tensor_file.find(rank_keyword)
-            if rank_idx != -1:
-                start_idx = rank_idx + len(rank_keyword)
-                stop_idx = start_idx
-                while stop_idx < len(
-                        tensor_file) and tensor_file[stop_idx].isdecimal():
-                    stop_idx += 1
-                if stop_idx == start_idx:
-                    raise RuntimeError("Did not find rank # in filename.")
-                rank = int(tensor_file[start_idx:stop_idx])
-            elif len(hf_tensor_files) == 1:
-                # Since there is only one tensor file, we can assume
-                # that it's intended for TP rank 0
-                rank = 0
-            else:
-                raise RuntimeError(
-                    f"Filename does not contain '{rank_keyword}'.")
-        except RuntimeError:
-            print("Unable to determine TP rank "
-                  f"corresponding to file '{tensor_file}'")
-            raise
-
-        if rank not in rank_scales_map:
-            layer_scales_map: Dict[int, float] = {}
-            rank_scales_map[rank] = layer_scales_map
-        else:
-            raise RuntimeError(
-                f"Tensor file '{tensor_file}' shares TP rank {rank} "
-                "with another tensor file.")
-
-        module_delimiter = ":" if args.load_format == "npz" else "."
-        for name, param in _hf_tensorfile_iterator(tensor_file,
-                                                   args.load_format,
-                                                   use_safetensors):
-            if "kv_cache_scaling_factor" in name:
-                nums = [
-                    int(s) for s in name.split(module_delimiter)
-                    if s.isdecimal()
-                ]
-                assert len(
-                    nums) == 1, f"Could not determine layer idx for {name}"
-                layer_idx = nums[0]
-                assert layer_idx not in layer_scales_map, f"Duplicate scaling"\
-                    f" factor corresponding to layer {layer_idx}"
-                try:
-                    layer_scales_map[layer_idx] = param.item()
-                except RuntimeError:
-                    print(
-                        "This utility supports only per-tensor scalar scales "
-                        f"for now. The tensor\n {name} = {param} \nis an "
-                        "invalid scale factor.")
-                    raise
-
-    if all(
-            len(layer_scales_map) == 0
-            for layer_scales_map in rank_scales_map.values()):
-        # Note: this is true even if the rank_scales_map is empty
-        print("WARNING: No KV cache scale factors found. No output saved.")
-        return None
-    empirical_tp_world_size = max(rank_scales_map.keys()) + 1
-    if expected_tp_size is not None:
-        assert expected_tp_size == empirical_tp_world_size, \
-            f"User expected TP world size = {expected_tp_size} " \
-            "from model but tool is expecting TP world size = " \
-            f"{empirical_tp_world_size} from model instead."
-    for i in range(empirical_tp_world_size):
-        assert i in rank_scales_map, "Expected TP world size = "\
-            f"{empirical_tp_world_size} but did not find KV " \
-            f"cache scaling factors for TP rank {i}"
-    print(f"Found TP world size = {empirical_tp_world_size} "
-          "when extracting KV cache scales!")
-    return rank_scales_map
-
-
-def _metadata_extractor(quantized_model_dir: str,
-                        metadata_extract_fns: \
-                        Dict[str, Callable[[Dict[str, Any]], Any]]) \
-                        -> Dict[str, Any]:
-    """
-    Given a directory containing quantized model files, this function
-    aims to extract metadata from the JSON files within this directory.
-    Each JSON file is expected to represent a dictionary in JSON
-    format (referred to as a "JSON-dictionary"). Metadata extraction is
-    defined by a dictionary called metadata_extract_fns, where each
-    metadata field name is mapped to an extraction function.
-
-    These extraction functions are designed to take a JSON-dictionary
-    as their only argument  and return the corresponding metadata.
-    While extraction functions are permitted to raise  exceptions, they
-    should only raise a KeyError or ValueError if the metadata field
-    cannot  be extracted from the current JSON-dictionary, yet there's
-    a possibility of finding it in another JSON-dictionary.
-
-    The function returns a dictionary that maps metadata fields to
-    their extracted data. The keys of this dictionary correspond exactly
-    to those in metadata_extract_fns. If any fields fail to be extracted,
-    their corresponding values are set to None, and a warning is printed.
-    """
-    if not os.path.isdir(quantized_model_dir):
-        raise FileNotFoundError(
-            f"The quantized model directory `{quantized_model_dir}` "
-            "does not exist.")
-    metadata_files = glob.glob(os.path.join(quantized_model_dir, "*.json"))
-
-    result: Dict[str, Any] = {}
-    for file in metadata_files:
-        with open(file) as f:
-            try:
-                metadata = json.load(f)
-            except json.JSONDecodeError:
-                print(f"Could not parse `{file}` as a valid metadata file,"
-                      " skipping it.")
-                continue
-            if not isinstance(metadata, dict):
-                print(f"The file `{file}` does not correspond to a "
-                      "JSON-serialized dictionary, skipping it.")
-                continue
-            for metadata_name, extract_fn in metadata_extract_fns.items():
-                try:
-                    metadata_info = extract_fn(metadata)
-                    if metadata_name not in result:
-                        result[metadata_name] = metadata_info
-                    elif metadata_info != result[metadata_name]:
-                        raise RuntimeError(
-                            "Metadata mismatch! Originally found "
-                            f"{metadata_name} = {result[metadata_name]} but "
-                            f"now found {metadata_name} = {metadata_info} in "
-                            f"`{file}`")
-                except KeyError:
-                    # It is possible that a given file does not contain some
-                    # of our selected metadata as it could be located in some
-                    # other metadata file.
-                    # 'EFINAE': extract_fn failure is not an error.
-                    pass
-                except ValueError:
-                    # See above.
-                    pass
-
-    # Warn if we cannot find any of the requested metadata
-    for metadata_name in metadata_extract_fns:
-        if metadata_name not in result:
-            print("WARNING: Unable to find requested metadata field "
-                  f"`{metadata_name}`, setting it to None.")
-            result[metadata_name] = None
-
-    return result
-
-
-def main(args):
-    metadata_extract_fns = {
-        "model_type": lambda json_dict: json_dict["layers"][0]["decoder_type"],
-        "tp_size": lambda json_dict: int(json_dict["tensor_parallel"]),
-        "model_dtype": lambda json_dict: json_dict["dtype"]
-    }
-    recovered_metadata = _metadata_extractor(args.quantized_model,
-                                             metadata_extract_fns)
-    if args.tp_size is not None:
-        metadata_tp_size = recovered_metadata["tp_size"]
-        if metadata_tp_size is not None:
-            assert args.tp_size == metadata_tp_size, \
-              f"User expected TP world size = {args.tp_size} " \
-              f"but found TP world size = {metadata_tp_size} from metadata!"
-    expected_tp_size = args.tp_size or recovered_metadata["tp_size"]
-    rank_keyword = "rank"
-    hf_tensor_files, use_safetensors = _prepare_hf_weights(
-        args.quantized_model, args.load_format)
-    rank_scales_map = _kv_scales_extractor(hf_tensor_files, use_safetensors,
-                                           rank_keyword, expected_tp_size)
-    # Postprocess: formatting to the current schema. Consider pulling it
-    # out into a dedicated function should it ever become more complicated.
-    rank_scales_map = {
-        rank: {k: scale[k]
-               for k in sorted(scale.keys())}
-        for rank, scale in rank_scales_map.items()
-    }
-    # TODO: Expand this with activation and weights scaling factors when
-    # they are used in the future
-    schema = QuantParamSchema(
-        model_type=recovered_metadata["model_type"],
-        kv_cache={
-            "dtype": ("float8_e4m3fn" if len(rank_scales_map) > 0 else
-                      recovered_metadata["model_dtype"]),
-            "scaling_factor":
-            rank_scales_map
-        },
-    )
-
-    if args.output_dir is None:
-        output_file = os.path.join(args.quantized_model, args.output_name)
-    else:
-        if not os.path.isdir(args.output_dir):
-            os.makedirs(args.output_dir, exist_ok=True)
-        output_file = os.path.join(args.output_dir, args.output_name)
-
-    with open(output_file, 'w') as f:
-        f.write(schema.model_dump_json(indent=4))
-        print(f"Completed! KV cache scaling factors saved to {output_file}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="This simple utility extracts the "
-        "KV cache scaling factors from a quantized HF model "
-        "and saves them to a JSON file compatible with later "
-        "use by vLLM (pass this file to the appropriate "
-        "runtime typically using the argument "
-        "--quantization-param-path <filename>). This is only used "
-        "if the KV cache dtype is FP8 and on ROCm (AMD GPU).")
-    parser.add_argument(
-        "--quantized-model",
-        help="Specify the directory containing a single quantized HF model. "
-        "It is expected that the quantization format is FP8_E4M3, for use "
-        "on ROCm (AMD GPU).",
-        required=True)
-    parser.add_argument(
-        "--load_format",
-        help="Optionally specify the format of the model's tensor files "
-        "containing the KV cache scaling factors.",
-        choices=["auto", "safetensors", "npz", "pt"],
-        default="auto")
-    parser.add_argument(
-        "--output-dir",
-        help="Optionally specify the output directory. By default the "
-        "KV cache scaling factors will be saved in the model directory, "
-        "however you can override this behavior here.",
-        default=None)
-    parser.add_argument(
-        "--output-name",
-        help="Optionally specify the output filename.",
-        # TODO: Change this once additional scaling factors are enabled
-        default="kv_cache_scales.json")
-    parser.add_argument(
-        "--tp-size",
-        help="Optionally specify the tensor-parallel (TP) size that the "
-        "quantized model should correspond to. If specified, during KV "
-        "cache scaling factor extraction the observed TP size will be "
-        "checked against this and an error will be raised if there is "
-        "a mismatch. If not specified, the quantized model's expected "
-        "TP size is instead inferred from the largest TP rank observed. "
-        "The expected TP size is cross-checked against the TP ranks "
-        "observed in the quantized model and an error is raised if any "
-        "discrepancies are found.",
-        default=None,
-        type=int)
-    args = parser.parse_args()
-
-    main(args)
diff --git a/examples/other/fp8/quantizer/README.md b/examples/other/fp8/quantizer/README.md
deleted file mode 100644
index d0895e97dc341..0000000000000
--- a/examples/other/fp8/quantizer/README.md
+++ /dev/null
@@ -1,32 +0,0 @@
-### Quantizer Utilities
-`quantize.py`: NVIDIA Quantization utilities using TensorRT-Model-Optimizer, ported
-from TensorRT-LLM: [`examples/quantization/quantize.py`](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/quantization/quantize.py)
-
-### Prerequisite
-
-#### AMMO (AlgorithMic Model Optimization) Installation: nvidia-ammo 0.7.1 or later
-`pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com nvidia-ammo` 
-
-#### AMMO Download (code and docs)
-`https://developer.nvidia.com/downloads/assets/cuda/files/nvidia-ammo/nvidia_ammo-0.5.0.tar.gz`
-`https://developer.nvidia.com/downloads/assets/cuda/files/nvidia-ammo/nvidia_ammo-0.7.1.tar.gz`
-
-### Usage
-
-#### Run on H100 system for speed if FP8; number of GPUs depends on the model size
-
-#### Example: quantize Llama2-7b model from HF to FP8 with FP8 KV Cache:
-`python quantize.py --model-dir ./ll2-7b --dtype float16 --qformat fp8 --kv-cache-dtype fp8 --output-dir ./ll2_7b_fp8 --calib-size 512 --tp-size 1`
-
-Outputs: model structure, quantized model & parameters (with scaling factors) are in JSON and Safetensors (npz is generated only for the reference)
-```
-# ll ./ll2_7b_fp8/
-total 19998244
-drwxr-xr-x 2 root root        4096 Feb  7 01:08 ./
-drwxrwxr-x 8 1060 1061        4096 Feb  7 01:08 ../
--rw-r--r-- 1 root root      176411 Feb  7 01:08 llama_tp1.json
--rw-r--r-- 1 root root 13477087480 Feb  7 01:09 llama_tp1_rank0.npz
--rw-r--r-- 1 root root  7000893272 Feb  7 01:08 rank0.safetensors
-#
-```
-
diff --git a/examples/other/fp8/quantizer/quantize.py b/examples/other/fp8/quantizer/quantize.py
deleted file mode 100644
index d75cc8b3d1cf7..0000000000000
--- a/examples/other/fp8/quantizer/quantize.py
+++ /dev/null
@@ -1,367 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # noqa: E501
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Adapted from examples/quantization/hf_ptq.py
-"""
-
-import argparse
-import copy
-import json
-import random
-import time
-
-import ammo.torch.quantization as atq
-import numpy as np
-import torch
-from ammo.torch.export import export_model_config
-from datasets import load_dataset
-from torch.utils.data import DataLoader
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-RAND_SEED = 1234
-MAX_SEQ_LEN = 2048
-
-EMPTY_CFG = {
-    "quant_cfg": {
-        "*weight_quantizer": {
-            "enable": False,
-        },
-        "*input_quantizer": {
-            "enable": False
-        },
-        "*lm_head*": {
-            "enable": False
-        },
-        "*output_layer*": {
-            "enable": False
-        },
-        "default": {
-            "enable": False
-        },
-    },
-    "algorithm": "max",
-}
-
-KV_CACHE_CFG = {
-    "*.query_key_value.output_quantizer": {
-        "num_bits": 8,
-        "axis": None,
-        "enable": True
-    },
-    "*.Wqkv.output_quantizer": {
-        "num_bits": 8,
-        "axis": None,
-        "enable": True
-    },
-    "*.W_pack.output_quantizer": {
-        "num_bits": 8,
-        "axis": None,
-        "enable": True
-    },
-    "*.c_attn.output_quantizer": {
-        "num_bits": 8,
-        "axis": None,
-        "enable": True
-    },
-    "*.k_proj.output_quantizer": {
-        "num_bits": 8,
-        "axis": None,
-        "enable": True
-    },
-    "*.v_proj.output_quantizer": {
-        "num_bits": 8,
-        "axis": None,
-        "enable": True
-    },
-}
-
-QUANT_CFG_CHOICES = {
-    "int8_sq": atq.INT8_SMOOTHQUANT_CFG,
-    "fp8": atq.FP8_DEFAULT_CFG,
-    "int4_awq": atq.INT4_AWQ_CFG,
-    "w4a8_awq": atq.W4A8_AWQ_BETA_CFG,
-    "int8_wo": EMPTY_CFG,
-    "int4_wo": EMPTY_CFG,
-    "full_prec": EMPTY_CFG,
-}
-
-MODEL_NAME_PATTERN_MAP = {
-    "GPT2": "gpt2",
-    "Xverse": "llama",
-    "Llama": "llama",
-    "Mistral": "llama",
-    "GPTJ": "gptj",
-    "FalconForCausalLM": "falcon",
-    "RWForCausalLM": "falcon",
-    "baichuan": "baichuan",
-    "MPT": "mpt",
-    "Bloom": "bloom",
-    "ChatGLM": "chatglm",
-    "QWen": "qwen",
-}
-
-
-def get_tokenizer(ckpt_path, max_seq_len=MAX_SEQ_LEN, model_type=None):
-    print(f"Initializing tokenizer from {ckpt_path}")
-    tokenizer = AutoTokenizer.from_pretrained(
-        ckpt_path,
-        model_max_length=max_seq_len,
-        padding_side="left",
-        trust_remote_code=True,
-    )
-    if model_type and model_type == "qwen":
-        # qwen use token id 151643 as pad and eos tokens
-        tokenizer.pad_token = tokenizer.convert_ids_to_tokens(151643)
-        tokenizer.eos_token = tokenizer.convert_ids_to_tokens(151643)
-
-    # can't set attribute 'pad_token' for "<unk>"
-    if tokenizer.pad_token != "<unk>":
-        tokenizer.pad_token = tokenizer.eos_token
-    if tokenizer.pad_token is None:
-        tokenizer.pad_token = tokenizer.eos_token
-    assert (tokenizer.pad_token
-            is not None), f"Pad token for {model_type} cannot be set!"
-
-    return tokenizer
-
-
-def get_model(ckpt_path, dtype="fp16", device="cuda"):
-    print(f"Initializing model from {ckpt_path}")
-    if dtype == "bf16" or dtype == "bfloat16":
-        dtype = torch.bfloat16
-    elif dtype == "fp16" or dtype == "float16":
-        dtype = torch.float16
-    elif dtype == "fp32" or dtype == "float32":
-        dtype = torch.float32
-    else:
-        raise NotImplementedError(f"Unknown dtype {dtype}")
-
-    # model_kwargs = {"torch_dtype": dtype}
-    model_kwargs = {"torch_dtype": "auto"}
-
-    model = AutoModelForCausalLM.from_pretrained(ckpt_path,
-                                                 device_map="auto",
-                                                 **model_kwargs,
-                                                 trust_remote_code=True)
-    model.eval()
-
-    model_dtype = next(model.parameters()).dtype
-    if dtype != model_dtype:
-        print("[TensorRT-LLM][WARNING] The manually set model data type is "
-              f"{dtype}, but the data type of the HuggingFace model is "
-              f"{model_dtype}.")
-
-    return model
-
-
-def get_model_type(model):
-    for k, v in MODEL_NAME_PATTERN_MAP.items():
-        if k.lower() in type(model).__name__.lower():
-            return v
-    return None
-
-
-def get_calib_dataloader(data="cnn_dailymail",
-                         tokenizer=None,
-                         batch_size=1,
-                         calib_size=512,
-                         block_size=512,
-                         device=None):
-    print("Loading calibration dataset")
-    if data == "pileval":
-        dataset = load_dataset(
-            "json",
-            data_files="https://the-eye.eu/public/AI/pile/val.jsonl.zst",
-            split="train")
-        dataset = dataset["text"][:calib_size]
-    elif data == "cnn_dailymail":
-        dataset = load_dataset("cnn_dailymail", name="3.0.0", split="train")
-        dataset = dataset["article"][:calib_size]
-    else:
-        raise NotImplementedError
-
-    batch_encoded = tokenizer.batch_encode_plus(dataset,
-                                                return_tensors="pt",
-                                                padding="max_length",
-                                                truncation=True,
-                                                max_length=block_size)
-    if device:
-        batch_encoded = batch_encoded.to(device)
-    batch_encoded = batch_encoded["input_ids"]
-
-    calib_dataloader = DataLoader(batch_encoded,
-                                  batch_size=batch_size,
-                                  shuffle=False)
-
-    return calib_dataloader
-
-
-def quantize_model(model, quant_cfg, calib_dataloader=None):
-
-    def calibrate_loop():
-        if calib_dataloader is None:
-            return
-        """Adjusts weights and scaling factors based on selected algorithms."""
-        for idx, data in enumerate(calib_dataloader):
-            print(f"Calibrating batch {idx}")
-            model(data)
-
-    print("Starting quantization...")
-    start_time = time.time()
-    atq.quantize(model, quant_cfg, forward_loop=calibrate_loop)
-    end_time = time.time()
-    print("Quantization done. Total time used: {:.2f} s.".format(end_time -
-                                                                 start_time))
-
-    return model
-
-
-def main(args):
-    if not torch.cuda.is_available():
-        raise OSError("GPU is required for inference.")
-
-    random.seed(RAND_SEED)
-    np.random.seed(RAND_SEED)
-
-    model = get_model(args.model_dir, args.dtype, args.device)
-    model_type = get_model_type(model)
-    tokenizer = get_tokenizer(args.model_dir, model_type=model_type)
-
-    if args.qformat in ["full_prec", "int8_wo", "int4_wo"
-                        ] and args.kv_cache_dtype is None:
-        print(f"No quantization applied, export {args.dtype} model")
-    else:
-        if "awq" in args.qformat:
-            if args.calib_size > 32:
-                print("AWQ calibration could take longer with calib_size = "
-                      f"{args.calib_size}, Using calib_size=32 instead")
-                args.calib_size = 32
-            print("\nAWQ calibration could take longer than other calibration "
-                  "methods. Please increase the batch size to speed up the "
-                  "calibration process. Batch size can be set by adding the "
-                  "argument --batch_size <batch_size> to the command line.\n")
-
-        calib_dataloader = get_calib_dataloader(
-            tokenizer=tokenizer,
-            batch_size=args.batch_size,
-            calib_size=args.calib_size,
-            device=args.device,
-        )
-
-        if args.qformat in QUANT_CFG_CHOICES:
-            quant_cfg = QUANT_CFG_CHOICES[args.qformat]
-        else:
-            raise ValueError(
-                f"Unsupported quantization format: {args.qformat}")
-
-        if "awq" in args.qformat:
-            quant_cfg = copy.deepcopy(QUANT_CFG_CHOICES[args.qformat])
-            weight_quantizer = quant_cfg["quant_cfg"][
-                "*weight_quantizer"]  # type: ignore
-            if isinstance(weight_quantizer, list):
-                weight_quantizer = weight_quantizer[0]
-            weight_quantizer["block_sizes"][-1] = args.awq_block_size
-
-        if args.kv_cache_dtype is not None:
-            if args.kv_cache_dtype == "fp8":
-                for value in KV_CACHE_CFG.values():
-                    value.update({"num_bits": (4, 3)})  # type: ignore
-            quant_cfg["quant_cfg"].update(KV_CACHE_CFG)  # type: ignore
-
-        print(quant_cfg)
-
-        model = quantize_model(model, quant_cfg, calib_dataloader)
-
-    with torch.inference_mode():
-        if model_type is None:
-            print(f"Unknown model type {type(model).__name__}. Continue "
-                  "exporting...")
-            model_type = f"unknown:{type(model).__name__}"
-
-        export_path = args.output_dir
-        start_time = time.time()
-
-        if args.qformat == "int4_awq" and model_type == "qwen":
-            torch.save(model.state_dict(), export_path)
-        else:
-            export_npz = (model_type not in [
-                'gptj', 'falcon', 'chatglm', 'mpt', 'llama', 'baichuan'
-            ])
-
-            # export safetensors
-            export_model_config(
-                model,
-                model_type,
-                getattr(torch, args.dtype),
-                export_dir=export_path,
-                inference_tensor_parallel=args.tp_size,
-                inference_pipeline_parallel=args.pp_size,
-                # export_tensorrt_llm_config=(not export_npz),
-                export_tensorrt_llm_config=False,
-                export_npz=export_npz)
-
-            # Workaround for wo quantization
-            if args.qformat in ["int8_wo", "int4_wo", "full_prec"]:
-                with open(f"{export_path}/config.json") as f:
-                    tensorrt_llm_config = json.load(f)
-                if args.qformat == "int8_wo":
-                    tensorrt_llm_config["quantization"]["quant_algo"] = 'W8A16'
-                elif args.qformat == "int4_wo":
-                    tensorrt_llm_config["quantization"]["quant_algo"] = 'W4A16'
-                else:
-                    tensorrt_llm_config["quantization"]["quant_algo"] = None
-                with open(f"{export_path}/config.json", "w") as f:
-                    json.dump(tensorrt_llm_config, f, indent=4)
-
-        end_time = time.time()
-        print("Quantized model exported to {} \nTotal time used {:.2f} s.".
-              format(export_path, end_time - start_time))
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description=__doc__)
-    parser.add_argument("--model-dir",
-                        help="Specify where the HuggingFace model is",
-                        required=True)
-    parser.add_argument("--device", default="cuda")
-    parser.add_argument("--dtype", help="Model data type.", default="float16")
-    parser.add_argument(
-        "--qformat",
-        help="Quantization format.",
-        default="full_prec",
-        choices=[
-            "fp8", "int8_sq", "int4_awq", "w4a8_awq", "int8_wo", "int4_wo",
-            "full_prec"
-        ],
-    )
-    parser.add_argument("--batch-size",
-                        help="Batch size for calibration.",
-                        type=int,
-                        default=1)
-    parser.add_argument("--calib-size",
-                        help="Number of samples for calibration.",
-                        type=int,
-                        default=512)
-    parser.add_argument("--output-dir", default="exported_model")
-    parser.add_argument("--tp-size", type=int, default=1)
-    parser.add_argument("--pp-size", type=int, default=1)
-    parser.add_argument("--awq-block-size", type=int, default=128)
-    parser.add_argument("--kv-cache-dtype",
-                        help="KV Cache dtype.",
-                        default=None,
-                        choices=["int8", "fp8", None])
-    args = parser.parse_args()
-
-    main(args)
diff --git a/tests/fp8_kv/llama2-70b-fp8-kv/kv_cache_scales.json b/tests/fp8_kv/llama2-70b-fp8-kv/kv_cache_scales.json
deleted file mode 100644
index a548f0a9611f6..0000000000000
--- a/tests/fp8_kv/llama2-70b-fp8-kv/kv_cache_scales.json
+++ /dev/null
@@ -1,90 +0,0 @@
-{
-    "model_type": "llama",
-    "kv_cache": {
-        "dtype": "float8_e4m3fn",
-        "scaling_factor": {
-            "0": {
-                "0": 0.0230364128947258,
-                "1": 0.01979283057153225,
-                "2": 0.0241350457072258,
-                "3": 0.0308314748108387,
-                "4": 0.0430733822286129,
-                "5": 0.0370396226644516,
-                "6": 0.0306222103536129,
-                "7": 0.0357491634786129,
-                "8": 0.0358189195394516,
-                "9": 0.0443289652466774,
-                "10": 0.0433175228536129,
-                "11": 0.0416782945394516,
-                "12": 0.0366908498108387,
-                "13": 0.0432477705180645,
-                "14": 0.0410505048930645,
-                "15": 0.0457589291036129,
-                "16": 0.0418526791036129,
-                "17": 0.0432477705180645,
-                "18": 0.0469447560608387,
-                "19": 0.0514787957072258,
-                "20": 0.0541294664144516,
-                "21": 0.0587681382894516,
-                "22": 0.0625,
-                "23": 0.0585588738322258,
-                "24": 0.0600237175822258,
-                "25": 0.0588030144572258,
-                "26": 0.0531180277466774,
-                "27": 0.06396484375,
-                "28": 0.0603027381002903,
-                "29": 0.0582101047039032,
-                "30": 0.0625348836183548,
-                "31": 0.0585588738322258,
-                "32": 0.0582798570394516,
-                "33": 0.0575125589966774,
-                "34": 0.0590820349752903,
-                "35": 0.0614188089966774,
-                "36": 0.0631975457072258,
-                "37": 0.0615931935608387,
-                "38": 0.0601283498108387,
-                "39": 0.0571986623108387,
-                "40": 0.0670340433716774,
-                "41": 0.0523507259786129,
-                "42": 0.0547223798930645,
-                "43": 0.0631975457072258,
-                "44": 0.0663713738322258,
-                "45": 0.0603376142680645,
-                "46": 0.0652204304933548,
-                "47": 0.0734514519572258,
-                "48": 0.0693708211183548,
-                "49": 0.0725446492433548,
-                "50": 0.0627790242433548,
-                "51": 0.0691266804933548,
-                "52": 0.0688825398683548,
-                "53": 0.068429134786129,
-                "54": 0.0605119988322258,
-                "55": 0.0799386203289032,
-                "56": 0.0853097140789032,
-                "57": 0.0661969929933548,
-                "58": 0.0689871683716774,
-                "59": 0.0724051371216774,
-                "60": 0.0541643425822258,
-                "61": 0.0626743882894516,
-                "62": 0.0628487765789032,
-                "63": 0.0607212632894516,
-                "64": 0.0589076466858387,
-                "65": 0.0451660193502903,
-                "66": 0.0453055277466774,
-                "67": 0.0414341539144516,
-                "68": 0.0385044664144516,
-                "69": 0.0414341539144516,
-                "70": 0.0466308631002903,
-                "71": 0.0399693101644516,
-                "72": 0.0437011756002903,
-                "73": 0.0434221550822258,
-                "74": 0.0428989976644516,
-                "75": 0.0401785746216774,
-                "76": 0.0431082621216774,
-                "77": 0.0484444759786129,
-                "78": 0.0417829267680645,
-                "79": 0.0418178029358387
-            }
-        }
-    }
-}
\ No newline at end of file
diff --git a/tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json b/tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json
deleted file mode 100644
index bb734039e982b..0000000000000
--- a/tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json
+++ /dev/null
@@ -1,42 +0,0 @@
-{
-    "model_type": "llama",
-    "kv_cache": {
-        "dtype": "float8_e4m3fn",
-        "scaling_factor": {
-            "0": {
-                "0": 0.0152239128947258,
-                "1": 0.0188860222697258,
-                "2": 0.0354178324341774,
-                "3": 0.0376674123108387,
-                "4": 0.0418526791036129,
-                "5": 0.0433175228536129,
-                "6": 0.0397600457072258,
-                "7": 0.0424455925822258,
-                "8": 0.0415387861430645,
-                "9": 0.0408412404358387,
-                "10": 0.0395856611430645,
-                "11": 0.0377371683716774,
-                "12": 0.0400739423930645,
-                "13": 0.040771484375,
-                "14": 0.0393415205180645,
-                "15": 0.0369001142680645,
-                "16": 0.03857421875,
-                "17": 0.0387486070394516,
-                "18": 0.0403180830180645,
-                "19": 0.0396205373108387,
-                "20": 0.0375627800822258,
-                "21": 0.0407366082072258,
-                "22": 0.0432477705180645,
-                "23": 0.0377022884786129,
-                "24": 0.0399693101644516,
-                "25": 0.0374581478536129,
-                "26": 0.0413295216858387,
-                "27": 0.0442243330180645,
-                "28": 0.0424804724752903,
-                "29": 0.0456891767680645,
-                "30": 0.0409109964966774,
-                "31": 0.0482352152466774
-            }
-        }
-    }
-}
diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py
index 124d5d297a574..574a0f223ef0d 100644
--- a/tests/kernels/test_attention.py
+++ b/tests/kernels/test_attention.py
@@ -182,7 +182,7 @@ def test_paged_attention(
     key_cache, value_cache = key_caches[0], value_caches[0]
 
     # Using default kv_scale
-    k_scale = v_scale = 1.0
+    k_scale = v_scale = torch.tensor(1.0, dtype=torch.float32, device=device)
 
     # Call the paged attention kernel.
     output = torch.empty_like(query)
diff --git a/tests/kernels/test_blocksparse_attention.py b/tests/kernels/test_blocksparse_attention.py
index fad342d1b5923..08f31219e3574 100644
--- a/tests/kernels/test_blocksparse_attention.py
+++ b/tests/kernels/test_blocksparse_attention.py
@@ -210,7 +210,7 @@ def test_paged_attention(
     key_cache, value_cache = key_caches[0], value_caches[0]
 
     # Using default kv_scale
-    k_scale = v_scale = 1.0
+    k_scale = v_scale = torch.tensor(1.0, dtype=torch.float32, device=device)
     tp_rank = 0
 
     # Call the paged attention kernel.
diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py
index 40550ed51e2c7..c848be4f9d807 100644
--- a/tests/kernels/test_cache.py
+++ b/tests/kernels/test_cache.py
@@ -160,7 +160,7 @@ def test_reshape_and_cache(
         cloned_value_cache = value_cache.clone()
 
     # Using default kv_scale
-    k_scale = v_scale = 1.0
+    k_scale = v_scale = torch.tensor(1.0, dtype=torch.float32, device=device)
 
     # Call the reshape_and_cache kernel.
     opcheck(torch.ops._C_cache_ops.reshape_and_cache,
@@ -258,8 +258,8 @@ def test_reshape_and_cache_flash(
     del key_caches
     del value_caches
 
-    k_scale = key.amax().item() / 256
-    v_scale = value.amax().item() / 256
+    k_scale = (key.amax() / 256.0).to(torch.float32)
+    v_scale = (value.amax() / 256.0).to(torch.float32)
 
     # Clone the KV caches.
     if kv_cache_dtype == "fp8":
@@ -284,12 +284,12 @@ def test_reshape_and_cache_flash(
         result_key_cache = torch.empty_like(key_cache, dtype=torch.float16)
         ops.convert_fp8(result_key_cache,
                         key_cache,
-                        k_scale,
+                        k_scale.item(),
                         kv_dtype=kv_cache_dtype)
         result_value_cache = torch.empty_like(value_cache, dtype=torch.float16)
         ops.convert_fp8(result_value_cache,
                         value_cache,
-                        v_scale,
+                        v_scale.item(),
                         kv_dtype=kv_cache_dtype)
 
     # Run the reference implementation.
diff --git a/tests/kernels/test_prefix_prefill.py b/tests/kernels/test_prefix_prefill.py
index 3fdb7996ba4e0..10e73ab950b0e 100644
--- a/tests/kernels/test_prefix_prefill.py
+++ b/tests/kernels/test_prefix_prefill.py
@@ -138,6 +138,7 @@ def test_contexted_kv_attention(
     # to V_cache[num_blocks, num_kv_heads, head_size, block_size]
     v_cache = v_cache.view(-1, block_size, num_kv_heads,
                            head_size).permute(0, 2, 3, 1).contiguous()
+    k_scale = v_scale = torch.tensor(1.0, dtype=torch.float32, device=device)
 
     # Warm up the Triton kernel by calling it once before actually measuring
     # generation time
@@ -153,6 +154,8 @@ def test_contexted_kv_attention(
                           b_seq_len,
                           b_ctx_len,
                           max_input_len,
+                          k_scale,
+                          v_scale,
                           sliding_window=sliding_window)
     torch.cuda.synchronize()
     start_time = time.time()
@@ -168,6 +171,8 @@ def test_contexted_kv_attention(
                           b_seq_len,
                           b_ctx_len,
                           max_input_len,
+                          k_scale,
+                          v_scale,
                           sliding_window=sliding_window)
     torch.cuda.synchronize()
     end_time = time.time()
@@ -366,6 +371,7 @@ def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
     # to V_cache[num_blocks, num_kv_heads, head_size, block_size]
     v_cache = v_cache.view(-1, block_size, num_kv_heads,
                            head_size).permute(0, 2, 3, 1).contiguous()
+    k_scale = v_scale = torch.tensor(1.0, dtype=torch.float32, device=device)
 
     # Warm up the Triton kernel by calling it once before actually measuring
     # generation time
@@ -381,6 +387,8 @@ def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
                           b_seq_len,
                           b_ctx_len,
                           max_input_len,
+                          k_scale,
+                          v_scale,
                           alibi_slopes=alibi_slopes)
     torch.cuda.synchronize()
     start_time = time.time()
@@ -396,6 +404,8 @@ def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
                           b_seq_len,
                           b_ctx_len,
                           max_input_len,
+                          k_scale,
+                          v_scale,
                           alibi_slopes=alibi_slopes)
     torch.cuda.synchronize()
     end_time = time.time()
diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py
index 848eea7f54cab..8011398551b9d 100644
--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@@ -909,6 +909,7 @@ def make_test_metadata(
             num_prefills=num_prefills,
             slot_mapping=(None if kv_mmap is None else kv_mmap.slot_mapping),
             multi_modal_placeholder_index_maps=None,
+            enable_kv_scales_calculation=True,
             num_prefill_tokens=num_prefill_tokens,
             num_decode_tokens=num_decode_tokens,
             seq_lens=seq_lens,
@@ -958,6 +959,7 @@ def make_test_metadata(
             num_prefills=num_prefills,
             slot_mapping=kv_mmap.slot_mapping,
             multi_modal_placeholder_index_maps=None,
+            enable_kv_scales_calculation=True,
             num_prefill_tokens=num_prefill_tokens,
             num_decode_tokens=num_decode_tokens,
             seq_lens=seq_lens,
diff --git a/tests/models/decoder_only/language/test_fp8.py b/tests/models/decoder_only/language/test_fp8.py
index 53f23e24511b3..5f06f1e3a2fe9 100644
--- a/tests/models/decoder_only/language/test_fp8.py
+++ b/tests/models/decoder_only/language/test_fp8.py
@@ -19,18 +19,17 @@
 @pytest.mark.skipif(not is_quant_method_supported("fp8"),
                     reason="fp8 is not supported on this GPU type.")
 @pytest.mark.parametrize(
-    "kv_cache_dtype,base_model,test_model,scale_path",
+    "kv_cache_dtype,base_model,test_model",
     [
         # Test FP8 checkpoint w. fp8_e4m3 kv-cache scaling factors.
         ("fp8_e4m3", "meta-llama/Llama-3.2-1B-Instruct",
-         "nm-testing/Llama-3.2-1B-Instruct-FP8-KV", None),
+         "nm-testing/Llama-3.2-1B-Instruct-FP8-KV"),
         # Test FP16 checkpoint w. fp8_e5m2 kv-cache.
         ("fp8_e5m2", "meta-llama/Llama-3.2-1B-Instruct",
-         "meta-llama/Llama-3.2-1B-Instruct", None),
+         "meta-llama/Llama-3.2-1B-Instruct"),
         # Test FP16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json.
         ("fp8_e4m3", "meta-llama/Llama-2-7b-chat-hf",
-         "meta-llama/Llama-2-7b-chat-hf",
-         "./tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json")
+         "meta-llama/Llama-2-7b-chat-hf")
     ])
 # Due to low-precision numerical divergence, we only test logprob of 4 tokens
 @pytest.mark.parametrize("max_tokens", [4])
@@ -48,7 +47,6 @@ def test_models(
     kv_cache_dtype: str,
     base_model: str,
     test_model: str,
-    scale_path: Optional[str],
     max_tokens: int,
     enforce_eager: bool,
     backend: str,
@@ -76,10 +74,6 @@ def test_models(
         baseline_outputs = vllm_model.generate_greedy_logprobs(
             example_prompts, max_tokens, NUM_LOG_PROBS)
 
-    extra_kwargs = {}
-    if scale_path is not None:
-        extra_kwargs["quantization_param_path"] = scale_path
-
     with vllm_runner(
             test_model,
             max_model_len=MAX_MODEL_LEN,
@@ -87,7 +81,6 @@ def test_models(
             enforce_eager=enforce_eager,
             kv_cache_dtype=kv_cache_dtype,
             disable_async_output_proc=disable_async_output_proc,
-            **extra_kwargs,
     ) as vllm_model:
         test_outputs = vllm_model.generate_greedy_logprobs(
             example_prompts, max_tokens, NUM_LOG_PROBS)
diff --git a/tests/worker/test_model_input.py b/tests/worker/test_model_input.py
index 309854e6babf3..57f1fd47a600f 100644
--- a/tests/worker/test_model_input.py
+++ b/tests/worker/test_model_input.py
@@ -74,6 +74,7 @@ def test_model_runner_input():
         num_decode_tokens=3,
         slot_mapping=torch.zeros(1),
         multi_modal_placeholder_index_maps=None,
+        enable_kv_scales_calculation=True,
     )
     model_input = ModelInputForGPUWithSamplingMetadata(
         input_tokens=torch.ones(10),
@@ -126,6 +127,7 @@ def test_embedding_model_runner_input():
         num_decode_tokens=3,
         slot_mapping=torch.zeros(1),
         multi_modal_placeholder_index_maps=None,
+        enable_kv_scales_calculation=True,
     )
     model_input = ModelInputForGPUWithPoolingMetadata(
         input_tokens=torch.ones(10),
@@ -177,6 +179,7 @@ def test_multi_step_model_runner_input():
         num_decode_tokens=3,
         slot_mapping=torch.zeros(1),
         multi_modal_placeholder_index_maps=None,
+        enable_kv_scales_calculation=True,
     )
     frozen_model_input = ModelInputForGPUWithSamplingMetadata(
         input_tokens=torch.ones(10),
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index d04cbbc0a9eed..440bc52012ab7 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -48,8 +48,8 @@ def paged_attention_v1(
     max_seq_len: int,
     alibi_slopes: Optional[torch.Tensor],
     kv_cache_dtype: str,
-    k_scale: float,
-    v_scale: float,
+    k_scale: torch.Tensor,
+    v_scale: torch.Tensor,
     tp_rank: int = 0,
     blocksparse_local_blocks: int = 0,
     blocksparse_vert_stride: int = 0,
@@ -80,8 +80,8 @@ def paged_attention_v2(
     max_seq_len: int,
     alibi_slopes: Optional[torch.Tensor],
     kv_cache_dtype: str,
-    k_scale: float,
-    v_scale: float,
+    k_scale: torch.Tensor,
+    v_scale: torch.Tensor,
     tp_rank: int = 0,
     blocksparse_local_blocks: int = 0,
     blocksparse_vert_stride: int = 0,
@@ -112,8 +112,8 @@ def paged_attention_rocm(
     max_seq_len: int,
     alibi_slopes: Optional[torch.Tensor],
     kv_cache_dtype: str,
-    k_scale: float,
-    v_scale: float,
+    k_scale: torch.Tensor,
+    v_scale: torch.Tensor,
 ) -> None:
     torch.ops._rocm_C.paged_attention(out, exp_sum, max_logits, tmp_out, query,
                                       key_cache, value_cache, num_kv_heads,
@@ -956,8 +956,8 @@ def reshape_and_cache(
     value_cache: torch.Tensor,
     slot_mapping: torch.Tensor,
     kv_cache_dtype: str,
-    k_scale: float,
-    v_scale: float,
+    k_scale: torch.Tensor,
+    v_scale: torch.Tensor,
 ) -> None:
     torch.ops._C_cache_ops.reshape_and_cache(key, value, key_cache,
                                              value_cache, slot_mapping,
@@ -971,8 +971,8 @@ def reshape_and_cache_flash(
     value_cache: torch.Tensor,
     slot_mapping: torch.Tensor,
     kv_cache_dtype: str,
-    k_scale: float,
-    v_scale: float,
+    k_scale: torch.Tensor,
+    v_scale: torch.Tensor,
 ) -> None:
     torch.ops._C_cache_ops.reshape_and_cache_flash(key, value, key_cache,
                                                    value_cache, slot_mapping,
diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index 2efe142a17b69..8027a52b82ffc 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -123,6 +123,10 @@ class AttentionMetadata:
     multi_modal_placeholder_index_maps: Optional[Dict[
         str, MultiModalPlaceholderMap.IndexMap]]
 
+    # Enable/disable KV scales calculation. This is so that we can disable the
+    # calculation until after prefill and cuda graph capture.
+    enable_kv_scales_calculation: bool
+
     @property
     @abstractmethod
     def prefill_metadata(self) -> Optional["AttentionMetadata"]:
@@ -226,8 +230,10 @@ def build(self, seq_lens: List[int], query_lens: List[int],
 
 class AttentionLayer(Protocol):
 
-    _k_scale: float
-    _v_scale: float
+    _k_scale: torch.Tensor
+    _v_scale: torch.Tensor
+    _k_scale_float: float
+    _v_scale_float: float
 
     def forward(
         self,
diff --git a/vllm/attention/backends/blocksparse_attn.py b/vllm/attention/backends/blocksparse_attn.py
index 9089db1126c94..20e9a3f139de2 100644
--- a/vllm/attention/backends/blocksparse_attn.py
+++ b/vllm/attention/backends/blocksparse_attn.py
@@ -222,6 +222,7 @@ def prefill_metadata(
             slot_mapping=self.slot_mapping[:self.num_prefill_tokens],
             multi_modal_placeholder_index_maps=self.
             multi_modal_placeholder_index_maps,
+            enable_kv_scales_calculation=self.enable_kv_scales_calculation,
             seq_lens=self.seq_lens[:self.num_prefills],
             seq_lens_tensor=self.seq_lens_tensor[:self.num_prefills],
             max_query_len=self.max_query_len,
@@ -251,6 +252,7 @@ def decode_metadata(self) -> Optional["BlocksparseFlashAttentionMetadata"]:
             num_decode_tokens=self.num_decode_tokens,
             slot_mapping=self.slot_mapping[self.num_prefill_tokens:],
             multi_modal_placeholder_index_maps=None,
+            enable_kv_scales_calculation=False,
             seq_lens=None,
             seq_lens_tensor=self.seq_lens_tensor[self.num_prefills:],
             max_query_len=None,
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index 18acfb82fac58..1be099283e472 100644
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -230,6 +230,7 @@ def prefill_metadata(self) -> Optional["FlashAttentionMetadata"]:
             slot_mapping=slot_mapping,
             multi_modal_placeholder_index_maps=self.
             multi_modal_placeholder_index_maps,
+            enable_kv_scales_calculation=self.enable_kv_scales_calculation,
             seq_lens=seq_lens,
             seq_lens_tensor=seq_lens_tensor,
             max_query_len=self.max_query_len,
@@ -274,6 +275,7 @@ def decode_metadata(self) -> Optional["FlashAttentionMetadata"]:
             num_decode_tokens=self.num_decode_tokens,
             slot_mapping=slot_mapping,
             multi_modal_placeholder_index_maps=None,
+            enable_kv_scales_calculation=True,
             seq_lens=None,
             seq_lens_tensor=seq_lens_tensor,
             max_decode_query_len=self.max_decode_query_len,
@@ -557,6 +559,7 @@ def build(self, seq_lens: List[int], query_lens: List[int],
             num_decode_tokens=num_decode_tokens,
             seq_lens=seq_lens,
             multi_modal_placeholder_index_maps=placeholder_index_maps,
+            enable_kv_scales_calculation=True,
             seq_lens_tensor=seq_lens_tensor,
             max_query_len=max_query_len,
             max_decode_query_len=max_decode_query_len,
@@ -675,7 +678,7 @@ def forward(
         NOTE: It in-place updates the output tensor.
         """
         # NOTE(woosuk): FlashAttention does not support FP8 KV cache.
-        assert layer._k_scale == 1.0 and layer._v_scale == 1.0, (
+        assert layer._k_scale_float == 1.0 and layer._v_scale_float == 1.0, (
             "key/v_scale is not supported in FlashAttention.")
 
         assert output is not None, "Output tensor must be provided."
diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
index b8ffbe6dd64dd..3135b0b405343 100644
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -219,6 +219,7 @@ def graph_capture_get_metadata_for_batch(
             num_prefills=0,
             slot_mapping=self._graph_slot_mapping[:batch_size],
             multi_modal_placeholder_index_maps=None,
+            enable_kv_scales_calculation=False,
             num_prefill_tokens=0,
             num_decode_tokens=batch_size,
             max_prefill_seq_len=0,
@@ -733,6 +734,7 @@ def build(self, seq_lens: List[int], query_lens: List[int],
             num_prefills=self.num_prefills,
             slot_mapping=slot_mapping_tensor,
             multi_modal_placeholder_index_maps=placeholder_index_maps,
+            enable_kv_scales_calculation=False,
             num_prefill_tokens=self.num_prefill_tokens,
             num_decode_tokens=num_decode_tokens,
             max_prefill_seq_len=max_prefill_seq_len,
@@ -888,8 +890,8 @@ def forward(
                     kv_cache,
                     logits_soft_cap=logits_soft_cap,
                     causal=True,
-                    k_scale=layer._k_scale,
-                    v_scale=layer._v_scale,
+                    k_scale=layer._k_scale_float,
+                    v_scale=layer._v_scale_float,
                     window_left=window_left)
         if decode_meta := attn_metadata.decode_metadata:
             assert decode_meta is not None
@@ -899,8 +901,8 @@ def forward(
                 kv_cache,
                 sm_scale=softmax_scale,
                 logits_soft_cap=logits_soft_cap,
-                k_scale=layer._k_scale,
-                v_scale=layer._v_scale,
+                k_scale=layer._k_scale_float,
+                v_scale=layer._v_scale_float,
                 window_left=window_left)
 
         if prefill_output is None and decode_output is not None:
diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py
index cd729a1c8b274..57916a3c6a34c 100644
--- a/vllm/attention/backends/ipex_attn.py
+++ b/vllm/attention/backends/ipex_attn.py
@@ -193,7 +193,7 @@ def forward(
         Returns:
             shape = [num_tokens, num_heads * head_size]
         """
-        assert layer._k_scale == 1.0 and layer._v_scale == 1.0
+        assert layer._k_scale_float == 1.0 and layer._v_scale_float == 1.0
         num_tokens, hidden_size = query.shape
         # Reshape the query, key, and value tensors.
         query = query.view(-1, self.num_heads, self.head_size)
diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py
index f5bf390df6afb..facdee6b29e39 100644
--- a/vllm/attention/backends/pallas.py
+++ b/vllm/attention/backends/pallas.py
@@ -173,7 +173,7 @@ def forward(
         Returns:
             shape = [batch_size, seq_len, num_heads * head_size]
         """
-        assert layer._k_scale == 1.0 and layer._v_scale == 1.0
+        assert layer._k_scale_float == 1.0 and layer._v_scale_float == 1.0
         batch_size, seq_len, hidden_size = query.shape
         query = query.view(batch_size, seq_len, self.num_heads, self.head_size)
         key = key.view(batch_size, seq_len, self.num_kv_heads, self.head_size)
diff --git a/vllm/attention/backends/placeholder_attn.py b/vllm/attention/backends/placeholder_attn.py
index 37860494702cf..826311896d1d2 100644
--- a/vllm/attention/backends/placeholder_attn.py
+++ b/vllm/attention/backends/placeholder_attn.py
@@ -140,6 +140,7 @@ def prefill_metadata(self) -> Optional["PlaceholderAttentionMetadata"]:
             slot_mapping=slot_mapping,
             multi_modal_placeholder_index_maps=self.
             multi_modal_placeholder_index_maps,
+            enable_kv_scales_calculation=self.enable_kv_scales_calculation,
             seq_lens=self.seq_lens[:self.num_prefills],
             seq_lens_tensor=self.seq_lens_tensor[:self.num_prefills],
             max_decode_query_len=0,
@@ -173,6 +174,7 @@ def decode_metadata(self) -> Optional["PlaceholderAttentionMetadata"]:
             num_decode_tokens=self.num_decode_tokens,
             slot_mapping=slot_mapping,
             multi_modal_placeholder_index_maps=None,
+            enable_kv_scales_calculation=True,
             seq_lens=None,
             seq_lens_tensor=self.seq_lens_tensor[self.num_prefills:],
             max_decode_query_len=self.max_decode_query_len,
@@ -380,6 +382,7 @@ def build(self, seq_lens: List[int], query_lens: List[int],
             num_prefills=self.num_prefills,
             slot_mapping=slot_mapping,
             multi_modal_placeholder_index_maps=placeholder_index_maps,
+            enable_kv_scales_calculation=True,
             num_prefill_tokens=self.num_prefill_tokens,
             num_decode_tokens=num_decode_tokens,
             seq_lens=seq_lens,
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index e9f2808ff1674..ca6fa9ca61b30 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -153,6 +153,7 @@ def prefill_metadata(self) -> Optional["ROCmFlashAttentionMetadata"]:
             slot_mapping=self.slot_mapping[:self.num_prefill_tokens],
             multi_modal_placeholder_index_maps=self.
             multi_modal_placeholder_index_maps,
+            enable_kv_scales_calculation=self.enable_kv_scales_calculation,
             seq_lens=self.seq_lens[:self.num_prefills],
             seq_lens_tensor=self.seq_lens_tensor[:self.num_prefills],
             max_query_len=self.max_query_len,
@@ -182,6 +183,7 @@ def decode_metadata(self) -> Optional["ROCmFlashAttentionMetadata"]:
             num_decode_tokens=self.num_decode_tokens,
             slot_mapping=self.slot_mapping[self.num_prefill_tokens:],
             multi_modal_placeholder_index_maps=None,
+            enable_kv_scales_calculation=True,
             seq_lens=None,
             seq_lens_tensor=self.seq_lens_tensor[self.num_prefills:],
             max_query_len=None,
diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py
index 8722d7376795a..c3b2398b4e632 100644
--- a/vllm/attention/backends/torch_sdpa.py
+++ b/vllm/attention/backends/torch_sdpa.py
@@ -379,6 +379,7 @@ def build(self, seq_lens: List[int], query_lens: List[int],
             prefill_block_tables=prefill_block_tables,
             slot_mapping=slot_mapping,
             multi_modal_placeholder_index_maps=placeholder_index_maps,
+            enable_kv_scales_calculation=False,
         )
 
         return attn_metadata
@@ -454,7 +455,6 @@ def forward(
         Returns:
             shape = [num_tokens, num_heads * head_size]
         """
-        assert layer._k_scale == 1.0 and layer._v_scale == 1.0
         attn_type = self.attn_type
         if (attn_type == AttentionType.ENCODER
                 and (not attn_metadata.is_all_encoder_attn_metadata_set)):
diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py
index 3df7f54cbd8d2..84fe89b7df360 100644
--- a/vllm/attention/backends/utils.py
+++ b/vllm/attention/backends/utils.py
@@ -265,6 +265,7 @@ def build(self, seq_lens: List[int], query_lens: List[int],
             num_prefills=self.num_prefills,
             slot_mapping=slot_mapping_tensor,
             multi_modal_placeholder_index_maps=placeholder_index_maps,
+            enable_kv_scales_calculation=True,
             num_prefill_tokens=self.num_prefill_tokens,
             num_decode_tokens=num_decode_tokens,
             seq_lens=seq_lens,
@@ -317,6 +318,7 @@ def graph_capture_get_metadata_for_batch(
             num_decode_tokens=batch_size,
             slot_mapping=self._graph_slot_mapping[:batch_size],
             multi_modal_placeholder_index_maps=None,
+            enable_kv_scales_calculation=True,
             seq_lens=None,
             seq_lens_tensor=self._graph_seq_lens[:batch_size],
             max_query_len=1,
diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
index 38e27434dab2c..8c25dda7aad2c 100644
--- a/vllm/attention/backends/xformers.py
+++ b/vllm/attention/backends/xformers.py
@@ -218,6 +218,7 @@ def prefill_metadata(self) -> Optional["XFormersMetadata"]:
             slot_mapping=slot_mapping,
             multi_modal_placeholder_index_maps=self.
             multi_modal_placeholder_index_maps,
+            enable_kv_scales_calculation=self.enable_kv_scales_calculation,
             seq_lens=seq_lens,
             seq_lens_tensor=seq_lens_tensor,
             max_query_len=self.max_query_len,
@@ -262,6 +263,7 @@ def decode_metadata(self) -> Optional["XFormersMetadata"]:
             num_decode_tokens=self.num_decode_tokens,
             slot_mapping=slot_mapping,
             multi_modal_placeholder_index_maps=None,
+            enable_kv_scales_calculation=True,
             seq_lens_tensor=seq_lens_tensor,
             max_prefill_seq_len=0,
             max_decode_seq_len=self.max_decode_seq_len,
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index c36f8d08eb4a7..79ea9b666c7e8 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -5,6 +5,7 @@
 import torch.nn as nn
 import torch.nn.functional as F
 
+import vllm.envs as envs
 from vllm.attention import AttentionMetadata, AttentionType
 from vllm.attention.selector import backend_name_to_enum, get_attn_backend
 from vllm.config import CacheConfig, get_current_vllm_config
@@ -57,10 +58,12 @@ def __init__(
             kv_cache_dtype = cache_config.cache_dtype
             block_size = cache_config.block_size
             is_attention_free = cache_config.is_attention_free
+            calculate_kv_scales = cache_config.calculate_kv_scales
         else:
             kv_cache_dtype = "auto"
             block_size = 16
             is_attention_free = False
+            calculate_kv_scales = False
         if num_kv_heads is None:
             num_kv_heads = num_heads
 
@@ -70,8 +73,15 @@ def __init__(
         # expect the pre-quantized k/v_scale to be loaded along
         # with the model weights.
         self.kv_cache_dtype = kv_cache_dtype
-        self._k_scale = 1.0
-        self._v_scale = 1.0
+        self.calculate_kv_scales = calculate_kv_scales
+        self._k_scale = torch.tensor(1.0, dtype=torch.float32)
+        self._v_scale = torch.tensor(1.0, dtype=torch.float32)
+
+        # We also keep the float32 versions of k/v_scale for attention
+        # backends that don't support tensors (Flashinfer)
+        self._k_scale_float = 1.0
+        self._v_scale_float = 1.0
+
         quant_method = quant_config.get_quant_method(
             self, prefix=prefix) if quant_config else None
         if quant_method is not None:
@@ -127,6 +137,9 @@ def __init__(
             ).parallel_config.pipeline_parallel_size)
         ]
 
+        self.k_range = torch.tensor(envs.K_SCALE_CONSTANT, dtype=torch.float32)
+        self.v_range = torch.tensor(envs.V_SCALE_CONSTANT, dtype=torch.float32)
+
     def forward(
         self,
         query: torch.Tensor,
@@ -135,6 +148,9 @@ def forward(
         kv_cache: torch.Tensor,
         attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
+        if self.calculate_kv_scales and \
+            attn_metadata.enable_kv_scales_calculation:
+            self.calc_kv_scales(key, value)
         if self.use_output:
             output = torch.empty_like(query)
             hidden_size = query.size(-1)
@@ -161,6 +177,14 @@ def forward(
                 return torch.ops.vllm.unified_attention(
                     query, key, value, self.layer_name)
 
+    def calc_kv_scales(self, key, value):
+        self._k_scale.copy_(torch.abs(key).max() / self.k_range)
+        self._v_scale.copy_(torch.abs(value).max() / self.v_range)
+        self._k_scale_float = self._k_scale.item()
+        self._v_scale_float = self._v_scale.item()
+        # We only calculate the scales once
+        self.calculate_kv_scales = False
+
     def extra_repr(self) -> str:
         s = f"head_size={self.impl.head_size}"  # type: ignore
         s += f", num_heads={self.impl.num_heads}"  # type: ignore
diff --git a/vllm/attention/ops/ipex_attn.py b/vllm/attention/ops/ipex_attn.py
index cbc6c74acf09a..3a07184ed31f0 100644
--- a/vllm/attention/ops/ipex_attn.py
+++ b/vllm/attention/ops/ipex_attn.py
@@ -52,8 +52,8 @@ def write_to_paged_cache(
         value_cache: torch.Tensor,
         slot_mapping: torch.Tensor,
         kv_cache_dtype: str,
-        k_scale: float,
-        v_scale: float,
+        k_scale: torch.Tensor,
+        v_scale: torch.Tensor,
         *args,
     ) -> None:
         ops.reshape_and_cache(
@@ -80,8 +80,8 @@ def forward_decode(
         num_kv_heads: int,
         scale: float,
         alibi_slopes: Optional[torch.Tensor],
-        k_scale: float,
-        v_scale: float,
+        k_scale: torch.Tensor,
+        v_scale: torch.Tensor,
         *args,
     ) -> None:
         tp_rank: int = 0
@@ -149,8 +149,8 @@ def write_to_paged_cache(
         value_cache: torch.Tensor,
         slot_mapping: torch.Tensor,
         kv_cache_dtype: str,
-        k_scale: float,
-        v_scale: float,
+        k_scale: torch.Tensor,
+        v_scale: torch.Tensor,
         *args,
     ) -> None:
         ipex_modules.PagedAttention.reshape_and_cache(
@@ -170,8 +170,8 @@ def forward_decode(
         num_kv_heads: int,
         scale: float,
         alibi_slopes: Optional[torch.Tensor],
-        k_scale: float,
-        v_scale: float,
+        k_scale: torch.Tensor,
+        v_scale: torch.Tensor,
         *args,
     ) -> None:
         block_size = value_cache.shape[2]
diff --git a/vllm/attention/ops/paged_attn.py b/vllm/attention/ops/paged_attn.py
index 076f151ffcb61..fd62329141f6f 100644
--- a/vllm/attention/ops/paged_attn.py
+++ b/vllm/attention/ops/paged_attn.py
@@ -69,8 +69,8 @@ def write_to_paged_cache(
         value_cache: torch.Tensor,
         slot_mapping: torch.Tensor,
         kv_cache_dtype: str,
-        k_scale: float,
-        v_scale: float,
+        k_scale: torch.Tensor,
+        v_scale: torch.Tensor,
     ) -> None:
         ops.reshape_and_cache(
             key,
@@ -95,8 +95,8 @@ def forward_decode(
         num_kv_heads: int,
         scale: float,
         alibi_slopes: Optional[torch.Tensor],
-        k_scale: float,
-        v_scale: float,
+        k_scale: torch.Tensor,
+        v_scale: torch.Tensor,
         tp_rank: int = 0,
         blocksparse_local_blocks: int = 0,
         blocksparse_vert_stride: int = 0,
@@ -204,8 +204,8 @@ def forward_prefix(
         max_query_len: int,
         alibi_slopes: Optional[torch.Tensor],
         sliding_window: Optional[int],
-        k_scale: float,
-        v_scale: float,
+        k_scale: torch.Tensor,
+        v_scale: torch.Tensor,
     ) -> torch.Tensor:
         output = torch.empty_like(query)
         context_attention_fwd(
diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py
index 9c11a8df55278..e2f2b66dfc90c 100644
--- a/vllm/attention/ops/prefix_prefill.py
+++ b/vllm/attention/ops/prefix_prefill.py
@@ -133,7 +133,7 @@ def _fwd_kernel(
                              other=0.0)  # [D,N]
 
             if k_load.dtype.is_fp8():
-                k = (k_load.to(tl.float32) * k_scale).to(q.dtype)
+                k = (k_load.to(tl.float32) * tl.load(k_scale)).to(q.dtype)
             else:
                 k = k_load
 
@@ -181,7 +181,7 @@ def _fwd_kernel(
                              ((start_n + offs_n[:, None]) < cur_batch_ctx_len),
                              other=0.0)  # [N,D]
             if v_load.dtype.is_fp8():
-                v = (v_load.to(tl.float32) * v_scale).to(q.dtype)
+                v = (v_load.to(tl.float32) * tl.load(v_scale)).to(q.dtype)
             else:
                 v = v_load
             p = p.to(v.dtype)
@@ -564,7 +564,7 @@ def _fwd_kernel_alibi(
                              other=0.0)  # [D,N]
 
             if k_load.dtype.is_fp8():
-                k = (k_load.to(tl.float32) * k_scale).to(q.dtype)
+                k = (k_load.to(tl.float32) * tl.load(k_scale)).to(q.dtype)
             else:
                 k = k_load
 
@@ -604,7 +604,7 @@ def _fwd_kernel_alibi(
                              ((start_n + offs_n[:, None]) < cur_batch_ctx_len),
                              other=0.0)
             if v_load.dtype.is_fp8():
-                v = (v_load.to(tl.float32) * v_scale).to(q.dtype)
+                v = (v_load.to(tl.float32) * tl.load(v_scale)).to(q.dtype)
             else:
                 v = v_load
             p = p.to(v.dtype)
@@ -713,8 +713,8 @@ def context_attention_fwd(q,
                               b_seq_len,
                               b_ctx_len,
                               max_input_len,
-                              k_scale: float = 1.0,
-                              v_scale: float = 1.0,
+                              k_scale: torch.Tensor,
+                              v_scale: torch.Tensor,
                               alibi_slopes=None,
                               sliding_window=None):
 
diff --git a/vllm/config.py b/vllm/config.py
index f7547921a05ea..efd81ad3de3b4 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -120,11 +120,6 @@ class ModelConfig:
             decoding draft models.
         quantization: Quantization method that was used to quantize the model
             weights. If None, we assume the model weights are not quantized.
-        quantization_param_path: Path to JSON file containing scaling factors.
-            Used to load KV cache scaling factors into the model when KV cache
-            type is FP8_E4M3 on ROCm (AMD GPU). In the future these will also
-            be used to load activation and weight scaling factors when the
-            model dtype is FP8_E4M3 on ROCm.
         enforce_eager: Whether to enforce eager execution. If True, we will
             disable CUDA graph and always execute the model in eager mode.
             If False, we will use CUDA graph and eager execution in hybrid.
@@ -187,7 +182,6 @@ def compute_hash(self) -> str:
         factors.append(self.model)
         factors.append(self.dtype)
         factors.append(self.quantization)
-        factors.append(self.quantization_param_path)
         factors.append(self.revision)
         factors.append(self.code_revision)
         factors.append(self.trust_remote_code)
@@ -213,7 +207,6 @@ def __init__(
         max_model_len: Optional[int] = None,
         spec_target_max_model_len: Optional[int] = None,
         quantization: Optional[str] = None,
-        quantization_param_path: Optional[str] = None,
         enforce_eager: Optional[bool] = None,
         max_seq_len_to_capture: Optional[int] = None,
         max_logprobs: int = 20,
@@ -274,7 +267,6 @@ def __init__(
         else:
             self.tokenizer_revision = tokenizer_revision
         self.quantization = quantization
-        self.quantization_param_path = quantization_param_path
         self.enforce_eager = enforce_eager
         self.max_seq_len_to_capture = max_seq_len_to_capture
         self.max_logprobs = max_logprobs
@@ -1002,6 +994,7 @@ def __init__(
         sliding_window: Optional[int] = None,
         enable_prefix_caching: bool = False,
         cpu_offload_gb: float = 0,
+        calculate_kv_scales: Optional[bool] = None,
     ) -> None:
         self.block_size = block_size
         self.gpu_memory_utilization = gpu_memory_utilization
@@ -1012,7 +1005,7 @@ def __init__(
         self.sliding_window = sliding_window
         self.enable_prefix_caching = enable_prefix_caching
         self.cpu_offload_gb = cpu_offload_gb
-
+        self.calculate_kv_scales = calculate_kv_scales
         self._verify_args()
         self._verify_cache_dtype()
         self._verify_prefix_caching()
@@ -1021,6 +1014,10 @@ def __init__(
         self.num_gpu_blocks: Optional[int] = None
         self.num_cpu_blocks: Optional[int] = None
 
+        # Set calculate_kv_scales to False if the value is unset.
+        if self.calculate_kv_scales is None:
+            self.calculate_kv_scales = False
+
     def metrics_info(self):
         # convert cache_config to dict(key: str, value: str) for prometheus
         # metrics info
@@ -3297,7 +3294,6 @@ def __str__(self):
             f"quantization={self.model_config.quantization}, "
             f"enforce_eager={self.model_config.enforce_eager}, "
             f"kv_cache_dtype={self.cache_config.cache_dtype}, "
-            f"quantization_param_path={self.model_config.quantization_param_path},"
             f" device_config={self.device_config.device}, "
             f"decoding_config={self.decoding_config!r}, "
             f"observability_config={self.observability_config!r}, "
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index f58c1b55e0c70..5d3aeb68ebcfe 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -98,7 +98,6 @@ class EngineArgs:
     config_format: ConfigFormat = ConfigFormat.AUTO
     dtype: str = 'auto'
     kv_cache_dtype: str = 'auto'
-    quantization_param_path: Optional[str] = None
     seed: int = 0
     max_model_len: Optional[int] = None
     worker_use_ray: bool = False
@@ -199,6 +198,8 @@ class EngineArgs:
     generation_config: Optional[str] = None
     enable_sleep_mode: bool = False
 
+    calculate_kv_scales: Optional[bool] = None
+
     def __post_init__(self):
         if not self.tokenizer:
             self.tokenizer = self.model
@@ -350,17 +351,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             help='Data type for kv cache storage. If "auto", will use model '
             'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
             'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)')
-        parser.add_argument(
-            '--quantization-param-path',
-            type=nullable_str,
-            default=None,
-            help='Path to the JSON file containing the KV cache '
-            'scaling factors. This should generally be supplied, when '
-            'KV cache dtype is FP8. Otherwise, KV cache scaling factors '
-            'default to 1.0, which may cause accuracy issues. '
-            'FP8_E5M2 (without scaling) is only supported on cuda version '
-            'greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead '
-            'supported for common inference criteria.')
         parser.add_argument('--max-model-len',
                             type=int,
                             default=EngineArgs.max_model_len,
@@ -962,6 +952,15 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
                             help="Enable sleep mode for the engine. "
                             "(only cuda platform is supported)")
 
+        parser.add_argument(
+            '--calculate-kv-scales',
+            action='store_true',
+            help='This enables dynamic calculation of '
+            'k_scale and v_scale when kv-cache-dtype is fp8. '
+            'If calculate-kv-scales is false, the scales will '
+            'be loaded from the model checkpoint if available. '
+            'Otherwise, the scales will default to 1.0.')
+
         return parser
 
     @classmethod
@@ -991,7 +990,6 @@ def create_model_config(self) -> ModelConfig:
             tokenizer_revision=self.tokenizer_revision,
             max_model_len=self.max_model_len,
             quantization=self.quantization,
-            quantization_param_path=self.quantization_param_path,
             enforce_eager=self.enforce_eager,
             max_seq_len_to_capture=self.max_seq_len_to_capture,
             max_logprobs=self.max_logprobs,
@@ -1068,6 +1066,7 @@ def create_engine_config(self,
             sliding_window=model_config.get_sliding_window(),
             enable_prefix_caching=self.enable_prefix_caching,
             cpu_offload_gb=self.cpu_offload_gb,
+            calculate_kv_scales=self.calculate_kv_scales,
         )
         parallel_config = ParallelConfig(
             pipeline_parallel_size=self.pipeline_parallel_size,
diff --git a/vllm/envs.py b/vllm/envs.py
index b72e9141ac792..8627caec7790d 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -73,6 +73,8 @@
     VLLM_ENABLE_V1_MULTIPROCESSING: bool = True
     VLLM_LOG_BATCHSIZE_INTERVAL: float = -1
     VLLM_DISABLE_COMPILE_CACHE: bool = False
+    K_SCALE_CONSTANT: int = 200
+    V_SCALE_CONSTANT: int = 100
     VLLM_SERVER_DEV_MODE: bool = False
     VLLM_V1_OUTPUT_PROC_CHUNK_SIZE: int = 128
 
@@ -474,6 +476,13 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     "VLLM_USE_V1":
     lambda: bool(int(os.getenv("VLLM_USE_V1", "0"))),
 
+    # Divisor for dynamic key scale factor calculation for FP8 KV Cache
+    "K_SCALE_CONSTANT":
+    lambda: int(os.getenv("K_SCALE_CONSTANT", "200")),
+
+    # Divisor for dynamic value scale factor calculation for FP8 KV Cache
+    "V_SCALE_CONSTANT":
+    lambda: int(os.getenv("V_SCALE_CONSTANT", "100")),
     # If set, enable multiprocessing in LLM for the V1 code path.
     "VLLM_ENABLE_V1_MULTIPROCESSING":
     lambda: bool(int(os.getenv("VLLM_ENABLE_V1_MULTIPROCESSING", "1"))),
diff --git a/vllm/model_executor/layers/quantization/kv_cache.py b/vllm/model_executor/layers/quantization/kv_cache.py
index a74f5415c8a51..e1870c73cc932 100644
--- a/vllm/model_executor/layers/quantization/kv_cache.py
+++ b/vllm/model_executor/layers/quantization/kv_cache.py
@@ -3,6 +3,7 @@
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
+from vllm.platforms import current_platform
 
 logger = init_logger(__name__)
 
@@ -40,11 +41,16 @@ def apply(self, layer: torch.nn.Module) -> torch.Tensor:
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         # If the kv-cache dtype is auto, we enforce the k/v_scale to be 1.0
         # regardless whether the kv-scale is available in the checkpoint.
-        if layer.kv_cache_dtype != "auto":
+        # No need to process kv scales after loading if we are going to
+        # calculate them on the fly.
+        if layer.kv_cache_dtype != "auto" and not layer.calculate_kv_scales:
             if layer.k_scale > 0.0 and layer.v_scale > 0.0:
                 # We prefer to use separate k_scale and v_scale if present
                 k_scale = layer.k_scale.to("cpu").tolist()
                 v_scale = layer.v_scale.to("cpu").tolist()
+                if current_platform.is_rocm():
+                    k_scale *= 2
+                    v_scale *= 2
             elif layer.k_scale < 0.0 and layer.v_scale < 0.0:
                 # If no scales were loaded (both scales are invalid negative
                 # values), use the default value of 1.0
@@ -58,6 +64,9 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
                 scale_to_duplicate = max(layer.k_scale, layer.v_scale)
                 k_scale = scale_to_duplicate.to("cpu").tolist()
                 v_scale = scale_to_duplicate.to("cpu").tolist()
+                if current_platform.is_rocm():
+                    k_scale *= 2
+                    v_scale *= 2
 
             if not isinstance(k_scale, float) or not isinstance(
                     v_scale, float):
@@ -65,9 +74,11 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
                                  "for fp8 KV cache")
 
             # These are used in the final Attention.forward()
-            layer._k_scale = k_scale
-            layer._v_scale = v_scale
-            if (layer._k_scale == 1.0 and layer._v_scale == 1.0
+            layer._k_scale.copy_(k_scale)
+            layer._v_scale.copy_(v_scale)
+            layer._k_scale_float = k_scale
+            layer._v_scale_float = v_scale
+            if (k_scale == 1.0 and v_scale == 1.0
                     and "e5m2" not in layer.kv_cache_dtype):
                 logger.warning_once(
                     "Using KV cache scaling factor 1.0 for fp8_e4m3. This "
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 9cfcdbf620d2b..b70407221312a 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -6,8 +6,7 @@
 import os
 import tempfile
 from collections import defaultdict
-from typing import (Any, Callable, Dict, Generator, Iterable, List, Optional,
-                    Tuple, Union)
+from typing import Any, Callable, Dict, Generator, List, Optional, Tuple, Union
 
 import filelock
 import gguf
@@ -23,7 +22,6 @@
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import (QuantizationConfig,
                                                      get_quantization_config)
-from vllm.model_executor.layers.quantization.schema import QuantParamSchema
 from vllm.platforms import current_platform
 from vllm.utils import PlaceholderModule
 
@@ -496,47 +494,6 @@ def gguf_quant_weights_iterator(
             yield name, param
 
 
-def kv_cache_scales_loader(
-        filename: str, tp_rank: int, tp_size: int, num_hidden_layers: int,
-        model_type: Optional[str]) -> Iterable[Tuple[int, float]]:
-    """
-    A simple utility to read in KV cache scaling factors that have been
-    previously serialized to disk. Used by the model to populate the appropriate
-    KV cache scaling factors. The serialization should represent a dictionary
-    whose keys are the TP ranks and values are another dictionary mapping layers
-    to their KV cache scaling factors.
-    Keep this function in sync with the output of
-    examples/other/fp8/extract_scales.py
-    """
-    try:
-        with open(filename) as f:
-            context = {
-                "model_type": model_type,
-                "num_hidden_layers": num_hidden_layers,
-                "tp_rank": tp_rank,
-                "tp_size": tp_size,
-            }
-            schema_dct = json.load(f)
-            schema = QuantParamSchema.model_validate(schema_dct,
-                                                     context=context)
-            layer_scales_map = schema.kv_cache.scaling_factor[tp_rank]
-            return layer_scales_map.items()
-
-    except FileNotFoundError:
-        logger.error("File or directory '%s' not found.", filename)
-    except json.JSONDecodeError:
-        logger.error("Error decoding JSON in file '%s'.", filename)
-    except Exception:
-        logger.exception("An error occurred while reading '%s'.", filename)
-    # This section is reached if and only if any of the excepts are hit
-    # Return an empty iterable (list) => no KV cache scales are loaded
-    # which ultimately defaults to 1.0 scales
-    logger.warning(
-        "Defaulting to KV cache scaling factors = 1.0 for all "
-        "layers in TP rank %d as an error occurred during loading.", tp_rank)
-    return []
-
-
 def convert_pyslice_to_tensor(x: Any) -> torch.Tensor:
     """convert PySafeSlice object from safetensors to torch.Tensor
 
diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py
index eab3bf0756fca..bc3295da7b60a 100644
--- a/vllm/model_executor/models/exaone.py
+++ b/vllm/model_executor/models/exaone.py
@@ -30,8 +30,7 @@
 from vllm.attention import Attention, AttentionMetadata
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
-from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
-                              get_tensor_model_parallel_world_size)
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
@@ -44,9 +43,8 @@
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader, kv_cache_scales_loader, maybe_remap_kv_scale_name)
+    default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs.exaone import ExaoneConfig
 
@@ -576,32 +574,3 @@ def load_weights(self, weights: Iterable[Tuple[str,
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
-
-    # If this function is called, it should always initialize KV cache scale
-    # factors (or else raise an exception). Thus, handled exceptions should
-    # make sure to leave KV cache scale factors in a known good (dummy) state
-    def load_kv_cache_scales(self, quantization_param_path: str) -> None:
-        tp_size = get_tensor_model_parallel_world_size()
-        tp_rank = get_tensor_model_parallel_rank()
-        for layer_idx, scaling_factor in kv_cache_scales_loader(
-                quantization_param_path,
-                tp_rank,
-                tp_size,
-                self.config.num_hidden_layers,
-                self.config.__class__.model_type,
-        ):
-            if not isinstance(self.transformer.h[layer_idx], nn.Identity):
-                layer_self_attn = self.transformer.h[layer_idx].attn
-
-            if current_platform.is_rocm():
-                # The scaling factor convention we are assuming is
-                # quantized_value * scaling_factor ~= true_value
-                # which is consistent with the practice of setting
-                # scaling_factor = tensor_amax / FPtype_max
-                scaling_factor *= 2
-            if hasattr(layer_self_attn.attn, "_k_scale"):
-                layer_self_attn.attn._k_scale = scaling_factor
-                layer_self_attn.attn._v_scale = scaling_factor
-            else:
-                raise RuntimeError("Self attention has no KV cache scaling "
-                                   "factor attribute!")
diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py
index ddd2d7a16b242..543b4e2f5e286 100644
--- a/vllm/model_executor/models/granite.py
+++ b/vllm/model_executor/models/granite.py
@@ -29,8 +29,7 @@
 from vllm.attention import Attention, AttentionMetadata
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
-from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
-                              get_tensor_model_parallel_world_size)
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
@@ -44,9 +43,8 @@
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader, kv_cache_scales_loader, maybe_remap_kv_scale_name)
+    default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
@@ -518,29 +516,3 @@ def load_weights(self, weights: Iterable[Tuple[str,
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
-
-    # If this function is called, it should always initialize KV cache scale
-    # factors (or else raise an exception). Thus, handled exceptions should
-    # make sure to leave KV cache scale factors in a known good (dummy) state
-    def load_kv_cache_scales(self, quantization_param_path: str) -> None:
-        tp_size = get_tensor_model_parallel_world_size()
-        tp_rank = get_tensor_model_parallel_rank()
-        for layer_idx, scaling_factor in kv_cache_scales_loader(
-                quantization_param_path, tp_rank, tp_size,
-                self.config.num_hidden_layers,
-                self.config.__class__.model_type):
-            if not isinstance(self.model.layers[layer_idx], nn.Identity):
-                layer_self_attn = self.model.layers[layer_idx].self_attn
-
-            if current_platform.is_rocm():
-                # The scaling factor convention we are assuming is
-                # quantized_value * scaling_factor ~= true_value
-                # which is consistent with the practice of setting
-                # scaling_factor = tensor_amax / FPtype_max
-                scaling_factor *= 2
-            if hasattr(layer_self_attn.attn, "_k_scale"):
-                layer_self_attn.attn._k_scale = scaling_factor
-                layer_self_attn.attn._v_scale = scaling_factor
-            else:
-                raise RuntimeError("Self attention has no KV cache scaling "
-                                   "factor attribute!")
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index a5bd418801f2c..e214c30f5d60b 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -29,8 +29,7 @@
 from vllm.attention import Attention, AttentionMetadata
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
-from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
-                              get_tensor_model_parallel_world_size)
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
@@ -43,9 +42,8 @@
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader, kv_cache_scales_loader, maybe_remap_kv_scale_name)
+    default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
@@ -440,32 +438,6 @@ def load_weights(self, weights: Iterable[Tuple[str,
             loaded_params.add(name)
         return loaded_params
 
-    # If this function is called, it should always initialize KV cache scale
-    # factors (or else raise an exception). Thus, handled exceptions should
-    # make sure to leave KV cache scale factors in a known good (dummy) state
-    def load_kv_cache_scales(self, quantization_param_path: str) -> None:
-        tp_size = get_tensor_model_parallel_world_size()
-        tp_rank = get_tensor_model_parallel_rank()
-        for layer_idx, scaling_factor in kv_cache_scales_loader(
-                quantization_param_path, tp_rank, tp_size,
-                self.config.num_hidden_layers,
-                self.config.__class__.model_type):
-            if not isinstance(self.layers[layer_idx], nn.Identity):
-                layer_self_attn = self.layers[layer_idx].self_attn
-
-            if current_platform.is_rocm():
-                # The scaling factor convention we are assuming is
-                # quantized_value * scaling_factor ~= true_value
-                # which is consistent with the practice of setting
-                # scaling_factor = tensor_amax / FPtype_max
-                scaling_factor *= 2
-            if hasattr(layer_self_attn.attn, "_k_scale"):
-                layer_self_attn.attn._k_scale = scaling_factor
-                layer_self_attn.attn._v_scale = scaling_factor
-            else:
-                raise RuntimeError("Self attention has no KV cache scaling "
-                                   "factor attribute!")
-
 
 class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     packed_modules_mapping = {
@@ -593,9 +565,6 @@ def load_weights(self, weights: Iterable[Tuple[str,
             self.maybe_remap_mistral(name, loaded_weight)
             for name, loaded_weight in weights)
 
-    def load_kv_cache_scales(self, quantization_param_path: str) -> None:
-        self.model.load_kv_cache_scales(quantization_param_path)
-
     # This function is used to remap the mistral format as
     # used by Mistral and Llama <=2
     def maybe_remap_mistral(
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index 2554281610a30..61baa8e588d74 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -831,6 +831,7 @@ def _attention_with_mask(
     ) -> torch.Tensor:
         # Skip writing kv-cache for the initial profiling run.
         if len(kv_cache.shape) > 1:
+            i = torch.ones(1, dtype=torch.float32)
             if self.attn.backend in (_Backend.FLASH_ATTN,
                                      _Backend.FLASH_ATTN_VLLM_V1):
                 cached_k = torch.cat([k[s:e] for s, e in kv_range_for_decode])
@@ -843,8 +844,8 @@ def _attention_with_mask(
                     attn_metadata.
                     cross_slot_mapping,  # type: ignore[union-attr]
                     "auto",
-                    1.0,
-                    1.0,
+                    i,
+                    i,
                 )
             elif self.attn.backend in (_Backend.XFORMERS, _Backend.TORCH_SDPA):
                 key_cache, value_cache = PagedAttention.split_kv_cache(
@@ -853,7 +854,7 @@ def _attention_with_mask(
                 cached_v = torch.cat([v[s:e] for s, e in kv_range_for_decode])
                 PagedAttention.write_to_paged_cache(
                     cached_k, cached_v, key_cache, value_cache,
-                    attn_metadata.cross_slot_mapping, "auto", 1.0, 1.0)
+                    attn_metadata.cross_slot_mapping, "auto", i, i)
             else:
                 raise ValueError(
                     f"Unsupported Attention backend {self.attn.backend} "
diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py
index 37c5a4b5713b8..e6d919f23c85d 100644
--- a/vllm/model_executor/models/solar.py
+++ b/vllm/model_executor/models/solar.py
@@ -30,8 +30,7 @@
 from vllm.attention import Attention, AttentionMetadata
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
-from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
-                              get_tensor_model_parallel_world_size)
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
@@ -44,9 +43,8 @@
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader, kv_cache_scales_loader, maybe_remap_kv_scale_name)
+    default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
@@ -535,32 +533,3 @@ def load_weights(self, weights: Iterable[Tuple[str,
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
-
-    # If this function is called, it should always initialize KV cache scale
-    # factors (or else raise an exception). Thus, handled exceptions should
-    # make sure to leave KV cache scale factors in a known good (dummy) state
-    def load_kv_cache_scales(self, quantization_param_path: str) -> None:
-        tp_size = get_tensor_model_parallel_world_size()
-        tp_rank = get_tensor_model_parallel_rank()
-        for layer_idx, scaling_factor in kv_cache_scales_loader(
-                quantization_param_path,
-                tp_rank,
-                tp_size,
-                self.config.num_hidden_layers,
-                self.config.__class__.model_type,
-        ):
-            if not isinstance(self.model.layers[layer_idx], nn.Identity):
-                layer_self_attn = self.model.layers[layer_idx].self_attn
-
-            if current_platform.is_rocm():
-                # The scaling factor convention we are assuming is
-                # quantized_value * scaling_factor ~= true_value
-                # which is consistent with the practice of setting
-                # scaling_factor = tensor_amax / FPtype_max
-                scaling_factor *= 2
-            if hasattr(layer_self_attn.attn, "_k_scale"):
-                layer_self_attn.attn._k_scale = scaling_factor
-                layer_self_attn.attn._v_scale = scaling_factor
-            else:
-                raise RuntimeError("Self attention has no KV cache scaling "
-                                   "factor attribute!")
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index 1806fec8833a3..7fe9b3a8f595a 100644
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -166,10 +166,6 @@ def forward(
         Returns:
             shape = [num_tokens, num_heads * head_size]
         """
-        # NOTE(woosuk): FlashAttention does not support FP8 KV cache.
-        assert layer._k_scale == 1.0 and layer._v_scale == 1.0, (
-            "key/v_scale is not supported in FlashAttention.")
-
         assert output is not None, "Output tensor must be provided."
 
         if attn_metadata is None:
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 4c8f69e449393..a339c97a8383c 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -903,7 +903,8 @@ def _prepare_prompt(
             num_decode_tokens=0,
             slot_mapping=slot_mapping,
             multi_modal_placeholder_index_maps=
-            None  # FIXME(kzawora): mutli-modality will not work here
+            None,  # FIXME(kzawora): mutli-modality will not work here
+            enable_kv_scales_calculation=False,
         )
         multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list)
 
@@ -1057,7 +1058,9 @@ def _prepare_decode(
             num_prefill_tokens=0,
             num_decode_tokens=num_decode_tokens,
             slot_mapping=slot_mapping,
-            multi_modal_placeholder_index_maps=None)
+            multi_modal_placeholder_index_maps=None,
+            enable_kv_scales_calculation=False,
+        )
         return PrepareDecodeMetadata(input_tokens=input_tokens,
                                      input_positions=input_positions,
                                      attn_metadata=attn_metadata,
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index fe504821a0d96..cf2f1c6b3b877 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -3,7 +3,6 @@
 import inspect
 import itertools
 import time
-import warnings
 import weakref
 from contextlib import contextmanager
 from dataclasses import dataclass
@@ -41,7 +40,6 @@
 from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
                              MultiModalKwargs, MultiModalPlaceholderMap,
                              MultiModalRegistry)
-from vllm.platforms import current_platform
 from vllm.prompt_adapter.layers import PromptAdapterMapping
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.prompt_adapter.worker_manager import (
@@ -1151,34 +1149,6 @@ def load_model(self) -> None:
                 self.prompt_adapter_manager.create_prompt_adapter_manager(
                     self.model))
 
-        if self.kv_cache_dtype == "fp8" and (current_platform.is_rocm()
-                                             or current_platform.is_cuda()):
-            # Currently only ROCm accepts kv-cache scaling factors
-            # via quantization_param_path and this will be deprecated
-            # in the future.
-            if self.model_config.quantization_param_path is not None:
-                if callable(getattr(self.model, "load_kv_cache_scales", None)):
-                    warnings.warn(
-                        "Loading kv cache scaling factor from JSON is "
-                        "deprecated and will be removed. Please include "
-                        "kv cache scaling factors in the model checkpoint.",
-                        FutureWarning,
-                        stacklevel=2)
-                    self.model.load_kv_cache_scales(
-                        self.model_config.quantization_param_path)
-                    logger.info("Loaded KV cache scaling factors from %s",
-                                self.model_config.quantization_param_path)
-                else:
-                    raise RuntimeError(
-                        "Using FP8 KV cache and scaling factors provided but "
-                        "model %s does not support loading scaling factors.",
-                        self.model.__class__)
-            else:
-                logger.warning(
-                    "Using FP8 KV cache but no scaling factors "
-                    "provided. Defaulting to scaling factors of 1.0. "
-                    "This may lead to less accurate results!")
-
         if self.vllm_config.compilation_config.level ==\
             CompilationLevel.DYNAMO_AS_IS and supports_dynamo():
             backend = self.vllm_config.compilation_config.init_backend(
@@ -1366,6 +1336,10 @@ def _dummy_run(self,
                     dtype=self.model_config.dtype,
                     device=self.device)
 
+            # Disable KV Scale Calculation for dummy data during profile run
+            if model_input.attn_metadata is not None:
+                model_input.attn_metadata.enable_kv_scales_calculation = False
+
             self.execute_model(model_input, kv_caches, intermediate_tensors)
             torch.cuda.synchronize()
             return
@@ -1510,7 +1484,8 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None:
                             batch_size,
                             is_encoder_decoder_model=self.model_config.
                             is_encoder_decoder))
-
+                    # Disable KV Scale Calculation for graph capture
+                    attn_metadata.enable_kv_scales_calculation = False
                     if self.lora_config:
                         lora_mapping = LoRAMapping(
                             **dict(index_mapping=[0] * batch_size,
diff --git a/vllm/worker/openvino_model_runner.py b/vllm/worker/openvino_model_runner.py
index 9d0a759ca2f21..42fe2cf668ad8 100644
--- a/vllm/worker/openvino_model_runner.py
+++ b/vllm/worker/openvino_model_runner.py
@@ -282,6 +282,7 @@ def _prepare_model_input(
             block_indices_begins=block_indices_begins_tensor,
             max_context_len=max_context_len_tensor,
             multi_modal_placeholder_index_maps=placeholder_index_maps,
+            enable_kv_scales_calculation=False,
         )
 
         multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list)
diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py
index f5c7bc955a673..a3f648f4cc645 100644
--- a/vllm/worker/tpu_model_runner.py
+++ b/vllm/worker/tpu_model_runner.py
@@ -190,6 +190,7 @@ def _dummy_run(
                     num_decode_tokens=0,
                     slot_mapping=slot_mapping,
                     multi_modal_placeholder_index_maps=None,
+                    enable_kv_scales_calculation=False,
                     block_tables=None,
                     context_lens=None,
                     effective_query_lens=None,
@@ -208,6 +209,7 @@ def _dummy_run(
                     num_decode_tokens=0,
                     slot_mapping=slot_mapping,
                     multi_modal_placeholder_index_maps=None,
+                    enable_kv_scales_calculation=False,
                     block_tables=block_tables,
                     context_lens=context_lens,
                     effective_query_lens=effective_query_lens,
@@ -239,6 +241,7 @@ def _dummy_run(
                 num_decode_tokens=batch_size * seq_len,
                 slot_mapping=slot_mapping,
                 multi_modal_placeholder_index_maps=None,
+                enable_kv_scales_calculation=False,
                 block_tables=block_tables,
                 context_lens=context_lens,
             )
@@ -425,6 +428,7 @@ def _prepare_prompt(
             num_decode_tokens=0,
             slot_mapping=slot_mapping,
             multi_modal_placeholder_index_maps=None,
+            enable_kv_scales_calculation=False,
             block_tables=block_tables,
             context_lens=context_lens,
             effective_query_lens=prompt_lens,
@@ -496,6 +500,7 @@ def _prepare_decode(
             num_decode_tokens=batch_size,
             slot_mapping=slot_mapping,
             multi_modal_placeholder_index_maps=None,
+            enable_kv_scales_calculation=False,
             block_tables=block_tables,
             context_lens=context_lens,
         )
diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
index 053658d047311..b7b7b7227b22c 100644
--- a/vllm/worker/xpu_model_runner.py
+++ b/vllm/worker/xpu_model_runner.py
@@ -261,6 +261,7 @@ def _prepare_prompt(
             is_prompt=True,
             slot_mapping=slot_mapping,
             multi_modal_placeholder_index_maps=placeholder_index_maps,
+            enable_kv_scales_calculation=False,
             seq_lens=seq_lens,
             seqlen_q=seqlen_q,
             max_seqlen=max_seqlen,
@@ -345,6 +346,7 @@ def _prepare_decode(
             is_prompt=False,
             slot_mapping=slot_mapping,
             multi_modal_placeholder_index_maps=None,
+            enable_kv_scales_calculation=False,
             seq_lens=seq_lens,
             seqlen_q=torch.tensor([]),
             max_seqlen=0,

From 2c85529bfc7bd0068f2395188bbf54c9adb01422 Mon Sep 17 00:00:00 2001
From: Siyuan Liu <lsiyuan@google.com>
Date: Thu, 23 Jan 2025 10:50:16 -0800
Subject: [PATCH 354/462] [TPU] Update TPU CI to use torchxla nightly on
 20250122 (#12334)

Signed-off-by: Siyuan Liu <lsiyuan@google.com>
---
 Dockerfile.tpu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile.tpu b/Dockerfile.tpu
index b617932a85b47..ee0d94d98e82b 100644
--- a/Dockerfile.tpu
+++ b/Dockerfile.tpu
@@ -1,4 +1,4 @@
-ARG NIGHTLY_DATE="20241017"
+ARG NIGHTLY_DATE="20250122"
 ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE"
 
 FROM $BASE_IMAGE

From 2cbeedad09157d9438bfe26a3ae06f1fe070118b Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Fri, 24 Jan 2025 03:18:51 +0800
Subject: [PATCH 355/462] [Docs] Document Phi-4 support (#12362)

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 docs/source/models/supported_models.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 3da5aaf713c1f..8cdc663a0320f 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -302,8 +302,8 @@ See [this page](#generative-models) for more information on how to use generativ
   - ✅︎
   - ✅︎
 * - `Phi3ForCausalLM`
-  - Phi-3
-  - `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc.
+  - Phi-4, Phi-3
+  - `microsoft/Phi-4`, `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc.
   - ✅︎
   - ✅︎
 * - `Phi3SmallForCausalLM`

From eb5cb5e5280c83226408c3d1e30928364c6258db Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Thu, 23 Jan 2025 16:40:33 -0500
Subject: [PATCH 356/462] [BugFix] Fix parameter names and
 `process_after_weight_loading` for W4A16 MoE Group Act Order  (#11528)

Signed-off-by: ElizaWszola <eliza@neuralmagic.com>
Co-authored-by: ElizaWszola <eliza@neuralmagic.com>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
---
 vllm/model_executor/layers/fused_moe/layer.py |  94 +++++++----
 .../layers/quantization/awq_marlin.py         |  35 ++--
 .../compressed_tensors_moe.py                 | 153 ++++++++++++------
 .../schemes/compressed_tensors_w4a16_24.py    |   2 +-
 .../layers/quantization/experts_int8.py       |  27 ++--
 .../model_executor/layers/quantization/fp8.py |  41 ++---
 .../layers/quantization/gptq_marlin.py        |  19 ++-
 .../layers/quantization/quark/quark_moe.py    |  20 +--
 8 files changed, 243 insertions(+), 148 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 3d822fc0c7f99..da0ce1885dbb2 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -38,7 +38,7 @@ class FusedMoEMethodBase(QuantizeMethodBase):
 
     @abstractmethod
     def create_weights(self, layer: torch.nn.Module, num_experts: int,
-                       hidden_size: int, intermediate_size: int,
+                       hidden_size: int, intermediate_size_per_partition: int,
                        params_dtype: torch.dtype, **extra_weight_attrs):
         raise NotImplementedError
 
@@ -65,22 +65,24 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
     """MoE method without quantization."""
 
     def create_weights(self, layer: torch.nn.Module, num_experts: int,
-                       hidden_size: int, intermediate_size: int,
+                       hidden_size: int, intermediate_size_per_partition: int,
                        params_dtype: torch.dtype, **extra_weight_attrs):
         # Fused gate_up_proj (column parallel)
-        w13_weight = torch.nn.Parameter(torch.empty(num_experts,
-                                                    2 * intermediate_size,
-                                                    hidden_size,
-                                                    dtype=params_dtype),
+        w13_weight = torch.nn.Parameter(torch.empty(
+            num_experts,
+            2 * intermediate_size_per_partition,
+            hidden_size,
+            dtype=params_dtype),
                                         requires_grad=False)
         layer.register_parameter("w13_weight", w13_weight)
         set_weight_attrs(w13_weight, extra_weight_attrs)
 
         # down_proj (row parallel)
-        w2_weight = torch.nn.Parameter(torch.empty(num_experts,
-                                                   hidden_size,
-                                                   intermediate_size,
-                                                   dtype=params_dtype),
+        w2_weight = torch.nn.Parameter(torch.empty(
+            num_experts,
+            hidden_size,
+            intermediate_size_per_partition,
+            dtype=params_dtype),
                                        requires_grad=False)
         layer.register_parameter("w2_weight", w2_weight)
         set_weight_attrs(w2_weight, extra_weight_attrs)
@@ -289,13 +291,20 @@ def __init__(
             self.quant_method = quant_config.get_quant_method(self, prefix)
         assert self.quant_method is not None
 
-        self.quant_method.create_weights(
-            layer=self,
-            num_experts=num_experts,
-            hidden_size=hidden_size,
-            intermediate_size=self.intermediate_size_per_partition,
-            params_dtype=params_dtype,
-            weight_loader=self.weight_loader)
+        moe_quant_params = {
+            "num_experts": num_experts,
+            "hidden_size": hidden_size,
+            "intermediate_size_per_partition":
+            self.intermediate_size_per_partition,
+            "params_dtype": params_dtype,
+            "weight_loader": self.weight_loader,
+        }
+        # need full intermediate size pre-sharding for WNA16 act order
+        if (self.quant_method.__class__.__name__ ==
+                "CompressedTensorsWNA16MoEMethod"):
+            moe_quant_params["intermediate_size_full"] = intermediate_size
+
+        self.quant_method.create_weights(layer=self, **moe_quant_params)
 
     def _load_per_tensor_weight_scale(self, shard_id: str,
                                       param: torch.nn.Parameter,
@@ -312,19 +321,30 @@ def _load_per_tensor_weight_scale(self, shard_id: str,
         elif shard_id == "w2":
             param_data[expert_id] = loaded_weight
 
-    def _load_model_weight_or_group_weight_scale(self, shard_dim: int,
+    def _load_model_weight_or_group_weight_scale(self,
+                                                 shard_dim: int,
                                                  expert_data: torch.Tensor,
                                                  shard_id: str,
                                                  loaded_weight: torch.Tensor,
-                                                 tp_rank: int):
-        # Load grouped weight scales for group quantization
-        # or model weights
+                                                 tp_rank: int,
+                                                 load_full_w2: bool = False):
+        """
+        Load grouped weight scales for group quantization or model weights
+            :param shard_dim: dimension to shard
+            :param expert_data: parameter for a particular expert
+            :param shard_id: either w1, w2, or w3
+            :param loaded_weight: checkpoint weight to load into the param
+            :param tp_rank: tensor parallel rank
+            :param load_full_w2: whether or not the w2 loaded should be sharded.
+        """
         if shard_id == "w2":
-            self._load_w2(shard_id=shard_id,
-                          shard_dim=shard_dim,
+            # In the case where we have actorder/g_idx, we do not partition the
+            # w2 scales, as indicated by `load_full` argument, for all tp cases
+            self._load_w2(shard_dim=shard_dim,
                           loaded_weight=loaded_weight,
                           expert_data=expert_data,
-                          tp_rank=tp_rank)
+                          tp_rank=tp_rank,
+                          load_full=load_full_w2)
         elif shard_id in ("w1", "w3"):
             self._load_w13(shard_id=shard_id,
                            shard_dim=shard_dim,
@@ -364,15 +384,21 @@ def _load_w13(self, expert_data: torch.Tensor, shard_dim: int,
             expert_data = expert_data.narrow(shard_dim, shard_size, shard_size)
         expert_data.copy_(loaded_weight)
 
-    def _load_w2(self, expert_data: torch.Tensor, shard_dim: int,
-                 shard_id: str, loaded_weight: torch.Tensor, tp_rank: int):
+    def _load_w2(self,
+                 expert_data: torch.Tensor,
+                 shard_dim: int,
+                 loaded_weight: torch.Tensor,
+                 tp_rank: int,
+                 load_full: bool = False):
 
         # Index the loaded weight for tp sharding.
         # down_proj: "RowParallel" so tp sharding on input_dim
         # Narrow parameter and load.
         shard_size = expert_data.shape[shard_dim]
-        loaded_weight = loaded_weight.narrow(shard_dim, shard_size * tp_rank,
-                                             shard_size)
+        if not load_full:
+            loaded_weight = loaded_weight.narrow(shard_dim,
+                                                 shard_size * tp_rank,
+                                                 shard_size)
         # w2, down_proj: Load into only logical weight of w2.
         expert_data.copy_(loaded_weight)
 
@@ -387,8 +413,7 @@ def _load_g_idx(self, shard_id: str, expert_data: torch.Tensor,
                     shard_dim: int, loaded_weight: torch.Tensor, tp_rank: int):
 
         if shard_id == "w2":
-            self._load_w2(shard_id=shard_id,
-                          shard_dim=shard_dim,
+            self._load_w2(shard_dim=shard_dim,
                           loaded_weight=loaded_weight,
                           expert_data=expert_data,
                           tp_rank=tp_rank)
@@ -416,7 +441,7 @@ def weight_loader(self, param: torch.nn.Parameter,
         ]
         # Fetch the dim to shard the parameter/loaded weight
         # based on the shard id. This will be whatever
-        # dimension intermediate_size is used.
+        # dimension intermediate_size_per_partition is used.
         SHARD_ID_TO_SHARDED_DIM = {"w1": 0, "w2": 1, "w3": 0}
 
         expert_data = param.data[expert_id]
@@ -424,11 +449,11 @@ def weight_loader(self, param: torch.nn.Parameter,
 
         # is_transposed: if the dim to shard the weight
         # should be flipped. Required by GPTQ, compressed-tensors
-        # should be whatever dimension intermediate_size is
+        # should be whatever dimension intermediate_size_per_partition is
         is_transposed = getattr(param, "is_transposed", False)
         shard_dim = SHARD_ID_TO_SHARDED_DIM[shard_id]
         if is_transposed:
-            shard_dim = ~shard_dim
+            shard_dim = int(not shard_dim)
 
         # Case input scale: input_scale loading is only supported for fp8
         if "input_scale" in weight_name:
@@ -480,7 +505,8 @@ def weight_loader(self, param: torch.nn.Parameter,
                     shard_dim=shard_dim,
                     loaded_weight=loaded_weight,
                     expert_data=expert_data,
-                    tp_rank=tp_rank)
+                    tp_rank=tp_rank,
+                    load_full_w2=getattr(param, "load_full_w2", False))
             elif quant_method == FusedMoeWeightScaleSupported.TENSOR.value:
                 self._load_per_tensor_weight_scale(shard_id=shard_id,
                                                    param=param,
diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py
index c28fd0c6737e0..0c3c9816878e9 100644
--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -303,7 +303,7 @@ def __init__(self, quant_config: AWQMarlinConfig):
         self.quant_config = quant_config
 
     def create_weights(self, layer: torch.nn.Module, num_experts: int,
-                       hidden_size: int, intermediate_size: int,
+                       hidden_size: int, intermediate_size_per_partition: int,
                        params_dtype: torch.dtype, **extra_weight_attrs):
         extra_weight_attrs.update({
             "is_transposed":
@@ -312,17 +312,18 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int,
             FusedMoeWeightScaleSupported.GROUP.value,
         })
 
-        w13_qweight = Parameter(torch.empty(num_experts,
-                                            hidden_size,
-                                            2 * intermediate_size //
-                                            self.quant_config.pack_factor,
-                                            dtype=torch.int32),
-                                requires_grad=False)
+        w13_qweight = Parameter(
+            torch.empty(num_experts,
+                        hidden_size,
+                        2 * intermediate_size_per_partition //
+                        self.quant_config.pack_factor,
+                        dtype=torch.int32),
+            requires_grad=False)
         layer.register_parameter("w13_qweight", w13_qweight)
         set_weight_attrs(w13_qweight, extra_weight_attrs)
 
         w2_qweight = Parameter(torch.empty(num_experts,
-                                           intermediate_size,
+                                           intermediate_size_per_partition,
                                            hidden_size //
                                            self.quant_config.pack_factor,
                                            dtype=torch.int32),
@@ -331,13 +332,14 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int,
         set_weight_attrs(w2_qweight, extra_weight_attrs)
 
         num_groups_w13 = hidden_size // self.quant_config.group_size
-        num_groups_w2 = intermediate_size // self.quant_config.group_size
+        num_groups_w2 = (intermediate_size_per_partition //
+                         self.quant_config.group_size)
 
         # WEIGHT_SCALES
         # Allocate 2 scales for w1 and w3 respectively.
         w13_scales = Parameter(torch.empty(num_experts,
                                            num_groups_w13,
-                                           intermediate_size * 2,
+                                           intermediate_size_per_partition * 2,
                                            dtype=params_dtype),
                                requires_grad=False)
         layer.register_parameter("w13_scales", w13_scales)
@@ -353,12 +355,13 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int,
 
         # WEIGHT_ZERO_POINT
         # Allocate 2 zero points for w1 and w3 respectively.
-        w13_qzeros = Parameter(torch.empty(num_experts,
-                                           num_groups_w13,
-                                           2 * intermediate_size //
-                                           self.quant_config.pack_factor,
-                                           dtype=torch.int32),
-                               requires_grad=False)
+        w13_qzeros = Parameter(
+            torch.empty(num_experts,
+                        num_groups_w13,
+                        2 * intermediate_size_per_partition //
+                        self.quant_config.pack_factor,
+                        dtype=torch.int32),
+            requires_grad=False)
         layer.register_parameter("w13_qzeros", w13_qzeros)
         set_weight_attrs(w13_qzeros, extra_weight_attrs)
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 4fb8fd84e92d4..e1c45f4e42e41 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -13,6 +13,7 @@
                                                   FusedMoeWeightScaleSupported)
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
     WNA16_SUPPORTED_BITS)
+from vllm.model_executor.layers.quantization.utils import replace_parameter
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     all_close_1d, normalize_e4m3fn_to_e4m3fnuz, per_tensor_dequantize)
 from vllm.model_executor.utils import set_weight_attrs
@@ -75,24 +76,26 @@ def __init__(
         self.static_input_scales = not self.input_quant.dynamic
 
     def create_weights(self, layer: torch.nn.Module, num_experts: int,
-                       hidden_size: int, intermediate_size: int,
+                       hidden_size: int, intermediate_size_per_partition: int,
                        params_dtype: torch.dtype, **extra_weight_attrs):
 
         params_dtype = torch.float8_e4m3fn
 
         # WEIGHTS
-        w13_weight = torch.nn.Parameter(torch.empty(num_experts,
-                                                    2 * intermediate_size,
-                                                    hidden_size,
-                                                    dtype=params_dtype),
+        w13_weight = torch.nn.Parameter(torch.empty(
+            num_experts,
+            2 * intermediate_size_per_partition,
+            hidden_size,
+            dtype=params_dtype),
                                         requires_grad=False)
         layer.register_parameter("w13_weight", w13_weight)
         set_weight_attrs(w13_weight, extra_weight_attrs)
 
-        w2_weight = torch.nn.Parameter(torch.empty(num_experts,
-                                                   hidden_size,
-                                                   intermediate_size,
-                                                   dtype=params_dtype),
+        w2_weight = torch.nn.Parameter(torch.empty(
+            num_experts,
+            hidden_size,
+            intermediate_size_per_partition,
+            dtype=params_dtype),
                                        requires_grad=False)
         layer.register_parameter("w2_weight", w2_weight)
         set_weight_attrs(w2_weight, extra_weight_attrs)
@@ -254,6 +257,7 @@ def __init__(
         self.packed_factor = 32 // config.num_bits
         self.strategy = config.strategy
         self.group_size = config.group_size
+        self.actorder = config.actorder
         assert config.symmetric, (
             "Only symmetric quantization is supported for MoE")
 
@@ -266,9 +270,16 @@ def __init__(
                              f"{WNA16_SUPPORTED_BITS}")
 
     def create_weights(self, layer: torch.nn.Module, num_experts: int,
-                       hidden_size: int, intermediate_size: int,
+                       hidden_size: int, intermediate_size_per_partition: int,
                        params_dtype: torch.dtype, **extra_weight_attrs):
 
+        assert params_dtype == torch.float16, (
+            "float16 is required for MoE compressed models. Set dtype=torch.float16"  # noqa: E501
+        )
+
+        intermediate_size_full = extra_weight_attrs.pop(
+            "intermediate_size_full")
+
         # Will transpose the loaded weight along the
         # intermediate and hidden dim sizes. Will
         # shard for TP along the transposed dims
@@ -276,35 +287,45 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int,
             "is_transposed": True,
             "quant_method": self.strategy
         })
-        w13_weight = torch.nn.Parameter(torch.empty(num_experts,
-                                                    hidden_size //
-                                                    self.packed_factor,
-                                                    2 * intermediate_size,
-                                                    dtype=torch.int32),
+        w13_weight = torch.nn.Parameter(torch.empty(
+            num_experts,
+            hidden_size // self.packed_factor,
+            2 * intermediate_size_per_partition,
+            dtype=torch.int32),
                                         requires_grad=False)
         layer.register_parameter("w13_weight_packed", w13_weight)
         set_weight_attrs(w13_weight, extra_weight_attrs)
 
-        w2_weight = torch.nn.Parameter(torch.empty(num_experts,
-                                                   intermediate_size //
-                                                   self.packed_factor,
-                                                   hidden_size,
-                                                   dtype=torch.int32),
+        w2_weight = torch.nn.Parameter(torch.empty(
+            num_experts,
+            intermediate_size_per_partition // self.packed_factor,
+            hidden_size,
+            dtype=torch.int32),
                                        requires_grad=False)
         layer.register_parameter("w2_weight_packed", w2_weight)
         set_weight_attrs(w2_weight, extra_weight_attrs)
 
+        # In the case where we have actorder/g_idx,
+        # we do not partition the w2 scales
+        load_full_w2 = self.actorder and self.group_size != -1
+        w2_scales_size = (intermediate_size_full
+                          if load_full_w2 else intermediate_size_per_partition)
+
+        self.is_k_full = (not self.actorder) or (
+            intermediate_size_per_partition == intermediate_size_full)
+
         if self.strategy == "channel":
             num_groups_w2 = num_groups_w13 = 1
             self.group_size = -1
         else:
-            num_groups_w2 = intermediate_size // self.group_size
+            num_groups_w2 = w2_scales_size // self.group_size
             num_groups_w13 = hidden_size // self.group_size
 
-        w13_scale = torch.nn.Parameter(torch.ones(num_experts,
-                                                  num_groups_w13,
-                                                  2 * intermediate_size,
-                                                  dtype=params_dtype),
+        w13_scale = torch.nn.Parameter(torch.ones(
+            num_experts,
+            num_groups_w13,
+            2 * intermediate_size_per_partition,
+            dtype=params_dtype),
                                        requires_grad=False)
         layer.register_parameter("w13_weight_scale", w13_scale)
         set_weight_attrs(w13_scale, extra_weight_attrs)
@@ -316,6 +337,7 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int,
                                       requires_grad=False)
         layer.register_parameter("w2_weight_scale", w2_scale)
         set_weight_attrs(w2_scale, extra_weight_attrs)
+        set_weight_attrs(w2_scale, {"load_full_w2": load_full_w2})
 
         w2_weight_shape = torch.nn.Parameter(torch.empty(num_experts, 2),
                                              requires_grad=False)
@@ -335,18 +357,18 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int,
             ),
             requires_grad=False,
         )
-        layer.register_parameter("w13_g_idx", w13_g_idx)
+        layer.register_parameter("w13_weight_g_idx", w13_g_idx)
         set_weight_attrs(w13_g_idx, extra_weight_attrs)
 
         w2_g_idx = torch.nn.Parameter(
             torch.empty(
                 num_experts,
-                intermediate_size,
+                intermediate_size_per_partition,
                 dtype=torch.int32,
             ),
             requires_grad=False,
         )
-        layer.register_parameter("w2_g_idx", w2_g_idx)
+        layer.register_parameter("w2_weight_g_idx", w2_g_idx)
         set_weight_attrs(w2_g_idx, extra_weight_attrs)
 
         w13_g_idx_sort_indices = torch.nn.Parameter(
@@ -364,7 +386,7 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int,
         w2_g_idx_sort_indices = torch.nn.Parameter(
             torch.empty(
                 num_experts,
-                intermediate_size,
+                intermediate_size_per_partition,
                 dtype=torch.int32,
             ),
             requires_grad=False,
@@ -422,24 +444,55 @@ def marlin_moe_permute_scales(s: torch.Tensor, size_k: int,
         size_k2 = layer.w2_weight_packed.shape[2]
         size_k13 = layer.w13_weight_packed.shape[2]
 
-        num_experts = layer.w13_g_idx.shape[0]
-        device = layer.w13_g_idx.device
-        layer.w13_g_idx = torch.nn.Parameter(
-            torch.empty((num_experts, 0), dtype=torch.int32, device=device),
-            requires_grad=False,
-        )
-        layer.w2_g_idx = torch.nn.Parameter(
-            torch.empty((num_experts, 0), dtype=torch.int32, device=device),
-            requires_grad=False,
-        )
-        layer.w13_g_idx_sort_indices = torch.nn.Parameter(
-            torch.empty((num_experts, 0), dtype=torch.int32, device=device),
-            requires_grad=False,
-        )
-        layer.w2_g_idx_sort_indices = torch.nn.Parameter(
-            torch.empty((num_experts, 0), dtype=torch.int32, device=device),
-            requires_grad=False,
-        )
+        num_experts = layer.w13_weight_g_idx.shape[0]
+        device = layer.w13_weight_g_idx.device
+
+        # when running models with grouped act order,
+        # resort to g_idx values provided in checkpoint
+        if self.actorder == "group":
+            w13_g_idx_sort_indices = torch.empty_like(layer.w13_weight_g_idx)
+            w2_g_idx_sort_indices = torch.empty_like(layer.w2_weight_g_idx)
+            w13_sorted_g_idx = torch.empty_like(layer.w13_weight_g_idx)
+            w2_sorted_g_idx = torch.empty_like(layer.w2_weight_g_idx)
+
+            for e in range(num_experts):
+                w13_g_idx_sort_indices[e] = torch.argsort(
+                    layer.w13_weight_g_idx[e]).to(torch.int32)
+                w2_g_idx_sort_indices[e] = torch.argsort(
+                    layer.w2_weight_g_idx[e]).to(torch.int32)
+                w13_sorted_g_idx[e] = layer.w13_weight_g_idx[e][
+                    w13_g_idx_sort_indices[e]]
+                w2_sorted_g_idx[e] = layer.w2_weight_g_idx[e][
+                    w2_g_idx_sort_indices[e]]
+
+            replace_parameter(layer, "w13_weight_g_idx", w13_sorted_g_idx)
+            replace_parameter(layer, "w2_weight_g_idx", w2_sorted_g_idx)
+            replace_parameter(layer, "w13_g_idx_sort_indices",
+                              w13_g_idx_sort_indices)
+            replace_parameter(layer, "w2_g_idx_sort_indices",
+                              w2_g_idx_sort_indices)
+
+        else:
+            layer.w13_weight_g_idx = torch.nn.Parameter(
+                torch.empty((num_experts, 0), dtype=torch.int32,
+                            device=device),
+                requires_grad=False,
+            )
+            layer.w2_weight_g_idx = torch.nn.Parameter(
+                torch.empty((num_experts, 0), dtype=torch.int32,
+                            device=device),
+                requires_grad=False,
+            )
+            layer.w13_g_idx_sort_indices = torch.nn.Parameter(
+                torch.empty((num_experts, 0), dtype=torch.int32,
+                            device=device),
+                requires_grad=False,
+            )
+            layer.w2_g_idx_sort_indices = torch.nn.Parameter(
+                torch.empty((num_experts, 0), dtype=torch.int32,
+                            device=device),
+                requires_grad=False,
+            )
 
         marlin_w13_qweight = ops.gptq_marlin_moe_repack(
             layer.w13_weight_packed,
@@ -511,9 +564,9 @@ def apply(
             router_logits,
             topk_weights,
             topk_ids,
-            g_idx1=layer.w13_g_idx,
-            g_idx2=layer.w2_g_idx,
+            g_idx1=layer.w13_weight_g_idx,
+            g_idx2=layer.w2_weight_g_idx,
             sort_indices1=layer.w13_g_idx_sort_indices,
             sort_indices2=layer.w2_g_idx_sort_indices,
             num_bits=self.num_bits,
-        )
+            is_k_full=self.is_k_full)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
index 61d1c911cd1ad..2e1b5e3c2d3b1 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
@@ -62,7 +62,7 @@ def create_weights(self, layer: torch.nn.Module, input_size: int,
                        **kwargs):
 
         assert params_dtype == torch.float16, (
-            "float16 is required for marlin24 compressd models. Set dtype=torch.float16"  # noqa: E501
+            "float16 is required for marlin24 compressed models. Set dtype=torch.float16"  # noqa: E501
         )
 
         pack_factor = 32 // self.quant_type.size_bits
diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py
index 209f12c6dfec9..100cbfa4c9598 100644
--- a/vllm/model_executor/layers/quantization/experts_int8.py
+++ b/vllm/model_executor/layers/quantization/experts_int8.py
@@ -52,7 +52,7 @@ def __init__(self, quant_config: ExpertsInt8Config):
         self.quant_config = quant_config
 
     def create_weights(self, layer: torch.nn.Module, num_experts: int,
-                       hidden_size: int, intermediate_size: int,
+                       hidden_size: int, intermediate_size_per_partition: int,
                        params_dtype: torch.dtype, **extra_weight_attrs):
 
         int8_dtype = torch.int8
@@ -64,26 +64,29 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int,
         extra_weight_attrs['weight_loader'] = wrapped_weight_loader
 
         # Fused gate_up_proj (column parallel)
-        w13_weight = torch.nn.Parameter(torch.empty(num_experts,
-                                                    2 * intermediate_size,
-                                                    hidden_size,
-                                                    dtype=int8_dtype),
+        w13_weight = torch.nn.Parameter(torch.empty(
+            num_experts,
+            2 * intermediate_size_per_partition,
+            hidden_size,
+            dtype=int8_dtype),
                                         requires_grad=False)
         layer.register_parameter("w13_weight", w13_weight)
         set_weight_attrs(w13_weight, extra_weight_attrs)
 
         # down_proj (row parallel)
-        w2_weight = torch.nn.Parameter(torch.empty(num_experts,
-                                                   hidden_size,
-                                                   intermediate_size,
-                                                   dtype=int8_dtype),
+        w2_weight = torch.nn.Parameter(torch.empty(
+            num_experts,
+            hidden_size,
+            intermediate_size_per_partition,
+            dtype=int8_dtype),
                                        requires_grad=False)
         layer.register_parameter("w2_weight", w2_weight)
         set_weight_attrs(w2_weight, extra_weight_attrs)
 
-        w13_scale = torch.nn.Parameter(torch.zeros(num_experts,
-                                                   2 * intermediate_size,
-                                                   dtype=torch.float32),
+        w13_scale = torch.nn.Parameter(torch.zeros(
+            num_experts,
+            2 * intermediate_size_per_partition,
+            dtype=torch.float32),
                                        requires_grad=False)
         layer.register_parameter("w13_scale", w13_scale)
 
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 26dd5df4e55b2..21d4355b36ab0 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -386,8 +386,8 @@ def __init__(self, quant_config: Fp8Config):
         self.block_quant = self.quant_config.weight_block_size is not None
 
     def create_weights(self, layer: Module, num_experts: int, hidden_size: int,
-                       intermediate_size: int, params_dtype: torch.dtype,
-                       **extra_weight_attrs):
+                       intermediate_size_per_partition: int,
+                       params_dtype: torch.dtype, **extra_weight_attrs):
 
         if self.quant_config.is_checkpoint_fp8_serialized:
             params_dtype = torch.float8_e4m3fn
@@ -402,30 +402,34 @@ def create_weights(self, layer: Module, num_experts: int, hidden_size: int,
             # scales, the output_size of the weights for both the gate and up
             # layers must be divisible by block_n.
             # Required by column parallel or enabling merged weights
-            if intermediate_size % block_n != 0:
+            if intermediate_size_per_partition % block_n != 0:
                 raise ValueError(
                     f"The output_size of gate's and up's weight = "
-                    f"{intermediate_size} is not divisible by "
+                    f"{intermediate_size_per_partition} is not divisible by "
                     f"weight quantization block_n = {block_n}.")
-            if (tp_size > 1 and intermediate_size % block_k != 0):
+            if (tp_size > 1
+                    and intermediate_size_per_partition % block_k != 0):
                 # Required by row parallel
-                raise ValueError(f"The input_size of down's weight = "
-                                 f"{intermediate_size} is not divisible by "
-                                 f"weight quantization block_k = {block_k}.")
+                raise ValueError(
+                    f"The input_size of down's weight = "
+                    f"{intermediate_size_per_partition} is not divisible by "
+                    f"weight quantization block_k = {block_k}.")
 
         # WEIGHTS
-        w13_weight = torch.nn.Parameter(torch.empty(num_experts,
-                                                    2 * intermediate_size,
-                                                    hidden_size,
-                                                    dtype=params_dtype),
+        w13_weight = torch.nn.Parameter(torch.empty(
+            num_experts,
+            2 * intermediate_size_per_partition,
+            hidden_size,
+            dtype=params_dtype),
                                         requires_grad=False)
         layer.register_parameter("w13_weight", w13_weight)
         set_weight_attrs(w13_weight, extra_weight_attrs)
 
-        w2_weight = torch.nn.Parameter(torch.empty(num_experts,
-                                                   hidden_size,
-                                                   intermediate_size,
-                                                   dtype=params_dtype),
+        w2_weight = torch.nn.Parameter(torch.empty(
+            num_experts,
+            hidden_size,
+            intermediate_size_per_partition,
+            dtype=params_dtype),
                                        requires_grad=False)
         layer.register_parameter("w2_weight", w2_weight)
         set_weight_attrs(w2_weight, extra_weight_attrs)
@@ -446,7 +450,8 @@ def create_weights(self, layer: Module, num_experts: int, hidden_size: int,
             w13_weight_scale = torch.nn.Parameter(
                 torch.ones(
                     num_experts,
-                    2 * ((intermediate_size + block_n - 1) // block_n),
+                    2 * ((intermediate_size_per_partition + block_n - 1) //
+                         block_n),
                     (hidden_size + block_k - 1) // block_k,
                     dtype=torch.float32,
                 ),
@@ -456,7 +461,7 @@ def create_weights(self, layer: Module, num_experts: int, hidden_size: int,
                 torch.ones(
                     num_experts,
                     (hidden_size + block_n - 1) // block_n,
-                    (intermediate_size + block_k - 1) // block_k,
+                    (intermediate_size_per_partition + block_k - 1) // block_k,
                     dtype=torch.float32,
                 ),
                 requires_grad=False,
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 2dbfca9b07690..4dc4b052b0410 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -317,7 +317,7 @@ def create_weights(
         layer: torch.nn.Module,
         num_experts: int,
         hidden_size: int,
-        intermediate_size: int,
+        intermediate_size_per_partition: int,
         params_dtype: torch.dtype,
         **extra_weight_attrs,
     ):
@@ -326,7 +326,8 @@ def create_weights(
         # Supports only sym for now (no zp)
         if self.quant_config.group_size != -1:
             scales_size13 = hidden_size // self.quant_config.group_size
-            scales_size2 = intermediate_size // self.quant_config.group_size
+            scales_size2 = (intermediate_size_per_partition //
+                            self.quant_config.group_size)
             strategy = FusedMoeWeightScaleSupported.GROUP.value
         else:
             scales_size13 = 1
@@ -342,7 +343,7 @@ def create_weights(
             torch.empty(
                 num_experts,
                 hidden_size // self.quant_config.pack_factor,
-                2 * intermediate_size,
+                2 * intermediate_size_per_partition,
                 dtype=torch.int32,
             ),
             requires_grad=False,
@@ -353,7 +354,8 @@ def create_weights(
         w2_qweight = torch.nn.Parameter(
             torch.empty(
                 num_experts,
-                intermediate_size // self.quant_config.pack_factor,
+                intermediate_size_per_partition //
+                self.quant_config.pack_factor,
                 hidden_size,
                 dtype=torch.int32,
             ),
@@ -365,7 +367,7 @@ def create_weights(
         w13_scales = torch.nn.Parameter(
             torch.empty(num_experts,
                         scales_size13,
-                        2 * intermediate_size,
+                        2 * intermediate_size_per_partition,
                         dtype=torch.half),
             requires_grad=False,
         )
@@ -385,7 +387,8 @@ def create_weights(
         w13_qzeros = torch.nn.Parameter(
             torch.empty(num_experts,
                         scales_size13,
-                        2 * intermediate_size // self.quant_config.pack_factor,
+                        2 * intermediate_size_per_partition //
+                        self.quant_config.pack_factor,
                         dtype=params_dtype),
             requires_grad=False,
         )
@@ -414,7 +417,7 @@ def create_weights(
         w2_g_idx = torch.nn.Parameter(
             torch.empty(
                 num_experts,
-                intermediate_size,
+                intermediate_size_per_partition,
                 dtype=torch.int32,
             ),
             requires_grad=False,
@@ -435,7 +438,7 @@ def create_weights(
         w2_g_idx_sort_indices = torch.nn.Parameter(
             torch.empty(
                 num_experts,
-                intermediate_size,
+                intermediate_size_per_partition,
                 dtype=torch.int32,
             ),
             requires_grad=False,
diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py
index 3e19247300808..68a3954540763 100644
--- a/vllm/model_executor/layers/quantization/quark/quark_moe.py
+++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py
@@ -60,24 +60,26 @@ def __init__(self, weight_config: Dict[str, Any], input_config: Dict[str,
         self.static_input_scales = not self.input_quant.get("is_dynamic")
 
     def create_weights(self, layer: torch.nn.Module, num_experts: int,
-                       hidden_size: int, intermediate_size: int,
+                       hidden_size: int, intermediate_size_per_partition: int,
                        params_dtype: torch.dtype, **extra_weight_attrs):
 
         params_dtype = torch.float8_e4m3fn
 
         # WEIGHTS
-        w13_weight = torch.nn.Parameter(torch.empty(num_experts,
-                                                    2 * intermediate_size,
-                                                    hidden_size,
-                                                    dtype=params_dtype),
+        w13_weight = torch.nn.Parameter(torch.empty(
+            num_experts,
+            2 * intermediate_size_per_partition,
+            hidden_size,
+            dtype=params_dtype),
                                         requires_grad=False)
         layer.register_parameter("w13_weight", w13_weight)
         set_weight_attrs(w13_weight, extra_weight_attrs)
 
-        w2_weight = torch.nn.Parameter(torch.empty(num_experts,
-                                                   hidden_size,
-                                                   intermediate_size,
-                                                   dtype=params_dtype),
+        w2_weight = torch.nn.Parameter(torch.empty(
+            num_experts,
+            hidden_size,
+            intermediate_size_per_partition,
+            dtype=params_dtype),
                                        requires_grad=False)
         layer.register_parameter("w2_weight", w2_weight)
         set_weight_attrs(w2_weight, extra_weight_attrs)

From 9726ad676d04d4b424d266212ac85000efdcd64d Mon Sep 17 00:00:00 2001
From: Junichi Sato <junichi.sato@sbintuitions.co.jp>
Date: Fri, 24 Jan 2025 07:02:13 +0900
Subject: [PATCH 357/462] [Misc] Fix OpenAI API Compatibility Issues in
 Benchmark Script (#12357)

Signed-off-by: Junichi Sato <junichi.sato@sbintuitions.co.jp>
---
 benchmarks/backend_request_func.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index f415d109bdfc8..d098c110cd921 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -245,11 +245,12 @@ async def async_request_openai_completions(
             "max_tokens": request_func_input.output_len,
             "logprobs": request_func_input.logprobs,
             "stream": True,
-            "ignore_eos": request_func_input.ignore_eos,
             "stream_options": {
                 "include_usage": True,
             },
         }
+        if request_func_input.ignore_eos:
+            payload["ignore_eos"] = request_func_input.ignore_eos
         if request_func_input.extra_body:
             payload.update(request_func_input.extra_body)
         headers = {
@@ -297,7 +298,7 @@ async def async_request_openai_completions(
                                                       most_recent_timestamp)
 
                                 most_recent_timestamp = timestamp
-                                generated_text += text
+                                generated_text += text or ""
                             elif usage := data.get("usage"):
                                 output.output_tokens = usage.get(
                                     "completion_tokens")
@@ -348,11 +349,12 @@ async def async_request_openai_chat_completions(
             "temperature": 0.0,
             "max_completion_tokens": request_func_input.output_len,
             "stream": True,
-            "ignore_eos": request_func_input.ignore_eos,
             "stream_options": {
                 "include_usage": True,
             },
         }
+        if request_func_input.ignore_eos:
+            payload["ignore_eos"] = request_func_input.ignore_eos
         if request_func_input.extra_body:
             payload.update(request_func_input.extra_body)
         headers = {
@@ -394,7 +396,7 @@ async def async_request_openai_chat_completions(
                                     output.itl.append(timestamp -
                                                       most_recent_timestamp)
 
-                                generated_text += content
+                                generated_text += content or ""
                             elif usage := data.get("usage"):
                                 output.output_tokens = usage.get(
                                     "completion_tokens")

From 682b55bc0734a5b02ef572c361c66dade71bd44d Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Thu, 23 Jan 2025 14:10:03 -0800
Subject: [PATCH 358/462] [Docs] Add meetup slides (#12345)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 README.md                        | 5 +----
 docs/source/community/meetups.md | 1 +
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 658b9fb6edd8c..4ed905bf7aa9d 100644
--- a/README.md
+++ b/README.md
@@ -15,11 +15,8 @@ Easy, fast, and cheap LLM serving for everyone
 
 ---
 
-The first vLLM meetup in 2025 is happening on January 22nd, Wednesday, with Google Cloud in San Francisco! We will talk about vLLM's performant V1 architecture, Q1 roadmap, Google Cloud's innovation around vLLM: networking, Cloud Run, Vertex, and TPU! [Register Now](https://lu.ma/zep56hui)
-
----
-
 *Latest News* 🔥
+- [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing).
 - [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
 - [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing).
 - [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there!
diff --git a/docs/source/community/meetups.md b/docs/source/community/meetups.md
index 43fa9ee616096..ab5ea147f4c6a 100644
--- a/docs/source/community/meetups.md
+++ b/docs/source/community/meetups.md
@@ -4,6 +4,7 @@
 
 We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
 
+- [The eighth vLLM meetup](https://lu.ma/zep56hui), with Google Cloud, January 22nd 2025. [[Slides]](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing)
 - [The seventh vLLM meetup](https://lu.ma/h0qvrajz), with Snowflake, November 14th 2024. [[Slides]](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing)
 - [The sixth vLLM meetup](https://lu.ma/87q3nvnh), with NVIDIA, September 9th 2024. [[Slides]](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing)
 - [The fifth vLLM meetup](https://lu.ma/lp0gyjqr), with AWS, July 24th 2024. [[Slides]](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing)

From c5cffcd0cdbba9273954b4fd1317137208ce564c Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Thu, 23 Jan 2025 20:15:52 -0500
Subject: [PATCH 359/462] [Docs] Update spec decode + structured output in
 compat matrix (#12373)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 docs/source/features/compatibility_matrix.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/features/compatibility_matrix.md b/docs/source/features/compatibility_matrix.md
index 86a82eb36df33..47ab616b30686 100644
--- a/docs/source/features/compatibility_matrix.md
+++ b/docs/source/features/compatibility_matrix.md
@@ -307,7 +307,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar
      - ✅
      - ?
      - ?
-     - ✅
+     - [✗](gh-issue:11484)
      - ✅
      - ✗
      - ?

From 24b0205f58c28e05ac060ec6f0e4defe8e9c32eb Mon Sep 17 00:00:00 2001
From: Nick Hill <nickhill@us.ibm.com>
Date: Thu, 23 Jan 2025 17:17:41 -0800
Subject: [PATCH 360/462] [V1][Frontend] Coalesce bunched `RequestOutput`s
 (#12298)

Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Robert Shaw <rshaw@neuralmagic.com>
---
 tests/v1/engine/test_async_llm.py | 51 +++++++++++++++++++++----------
 vllm/outputs.py                   | 22 ++++++++++++-
 vllm/v1/engine/async_llm.py       | 10 +++++-
 3 files changed, 65 insertions(+), 18 deletions(-)

diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py
index 2c805e18eebae..10f783b21a9ec 100644
--- a/tests/v1/engine/test_async_llm.py
+++ b/tests/v1/engine/test_async_llm.py
@@ -1,4 +1,5 @@
 import asyncio
+from contextlib import ExitStack
 from typing import List, Tuple
 
 import pytest
@@ -6,6 +7,7 @@
 from vllm import SamplingParams
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.platforms import current_platform
+from vllm.sampling_params import RequestOutputKind
 from vllm.v1.engine.async_llm import AsyncLLM
 
 if not current_platform.is_cuda():
@@ -18,28 +20,39 @@
 
 
 async def generate(engine: AsyncLLM, request_id: str,
+                   output_kind: RequestOutputKind,
                    max_tokens: int) -> Tuple[int, str]:
     count = 0
-    async for _ in engine.generate(request_id=request_id,
-                                   prompt="Hello my name is Robert and",
-                                   sampling_params=SamplingParams(
-                                       max_tokens=max_tokens, temperature=0)):
+    sampling_params = SamplingParams(max_tokens=max_tokens,
+                                     output_kind=output_kind,
+                                     temperature=0)
+    async for out in engine.generate(request_id=request_id,
+                                     prompt="Hello my name is Robert and",
+                                     sampling_params=sampling_params):
+
+        num_tokens = len(out.outputs[0].token_ids)
+        if output_kind == RequestOutputKind.DELTA:
+            count += num_tokens
+        else:
+            count = num_tokens
 
-        count += 1
         await asyncio.sleep(0.)
 
     return count, request_id
 
 
+@pytest.mark.parametrize(
+    "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
 @pytest.mark.asyncio
-async def test_load(monkeypatch):
+async def test_load(monkeypatch, output_kind: RequestOutputKind):
     # TODO(rickyx): Remove monkeypatch once we have a better way to test V1
     # so that in the future when we switch, we don't have to change all the
     # tests.
-    with monkeypatch.context() as m:
+    with monkeypatch.context() as m, ExitStack() as after:
         m.setenv("VLLM_USE_V1", "1")
 
         engine = AsyncLLM.from_engine_args(ENGINE_ARGS)
+        after.callback(engine.shutdown)
 
         NUM_REQUESTS = 10000
         NUM_EXPECTED_TOKENS = 10
@@ -51,26 +64,33 @@ async def test_load(monkeypatch):
         for request_id in request_ids:
             tasks.append(
                 asyncio.create_task(
-                    generate(engine, request_id, NUM_EXPECTED_TOKENS)))
+                    generate(engine, request_id, output_kind,
+                             NUM_EXPECTED_TOKENS)))
 
         # Confirm that we got all the EXPECTED tokens from the requests.
-        for task in tasks:
+        done, pending = await asyncio.wait(tasks,
+                                           return_when=asyncio.FIRST_EXCEPTION)
+        for task in pending:
+            task.cancel()
+        for task in done:
             num_generated_tokens, request_id = await task
             assert num_generated_tokens == NUM_EXPECTED_TOKENS, (
                 f"{request_id} generated {num_generated_tokens} but "
                 f"expected {NUM_EXPECTED_TOKENS}")
 
         assert not engine.output_processor.has_unfinished_requests()
-        engine.shutdown()
 
 
+@pytest.mark.parametrize(
+    "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
 @pytest.mark.asyncio
-async def test_abort(monkeypatch):
+async def test_abort(monkeypatch, output_kind: RequestOutputKind):
 
-    with monkeypatch.context() as m:
+    with monkeypatch.context() as m, ExitStack() as after:
         m.setenv("VLLM_USE_V1", "1")
 
         engine = AsyncLLM.from_engine_args(ENGINE_ARGS)
+        after.callback(engine.shutdown)
 
         NUM_REQUESTS = 100
         NUM_EXPECTED_TOKENS = 100
@@ -83,7 +103,8 @@ async def test_abort(monkeypatch):
         for request_id in request_ids:
             tasks.append(
                 asyncio.create_task(
-                    generate(engine, request_id, NUM_EXPECTED_TOKENS)))
+                    generate(engine, request_id, output_kind,
+                             NUM_EXPECTED_TOKENS)))
 
         # API server cancels requests when they disconnect.
         for idx in REQUEST_IDS_TO_ABORT:
@@ -108,9 +129,7 @@ async def test_abort(monkeypatch):
         # Confirm we can do another generation.
         request_id = f"request-{REQUEST_IDS_TO_ABORT[0]}"
         task = asyncio.create_task(
-            generate(engine, request_id, NUM_EXPECTED_TOKENS))
+            generate(engine, request_id, output_kind, NUM_EXPECTED_TOKENS))
         num_generated_tokens, request_id = await task
         assert num_generated_tokens == NUM_EXPECTED_TOKENS
         assert not engine.output_processor.has_unfinished_requests()
-
-        engine.shutdown()
diff --git a/vllm/outputs.py b/vllm/outputs.py
index 63df7dcf519b5..25b2265285d16 100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -1,6 +1,6 @@
 import time
 from dataclasses import dataclass
-from typing import Dict, Generic, List, Optional
+from typing import Dict, Generic, List, MutableSequence, Optional
 from typing import Sequence as GenericSequence
 from typing import Union
 
@@ -162,6 +162,26 @@ def new(
             finished=finished,
         )
 
+    def add(self, next_output: "RequestOutput") -> None:
+        """Merge subsequent RequestOutput into this one"""
+
+        self.prompt = next_output.prompt
+        self.prompt_token_ids = next_output.prompt_token_ids
+        self.prompt_logprobs = next_output.prompt_logprobs
+        self.finished |= next_output.finished
+
+        #TODO assuming n == 1 for now
+        completion = self.outputs[0]
+        next_completion = next_output.outputs[0]
+        completion.text += next_completion.text
+        if not isinstance(completion.token_ids, MutableSequence):
+            completion.token_ids = list(completion.token_ids)
+        completion.token_ids.extend(next_completion.token_ids)
+        if next_completion.logprobs:
+            assert completion.logprobs is not None
+            completion.logprobs.extend(next_completion.logprobs)
+        completion.cumulative_logprob = next_completion.cumulative_logprob
+
     @classmethod
     def from_seq_group(
         cls, seq_group: SequenceGroup, use_cache: bool,
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 1505b62504a2f..6dc68b3a16099 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -15,7 +15,7 @@
 from vllm.outputs import RequestOutput
 from vllm.pooling_params import PoolingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
-from vllm.sampling_params import SamplingParams
+from vllm.sampling_params import RequestOutputKind, SamplingParams
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
 from vllm.usage.usage_lib import UsageContext
@@ -214,6 +214,14 @@ async def generate(
                 # task switching under load which helps performance).
                 out = q.get_nowait() if not q.empty() else await q.get()
 
+                # Coalesce any additional queued outputs
+                while not q.empty():
+                    next_out = q.get_nowait()
+                    if sampling_params.output_kind == RequestOutputKind.DELTA:
+                        out.add(next_out)
+                    else:
+                        out = next_out
+
                 # Note: both OutputProcessor and EngineCore handle their
                 # own request cleanup based on finished.
                 finished = out.finished

From d3d6bb13fb62da3234addf6574922a4ec0513d04 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Thu, 23 Jan 2025 21:17:30 -0500
Subject: [PATCH 361/462] Set weights_only=True when using torch.load()
 (#12366)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 vllm/assets/image.py                             | 2 +-
 vllm/lora/models.py                              | 3 ++-
 vllm/model_executor/model_loader/weight_utils.py | 8 +++++---
 vllm/prompt_adapter/utils.py                     | 3 ++-
 4 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/vllm/assets/image.py b/vllm/assets/image.py
index cb831cb0b5bb4..0a55506f88255 100644
--- a/vllm/assets/image.py
+++ b/vllm/assets/image.py
@@ -26,4 +26,4 @@ def image_embeds(self) -> torch.Tensor:
         """
         image_path = get_vllm_public_assets(filename=f"{self.name}.pt",
                                             s3_prefix=VLM_IMAGES_DIR)
-        return torch.load(image_path, map_location="cpu")
+        return torch.load(image_path, map_location="cpu", weights_only=True)
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 9809405ca9a61..b77b6b3d72ff4 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -273,7 +273,8 @@ def from_local_checkpoint(
                 new_embeddings_tensor_path)
         elif os.path.isfile(new_embeddings_bin_file_path):
             embeddings = torch.load(new_embeddings_bin_file_path,
-                                    map_location=device)
+                                    map_location=device,
+                                    weights_only=True)
 
         return cls.from_lora_tensors(
             lora_model_id=get_lora_id()
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index b70407221312a..b764a940b1742 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -93,7 +93,7 @@ def convert_bin_to_safetensor_file(
     pt_filename: str,
     sf_filename: str,
 ) -> None:
-    loaded = torch.load(pt_filename, map_location="cpu")
+    loaded = torch.load(pt_filename, map_location="cpu", weights_only=True)
     if "state_dict" in loaded:
         loaded = loaded["state_dict"]
     shared = _shared_pointers(loaded)
@@ -381,7 +381,9 @@ def np_cache_weights_iterator(
                     disable=not enable_tqdm,
                     bar_format=_BAR_FORMAT,
             ):
-                state = torch.load(bin_file, map_location="cpu")
+                state = torch.load(bin_file,
+                                   map_location="cpu",
+                                   weights_only=True)
                 for name, param in state.items():
                     param_path = os.path.join(np_folder, name)
                     with open(param_path, "wb") as f:
@@ -447,7 +449,7 @@ def pt_weights_iterator(
             disable=not enable_tqdm,
             bar_format=_BAR_FORMAT,
     ):
-        state = torch.load(bin_file, map_location="cpu")
+        state = torch.load(bin_file, map_location="cpu", weights_only=True)
         yield from state.items()
         del state
         torch.cuda.empty_cache()
diff --git a/vllm/prompt_adapter/utils.py b/vllm/prompt_adapter/utils.py
index 473b87c89c21d..8b2732923c4e7 100644
--- a/vllm/prompt_adapter/utils.py
+++ b/vllm/prompt_adapter/utils.py
@@ -89,6 +89,7 @@ def load_peft_weights(model_id: str,
         adapters_weights = safe_load_file(filename, device=device)
     else:
         adapters_weights = torch.load(filename,
-                                      map_location=torch.device(device))
+                                      map_location=torch.device(device),
+                                      weights_only=True)
 
     return adapters_weights

From 5e5630a478fe75bc99e4ceea304f9ea68de5aaa6 Mon Sep 17 00:00:00 2001
From: omer-dayan <omer@run.ai>
Date: Fri, 24 Jan 2025 05:06:07 +0200
Subject: [PATCH 362/462] [Bugfix] Path join when building local path for S3
 clone (#12353)

Signed-off-by: Omer Dayan (SW-GPU) <omer@run.ai>
---
 vllm/transformers_utils/s3_utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/transformers_utils/s3_utils.py b/vllm/transformers_utils/s3_utils.py
index 6ae68161bbd97..74a56cbf57ec3 100644
--- a/vllm/transformers_utils/s3_utils.py
+++ b/vllm/transformers_utils/s3_utils.py
@@ -145,7 +145,8 @@ def pull_files(self,
             return
 
         for file in files:
-            destination_file = self.dir + file.removeprefix(base_dir)
+            destination_file = os.path.join(self.dir,
+                                            file.removeprefix(base_dir))
             local_dir = Path(destination_file).parent
             os.makedirs(local_dir, exist_ok=True)
             self.s3.download_file(bucket_name, file, destination_file)

From 55ef66edf48a15468a96cb34985319c57e3840ce Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Thu, 23 Jan 2025 22:19:42 -0500
Subject: [PATCH 363/462] Update compressed-tensors version (#12367)

---
 requirements-common.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements-common.txt b/requirements-common.txt
index 777b2bb124323..7051ca8cb50cd 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -34,6 +34,6 @@ pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
 setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
 einops # Required for Qwen2-VL.
-compressed-tensors == 0.8.1 # required for compressed-tensors
+compressed-tensors == 0.9.0 # required for compressed-tensors
 depyf==0.18.0 # required for profiling and debugging with compilation config
 cloudpickle # allows pickling lambda functions in model_executor/models/registry.py

From 0e74d797ce8618fdb685126e0ff8576fb966e6ad Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Thu, 23 Jan 2025 19:19:55 -0800
Subject: [PATCH 364/462] [V1] Increase default batch size for H100/H200
 (#12369)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/engine/arg_utils.py | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 5d3aeb68ebcfe..8f1b0bc5fd62e 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1279,11 +1279,22 @@ def _override_v1_engine_args(self, usage_context: UsageContext) -> None:
         self.enable_chunked_prefill = True
         # When no user override, set the default values based on the usage
         # context.
-        # TODO(woosuk): Tune the default values for different hardware.
-        default_max_num_batched_tokens = {
-            UsageContext.LLM_CLASS: 8192,
-            UsageContext.OPENAI_API_SERVER: 2048,
-        }
+        # Use different default values for different hardware.
+        from vllm.platforms import current_platform
+        device_name = current_platform.get_device_name().lower()
+        if "h100" in device_name or "h200" in device_name:
+            # For H100 and H200, we use larger default values.
+            default_max_num_batched_tokens = {
+                UsageContext.LLM_CLASS: 16384,
+                UsageContext.OPENAI_API_SERVER: 8192,
+            }
+        else:
+            # TODO(woosuk): Tune the default values for other hardware.
+            default_max_num_batched_tokens = {
+                UsageContext.LLM_CLASS: 8192,
+                UsageContext.OPENAI_API_SERVER: 2048,
+            }
+
         if (self.max_num_batched_tokens is None
                 and usage_context in default_max_num_batched_tokens):
             self.max_num_batched_tokens = default_max_num_batched_tokens[

From 6dd94dbe94c1820a1e224cba65efcf0befa97995 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Fri, 24 Jan 2025 11:34:27 +0800
Subject: [PATCH 365/462] [perf] fix perf regression from #12253 (#12380)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/worker/model_runner.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index cf2f1c6b3b877..bf1a40d48a789 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -455,7 +455,6 @@ def __init__(self,
         self.enable_prompt_adapter = (self.runner.prompt_adapter_config
                                       is not None)
         self.multi_modal_input_mapper = self.runner.multi_modal_input_mapper
-        self.decode_only = True
 
         # Attention metadata inputs.
         if self.attn_backend is not None:
@@ -477,6 +476,10 @@ def prepare(self,
                 finished_requests_ids: Optional[List[str]] = None) -> None:
         self.finished_requests_ids = finished_requests_ids
 
+        # if the current batch is decode-only.
+        # will be set to False if there is any non-decode request.
+        self.decode_only = True
+
         # Intermediate data (data in CPU before going to GPU) for
         # the current sequence group.
         self.inter_data_list: List[

From 3c818bdb42f2b0ed1250568abbabf45909ff308e Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Fri, 24 Jan 2025 00:22:04 -0800
Subject: [PATCH 366/462] [Misc] Use VisionArena Dataset for VLM Benchmarking
 (#12389)

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 benchmarks/benchmark_serving.py | 32 ++++++++++++--------------------
 1 file changed, 12 insertions(+), 20 deletions(-)

diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index bc026b0ec1ca6..63d2c3f7c7dd9 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -200,7 +200,7 @@ def sample_sonnet_requests(
     return sampled_requests
 
 
-def sample_mmmu_pro_vision_requests(
+def sample_vision_arena_requests(
     dataset,
     num_requests: int,
     tokenizer: PreTrainedTokenizerBase,
@@ -212,13 +212,7 @@ def sample_mmmu_pro_vision_requests(
         if len(sampled_requests) == num_requests:
             break
 
-        # MMMU-Pro vision direct prompt
-        # Ref: https://github.com/MMMU-Benchmark/MMMU/blob/6ce42f4d8f70c1841c67867152648974415b5cac/mmmu-pro/prompts.yaml#L5
-        prompt = (
-            "Answer with the option letter from the given choices directly. "
-            "The last line of your response should be of the following "
-            "format: 'Answer: $LETTER' (without quotes) where LETTER is one of "
-            "options.")
+        prompt = data["turns"][0][0]['content']
 
         prompt_token_ids = tokenizer(prompt).input_ids
         if fixed_output_len is None:
@@ -230,10 +224,10 @@ def sample_mmmu_pro_vision_requests(
         output_len = fixed_output_len
 
         assert isinstance(
-            data["image"],
+            data["images"][0],
             Image), ("Input image format must be `PIL.Image.Image`, "
                      f"given {type(data['image'])}.")
-        image: Image = data["image"]
+        image: Image = data["images"][0]
         image = image.convert("RGB")
         image_data = io.BytesIO()
         image.save(image_data, format='JPEG')
@@ -252,7 +246,7 @@ def sample_mmmu_pro_vision_requests(
 
 def sample_hf_requests(
     dataset_path: str,
-    dataset_subset: str,
+    dataset_subset: Optional[str],
     dataset_split: str,
     num_requests: int,
     tokenizer: PreTrainedTokenizerBase,
@@ -260,19 +254,17 @@ def sample_hf_requests(
     fixed_output_len: Optional[int] = None,
 ) -> List[Tuple[str, str, int, Optional[Dict[str, Collection[str]]]]]:
 
-    # Special case for MMMU-Pro vision dataset
-    if dataset_path == 'MMMU/MMMU_Pro' and dataset_subset == 'vision':
-        assert dataset_split == "test"
+    # Special case for vision_arena dataset
+    if dataset_path == 'lmarena-ai/vision-arena-bench-v0.1' \
+        and dataset_subset is None:
+        assert dataset_split == "train"
         dataset = load_dataset(dataset_path,
                                name=dataset_subset,
                                split=dataset_split,
                                streaming=True)
-        assert "image" in dataset.features, (
-            "MMMU/MMMU_Pro vision dataset must have 'image' column.")
-        filter_func = lambda x: isinstance(x["image"], Image)
-        dataset = dataset.shuffle(seed=random_seed).filter(filter_func)
-        return sample_mmmu_pro_vision_requests(dataset, num_requests,
-                                               tokenizer, fixed_output_len)
+        dataset = dataset.shuffle(seed=random_seed)
+        return sample_vision_arena_requests(dataset, num_requests, tokenizer,
+                                            fixed_output_len)
 
     dataset = load_dataset(dataset_path,
                            name=dataset_subset,

From c7c98510360693be668b654c7b1c168d1e656f2c Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Fri, 24 Jan 2025 17:31:25 +0800
Subject: [PATCH 367/462] [ci/build] fix wheel size check (#12396)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 .buildkite/check-wheel-size.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/.buildkite/check-wheel-size.py b/.buildkite/check-wheel-size.py
index 0412c5f37952d..105d0b9e77a44 100644
--- a/.buildkite/check-wheel-size.py
+++ b/.buildkite/check-wheel-size.py
@@ -2,8 +2,10 @@
 import sys
 import zipfile
 
-# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 250 MB
-VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 250))
+# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 300 MiB
+# Note that we have 400 MiB quota, please use it wisely.
+# See https://github.com/pypi/support/issues/3792 .
+VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 300))
 
 
 def print_top_10_largest_files(zip_file):

From 9a0f3bdbe530f4d90e27cf9c6f5cc506e2b44c03 Mon Sep 17 00:00:00 2001
From: Mohit Deopujari <mdeopujari@habana.ai>
Date: Fri, 24 Jan 2025 01:43:49 -0800
Subject: [PATCH 368/462] [Hardware][Gaudi][Doc] Add missing step in setup
 instructions (#12382)

---
 .../installation/ai_accelerator/hpu-gaudi.inc.md                | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md b/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
index b4695d504b601..ae42dd0c0d08f 100644
--- a/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
+++ b/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
@@ -59,6 +59,7 @@ To build and install vLLM from source, run:
 ```console
 git clone https://github.com/vllm-project/vllm.git
 cd vllm
+pip install -r requirements-hpu.txt
 python setup.py develop
 ```
 
@@ -68,6 +69,7 @@ Currently, the latest features and performance optimizations are developed in Ga
 git clone https://github.com/HabanaAI/vllm-fork.git
 cd vllm-fork
 git checkout habana_main
+pip install -r requirements-hpu.txt
 python setup.py develop
 ```
 

From e784c6b9984e8f8116f74000b863d941495acb0b Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Fri, 24 Jan 2025 17:54:29 +0800
Subject: [PATCH 369/462] [ci/build] sync default value for wheel size (#12398)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 .buildkite/check-wheel-size.py | 1 +
 Dockerfile                     | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/.buildkite/check-wheel-size.py b/.buildkite/check-wheel-size.py
index 105d0b9e77a44..e29eb78a9f945 100644
--- a/.buildkite/check-wheel-size.py
+++ b/.buildkite/check-wheel-size.py
@@ -5,6 +5,7 @@
 # Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 300 MiB
 # Note that we have 400 MiB quota, please use it wisely.
 # See https://github.com/pypi/support/issues/3792 .
+# Please also sync the value with the one in Dockerfile.
 VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 300))
 
 
diff --git a/Dockerfile b/Dockerfile
index 261f5440aee47..cb9cf0da5be65 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -126,8 +126,8 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
 
 # Check the size of the wheel if RUN_WHEEL_CHECK is true
 COPY .buildkite/check-wheel-size.py check-wheel-size.py
-# Default max size of the wheel is 250MB
-ARG VLLM_MAX_SIZE_MB=250
+# sync the default value with .buildkite/check-wheel-size.py
+ARG VLLM_MAX_SIZE_MB=300
 ENV VLLM_MAX_SIZE_MB=$VLLM_MAX_SIZE_MB
 ARG RUN_WHEEL_CHECK=true
 RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \

From 3bb8e2c9a2a2dadfe61f35bcde15f72c43f4635f Mon Sep 17 00:00:00 2001
From: Junichi Sato <junichi.sato@sbintuitions.co.jp>
Date: Fri, 24 Jan 2025 23:58:26 +0900
Subject: [PATCH 370/462] [Misc] Enable proxy support in benchmark script
 (#12356)

Signed-off-by: Junichi Sato <junichi.sato@sbintuitions.co.jp>
---
 benchmarks/backend_request_func.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index d098c110cd921..0612e8778aca5 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -51,7 +51,8 @@ async def async_request_tgi(
     api_url = request_func_input.api_url
     assert api_url.endswith("generate_stream")
 
-    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+    async with aiohttp.ClientSession(trust_env=True,
+                                     timeout=AIOHTTP_TIMEOUT) as session:
         params = {
             "best_of": request_func_input.best_of,
             "max_new_tokens": request_func_input.output_len,
@@ -123,7 +124,8 @@ async def async_request_trt_llm(
     api_url = request_func_input.api_url
     assert api_url.endswith("generate_stream")
 
-    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+    async with aiohttp.ClientSession(trust_env=True,
+                                     timeout=AIOHTTP_TIMEOUT) as session:
         assert request_func_input.best_of == 1
         payload = {
             "accumulate_tokens": True,
@@ -187,7 +189,8 @@ async def async_request_deepspeed_mii(
     request_func_input: RequestFuncInput,
     pbar: Optional[tqdm] = None,
 ) -> RequestFuncOutput:
-    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+    async with aiohttp.ClientSession(trust_env=True,
+                                     timeout=AIOHTTP_TIMEOUT) as session:
         assert request_func_input.best_of == 1
 
         payload = {
@@ -235,7 +238,8 @@ async def async_request_openai_completions(
         ("completions", "profile")
     ), "OpenAI Completions API URL must end with 'completions' or 'profile'."
 
-    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+    async with aiohttp.ClientSession(trust_env=True,
+                                     timeout=AIOHTTP_TIMEOUT) as session:
         payload = {
             "model": request_func_input.model_name \
                 if request_func_input.model_name else request_func_input.model,
@@ -333,7 +337,8 @@ async def async_request_openai_chat_completions(
         "chat/completions"
     ), "OpenAI Chat Completions API URL must end with 'chat/completions'."
 
-    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+    async with aiohttp.ClientSession(trust_env=True,
+                                     timeout=AIOHTTP_TIMEOUT) as session:
         content = [{"type": "text", "text": request_func_input.prompt}]
         if request_func_input.multi_modal_content:
             content.append(request_func_input.multi_modal_content)

From ab5bbf5ae32bc438803ced4c2a021bb161e00050 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Fri, 24 Jan 2025 10:27:59 -0500
Subject: [PATCH 371/462] [Bugfix][Kernel] Fix CUDA 11.8 being broken by FA3
 build (#12375)

Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
---
 CMakeLists.txt                           |  2 +-
 setup.py                                 |  5 ++++-
 tests/kernels/test_cascade_flash_attn.py | 11 ++++++-----
 tests/kernels/test_flash_attn.py         | 21 ++++++++++-----------
 vllm/attention/backends/flash_attn.py    | 14 +++++++++++---
 vllm/v1/attention/backends/flash_attn.py | 11 ++++++++++-
 6 files changed, 42 insertions(+), 22 deletions(-)
 mode change 100644 => 100755 CMakeLists.txt
 mode change 100644 => 100755 setup.py
 mode change 100644 => 100755 tests/kernels/test_cascade_flash_attn.py
 mode change 100644 => 100755 vllm/attention/backends/flash_attn.py
 mode change 100644 => 100755 vllm/v1/attention/backends/flash_attn.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
old mode 100644
new mode 100755
index 5039ac2448f83..2f9da6fa3e1d3
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -576,7 +576,7 @@ else()
   FetchContent_Declare(
           vllm-flash-attn
           GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 90eacc1af2a7c3de62ea249e929ed5faccf38954
+          GIT_TAG 0aff05f577e8a10086066a00618609199b25231d
           GIT_PROGRESS TRUE
           # Don't share the vllm-flash-attn build between build types
           BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
diff --git a/setup.py b/setup.py
old mode 100644
new mode 100755
index 36c89d435c7b7..ee193e4693806
--- a/setup.py
+++ b/setup.py
@@ -598,7 +598,10 @@ def _read_requirements(filename: str) -> List[str]:
 
 if _is_cuda():
     ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa2_C"))
-    ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa3_C"))
+    if envs.VLLM_USE_PRECOMPILED or get_nvcc_cuda_version() >= Version("12.0"):
+        # FA3 requires CUDA 12.0 or later
+        ext_modules.append(
+            CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa3_C"))
     ext_modules.append(CMakeExtension(name="vllm.cumem_allocator"))
 
 if _build_custom_ops():
diff --git a/tests/kernels/test_cascade_flash_attn.py b/tests/kernels/test_cascade_flash_attn.py
old mode 100644
new mode 100755
index 00eb927205d46..8edfde42ede74
--- a/tests/kernels/test_cascade_flash_attn.py
+++ b/tests/kernels/test_cascade_flash_attn.py
@@ -6,7 +6,9 @@
 from vllm.platforms import current_platform
 from vllm.v1.attention.backends.flash_attn import (cascade_attention,
                                                    merge_attn_states)
-from vllm.vllm_flash_attn import flash_attn_varlen_func
+from vllm.vllm_flash_attn import (fa_version_unsupported_reason,
+                                  flash_attn_varlen_func,
+                                  is_fa_version_supported)
 
 NUM_HEADS = [(4, 4), (8, 2), (16, 2)]
 HEAD_SIZES = [128, 192, 256]
@@ -91,10 +93,9 @@ def test_cascade(
     fa_version: int,
 ) -> None:
     torch.set_default_device("cuda")
-    if fa_version == 3 and (torch.cuda.get_device_capability() == (8, 6)
-                            or torch.cuda.get_device_capability() == (8, 9)):
-        pytest.skip("Flash attention version 3 fails on 8.6 and 8.9 due to "
-                    "insufficient shared memory for some shapes")
+    if not is_fa_version_supported(fa_version):
+        pytest.skip(f"Flash attention version {fa_version} not supported due "
+                    f"to: \"{fa_version_unsupported_reason(fa_version)}\"")
 
     current_platform.seed_everything(0)
 
diff --git a/tests/kernels/test_flash_attn.py b/tests/kernels/test_flash_attn.py
index b22153c86b25f..0ee0bf6c6a374 100644
--- a/tests/kernels/test_flash_attn.py
+++ b/tests/kernels/test_flash_attn.py
@@ -4,8 +4,10 @@
 import torch
 
 from vllm.platforms import current_platform
-from vllm.vllm_flash_attn import (flash_attn_varlen_func,
-                                  flash_attn_with_kvcache)
+from vllm.vllm_flash_attn import (fa_version_unsupported_reason,
+                                  flash_attn_varlen_func,
+                                  flash_attn_with_kvcache,
+                                  is_fa_version_supported)
 
 NUM_HEADS = [(4, 4), (8, 2), (16, 2)]
 HEAD_SIZES = [128, 256]
@@ -95,10 +97,9 @@ def test_flash_attn_with_paged_kv(
     fa_version: int,
 ) -> None:
     torch.set_default_device("cuda")
-    if fa_version == 3 and (torch.cuda.get_device_capability() == (8, 6)
-                            or torch.cuda.get_device_capability() == (8, 9)):
-        pytest.skip("Flash attention version 3 fails on 8.6 and 8.9 due to "
-                    "insufficient shared memory for some shapes")
+    if not is_fa_version_supported(fa_version):
+        pytest.skip(f"Flash attention version {fa_version} not supported due "
+                    f"to: \"{fa_version_unsupported_reason(fa_version)}\"")
 
     current_platform.seed_everything(0)
     num_seqs = len(kv_lens)
@@ -182,11 +183,9 @@ def test_varlen_with_paged_kv(
     fa_version: int,
 ) -> None:
     torch.set_default_device("cuda")
-    if fa_version == 3 and (torch.cuda.get_device_capability() == (8, 6)
-                            or torch.cuda.get_device_capability() == (8, 9)):
-        pytest.skip("Flash attention version 3 fails on 8.6 and 8.9 due to "
-                    "insufficient shared memory for some shapes")
-
+    if not is_fa_version_supported(fa_version):
+        pytest.skip(f"Flash attention version {fa_version} not supported due "
+                    f"to: \"{fa_version_unsupported_reason(fa_version)}\"")
     current_platform.seed_everything(0)
     num_seqs = len(seq_lens)
     query_lens = [x[0] for x in seq_lens]
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
old mode 100644
new mode 100755
index 1be099283e472..4a9aa1e217365
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -18,17 +18,20 @@
     get_seq_len_block_table_args, is_all_cross_attn_metadata_set,
     is_all_encoder_attn_metadata_set, is_block_tables_empty)
 from vllm.envs import VLLM_FLASH_ATTN_VERSION
+from vllm.logger import init_logger
 from vllm.multimodal import MultiModalPlaceholderMap
 from vllm.platforms import current_platform
 from vllm.utils import async_tensor_h2d, make_tensor_with_pad
+from vllm.vllm_flash_attn import (fa_version_unsupported_reason,
+                                  flash_attn_varlen_func,
+                                  flash_attn_with_kvcache,
+                                  is_fa_version_supported)
 
 if TYPE_CHECKING:
     from vllm.worker.model_runner import (ModelInputForGPUBuilder,
                                           ModelInputForGPUWithSamplingMetadata)
 
-from vllm.vllm_flash_attn import (flash_attn_varlen_func,
-                                  flash_attn_with_kvcache,
-                                  is_fa_version_supported)
+logger = init_logger(__name__)
 
 
 class FlashAttentionBackend(AttentionBackend):
@@ -652,6 +655,11 @@ def __init__(
             assert VLLM_FLASH_ATTN_VERSION in [2, 3]
             self.fa_version = VLLM_FLASH_ATTN_VERSION
 
+        if not is_fa_version_supported(self.fa_version):
+            logger.error("Cannot use FA version %d is not supported due to %s",
+                         self.fa_version,
+                         fa_version_unsupported_reason(self.fa_version))
+
         assert is_fa_version_supported(self.fa_version)
 
     def forward(
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
old mode 100644
new mode 100755
index 7fe9b3a8f595a..ce83b1fac6c0b
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -10,11 +10,15 @@
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionMetadata, AttentionType)
 from vllm.envs import VLLM_FLASH_ATTN_VERSION
+from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.utils import cdiv
-from vllm.vllm_flash_attn import (flash_attn_varlen_func,
+from vllm.vllm_flash_attn import (fa_version_unsupported_reason,
+                                  flash_attn_varlen_func,
                                   is_fa_version_supported)
 
+logger = init_logger(__name__)
+
 
 class FlashAttentionBackend(AttentionBackend):
 
@@ -143,6 +147,11 @@ def __init__(
             assert VLLM_FLASH_ATTN_VERSION in [2, 3]
             self.fa_version = VLLM_FLASH_ATTN_VERSION
 
+        if not is_fa_version_supported(self.fa_version):
+            logger.error("Cannot use FA version %d is not supported due to %s",
+                         self.fa_version,
+                         fa_version_unsupported_reason(self.fa_version))
+
         assert is_fa_version_supported(self.fa_version)
 
     def forward(

From df5dafaa5ba611f7179720958ba63e49615c927f Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 25 Jan 2025 03:45:20 +0800
Subject: [PATCH 372/462] [Misc] Remove deprecated code (#12383)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/async_engine/test_api_server.py         | 23 ++++++----
 tests/basic_correctness/test_preemption.py    | 18 ++++----
 .../multi_step/test_correctness_async_llm.py  |  3 +-
 vllm/config.py                                | 10 -----
 vllm/engine/arg_utils.py                      |  6 ---
 vllm/engine/metrics.py                        | 43 -------------------
 6 files changed, 25 insertions(+), 78 deletions(-)

diff --git a/tests/async_engine/test_api_server.py b/tests/async_engine/test_api_server.py
index 83c71b5cf6eb7..91ac35dd67bbf 100644
--- a/tests/async_engine/test_api_server.py
+++ b/tests/async_engine/test_api_server.py
@@ -25,27 +25,32 @@ def _query_server_long(prompt: str) -> dict:
 
 
 @pytest.fixture
-def api_server(tokenizer_pool_size: int, worker_use_ray: bool):
+def api_server(tokenizer_pool_size: int, distributed_executor_backend: str):
     script_path = Path(__file__).parent.joinpath(
         "api_server_async_engine.py").absolute()
     commands = [
-        sys.executable, "-u",
-        str(script_path), "--model", "facebook/opt-125m", "--host",
-        "127.0.0.1", "--tokenizer-pool-size",
-        str(tokenizer_pool_size)
+        sys.executable,
+        "-u",
+        str(script_path),
+        "--model",
+        "facebook/opt-125m",
+        "--host",
+        "127.0.0.1",
+        "--tokenizer-pool-size",
+        str(tokenizer_pool_size),
+        "--distributed-executor-backend",
+        distributed_executor_backend,
     ]
 
-    if worker_use_ray:
-        commands.append("--worker-use-ray")
     uvicorn_process = subprocess.Popen(commands)
     yield
     uvicorn_process.terminate()
 
 
 @pytest.mark.parametrize("tokenizer_pool_size", [0, 2])
-@pytest.mark.parametrize("worker_use_ray", [False, True])
+@pytest.mark.parametrize("distributed_executor_backend", ["mp", "ray"])
 def test_api_server(api_server, tokenizer_pool_size: int,
-                    worker_use_ray: bool):
+                    distributed_executor_backend: str):
     """
     Run the API server and test it.
 
diff --git a/tests/basic_correctness/test_preemption.py b/tests/basic_correctness/test_preemption.py
index 4e502cfb5f4f8..4b27dcbc8609f 100644
--- a/tests/basic_correctness/test_preemption.py
+++ b/tests/basic_correctness/test_preemption.py
@@ -29,10 +29,10 @@ def check_settings():
 
 
 @pytest.fixture
-def worker_use_ray() -> bool:
-    # When SPMD worker is used, use ray_use_worker=True
+def distributed_executor_backend() -> str:
+    # When SPMD worker is used, use distributed_executor_backend="ray"
     # to test delta input optimization works with preemption.
-    return envs.VLLM_USE_RAY_SPMD_WORKER
+    return "ray" if envs.VLLM_USE_RAY_SPMD_WORKER else "mp"
 
 
 @pytest.mark.parametrize("model", MODELS)
@@ -47,7 +47,7 @@ def test_chunked_prefill_recompute(
     dtype: str,
     max_tokens: int,
     chunked_prefill_token_size: int,
-    worker_use_ray: bool,
+    distributed_executor_backend: str,
 ) -> None:
     """Ensure that chunked prefill works with preemption."""
     max_num_seqs = min(chunked_prefill_token_size, 256)
@@ -66,7 +66,7 @@ def test_chunked_prefill_recompute(
             max_num_batched_tokens=max_num_batched_tokens,
             enable_chunked_prefill=enable_chunked_prefill,
             max_num_seqs=max_num_seqs,
-            worker_use_ray=worker_use_ray,
+            distributed_executor_backend=distributed_executor_backend,
             disable_log_stats=False,
     ) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
@@ -93,7 +93,7 @@ def test_preemption(
     model: str,
     dtype: str,
     max_tokens: int,
-    worker_use_ray: bool,
+    distributed_executor_backend: str,
 ) -> None:
     """By default, recompute preemption is enabled"""
 
@@ -104,7 +104,7 @@ def test_preemption(
             model,
             dtype=dtype,
             disable_log_stats=False,
-            worker_use_ray=worker_use_ray,
+            distributed_executor_backend=distributed_executor_backend,
     ) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
         assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
@@ -144,7 +144,7 @@ def test_preemption_infeasible(
     model: str,
     dtype: str,
     max_tokens: int,
-    worker_use_ray: bool,
+    distributed_executor_backend: str,
 ) -> None:
     """Verify infeasible preemption request will be ignored."""
     BLOCK_SIZE = 16
@@ -159,7 +159,7 @@ def test_preemption_infeasible(
             # ignored instead of hanging forever.
             num_gpu_blocks_override=prefill_blocks + decode_blocks // 2,
             max_model_len=((prefill_blocks + decode_blocks // 2) * BLOCK_SIZE),
-            worker_use_ray=worker_use_ray,
+            distributed_executor_backend=distributed_executor_backend,
     ) as vllm_model:
         sampling_params = SamplingParams(max_tokens=max_tokens,
                                          ignore_eos=True)
diff --git a/tests/multi_step/test_correctness_async_llm.py b/tests/multi_step/test_correctness_async_llm.py
index 8456a463adeeb..b8524ed83026b 100644
--- a/tests/multi_step/test_correctness_async_llm.py
+++ b/tests/multi_step/test_correctness_async_llm.py
@@ -16,7 +16,8 @@
 NUM_PROMPTS = [10]
 
 DEFAULT_SERVER_ARGS: List[str] = [
-    "--worker-use-ray",
+    "--distributed-executor-backend",
+    "ray",
     "--gpu-memory-utilization",
     "0.85",
     "--swap-space",
diff --git a/vllm/config.py b/vllm/config.py
index efd81ad3de3b4..11c6f853b2b45 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1227,9 +1227,6 @@ class ParallelConfig:
     pipeline_parallel_size: int = 1  # Number of pipeline parallel groups.
     tensor_parallel_size: int = 1  # Number of tensor parallel groups.
 
-    # Deprecated, use distributed_executor_backend instead.
-    worker_use_ray: Optional[bool] = None
-
     # Maximum number of multiple batches
     # when load model sequentially. To avoid RAM OOM when using tensor
     # parallel and large models.
@@ -1283,13 +1280,6 @@ def __post_init__(self) -> None:
         self.world_size = self.pipeline_parallel_size * \
             self.tensor_parallel_size
 
-        if self.worker_use_ray:
-            if self.distributed_executor_backend is None:
-                self.distributed_executor_backend = "ray"
-            elif not self.use_ray:
-                raise ValueError(f"worker-use-ray can't be used with "
-                                 f"distributed executor backend "
-                                 f"'{self.distributed_executor_backend}'.")
         ray_only_devices = ["tpu"]
         from vllm.platforms import current_platform
         if (current_platform.device_type in ray_only_devices
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 8f1b0bc5fd62e..f16e8e6df76bd 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -100,7 +100,6 @@ class EngineArgs:
     kv_cache_dtype: str = 'auto'
     seed: int = 0
     max_model_len: Optional[int] = None
-    worker_use_ray: bool = False
     # Note: Specifying a custom executor backend by passing a class
     # is intended for expert use only. The API may change without
     # notice.
@@ -389,10 +388,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             'to "ray" if Ray is installed and fail otherwise. Note that tpu '
             'only supports Ray for distributed inference.')
 
-        parser.add_argument(
-            '--worker-use-ray',
-            action='store_true',
-            help='Deprecated, use ``--distributed-executor-backend=ray``.')
         parser.add_argument('--pipeline-parallel-size',
                             '-pp',
                             type=int,
@@ -1071,7 +1066,6 @@ def create_engine_config(self,
         parallel_config = ParallelConfig(
             pipeline_parallel_size=self.pipeline_parallel_size,
             tensor_parallel_size=self.tensor_parallel_size,
-            worker_use_ray=self.worker_use_ray,
             max_parallel_loading_workers=self.max_parallel_loading_workers,
             disable_custom_all_reduce=self.disable_custom_all_reduce,
             tokenizer_pool_config=TokenizerPoolConfig.create_config(
diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py
index f7ce21d0ae988..b771c190dd82a 100644
--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@@ -259,21 +259,6 @@ def __init__(self, labelnames: List[str], vllm_config: VllmConfig):
             documentation="Number of emitted tokens.",
             labelnames=labelnames))
 
-        # Deprecated in favor of vllm:prompt_tokens_total
-        self.gauge_avg_prompt_throughput = self._gauge_cls(
-            name="vllm:avg_prompt_throughput_toks_per_s",
-            documentation="Average prefill throughput in tokens/s.",
-            labelnames=labelnames,
-            multiprocess_mode="sum",
-        )
-        # Deprecated in favor of vllm:generation_tokens_total
-        self.gauge_avg_generation_throughput = self._gauge_cls(
-            name="vllm:avg_generation_throughput_toks_per_s",
-            documentation="Average generation throughput in tokens/s.",
-            labelnames=labelnames,
-            multiprocess_mode="sum",
-        )
-
 
 # end-metrics-definitions
 
@@ -635,20 +620,6 @@ def _log_prometheus(self, stats: Stats) -> None:
         self._log_histogram(self.metrics.histogram_max_tokens_request,
                             stats.max_tokens_requests)
 
-    def _log_prometheus_interval(self, prompt_throughput: float,
-                                 generation_throughput: float) -> None:
-        # Logs metrics to prometheus that are computed every logging_interval.
-        # Support legacy gauge metrics that make throughput calculations on
-        # the vLLM side. Moving forward, we should use counters like
-        # counter_prompt_tokens, counter_generation_tokens
-        # Which log raw data and calculate summaries using rate() on the
-        # grafana/prometheus side. See
-        # https://github.com/vllm-project/vllm/pull/2316#discussion_r1464204666
-        self.metrics.gauge_avg_prompt_throughput.labels(
-            **self.labels).set(prompt_throughput)
-        self.metrics.gauge_avg_generation_throughput.labels(
-            **self.labels).set(generation_throughput)
-
     def log(self, stats: Stats):
         """Logs to prometheus and tracked stats every iteration."""
         # Log to prometheus.
@@ -664,20 +635,6 @@ def log(self, stats: Stats):
         # Log locally every local_interval seconds.
         if local_interval_elapsed(stats.now, self.last_local_log,
                                   self.local_interval):
-            # Compute summary metrics for tracked stats (and log them
-            # to promethus if applicable).
-            prompt_throughput = get_throughput(self.num_prompt_tokens,
-                                               now=stats.now,
-                                               last_log=self.last_local_log)
-            generation_throughput = get_throughput(
-                self.num_generation_tokens,
-                now=stats.now,
-                last_log=self.last_local_log)
-
-            self._log_prometheus_interval(
-                prompt_throughput=prompt_throughput,
-                generation_throughput=generation_throughput)
-
             if self.spec_decode_metrics is not None:
                 self._log_gauge(
                     self.metrics.gauge_spec_decode_draft_acceptance_rate,

From 3132a933b65d8ed3383e082264c682940d92d803 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Fri, 24 Jan 2025 15:20:59 -0500
Subject: [PATCH 373/462] [Bugfix][Kernel] FA3 Fix - RuntimeError: This flash
 attention build only supports pack_gqa (for build size reasons). (#12405)

Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2f9da6fa3e1d3..c954731bf94ef 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -576,7 +576,7 @@ else()
   FetchContent_Declare(
           vllm-flash-attn
           GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 0aff05f577e8a10086066a00618609199b25231d
+          GIT_TAG 9732b0ce005d1e6216864788502d5570004678f5
           GIT_PROGRESS TRUE
           # Don't share the vllm-flash-attn build between build types
           BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn

From 221d388cc5a836fa189305785ed7e887cea8b510 Mon Sep 17 00:00:00 2001
From: ElizaWszola <eliza@neuralmagic.com>
Date: Fri, 24 Jan 2025 20:49:28 -0500
Subject: [PATCH 374/462] [Bugfix][Kernel] Fix moe align block issue for
 mixtral (#12413)

---
 csrc/moe/moe_align_sum_kernels.cu | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/csrc/moe/moe_align_sum_kernels.cu b/csrc/moe/moe_align_sum_kernels.cu
index d609ce1697df3..8b6fe72ad743b 100644
--- a/csrc/moe/moe_align_sum_kernels.cu
+++ b/csrc/moe/moe_align_sum_kernels.cu
@@ -33,7 +33,9 @@ __global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids,
 
   extern __shared__ int32_t shared_mem[];
   int32_t* cumsum = shared_mem;  // 1d tensor with shape (num_experts + 1)
-  token_cnts_t* tokens_cnts = (token_cnts_t*)(shared_mem + blockDim.x + 1);
+  token_cnts_t* tokens_cnts =
+      (token_cnts_t*)(shared_mem + num_experts +
+                      1);  // 2d tensor with shape (blockDim.x + 1, num_experts)
 
   for (int i = 0; i < num_experts; ++i) {
     tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0;

From fb30ee92eefec7eacc0d7483f9d07daa1206530f Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 25 Jan 2025 11:42:42 +0800
Subject: [PATCH 375/462] [Bugfix] Fix BLIP-2 processing (#12412)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/blip2.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index 09c5087c2dc36..b559ac677a740 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -481,14 +481,14 @@ def _get_prompt_replacements(
         bos_token_id = tokenizer.bos_token_id
         assert isinstance(bos_token_id, int)
 
-        image_token_id = vocab["image"]
+        image_token_id = vocab["<image>"]
         num_image_tokens = self.info.get_num_image_tokens()
         image_tokens = [image_token_id] * num_image_tokens
 
         return [
             PromptReplacement(
                 modality="image",
-                target="</s>",
+                target=[bos_token_id],
                 replacement=PromptReplacementDetails(
                     full=image_tokens + [bos_token_id],
                     features=image_tokens,

From bf21481ddef2fa9bb96c13ba1f80072abdae3eb7 Mon Sep 17 00:00:00 2001
From: Divakar Verma <137818590+divakar-amd@users.noreply.github.com>
Date: Fri, 24 Jan 2025 22:17:19 -0600
Subject: [PATCH 376/462] [ROCm][MoE] MI300 tuned configs Mixtral-8x(7B,22B) |
 fp16, fp8 (#12408)

Signed-off-by: Divakar Verma <divakar.verma@amd.com>
---
 ...me=AMD_Instinct_MI300X,dtype=fp8_w8a8.json | 164 ++++++++++++++
 ...14336,device_name=AMD_Instinct_MI300X.json |  64 +++---
 ...me=AMD_Instinct_MI300X,dtype=fp8_w8a8.json | 164 ++++++++++++++
 ...16384,device_name=AMD_Instinct_MI300X.json | 200 ++++++++++++++++++
 ...me=AMD_Instinct_MI300X,dtype=fp8_w8a8.json | 164 ++++++++++++++
 ...=1792,device_name=AMD_Instinct_MI300X.json |  86 ++++----
 ...me=AMD_Instinct_MI300X,dtype=fp8_w8a8.json | 164 ++++++++++++++
 ...=2048,device_name=AMD_Instinct_MI300X.json | 200 ++++++++++++++++++
 ...me=AMD_Instinct_MI300X,dtype=fp8_w8a8.json | 164 ++++++++++++++
 ...=3584,device_name=AMD_Instinct_MI300X.json |  70 +++---
 ...me=AMD_Instinct_MI300X,dtype=fp8_w8a8.json | 164 ++++++++++++++
 ...=4096,device_name=AMD_Instinct_MI300X.json | 200 ++++++++++++++++++
 ...me=AMD_Instinct_MI300X,dtype=fp8_w8a8.json | 164 ++++++++++++++
 ...=7168,device_name=AMD_Instinct_MI300X.json |  76 +++----
 ...me=AMD_Instinct_MI300X,dtype=fp8_w8a8.json | 164 ++++++++++++++
 ...=8192,device_name=AMD_Instinct_MI300X.json | 200 ++++++++++++++++++
 16 files changed, 2260 insertions(+), 148 deletions(-)
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json

diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000..b6f1d01f88652
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json
index 4d4b752fa5d64..66f9106bd1be3 100644
--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json
@@ -1,21 +1,21 @@
 {
     "1": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
         "num_warps": 2,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "2": {
         "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 16,
-        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
-        "num_warps": 2,
+        "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
@@ -23,10 +23,10 @@
     },
     "4": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 256,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
-        "num_warps": 2,
+        "num_warps": 1,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
@@ -34,10 +34,10 @@
     },
     "8": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 16,
-        "BLOCK_SIZE_K": 256,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
-        "num_warps": 1,
+        "num_warps": 2,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
@@ -48,7 +48,7 @@
         "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
-        "num_warps": 4,
+        "num_warps": 2,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
@@ -56,10 +56,10 @@
     },
     "24": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
-        "num_warps": 1,
+        "num_warps": 2,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
@@ -67,32 +67,32 @@
     },
     "32": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 16,
-        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 4,
         "num_warps": 2,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "48": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 4,
-        "num_warps": 2,
+        "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 2
+        "kpack": 1
     },
     "64": {
         "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 4,
-        "num_warps": 8,
+        "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
@@ -100,24 +100,24 @@
     },
     "96": {
         "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 4,
-        "num_warps": 4,
+        "num_warps": 8,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 2
+        "kpack": 1
     },
     "128": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
+        "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 4,
-        "num_warps": 8,
+        "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0,
-        "matrix_instr_nonkdim": 16,
+        "matrix_instr_nonkdim": 32,
         "kpack": 2
     },
     "256": {
@@ -129,7 +129,7 @@
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "512": {
         "BLOCK_SIZE_M": 128,
@@ -150,7 +150,7 @@
         "num_warps": 8,
         "num_stages": 2,
         "waves_per_eu": 0,
-        "matrix_instr_nonkdim": 32,
+        "matrix_instr_nonkdim": 16,
         "kpack": 2
     },
     "1536": {
@@ -184,7 +184,7 @@
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "4096": {
         "BLOCK_SIZE_M": 128,
@@ -195,6 +195,6 @@
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     }
 }
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000..0e5fd1eec77d7
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json
new file mode 100644
index 0000000000000..d6ad63509f157
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json
@@ -0,0 +1,200 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 1,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000..8323f512db015
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json
index a218fc40642c1..1b46cb5716514 100644
--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json
@@ -1,10 +1,10 @@
 {
     "1": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 256,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
-        "num_warps": 2,
+        "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
@@ -19,14 +19,14 @@
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "4": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
-        "num_warps": 4,
+        "num_warps": 8,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
@@ -34,76 +34,76 @@
     },
     "8": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 16,
-        "BLOCK_SIZE_K": 256,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
-        "num_warps": 2,
+        "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 2
+        "kpack": 1
     },
     "16": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
+        "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
-        "num_warps": 8,
+        "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "24": {
         "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
+        "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "32": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 16,
-        "BLOCK_SIZE_K": 256,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 4,
-        "num_warps": 2,
+        "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 2
+        "kpack": 1
     },
     "48": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
     },
     "64": {
-        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
+        "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
-        "num_warps": 2,
+        "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "96": {
         "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 4,
-        "num_warps": 4,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
@@ -112,24 +112,24 @@
     "128": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
+        "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 4,
         "num_warps": 8,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "256": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 4,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
         "num_warps": 8,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "512": {
         "BLOCK_SIZE_M": 64,
@@ -151,7 +151,7 @@
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "1536": {
         "BLOCK_SIZE_M": 128,
@@ -162,7 +162,7 @@
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "2048": {
         "BLOCK_SIZE_M": 128,
@@ -184,7 +184,7 @@
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "4096": {
         "BLOCK_SIZE_M": 128,
@@ -195,6 +195,6 @@
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     }
 }
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000..81bb765d30031
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json
new file mode 100644
index 0000000000000..811c77ab41093
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json
@@ -0,0 +1,200 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 32,
+        "kpack": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000..379ca107a9469
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json
index 3682cc548f352..ed5b655d89937 100644
--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json
@@ -1,21 +1,21 @@
 {
     "1": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 16,
-        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
-        "num_warps": 2,
+        "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "2": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 16,
-        "BLOCK_SIZE_K": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
-        "num_warps": 2,
+        "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
@@ -23,10 +23,10 @@
     },
     "4": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 256,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
-        "num_warps": 2,
+        "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
@@ -34,7 +34,7 @@
     },
     "8": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
         "num_warps": 2,
@@ -52,32 +52,32 @@
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 2
+        "kpack": 1
     },
     "24": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
-        "num_warps": 4,
+        "num_warps": 2,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "32": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 4,
         "num_warps": 2,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 2
+        "kpack": 1
     },
     "48": {
-        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
@@ -85,7 +85,7 @@
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 2
+        "kpack": 1
     },
     "64": {
         "BLOCK_SIZE_M": 32,
@@ -101,40 +101,40 @@
     "96": {
         "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 4,
-        "num_warps": 4,
+        "num_warps": 2,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "128": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 4,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
         "num_warps": 8,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "256": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 4,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
         "num_warps": 8,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "512": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
+        "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 8,
         "num_stages": 2,
@@ -151,7 +151,7 @@
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "1536": {
         "BLOCK_SIZE_M": 128,
@@ -173,7 +173,7 @@
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "3072": {
         "BLOCK_SIZE_M": 128,
@@ -195,6 +195,6 @@
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     }
 }
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000..48bb5f2ccb8e3
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json
new file mode 100644
index 0000000000000..a64d06c6d1724
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json
@@ -0,0 +1,200 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 32,
+        "kpack": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000..bd2c6fbc1b941
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json
index 21742854c613f..822f04e33e879 100644
--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json
@@ -1,7 +1,7 @@
 {
     "1": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
         "num_warps": 2,
@@ -12,54 +12,54 @@
     },
     "2": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
-        "num_warps": 4,
+        "num_warps": 2,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "4": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
-        "num_warps": 4,
+        "num_warps": 2,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "8": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
-        "num_warps": 2,
+        "num_warps": 1,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "16": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 16,
-        "BLOCK_SIZE_K": 256,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
-        "num_warps": 4,
+        "num_warps": 2,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 2
+        "kpack": 1
     },
     "24": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
-        "num_warps": 8,
+        "num_warps": 1,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
@@ -68,7 +68,7 @@
     "32": {
         "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 4,
         "num_warps": 2,
         "num_stages": 2,
@@ -78,32 +78,32 @@
     },
     "48": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 4,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
         "num_warps": 2,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "64": {
         "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 4,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 2
+        "kpack": 1
     },
     "96": {
         "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 4,
-        "num_warps": 4,
+        "num_warps": 8,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
@@ -112,18 +112,18 @@
     "128": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
+        "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 4,
         "num_warps": 8,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "256": {
         "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
+        "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 4,
         "num_warps": 8,
         "num_stages": 2,
@@ -140,7 +140,7 @@
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "1024": {
         "BLOCK_SIZE_M": 128,
@@ -151,7 +151,7 @@
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "1536": {
         "BLOCK_SIZE_M": 128,
@@ -173,7 +173,7 @@
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "3072": {
         "BLOCK_SIZE_M": 128,
@@ -187,7 +187,7 @@
         "kpack": 2
     },
     "4096": {
-        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_M": 256,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
@@ -195,6 +195,6 @@
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     }
 }
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000..cd4fb8f11b935
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 1,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 1,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json
new file mode 100644
index 0000000000000..cf66868e9d57a
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json
@@ -0,0 +1,200 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 1,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 1,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    }
+}

From f1fc0510dfbb11c98f41d02a44e092785c626314 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sat, 25 Jan 2025 15:07:35 +0800
Subject: [PATCH 377/462] [Misc] Add FA2 support to ViT MHA layer (#12355)

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 tests/kernels/test_mha_attn.py | 126 +++++++++++++++++++++++++++++++++
 vllm/attention/layer.py        |  25 +++++--
 2 files changed, 146 insertions(+), 5 deletions(-)
 create mode 100644 tests/kernels/test_mha_attn.py

diff --git a/tests/kernels/test_mha_attn.py b/tests/kernels/test_mha_attn.py
new file mode 100644
index 0000000000000..22d434f5e40ef
--- /dev/null
+++ b/tests/kernels/test_mha_attn.py
@@ -0,0 +1,126 @@
+"""
+Test:
+
+* Tests for MultiHeadAttention layer
+"""
+from unittest.mock import patch
+
+import pytest
+import torch
+
+from vllm.attention.layer import MultiHeadAttention
+from vllm.attention.selector import _Backend, _cached_get_attn_backend
+from vllm.platforms import current_platform
+from vllm.platforms.cpu import CpuPlatform
+from vllm.platforms.cuda import CudaPlatform
+from vllm.platforms.rocm import RocmPlatform
+
+
+@pytest.fixture(autouse=True)
+def clear_cache():
+    """Clear lru cache to ensure each test case runs without caching.
+    """
+    _cached_get_attn_backend.cache_clear()
+
+
+@pytest.mark.parametrize("device", ["cpu", "hip", "cuda"])
+def test_mha_attn_platform(device: str):
+    """
+    Test that the attention selector between different platform and device.
+    """
+    torch.set_default_dtype(torch.float16)
+
+    if device == "cpu":
+        with patch("vllm.attention.selector.current_platform", CpuPlatform()):
+            attn = MultiHeadAttention(16, 64, scale=1)
+            assert attn.attn_backend == _Backend.TORCH_SDPA
+    elif device == "hip":
+        with patch("vllm.attention.selector.current_platform", RocmPlatform()):
+            attn = MultiHeadAttention(16, 64, scale=1)
+            assert attn.attn_backend == _Backend.TORCH_SDPA
+    else:
+        with patch("vllm.attention.selector.current_platform", CudaPlatform()):
+            attn = MultiHeadAttention(16, 64, scale=1)
+            assert attn.attn_backend == _Backend.FLASH_ATTN
+
+        with patch("vllm.attention.selector.current_platform", CudaPlatform()):
+            attn = MultiHeadAttention(16, 72, scale=1)
+            assert attn.attn_backend == _Backend.XFORMERS
+
+
+def ref_attention(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    scale: float,
+) -> torch.Tensor:
+    """
+    Native implementation of scaled dot product attention without mask:
+    - query, key, value: [batch_size, seq_len, num_heads, head_size]
+    - attn_mask: [batch_size, seq_len, seq_len]
+    """
+    query, key, value = (x.transpose(1, 2) for x in (query, key, value))
+    attn_weights = scale * torch.matmul(query, key.transpose(2, 3))
+    attn_weights = torch.softmax(attn_weights, dim=-1).to(value.dtype)
+    out = torch.matmul(attn_weights, value).transpose(1, 2)
+    return out
+
+
+BATCH_SIZES = [1, 16]
+SEQ_LENS = [1]
+NUM_HEADS = [1, 16]
+NUM_KV_HEADS = [1]
+HEAD_SIZES = [64, 80]
+# flshattF and tritonflashattF supported: {torch.float16, torch.bfloat16}
+DTYPES = [
+    torch.half, torch.bfloat16, torch.float
+] if not current_platform.is_rocm() else [torch.half, torch.bfloat16]
+CUDA_DEVICES = ["cuda"]
+
+
+@pytest.mark.parametrize("batch_size", BATCH_SIZES)
+@pytest.mark.parametrize("seq_len", SEQ_LENS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("num_kv_heads", NUM_KV_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_mha_attn_forward(
+    batch_size: int,
+    seq_len: int,
+    num_heads: int,
+    num_kv_heads: int,
+    head_size: int,
+    dtype: torch.dtype,
+    device: str,
+):
+    current_platform.seed_everything(0)
+    torch.set_default_device(device)
+    torch.set_default_dtype(dtype)
+
+    q = torch.randn(batch_size, seq_len, num_heads * head_size)
+    k = torch.randn(batch_size, seq_len, num_kv_heads * head_size)
+    v = torch.randn(batch_size, seq_len, num_kv_heads * head_size)
+    scale = 1.0 / head_size**0.5
+    attn = MultiHeadAttention(num_heads,
+                              head_size,
+                              scale=scale,
+                              num_kv_heads=num_kv_heads)
+    output = attn(q, k, v)
+
+    assert num_heads % num_kv_heads == 0
+    num_queries_per_kv = num_heads // num_kv_heads
+    q = q.reshape(batch_size, seq_len, num_heads, head_size)
+    k = k.reshape(batch_size, seq_len, num_kv_heads, head_size)
+    v = v.reshape(batch_size, seq_len, num_kv_heads, head_size)
+    if num_queries_per_kv > 1:
+        k = torch.repeat_interleave(k, num_queries_per_kv, dim=2)
+        v = torch.repeat_interleave(v, num_queries_per_kv, dim=2)
+
+    ref_output = ref_attention(
+        q,
+        k,
+        v,
+        scale=scale,
+    ).reshape(batch_size, seq_len, num_heads * head_size)
+    torch.testing.assert_close(output, ref_output)
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 79ea9b666c7e8..a90bb4fbf5ab3 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -210,6 +210,9 @@ def __init__(
         self.scale = scale
         self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
 
+        assert self.num_heads % self.num_kv_heads == 0
+        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
+
         dtype = torch.get_default_dtype()
         attn_backend = get_attn_backend(head_size,
                                         dtype,
@@ -217,11 +220,12 @@ def __init__(
                                         block_size=16,
                                         is_attention_free=False)
         backend = backend_name_to_enum(attn_backend.get_name())
-        if backend in {_Backend.FLASH_ATTN, _Backend.FLASH_ATTN_VLLM_V1}:
-            backend = _Backend.XFORMERS
 
         self.attn_backend = backend if backend in {
-            _Backend.TORCH_SDPA, _Backend.XFORMERS
+            _Backend.TORCH_SDPA,
+            _Backend.XFORMERS,
+            _Backend.FLASH_ATTN,
+            _Backend.FLASH_ATTN_VLLM_V1,
         } else _Backend.TORCH_SDPA
 
     def forward(
@@ -231,7 +235,6 @@ def forward(
         value: torch.Tensor,
     ) -> torch.Tensor:
         """Input shape: batch_size x seq_len x hidden_size"""
-        # TODO(Isotr0py): Use existing backend implementations and support FA2
         bsz, q_len, _ = query.size()
         kv_len = key.size(1)
 
@@ -239,7 +242,19 @@ def forward(
         key = key.view(bsz, kv_len, self.num_kv_heads, self.head_size)
         value = value.view(bsz, kv_len, self.num_kv_heads, self.head_size)
 
-        if self.attn_backend == _Backend.XFORMERS:
+        if (num_repeat := self.num_queries_per_kv) > 1:
+            # Handle MQA and GQA
+            key = torch.repeat_interleave(key, num_repeat, dim=2)
+            value = torch.repeat_interleave(value, num_repeat, dim=2)
+
+        if self.attn_backend in {
+                _Backend.FLASH_ATTN,
+                _Backend.FLASH_ATTN_VLLM_V1,
+        }:
+            from vllm.vllm_flash_attn import flash_attn_func
+
+            out = flash_attn_func(query, key, value, softmax_scale=self.scale)
+        elif self.attn_backend == _Backend.XFORMERS:
             from xformers import ops as xops
 
             out = xops.memory_efficient_attention_forward(query,

From 324960a95c00112ce6b9b858d9311da1597cfb8b Mon Sep 17 00:00:00 2001
From: Siyuan Liu <lsiyuan@google.com>
Date: Fri, 24 Jan 2025 23:23:03 -0800
Subject: [PATCH 378/462] [TPU][CI] Update torchxla version in
 requirement-tpu.txt (#12422)

Signed-off-by: Siyuan Liu <lsiyuan@google.com>
---
 Dockerfile.tpu       |  2 +-
 requirements-tpu.txt | 21 +++++++++++----------
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/Dockerfile.tpu b/Dockerfile.tpu
index ee0d94d98e82b..e268b39476665 100644
--- a/Dockerfile.tpu
+++ b/Dockerfile.tpu
@@ -1,4 +1,4 @@
-ARG NIGHTLY_DATE="20250122"
+ARG NIGHTLY_DATE="20250124"
 ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE"
 
 FROM $BASE_IMAGE
diff --git a/requirements-tpu.txt b/requirements-tpu.txt
index 8ab18b3770ae8..51a0c65eac5aa 100644
--- a/requirements-tpu.txt
+++ b/requirements-tpu.txt
@@ -10,16 +10,17 @@ wheel
 jinja2
 ray[default]
 
-# Install torch_xla
---pre
---extra-index-url https://download.pytorch.org/whl/nightly/cpu
+# Install torch, torch_xla
 --find-links https://storage.googleapis.com/libtpu-releases/index.html
 --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
 --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
-torch==2.6.0.dev20241126+cpu
-torchvision==0.20.0.dev20241126+cpu
-torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev20241126-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
-torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev20241126-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
-torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev20241126-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
-jaxlib==0.4.36.dev20241122
-jax==0.4.36.dev20241122
+# Note: This torch whl can be slightly different from the official torch nightly whl
+# since they are not built on the same commit (but on the same day). This difference may cause C++ undefined symbol issue
+# if some change between the 2 commits introduce some C++ API change.
+# Here we install the exact torch whl from which torch_xla is built from, to avoid potential C++ undefined symbol issue.
+torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.7.0.dev20250124-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
+torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.7.0.dev20250124-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
+torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.7.0.dev20250124-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
+torch_xla[pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250124-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
+torch_xla[pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250124-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
+torch_xla[pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250124-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"

From 2a0309a646b1ed83a0c40974e08c8dc628726d3c Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Sat, 25 Jan 2025 21:00:31 -0800
Subject: [PATCH 379/462] [Misc][Bugfix] FA3 support to ViT MHA layer (#12435)

Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
---
 vllm/attention/layer.py | 25 ++++++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index a90bb4fbf5ab3..db682b4ac63b0 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -251,9 +251,28 @@ def forward(
                 _Backend.FLASH_ATTN,
                 _Backend.FLASH_ATTN_VLLM_V1,
         }:
-            from vllm.vllm_flash_attn import flash_attn_func
-
-            out = flash_attn_func(query, key, value, softmax_scale=self.scale)
+            from vllm.vllm_flash_attn import flash_attn_varlen_func
+
+            cu_seqlens_q = torch.arange(0, (bsz + 1) * q_len,
+                                        step=q_len,
+                                        dtype=torch.int32,
+                                        device=query.device)
+            cu_seqlens_k = torch.arange(0, (bsz + 1) * kv_len,
+                                        step=kv_len,
+                                        dtype=torch.int32,
+                                        device=key.device)
+
+            out = flash_attn_varlen_func(
+                query.flatten(0, 1),
+                key.flatten(0, 1),
+                value.flatten(0, 1),
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k=cu_seqlens_k,
+                max_seqlen_q=q_len,
+                max_seqlen_k=kv_len,
+                softmax_scale=self.scale,
+            )
+            out = out.reshape(bsz, q_len, -1)
         elif self.attn_backend == _Backend.XFORMERS:
             from xformers import ops as xops
 

From fa63e710c7fbaae3a445f669d3b5ba6b9a4ef412 Mon Sep 17 00:00:00 2001
From: Keyun Tong <tongkeyun@gmail.com>
Date: Sun, 26 Jan 2025 00:42:37 -0800
Subject: [PATCH 380/462] [V1][Perf] Reduce scheduling overhead in model runner
 after cuda sync (#12094)

Signed-off-by: Keyun Tong <tongkeyun@gmail.com>
---
 vllm/v1/outputs.py                 |  2 +-
 vllm/v1/sample/sampler.py          |  3 +--
 vllm/v1/worker/gpu_model_runner.py | 29 +++++++++++++++++++----------
 3 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py
index acc3a944e21b9..32aee44e3f374 100644
--- a/vllm/v1/outputs.py
+++ b/vllm/v1/outputs.py
@@ -8,7 +8,7 @@
 class SamplerOutput:
 
     # [num_reqs]
-    sampled_token_ids: List[int]
+    sampled_token_ids: torch.Tensor
 
     # [num_reqs, max_num_logprobs + 1]
     logprob_token_ids: Optional[torch.Tensor]
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 7cd42ca211a22..9ad665a64894c 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -50,9 +50,8 @@ def forward(
         # Use int32 to reduce the tensor size.
         sampled = sampled.to(torch.int32)
 
-        # NOTE: CPU-GPU synchronization happens here.
         sampler_output = SamplerOutput(
-            sampled_token_ids=sampled.tolist(),
+            sampled_token_ids=sampled,
             logprob_token_ids=topk_indices,
             logprobs=topk_logprobs,
             prompt_logprob_token_ids=None,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 4b3c325ded906..6339f1f03f11d 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -775,10 +775,10 @@ def execute_model(
             sampling_metadata=sampling_metadata,
         )
 
-        sampled_token_ids = sampler_output.sampled_token_ids
         # TODO(woosuk): The following loop can be slow since it iterates over
         # the requests one by one. Optimize.
         num_reqs = self.input_batch.num_reqs
+        request_seq_lens: List[Tuple[int, CachedRequestState, int]] = []
         for i, req_id in enumerate(self.input_batch.req_ids[:num_reqs]):
             assert req_id is not None
             req_state = self.requests[req_id]
@@ -787,10 +787,10 @@ def execute_model(
             assert seq_len <= req_state.num_tokens
             if seq_len == req_state.num_tokens:
                 # Append the sampled token to the output token ids.
-                token_id = sampled_token_ids[i]
-                self.input_batch.token_ids_cpu[i, seq_len] = token_id
                 self.input_batch.num_tokens[i] += 1
-                req_state.output_token_ids.append(token_id)
+                # OPTIMIZATION: Priming the state updates for later updates.
+                req_state.output_token_ids.append(0)
+                request_seq_lens.append((i, req_state, seq_len))
             else:
                 # Ignore the sampled token from the partial request.
                 # Rewind the generator state as if the token was not sampled.
@@ -799,6 +799,21 @@ def execute_model(
                     # This relies on cuda-specific torch-internal impl details
                     generator.set_offset(generator.get_offset() - 4)
 
+        # num_reqs entries should be non-None
+        assert all(
+            req_id is not None for req_id in
+            self.input_batch.req_ids[:num_reqs]), "req_ids contains None"
+        req_ids = cast(List[str], self.input_batch.req_ids[:num_reqs])
+
+        # NOTE: GPU -> CPU Sync happens here.
+        # Move as many CPU operations as possible before this sync point.
+        sampled_token_ids = sampler_output.sampled_token_ids.tolist()
+        # Update with the actual token ids
+        for i, req_state, seq_len in request_seq_lens:
+            token_id = sampled_token_ids[i]
+            self.input_batch.token_ids_cpu[i, seq_len] = token_id
+            req_state.output_token_ids[-1] = token_id
+
         if sampler_output.logprob_token_ids is None:
             logprob_token_ids = None
         else:
@@ -808,12 +823,6 @@ def execute_model(
         else:
             logprobs = sampler_output.logprobs.cpu()
 
-        # num_reqs entries should be non-None
-        assert all(
-            req_id is not None for req_id in
-            self.input_batch.req_ids[:num_reqs]), "req_ids contains None"
-        req_ids = cast(List[str], self.input_batch.req_ids[:num_reqs])
-
         model_runner_output = ModelRunnerOutput(
             req_ids=req_ids,
             req_id_to_index=self.input_batch.req_id_to_index,

From 0ee349b5534e3d02b499b1126f2abde73b798fe9 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Sun, 26 Jan 2025 00:47:42 -0800
Subject: [PATCH 381/462] [V1][Bugfix] Fix assertion when mm hashing is turned
 off (#12439)

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/v1/request.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index eefcdaf29e753..2cfcd8b63ccb2 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -58,7 +58,8 @@ def __init__(
 
         # Sanity check
         assert len(self.mm_inputs) == len(self.mm_positions)
-        assert len(self.mm_inputs) == len(self.mm_hashes)
+        if self.mm_hashes:
+            assert len(self.mm_inputs) == len(self.mm_hashes)
 
         # Cache the computed kv block hashes of the request to avoid
         # recomputing.

From a5255270c3ad492b5def19fe38beb9b2df30e74f Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Sun, 26 Jan 2025 03:56:34 -0800
Subject: [PATCH 382/462] [Misc] Revert FA on ViT #12355 and #12435 (#12445)

---
 vllm/attention/layer.py | 41 ++++-------------------------------------
 1 file changed, 4 insertions(+), 37 deletions(-)

diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index db682b4ac63b0..da663d894aeb3 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -210,9 +210,6 @@ def __init__(
         self.scale = scale
         self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
 
-        assert self.num_heads % self.num_kv_heads == 0
-        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
-
         dtype = torch.get_default_dtype()
         attn_backend = get_attn_backend(head_size,
                                         dtype,
@@ -220,12 +217,12 @@ def __init__(
                                         block_size=16,
                                         is_attention_free=False)
         backend = backend_name_to_enum(attn_backend.get_name())
+        if backend in {_Backend.FLASH_ATTN, _Backend.FLASH_ATTN_VLLM_V1}:
+            backend = _Backend.XFORMERS
 
         self.attn_backend = backend if backend in {
             _Backend.TORCH_SDPA,
             _Backend.XFORMERS,
-            _Backend.FLASH_ATTN,
-            _Backend.FLASH_ATTN_VLLM_V1,
         } else _Backend.TORCH_SDPA
 
     def forward(
@@ -235,6 +232,7 @@ def forward(
         value: torch.Tensor,
     ) -> torch.Tensor:
         """Input shape: batch_size x seq_len x hidden_size"""
+        # TODO(Isotr0py): Use existing backend implementations and support FA3
         bsz, q_len, _ = query.size()
         kv_len = key.size(1)
 
@@ -242,38 +240,7 @@ def forward(
         key = key.view(bsz, kv_len, self.num_kv_heads, self.head_size)
         value = value.view(bsz, kv_len, self.num_kv_heads, self.head_size)
 
-        if (num_repeat := self.num_queries_per_kv) > 1:
-            # Handle MQA and GQA
-            key = torch.repeat_interleave(key, num_repeat, dim=2)
-            value = torch.repeat_interleave(value, num_repeat, dim=2)
-
-        if self.attn_backend in {
-                _Backend.FLASH_ATTN,
-                _Backend.FLASH_ATTN_VLLM_V1,
-        }:
-            from vllm.vllm_flash_attn import flash_attn_varlen_func
-
-            cu_seqlens_q = torch.arange(0, (bsz + 1) * q_len,
-                                        step=q_len,
-                                        dtype=torch.int32,
-                                        device=query.device)
-            cu_seqlens_k = torch.arange(0, (bsz + 1) * kv_len,
-                                        step=kv_len,
-                                        dtype=torch.int32,
-                                        device=key.device)
-
-            out = flash_attn_varlen_func(
-                query.flatten(0, 1),
-                key.flatten(0, 1),
-                value.flatten(0, 1),
-                cu_seqlens_q=cu_seqlens_q,
-                cu_seqlens_k=cu_seqlens_k,
-                max_seqlen_q=q_len,
-                max_seqlen_k=kv_len,
-                softmax_scale=self.scale,
-            )
-            out = out.reshape(bsz, q_len, -1)
-        elif self.attn_backend == _Backend.XFORMERS:
+        if self.attn_backend == _Backend.XFORMERS:
             from xformers import ops as xops
 
             out = xops.memory_efficient_attention_forward(query,

From 9ddc35220bee793eb445d9592a40bc4d3c081519 Mon Sep 17 00:00:00 2001
From: Matthew Hendrey <matthew.hendrey@gmail.com>
Date: Sun, 26 Jan 2025 06:59:25 -0500
Subject: [PATCH 383/462] [Frontend] generation_config.json for  maximum
 tokens(#12242)

Signed-off-by: Matthew Hendrey <matthew.hendrey@gmail.com>
Signed-off-by: Shangming Cai <caishangming@linux.alibaba.com>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
Co-authored-by: shangmingc <caishangming@linux.alibaba.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Yuan Tang <terrytangyuan@gmail.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com>
---
 tests/entrypoints/openai/test_serving_chat.py | 110 ++++++++++++++++++
 vllm/config.py                                |   6 +
 vllm/engine/arg_utils.py                      |   4 +-
 vllm/entrypoints/openai/protocol.py           |  34 ++++--
 4 files changed, 145 insertions(+), 9 deletions(-)

diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index 85f485364a411..e88d6c3c67829 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -103,6 +103,116 @@ def test_serving_chat_should_set_correct_max_tokens():
 
     assert mock_engine.generate.call_args.args[1].max_tokens == 10
 
+    # Setting server's max_tokens in the generation_config.json
+    # lower than context_window - prompt_tokens
+    mock_model_config = MockModelConfig()
+    mock_model_config.diff_sampling_param = {
+        "max_tokens": 10  # Setting server-side max_tokens limit
+    }
+
+    # Reinitialize the engine with new settings
+    mock_engine = MagicMock(spec=MQLLMEngineClient)
+    mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
+    mock_engine.errored = False
+
+    # Initialize the serving chat
+    models = OpenAIServingModels(engine_client=mock_engine,
+                                 base_model_paths=BASE_MODEL_PATHS,
+                                 model_config=mock_model_config)
+    serving_chat = OpenAIServingChat(mock_engine,
+                                     mock_model_config,
+                                     models,
+                                     response_role="assistant",
+                                     chat_template=CHAT_TEMPLATE,
+                                     chat_template_content_format="auto",
+                                     request_logger=None)
+
+    # Test Case 1: No max_tokens specified in request
+    req = ChatCompletionRequest(
+        model=MODEL_NAME,
+        messages=[{
+            "role": "user",
+            "content": "what is 1+1?"
+        }],
+        guided_decoding_backend="outlines",
+    )
+
+    with suppress(Exception):
+        asyncio.run(serving_chat.create_chat_completion(req))
+
+    assert mock_engine.generate.call_args.args[1].max_tokens == 10
+
+    # Test Case 2: Request's max_tokens set higher than server accepts
+    req.max_tokens = 15
+
+    with suppress(Exception):
+        asyncio.run(serving_chat.create_chat_completion(req))
+
+    assert mock_engine.generate.call_args.args[1].max_tokens == 10
+
+    # Test Case 3: Request's max_tokens set lower than server accepts
+    req.max_tokens = 5
+
+    with suppress(Exception):
+        asyncio.run(serving_chat.create_chat_completion(req))
+
+    assert mock_engine.generate.call_args.args[1].max_tokens == 5
+
+    # Setting server's max_tokens in the generation_config.json
+    # higher than context_window - prompt_tokens
+    mock_model_config = MockModelConfig()
+    mock_model_config.diff_sampling_param = {
+        "max_tokens": 200  # Setting server-side max_tokens limit
+    }
+
+    # Reinitialize the engine with new settings
+    mock_engine = MagicMock(spec=MQLLMEngineClient)
+    mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
+    mock_engine.errored = False
+
+    # Initialize the serving chat
+    models = OpenAIServingModels(engine_client=mock_engine,
+                                 base_model_paths=BASE_MODEL_PATHS,
+                                 model_config=mock_model_config)
+    serving_chat = OpenAIServingChat(mock_engine,
+                                     mock_model_config,
+                                     models,
+                                     response_role="assistant",
+                                     chat_template=CHAT_TEMPLATE,
+                                     chat_template_content_format="auto",
+                                     request_logger=None)
+
+    # Test case 1: No max_tokens specified, defaults to context_window
+    req = ChatCompletionRequest(
+        model=MODEL_NAME,
+        messages=[{
+            "role": "user",
+            "content": "what is 1+1?"
+        }],
+        guided_decoding_backend="outlines",
+    )
+
+    with suppress(Exception):
+        asyncio.run(serving_chat.create_chat_completion(req))
+
+    assert mock_engine.generate.call_args.args[1].max_tokens == 93
+
+    # Test Case 2: Request's max_tokens set higher than server accepts
+    req.max_tokens = 100
+
+    with suppress(Exception):
+        asyncio.run(serving_chat.create_chat_completion(req))
+
+    assert mock_engine.generate.call_args.args[1].max_tokens == 93
+
+    # Test Case 3: Request's max_tokens set lower than server accepts
+    req.max_tokens = 5
+
+    with suppress(Exception):
+        asyncio.run(serving_chat.create_chat_completion(req))
+
+    assert mock_engine.generate.call_args.args[1].max_tokens == 5
+
 
 def test_serving_chat_could_load_correct_generation_config():
 
diff --git a/vllm/config.py b/vllm/config.py
index 11c6f853b2b45..7a58d64bcc6e2 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -910,12 +910,18 @@ def get_diff_sampling_param(self) -> Dict[str, Any]:
             "top_k",
             "top_p",
             "min_p",
+            "max_new_tokens",
         ]
         if any(p in config for p in available_params):
             diff_sampling_param = {
                 p: config.get(p)
                 for p in available_params if config.get(p) is not None
             }
+            # Huggingface definition of max_new_tokens is equivalent
+            # to vLLM's max_tokens
+            if "max_new_tokens" in diff_sampling_param:
+                diff_sampling_param["max_tokens"] = diff_sampling_param.pop(
+                    "max_new_tokens")
         else:
             diff_sampling_param = {}
         return diff_sampling_param
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index f16e8e6df76bd..ba96484e3fce9 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -939,7 +939,9 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             "Defaults to None, will use the default generation config in vLLM. "
             "If set to 'auto', the generation config will be automatically "
             "loaded from model. If set to a folder path, the generation config "
-            "will be loaded from the specified folder path.")
+            "will be loaded from the specified folder path. If "
+            "`max_new_tokens` is specified, then it sets a server-wide limit "
+            "on the number of output tokens for all requests.")
 
         parser.add_argument("--enable-sleep-mode",
                             action="store_true",
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 80403f77d5375..6f546aaec442a 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -380,13 +380,17 @@ def to_beam_search_params(
     ) -> BeamSearchParams:
         # TODO(#9845): remove max_tokens when field is removed from OpenAI API
         max_tokens = self.max_completion_tokens or self.max_tokens
-        if max_tokens is None:
-            max_tokens = default_max_tokens
 
         if default_sampling_params is None:
             default_sampling_params = {}
         n = self.n if self.n is not None else 1
 
+        # Use minimum of context window, user request & server limit.
+        max_tokens = min(
+            val for val in (default_max_tokens, max_tokens,
+                            default_sampling_params.get("max_tokens", None))
+            if val is not None)
+
         if (temperature := self.temperature) is None:
             temperature = default_sampling_params.get(
                 "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
@@ -406,11 +410,16 @@ def to_sampling_params(
             default_sampling_params: Optional[dict] = None) -> SamplingParams:
         # TODO(#9845): remove max_tokens when field is removed from OpenAI API
         max_tokens = self.max_completion_tokens or self.max_tokens
-        if max_tokens is None:
-            max_tokens = default_max_tokens
 
         if default_sampling_params is None:
             default_sampling_params = {}
+
+        # Use minimum of context window, user request & server limit.
+        max_tokens = min(
+            val for val in (default_max_tokens, max_tokens,
+                            default_sampling_params.get("max_tokens", None))
+            if val is not None)
+
         # Default parameters
         if (repetition_penalty := self.repetition_penalty) is None:
             repetition_penalty = default_sampling_params.get(
@@ -740,13 +749,17 @@ def to_beam_search_params(
             default_sampling_params: Optional[dict] = None
     ) -> BeamSearchParams:
         max_tokens = self.max_tokens
-        if max_tokens is None:
-            max_tokens = default_max_tokens
 
         if default_sampling_params is None:
             default_sampling_params = {}
         n = self.n if self.n is not None else 1
 
+        # Use minimum of context window, user request & server limit.
+        max_tokens = min(
+            val for val in (default_max_tokens, max_tokens,
+                            default_sampling_params.get("max_tokens", None))
+            if val is not None)
+
         if (temperature := self.temperature) is None:
             temperature = default_sampling_params.get("temperature", 1.0)
 
@@ -764,11 +777,16 @@ def to_sampling_params(
             logits_processor_pattern: Optional[str],
             default_sampling_params: Optional[dict] = None) -> SamplingParams:
         max_tokens = self.max_tokens
-        if max_tokens is None:
-            max_tokens = default_max_tokens
 
         if default_sampling_params is None:
             default_sampling_params = {}
+
+        # Use minimum of context window, user request & server limit.
+        max_tokens = min(
+            val for val in (default_max_tokens, max_tokens,
+                            default_sampling_params.get("max_tokens", None))
+            if val is not None)
+
         # Default parameters
         if (repetition_penalty := self.repetition_penalty) is None:
             repetition_penalty = default_sampling_params.get(

From aa2cd2c43d1d19ece0f3b36ad716c3a9b8a2def0 Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Sun, 26 Jan 2025 06:59:58 -0500
Subject: [PATCH 384/462] [Bugfix] Disable w16a16 2of4 sparse
 CompressedTensors24 (#12417)

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
Co-authored-by: mgoin <michael@neuralmagic.com>
---
 tests/kernels/test_cutlass.py                 |  28 +--
 tests/kernels/test_cutlass_2of4_sparse.py     | 214 ++++++++++++++++++
 tests/kernels/test_semi_structured.py         | 134 -----------
 tests/kernels/utils.py                        |  27 ++-
 tests/quantization/test_compressed_tensors.py |   4 +-
 .../compressed_tensors/compressed_tensors.py  |  25 +-
 6 files changed, 263 insertions(+), 169 deletions(-)
 create mode 100644 tests/kernels/test_cutlass_2of4_sparse.py
 delete mode 100644 tests/kernels/test_semi_structured.py

diff --git a/tests/kernels/test_cutlass.py b/tests/kernels/test_cutlass.py
index afe53797322f9..c3eddacec2727 100644
--- a/tests/kernels/test_cutlass.py
+++ b/tests/kernels/test_cutlass.py
@@ -2,7 +2,7 @@
 
 Run `pytest tests/kernels/test_cutlass.py`.
 """
-from typing import Optional, Type
+from typing import Type
 
 import pytest
 import torch
@@ -11,6 +11,8 @@
 from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
 
+from .utils import baseline_scaled_mm, to_fp8, to_int8
+
 MNK_FACTORS = [
     (1, 256, 128),
     (1, 16384, 1024),
@@ -41,34 +43,10 @@
 capability = capability[0] * 10 + capability[1]
 
 
-def to_fp8(tensor: torch.Tensor):
-    finfo = torch.finfo(torch.float8_e4m3fn)
-    return torch.round(tensor.clamp(
-        min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
-
-
-def to_int8(tensor: torch.Tensor):
-    return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
-
-
 def rand_int8(shape: tuple, device: str = "cuda"):
     return to_int8(torch.rand(shape, device=device) * 255 - 128)
 
 
-def baseline_scaled_mm(a: torch.Tensor,
-                       b: torch.Tensor,
-                       scale_a: torch.Tensor,
-                       scale_b: torch.Tensor,
-                       out_dtype: Type[torch.dtype],
-                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-    output = (scale_a * (scale_b * (torch.mm(
-        a.to(dtype=torch.float32), b.to(dtype=torch.float32))))).to(out_dtype)
-    if bias is not None:
-        output = output + bias
-
-    return output
-
-
 def cutlass_fp8_gemm_helper(m: int,
                             n: int,
                             k: int,
diff --git a/tests/kernels/test_cutlass_2of4_sparse.py b/tests/kernels/test_cutlass_2of4_sparse.py
new file mode 100644
index 0000000000000..56495df34aa6c
--- /dev/null
+++ b/tests/kernels/test_cutlass_2of4_sparse.py
@@ -0,0 +1,214 @@
+"""Tests for sparse cutlass kernels
+
+Run `pytest tests/kernels/test_semi_structured.py`.
+"""
+from typing import Tuple, Type
+
+import pytest
+import torch
+import torch.nn.functional as F
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    sparse_cutlass_supported)
+from vllm.platforms import current_platform
+
+from .utils import baseline_scaled_mm, to_fp8, to_int8
+
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
+
+capability = current_platform.get_device_capability()
+capability = capability[0] * 10 + capability[1]
+
+
+def to_bf16(tensor: torch.Tensor) -> torch.Tensor:
+    return tensor.to(dtype=torch.bfloat16)
+
+
+def to_fp16(tensor: torch.Tensor) -> torch.Tensor:
+    return tensor.to(dtype=torch.float16)
+
+
+def prune_to_2_4(tensor):
+    # Reshape tensor to [N, 4] where N is number of groups of 4
+    original_shape = tensor.shape
+    reshaped = tensor.reshape(-1, 4)
+
+    # Get indices of top 2 absolute values in each group of 4
+    _, indices = torch.topk(torch.abs(reshaped), k=2, dim=1)
+
+    # Create binary mask
+    mask = torch.zeros_like(reshaped)
+    mask.scatter_(dim=1,
+                  index=indices,
+                  src=torch.ones_like(indices, dtype=mask.dtype))
+
+    # Apply mask and reshape back
+    pruned = reshaped * mask
+
+    # Turn all -0.0 to 0.0
+    pruned[pruned == -0.0] = 0.0
+
+    return pruned.reshape(original_shape)
+
+
+def make_rand_sparse_tensors(
+        dtype: torch.dtype, m: int, n: int, k: int
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    a = torch.randn((m, k), device='cuda') * 5
+    b = torch.randn((n, k), device='cuda').t() * 5
+
+    b = prune_to_2_4(b.t()).t()
+
+    if dtype == torch.int8:
+        a, b = to_int8(a), to_int8(b)
+    elif dtype == torch.float8_e4m3fn:
+        a, b = to_fp8(a), to_fp8(b)
+    elif dtype == torch.float16:
+        a, b = to_fp16(a), to_fp16(b)
+    elif dtype == torch.bfloat16:
+        a, b = to_bf16(a), to_bf16(b)
+    else:
+        raise ValueError("unsupported dtype")
+
+    b_compressed, e = ops.cutlass_sparse_compress(b.t())
+
+    # Compressed B, Metadata, Original A, B
+    return b_compressed, e, a, b
+
+
+@pytest.mark.skipif(not sparse_cutlass_supported(),
+                    reason="Sparse CUTLASS is not supported on this GPU type.")
+# Test working with a subset of A and B for sparse matmul
+def test_cutlass_sparse_subset():
+
+    big_m = 1024
+    m, n, k = 512, 512, 512
+
+    # Create tensors
+    b_comp, e, whole_a, b = make_rand_sparse_tensors(torch.float8_e4m3fn,
+                                                     big_m, n, k)
+    a = whole_a[0:m, 0:k]
+    scale_a = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10
+    scale_b = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10
+
+    out = ops.cutlass_scaled_sparse_mm(a,
+                                       b_comp,
+                                       e,
+                                       scale_a,
+                                       scale_b,
+                                       out_dtype=torch.bfloat16)
+    baseline = baseline_scaled_mm(a,
+                                  b,
+                                  scale_a,
+                                  scale_b,
+                                  out_dtype=torch.bfloat16)
+
+    torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0)
+
+
+MNK_FACTORS = [
+    (1, 256, 128),
+    (1, 16384, 1024),
+    (1, 24576, 512),
+    (16, 256, 512),
+    (16, 16384, 128),
+    (16, 24576, 4096),
+    (32, 8192, 4096),
+    (32, 16384, 4096),
+    (33, 1024, 1024),
+    (33, 8192, 128),
+    (64, 2048, 512),
+    (64, 16384, 1024),
+    (100, 8192, 512),
+    (128, 32768, 4096),
+    (256, 4096, 4096),
+    (512, 256, 1024),
+    (512, 8192, 4096),
+    (512, 16384, 128),
+    (512, 24576, 128),
+]
+
+
+# Test working with a subset of A and B for sparse matmul
+@pytest.mark.skip(reason="2of4 sparse w16a16 CUTLASS produces bad output.")
+@pytest.mark.skipif(not sparse_cutlass_supported(),
+                    reason="Sparse CUTLASS is not supported on this GPU type.")
+@pytest.mark.parametrize("m, k, n", MNK_FACTORS)
+@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
+def test_cutlass_sparse_gemm(m: int, k: int, n: int, dtype: Type[torch.dtype]):
+
+    # Create tensors
+    b_comp, e, a, b = make_rand_sparse_tensors(dtype, m, n, k)
+    scale_a = torch.ones((1, 1), device="cuda", dtype=torch.float32)
+    scale_b = torch.ones((1, 1), device="cuda", dtype=torch.float32)
+
+    out = ops.cutlass_scaled_sparse_mm(a,
+                                       b_comp,
+                                       e,
+                                       scale_a,
+                                       scale_b,
+                                       out_dtype=dtype)
+    baseline = F.linear(a, b.T)
+
+    torch.testing.assert_close(out, baseline, rtol=1e-2, atol=1e-2)
+
+
+@pytest.mark.skipif(not sparse_cutlass_supported(),
+                    reason="Sparse CUTLASS is not supported on this GPU type.")
+@pytest.mark.parametrize("m, k, n", MNK_FACTORS)
+@pytest.mark.skipif(not current_platform.has_device_capability(89),
+                    reason="FP8 is not supported on this GPU type.")
+def test_cutlass_sparse_fp8_gemm(m: int, n: int, k: int):
+
+    # Create tensors
+    b_comp, e, a, b = make_rand_sparse_tensors(torch.float8_e4m3fn, m, n, k)
+    scale_a = (torch.randn((1, 1), device="cuda", dtype=torch.float32))
+    scale_b = (torch.randn((1, 1), device="cuda", dtype=torch.float32))
+
+    out = ops.cutlass_scaled_sparse_mm(a,
+                                       b_comp,
+                                       e,
+                                       scale_a,
+                                       scale_b,
+                                       out_dtype=torch.bfloat16)
+
+    baseline = baseline_scaled_mm(a,
+                                  b,
+                                  scale_a,
+                                  scale_b,
+                                  out_dtype=torch.bfloat16)
+
+    torch.testing.assert_close(out, baseline, rtol=1e0, atol=2e0)
+
+
+@pytest.mark.skipif(not sparse_cutlass_supported(),
+                    reason="Sparse CUTLASS is not supported on this GPU type.")
+@pytest.mark.parametrize("m,k,n", MNK_FACTORS)
+@pytest.mark.parametrize("per_act_token", [True, False])
+@pytest.mark.parametrize("per_out_ch", [True, False])
+@pytest.mark.parametrize("use_bias", [True, False])
+def test_cutlass_sparse_int8_gemm(m: int, n: int, k: int, per_act_token: bool,
+                                  per_out_ch: bool, use_bias: bool):
+
+    # Create tensors
+    b_comp, e, a, b = make_rand_sparse_tensors(torch.int8, m, n, k)
+    scale_a = (torch.randn((1, 1), device="cuda", dtype=torch.float32))
+    scale_b = (torch.randn((1, 1), device="cuda", dtype=torch.float32))
+
+    out = ops.cutlass_scaled_sparse_mm(a,
+                                       b_comp,
+                                       e,
+                                       scale_a,
+                                       scale_b,
+                                       out_dtype=torch.bfloat16)
+
+    baseline = baseline_scaled_mm(a,
+                                  b,
+                                  scale_a,
+                                  scale_b,
+                                  out_dtype=torch.bfloat16)
+
+    torch.testing.assert_close(out, baseline, rtol=1e0, atol=2e0)
diff --git a/tests/kernels/test_semi_structured.py b/tests/kernels/test_semi_structured.py
deleted file mode 100644
index 4316d6ab30e33..0000000000000
--- a/tests/kernels/test_semi_structured.py
+++ /dev/null
@@ -1,134 +0,0 @@
-"""Tests for sparse cutlass kernels
-
-Run `pytest tests/kernels/test_semi_structured.py`.
-"""
-from typing import Optional, Tuple, Type
-
-import pytest
-import torch
-
-from vllm import _custom_ops as ops
-from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    sparse_cutlass_supported)
-from vllm.platforms import current_platform
-
-CUDA_DEVICES = [
-    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-]
-
-capability = current_platform.get_device_capability()
-capability = capability[0] * 10 + capability[1]
-
-
-def to_fp8(tensor: torch.Tensor):
-    finfo = torch.finfo(torch.float8_e4m3fn)
-    return torch.round(tensor.clamp(
-        min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
-
-
-def to_int8(tensor: torch.Tensor):
-    return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
-
-
-def rand_int8(shape: tuple, device: str = "cuda"):
-    return to_int8(torch.rand(shape, device=device) * 255 - 128)
-
-
-def to_bf16(tensor: torch.Tensor) -> torch.Tensor:
-    return tensor.to(dtype=torch.bfloat16)
-
-
-def to_fp16(tensor: torch.Tensor) -> torch.Tensor:
-    return tensor.to(dtype=torch.float16)
-
-
-def prune_to_2_4(tensor):
-    # Reshape tensor to [N, 4] where N is number of groups of 4
-    original_shape = tensor.shape
-    reshaped = tensor.reshape(-1, 4)
-
-    # Get indices of top 2 absolute values in each group of 4
-    _, indices = torch.topk(torch.abs(reshaped), k=2, dim=1)
-
-    # Create binary mask
-    mask = torch.zeros_like(reshaped)
-    mask.scatter_(dim=1,
-                  index=indices,
-                  src=torch.ones_like(indices, dtype=mask.dtype))
-
-    # Apply mask and reshape back
-    pruned = reshaped * mask
-
-    # Turn all -0.0 to 0.0
-    pruned[pruned == -0.0] = 0.0
-
-    return pruned.reshape(original_shape)
-
-
-def make_rand_sparse_tensors(
-        dtype: torch.dtype, m: int, n: int, k: int
-) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-    a = torch.randn((m, k), device='cuda') * 5
-    b = torch.randn((n, k), device='cuda').t() * 5
-
-    b = prune_to_2_4(b.t()).t()
-
-    if dtype == torch.int8:
-        a, b = to_int8(a), to_int8(b)
-    elif dtype == torch.float8_e4m3fn:
-        a, b = to_fp8(a), to_fp8(b)
-    elif dtype == torch.float16:
-        a, b = to_fp16(a), to_fp16(b)
-    elif dtype == torch.bfloat16:
-        a, b = to_bf16(a), to_bf16(b)
-    else:
-        raise ValueError("unsupported dtype")
-
-    b_compressed, e = ops.cutlass_sparse_compress(b.t())
-
-    # Compressed B, Metadata, Original A, B
-    return b_compressed, e, a, b
-
-
-def baseline_scaled_mm(a: torch.Tensor,
-                       b: torch.Tensor,
-                       scale_a: torch.Tensor,
-                       scale_b: torch.Tensor,
-                       out_dtype: Type[torch.dtype],
-                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-    output = (scale_a * (scale_b * (torch.mm(
-        a.to(dtype=torch.float32), b.to(dtype=torch.float32))))).to(out_dtype)
-    if bias is not None:
-        output = output + bias
-
-    return output
-
-
-@pytest.mark.skipif(not sparse_cutlass_supported(),
-                    reason="Sparse FP8 is not yet supported on this GPU type.")
-# Test working with a subset of A and B for sparse matmul
-def test_cutlass_sparse_subset():
-
-    big_m = 1024
-    m, n, k = 512, 512, 512
-
-    # Create tensors
-    b_comp, e, whole_a, b = make_rand_sparse_tensors(torch.float8_e4m3fn,
-                                                     big_m, n, k)
-    a = whole_a[0:m, 0:k]
-    scale_a = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10
-    scale_b = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10
-
-    out = ops.cutlass_scaled_sparse_mm(a,
-                                       b_comp,
-                                       e,
-                                       scale_a,
-                                       scale_b,
-                                       out_dtype=torch.bfloat16)
-    baseline = baseline_scaled_mm(a,
-                                  b,
-                                  scale_a,
-                                  scale_b,
-                                  out_dtype=torch.bfloat16)
-
-    torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0)
diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py
index 8011398551b9d..fb2c9f5d30583 100644
--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@@ -5,7 +5,7 @@
 import unittest
 from numbers import Number
 from typing import (Any, Dict, List, NamedTuple, Optional, Sequence, Tuple,
-                    Union)
+                    Type, Union)
 
 import pytest
 import torch
@@ -1100,3 +1100,28 @@ def opcheck(op: Union[torch._ops.OpOverload, torch._ops.OpOverloadPacket,
             kwargs,
             test_utils=test_utils,
             raise_exception=raise_exception) if cond else {}
+
+
+# For testing quantized linear kernels
+def to_fp8(tensor: torch.Tensor):
+    finfo = torch.finfo(torch.float8_e4m3fn)
+    return torch.round(tensor.clamp(
+        min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
+
+
+def to_int8(tensor: torch.Tensor):
+    return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
+
+
+def baseline_scaled_mm(a: torch.Tensor,
+                       b: torch.Tensor,
+                       scale_a: torch.Tensor,
+                       scale_b: torch.Tensor,
+                       out_dtype: Type[torch.dtype],
+                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    output = (scale_a * (scale_b * (torch.mm(
+        a.to(dtype=torch.float32), b.to(dtype=torch.float32))))).to(out_dtype)
+    if bias is not None:
+        output = output + bias
+
+    return output
diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index 0cd86cef0a475..bf0d454ad511c 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -313,8 +313,10 @@ def check_model(model):
         assert output
 
 
+@pytest.mark.skip(reason="2of4 sparse w16a16 CUTLASS produces bad output.")
 @pytest.mark.skipif(not sparse_cutlass_supported(),
-                    reason="Sparse FP8 is not yet supported on this GPU type.")
+                    reason="2of4 Sparse is not yet supported on this GPU type."
+                    )
 @pytest.mark.parametrize(
     "args_2of4",
     [("nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor")])
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index b2fc2360f47f1..dd2dd02eaf723 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -9,6 +9,7 @@
                                              QuantizationType)
 from pydantic import BaseModel
 
+from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
                                                UnquantizedLinearMethod)
@@ -27,6 +28,8 @@
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
 from vllm.platforms import current_platform
 
+logger = init_logger(__name__)
+
 __all__ = ["CompressedTensorsLinearMethod"]
 
 SPARSITY_CONFIG_NAME: Literal["sparsity_config"] = "sparsity_config"
@@ -79,6 +82,8 @@ def get_quant_method(
             return UnquantizedLinearMethod()
         if isinstance(layer, LinearBase):
             scheme = self.get_scheme(layer=layer, layer_name=prefix)
+            if scheme is None:
+                return UnquantizedLinearMethod()
             layer.scheme = scheme
             return CompressedTensorsLinearMethod(self)
         if isinstance(layer, Attention):
@@ -340,10 +345,10 @@ def _get_scheme_from_parts(
         raise NotImplementedError(
             "No compressed-tensors compatible scheme was found.")
 
-    def get_scheme(
-            self,
-            layer: torch.nn.Module,
-            layer_name: Optional[str] = None) -> "CompressedTensorsScheme":
+    def get_scheme(self,
+                   layer: torch.nn.Module,
+                   layer_name: Optional[str] = None
+                   ) -> Optional["CompressedTensorsScheme"]:
         """
         compressed-tensors supports non uniform in the following way:
 
@@ -353,10 +358,7 @@ def get_scheme(
             which can be a full layer_name, a regex for a layer_name, or
             an nn.Module name.
 
-        We first check whether a layer is in the ignore group and use
-        CompressedTensorsUnquantized (i.e. fp16/bf16) scheme for the layer
-
-        We then detect whether a layer_name is found in any target and
+        Detect whether a layer_name is found in any target and
         use the quantization scheme corresponding to the matched target
         to select the CompressedTensorsScheme used for infernece.
         """
@@ -394,6 +396,13 @@ def get_scheme(
         if self.supports_cutlass_24(weight_quant=weight_quant,
                                     input_quant=input_quant,
                                     sparsity_scheme=sparsity_scheme):
+            # FIXME(tlrmchlsmth): layers using W16A16 CUTLASS 2:4 sparse kernels
+            # currently produce bad output in some cases
+            if weight_quant is None:
+                logger.warning_once(
+                    "CompressedTensors24 scheme is disabled for the w16a16 "
+                    "case. Falling back to UnquantizedLinearMethod")
+                return None
             # Have a valid sparsity scheme
             # Validate layer is supported by Cutlass 2:4 Kernel
             scheme = CompressedTensors24(quantized=weight_quant is not None

From 72f4880425edf06f105863b2389f9c46025e08ee Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Sun, 26 Jan 2025 13:39:03 -0500
Subject: [PATCH 385/462] [Bugfix/CI] Fix broken kernels/test_mha.py (#12450)

---
 tests/kernels/test_mha_attn.py | 4 ++--
 vllm/attention/layer.py        | 8 ++++++++
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/tests/kernels/test_mha_attn.py b/tests/kernels/test_mha_attn.py
index 22d434f5e40ef..eab874e9e02bb 100644
--- a/tests/kernels/test_mha_attn.py
+++ b/tests/kernels/test_mha_attn.py
@@ -26,7 +26,7 @@ def clear_cache():
 @pytest.mark.parametrize("device", ["cpu", "hip", "cuda"])
 def test_mha_attn_platform(device: str):
     """
-    Test that the attention selector between different platform and device.
+    Test the attention selector between different platform and device.
     """
     torch.set_default_dtype(torch.float16)
 
@@ -41,7 +41,7 @@ def test_mha_attn_platform(device: str):
     else:
         with patch("vllm.attention.selector.current_platform", CudaPlatform()):
             attn = MultiHeadAttention(16, 64, scale=1)
-            assert attn.attn_backend == _Backend.FLASH_ATTN
+            assert attn.attn_backend == _Backend.XFORMERS
 
         with patch("vllm.attention.selector.current_platform", CudaPlatform()):
             attn = MultiHeadAttention(16, 72, scale=1)
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index da663d894aeb3..962c45a65ae23 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -210,6 +210,9 @@ def __init__(
         self.scale = scale
         self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
 
+        assert self.num_heads % self.num_kv_heads == 0
+        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
+
         dtype = torch.get_default_dtype()
         attn_backend = get_attn_backend(head_size,
                                         dtype,
@@ -240,6 +243,11 @@ def forward(
         key = key.view(bsz, kv_len, self.num_kv_heads, self.head_size)
         value = value.view(bsz, kv_len, self.num_kv_heads, self.head_size)
 
+        if (num_repeat := self.num_queries_per_kv) > 1:
+            # Handle MQA and GQA
+            key = torch.repeat_interleave(key, num_repeat, dim=2)
+            value = torch.repeat_interleave(value, num_repeat, dim=2)
+
         if self.attn_backend == _Backend.XFORMERS:
             from xformers import ops as xops
 

From 68f11149d845a164c9bbf122ab3bee8c94290169 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Sun, 26 Jan 2025 14:09:34 -0500
Subject: [PATCH 386/462] [Bugfix][Kernel] Fix perf regression caused by PR
 #12405 (#12434)

Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c954731bf94ef..921f5dc7de9cb 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -576,7 +576,7 @@ else()
   FetchContent_Declare(
           vllm-flash-attn
           GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 9732b0ce005d1e6216864788502d5570004678f5
+          GIT_TAG d4e09037abf588af1ec47d0e966b237ee376876c
           GIT_PROGRESS TRUE
           # Don't share the vllm-flash-attn build between build types
           BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn

From 72bac7306796b01c202d846da041f62ded3a26a9 Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Sun, 26 Jan 2025 16:18:19 -0500
Subject: [PATCH 387/462] [Build/CI] Fix libcuda.so linkage (#12424)

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
---
 CMakeLists.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 921f5dc7de9cb..ead539993d98c 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -446,6 +446,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 endif()
 
 message(STATUS "Enabling C extension.")
+if(VLLM_GPU_LANG STREQUAL "CUDA")
+  list(APPEND VLLM_C_LIBS cuda)
+endif()
 define_gpu_extension_target(
   _C
   DESTINATION vllm
@@ -454,6 +457,7 @@ define_gpu_extension_target(
   COMPILE_FLAGS ${VLLM_GPU_FLAGS}
   ARCHITECTURES ${VLLM_GPU_ARCHES}
   INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR};${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
+  LIBRARIES ${VLLM_C_LIBS}
   USE_SABI 3
   WITH_SOABI)
 

From 0034b09ceb7f578f2d097c7fb8c7042d17367c35 Mon Sep 17 00:00:00 2001
From: Kyle Mistele <kyle@mistele.com>
Date: Sun, 26 Jan 2025 20:58:45 -0600
Subject: [PATCH 388/462] [Frontend] Rerank API (Jina- and Cohere-compatible
 API)  (#12376)

Signed-off-by: Kyle Mistele <kyle@mistele.com>
---
 .../serving/openai_compatible_server.md       |  92 ++++++++
 .../online_serving/cohere_rerank_client.py    |  32 +++
 .../online_serving/jinaai_rerank_client.py    |  33 +++
 tests/entrypoints/openai/test_rerank.py       |  87 ++++++++
 tests/entrypoints/openai/test_score.py        |   7 +-
 vllm/entrypoints/openai/api_server.py         |  51 ++++-
 vllm/entrypoints/openai/protocol.py           |  46 ++++
 vllm/entrypoints/openai/serving_engine.py     |   9 +-
 vllm/entrypoints/openai/serving_rerank.py     | 206 ++++++++++++++++++
 9 files changed, 552 insertions(+), 11 deletions(-)
 create mode 100644 examples/online_serving/cohere_rerank_client.py
 create mode 100644 examples/online_serving/jinaai_rerank_client.py
 create mode 100644 tests/entrypoints/openai/test_rerank.py
 create mode 100644 vllm/entrypoints/openai/serving_rerank.py

diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index e49bbb06695f8..8bc234545befd 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -50,6 +50,11 @@ In addition, we have the following custom APIs:
   - Applicable to all [pooling models](../models/pooling_models.md).
 - [Score API](#score-api) (`/score`)
   - Only applicable to [cross-encoder models](../models/pooling_models.md) (`--task score`).
+- [Re-rank API](#rerank-api) (`/rerank`, `/v1/rerank`, `/v2/rerank`)
+  - Implements [Jina AI's v1 re-rank API](https://jina.ai/reranker/)
+  - Also compatible with [Cohere's v1 & v2 re-rank APIs](https://docs.cohere.com/v2/reference/rerank)
+  - Jina and Cohere's APIs are very similar; Jina's includes extra information in the rerank endpoint's response.
+  - Only applicable to [cross-encoder models](../models/pooling_models.md) (`--task score`).
 
 (chat-template)=
 
@@ -473,3 +478,90 @@ The following extra parameters are supported:
 :start-after: begin-score-extra-params
 :end-before: end-score-extra-params
 ```
+
+(rerank-api)=
+
+### Re-rank API
+
+Our Re-rank API applies a cross-encoder model to predict relevant scores between a single query, and
+each of a list of documents. Usually, the score for a sentence pair refers to the similarity between two sentences, on
+a scale of 0 to 1.
+
+You can find the documentation for these kind of models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html).
+
+The rerank endpoints support popular re-rank models such as `BAAI/bge-reranker-base` and other models supporting the
+`score` task. Additionally, `/rerank`, `/v1/rerank`, and `/v2/rerank`
+endpoints are compatible with both [Jina AI's re-rank API interface](https://jina.ai/reranker/) and
+[Cohere's re-rank API interface](https://docs.cohere.com/v2/reference/rerank) to ensure compatibility with
+popular open-source tools.
+
+Code example: <gh-file:examples/online_serving/jinaai_rerank_client.py>
+
+#### Example Request
+
+Note that the `top_n` request parameter is optional and will default to the length of the `documents` field.
+Result documents will be sorted by relevance, and the `index` property can be used to determine original order.
+
+Request:
+
+```bash
+curl -X 'POST' \
+  'http://127.0.0.1:8000/v1/rerank' \
+  -H 'accept: application/json' \
+  -H 'Content-Type: application/json' \
+  -d '{
+  "model": "BAAI/bge-reranker-base",
+  "query": "What is the capital of France?",
+  "documents": [
+    "The capital of Brazil is Brasilia.",
+    "The capital of France is Paris.",
+    "Horses and cows are both animals"
+  ]
+}'
+```
+
+Response:
+
+```bash
+{
+  "id": "rerank-fae51b2b664d4ed38f5969b612edff77",
+  "model": "BAAI/bge-reranker-base",
+  "usage": {
+    "total_tokens": 56
+  },
+  "results": [
+    {
+      "index": 1,
+      "document": {
+        "text": "The capital of France is Paris."
+      },
+      "relevance_score": 0.99853515625
+    },
+    {
+      "index": 0,
+      "document": {
+        "text": "The capital of Brazil is Brasilia."
+      },
+      "relevance_score": 0.0005860328674316406
+    }
+  ]
+}
+```
+
+#### Extra parameters
+
+The following [pooling parameters](#pooling-params) are supported.
+
+```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:language: python
+:start-after: begin-rerank-pooling-params
+:end-before: end-rerank-pooling-params
+```
+
+The following extra parameters are supported:
+
+```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:language: python
+:start-after: begin-rerank-extra-params
+:end-before: end-rerank-extra-params
+```
diff --git a/examples/online_serving/cohere_rerank_client.py b/examples/online_serving/cohere_rerank_client.py
new file mode 100644
index 0000000000000..a07affe3351ce
--- /dev/null
+++ b/examples/online_serving/cohere_rerank_client.py
@@ -0,0 +1,32 @@
+"""
+Example of using the OpenAI entrypoint's rerank API which is compatible with
+the Cohere SDK: https://github.com/cohere-ai/cohere-python
+
+run: vllm serve BAAI/bge-reranker-base
+"""
+import cohere
+
+# cohere v1 client
+co = cohere.Client(base_url="http://localhost:8000", api_key="sk-fake-key")
+rerank_v1_result = co.rerank(
+    model="BAAI/bge-reranker-base",
+    query="What is the capital of France?",
+    documents=[
+        "The capital of France is Paris", "Reranking is fun!",
+        "vLLM is an open-source framework for fast AI serving"
+    ])
+
+print(rerank_v1_result)
+
+# or the v2
+co2 = cohere.ClientV2("sk-fake-key", base_url="http://localhost:8000")
+
+v2_rerank_result = co2.rerank(
+    model="BAAI/bge-reranker-base",
+    query="What is the capital of France?",
+    documents=[
+        "The capital of France is Paris", "Reranking is fun!",
+        "vLLM is an open-source framework for fast AI serving"
+    ])
+
+print(v2_rerank_result)
diff --git a/examples/online_serving/jinaai_rerank_client.py b/examples/online_serving/jinaai_rerank_client.py
new file mode 100644
index 0000000000000..bf4de76ddf362
--- /dev/null
+++ b/examples/online_serving/jinaai_rerank_client.py
@@ -0,0 +1,33 @@
+"""
+Example of using the OpenAI entrypoint's rerank API which is compatible with
+Jina and Cohere https://jina.ai/reranker
+
+run: vllm serve BAAI/bge-reranker-base
+"""
+import json
+
+import requests
+
+url = "http://127.0.0.1:8000/rerank"
+
+headers = {"accept": "application/json", "Content-Type": "application/json"}
+
+data = {
+    "model":
+    "BAAI/bge-reranker-base",
+    "query":
+    "What is the capital of France?",
+    "documents": [
+        "The capital of Brazil is Brasilia.",
+        "The capital of France is Paris.", "Horses and cows are both animals"
+    ]
+}
+response = requests.post(url, headers=headers, json=data)
+
+# Check the response
+if response.status_code == 200:
+    print("Request successful!")
+    print(json.dumps(response.json(), indent=2))
+else:
+    print(f"Request failed with status code: {response.status_code}")
+    print(response.text)
diff --git a/tests/entrypoints/openai/test_rerank.py b/tests/entrypoints/openai/test_rerank.py
new file mode 100644
index 0000000000000..cfd8f33133960
--- /dev/null
+++ b/tests/entrypoints/openai/test_rerank.py
@@ -0,0 +1,87 @@
+import pytest
+import requests
+
+from vllm.entrypoints.openai.protocol import RerankResponse
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "BAAI/bge-reranker-base"
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = ["--enforce-eager", "--max-model-len", "100"]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+def test_rerank_texts(server: RemoteOpenAIServer, model_name: str):
+    query = "What is the capital of France?"
+    documents = [
+        "The capital of Brazil is Brasilia.", "The capital of France is Paris."
+    ]
+
+    rerank_response = requests.post(server.url_for("rerank"),
+                                    json={
+                                        "model": model_name,
+                                        "query": query,
+                                        "documents": documents,
+                                    })
+    rerank_response.raise_for_status()
+    rerank = RerankResponse.model_validate(rerank_response.json())
+
+    assert rerank.id is not None
+    assert rerank.results is not None
+    assert len(rerank.results) == 2
+    assert rerank.results[0].relevance_score >= 0.9
+    assert rerank.results[1].relevance_score <= 0.01
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+def test_top_n(server: RemoteOpenAIServer, model_name: str):
+    query = "What is the capital of France?"
+    documents = [
+        "The capital of Brazil is Brasilia.",
+        "The capital of France is Paris.", "Cross-encoder models are neat"
+    ]
+
+    rerank_response = requests.post(server.url_for("rerank"),
+                                    json={
+                                        "model": model_name,
+                                        "query": query,
+                                        "documents": documents,
+                                        "top_n": 2
+                                    })
+    rerank_response.raise_for_status()
+    rerank = RerankResponse.model_validate(rerank_response.json())
+
+    assert rerank.id is not None
+    assert rerank.results is not None
+    assert len(rerank.results) == 2
+    assert rerank.results[0].relevance_score >= 0.9
+    assert rerank.results[1].relevance_score <= 0.01
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+def test_rerank_max_model_len(server: RemoteOpenAIServer, model_name: str):
+
+    query = "What is the capital of France?" * 100
+    documents = [
+        "The capital of Brazil is Brasilia.", "The capital of France is Paris."
+    ]
+
+    rerank_response = requests.post(server.url_for("rerank"),
+                                    json={
+                                        "model": model_name,
+                                        "query": query,
+                                        "documents": documents
+                                    })
+    assert rerank_response.status_code == 400
+    # Assert just a small fragments of the response
+    assert "Please reduce the length of the input." in \
+        rerank_response.text
\ No newline at end of file
diff --git a/tests/entrypoints/openai/test_score.py b/tests/entrypoints/openai/test_score.py
index 06e0f93dbe269..0d19615bc0d99 100644
--- a/tests/entrypoints/openai/test_score.py
+++ b/tests/entrypoints/openai/test_score.py
@@ -10,12 +10,7 @@
 
 @pytest.fixture(scope="module")
 def server():
-    args = [
-        "--enforce-eager",
-        # Will be used on tests to compare prompt input length
-        "--max-model-len",
-        "100"
-    ]
+    args = ["--enforce-eager", "--max-model-len", "100"]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
         yield remote_server
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index f510c41503011..45cf06566faaa 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -56,6 +56,7 @@
                                               PoolingChatRequest,
                                               PoolingCompletionRequest,
                                               PoolingRequest, PoolingResponse,
+                                              RerankRequest, RerankResponse,
                                               ScoreRequest, ScoreResponse,
                                               TokenizeRequest,
                                               TokenizeResponse,
@@ -68,6 +69,7 @@
 from vllm.entrypoints.openai.serving_models import (BaseModelPath,
                                                     OpenAIServingModels)
 from vllm.entrypoints.openai.serving_pooling import OpenAIServingPooling
+from vllm.entrypoints.openai.serving_rerank import JinaAIServingRerank
 from vllm.entrypoints.openai.serving_score import OpenAIServingScores
 from vllm.entrypoints.openai.serving_tokenization import (
     OpenAIServingTokenization)
@@ -306,6 +308,10 @@ def score(request: Request) -> Optional[OpenAIServingScores]:
     return request.app.state.openai_serving_scores
 
 
+def rerank(request: Request) -> Optional[JinaAIServingRerank]:
+    return request.app.state.jinaai_serving_reranking
+
+
 def tokenization(request: Request) -> OpenAIServingTokenization:
     return request.app.state.openai_serving_tokenization
 
@@ -502,6 +508,40 @@ async def create_score_v1(request: ScoreRequest, raw_request: Request):
     return await create_score(request, raw_request)
 
 
+@router.post("/rerank")
+@with_cancellation
+async def do_rerank(request: RerankRequest, raw_request: Request):
+    handler = rerank(raw_request)
+    if handler is None:
+        return base(raw_request).create_error_response(
+            message="The model does not support Rerank (Score) API")
+    generator = await handler.do_rerank(request, raw_request)
+    if isinstance(generator, ErrorResponse):
+        return JSONResponse(content=generator.model_dump(),
+                            status_code=generator.code)
+    elif isinstance(generator, RerankResponse):
+        return JSONResponse(content=generator.model_dump())
+
+    assert_never(generator)
+
+
+@router.post("/v1/rerank")
+@with_cancellation
+async def do_rerank_v1(request: RerankRequest, raw_request: Request):
+    logger.warning(
+        "To indicate that the rerank API is not part of the standard OpenAI"
+        " API, we have located it at `/rerank`. Please update your client"
+        "accordingly. (Note: Conforms to JinaAI rerank API)")
+
+    return await do_rerank(request, raw_request)
+
+
+@router.post("/v2/rerank")
+@with_cancellation
+async def do_rerank_v2(request: RerankRequest, raw_request: Request):
+    return await do_rerank(request, raw_request)
+
+
 TASK_HANDLERS: Dict[str, Dict[str, tuple]] = {
     "generate": {
         "messages": (ChatCompletionRequest, create_chat_completion),
@@ -512,7 +552,10 @@ async def create_score_v1(request: ScoreRequest, raw_request: Request):
         "default": (EmbeddingCompletionRequest, create_embedding),
     },
     "score": {
-        "default": (ScoreRequest, create_score),
+        "default": (RerankRequest, do_rerank)
+    },
+    "rerank": {
+        "default": (RerankRequest, do_rerank)
     },
     "reward": {
         "messages": (PoolingChatRequest, create_pooling),
@@ -759,6 +802,12 @@ async def init_app_state(
         state.openai_serving_models,
         request_logger=request_logger
     ) if model_config.task == "score" else None
+    state.jinaai_serving_reranking = JinaAIServingRerank(
+        engine_client,
+        model_config,
+        state.openai_serving_models,
+        request_logger=request_logger
+    ) if model_config.task == "score" else None
     state.openai_serving_tokenization = OpenAIServingTokenization(
         engine_client,
         model_config,
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 6f546aaec442a..311c16c538f34 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -1018,6 +1018,52 @@ def to_pooling_params(self):
         return PoolingParams(additional_data=self.additional_data)
 
 
+class RerankRequest(OpenAIBaseModel):
+    model: str
+    query: str
+    documents: List[str]
+    top_n: int = Field(default_factory=lambda: 0)
+    truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
+
+    # doc: begin-rerank-pooling-params
+    additional_data: Optional[Any] = None
+    # doc: end-rerank-pooling-params
+
+    # doc: begin-rerank-extra-params
+    priority: int = Field(
+        default=0,
+        description=(
+            "The priority of the request (lower means earlier handling; "
+            "default: 0). Any priority other than 0 will raise an error "
+            "if the served model does not use priority scheduling."))
+
+    # doc: end-rerank-extra-params
+
+    def to_pooling_params(self):
+        return PoolingParams(additional_data=self.additional_data)
+
+
+class RerankDocument(BaseModel):
+    text: str
+
+
+class RerankResult(BaseModel):
+    index: int
+    document: RerankDocument
+    relevance_score: float
+
+
+class RerankUsage(BaseModel):
+    total_tokens: int
+
+
+class RerankResponse(OpenAIBaseModel):
+    id: str
+    model: str
+    usage: RerankUsage
+    results: List[RerankResult]
+
+
 class CompletionLogProbs(OpenAIBaseModel):
     text_offset: List[int] = Field(default_factory=list)
     token_logprobs: List[Optional[float]] = Field(default_factory=list)
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 3da447be06430..8d54164e500eb 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -26,7 +26,8 @@
                                               DetokenizeRequest,
                                               EmbeddingChatRequest,
                                               EmbeddingCompletionRequest,
-                                              ErrorResponse, ScoreRequest,
+                                              ErrorResponse, RerankRequest,
+                                              ScoreRequest,
                                               TokenizeChatRequest,
                                               TokenizeCompletionRequest)
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
@@ -204,9 +205,9 @@ def _validate_input(
         token_num = len(input_ids)
 
         # Note: EmbeddingRequest and ScoreRequest doesn't have max_tokens
-        if isinstance(
-                request,
-            (EmbeddingChatRequest, EmbeddingCompletionRequest, ScoreRequest)):
+        if isinstance(request,
+                      (EmbeddingChatRequest, EmbeddingCompletionRequest,
+                       ScoreRequest, RerankRequest)):
 
             operation = "score" if isinstance(request, ScoreRequest) \
                 else "embedding generation"
diff --git a/vllm/entrypoints/openai/serving_rerank.py b/vllm/entrypoints/openai/serving_rerank.py
new file mode 100644
index 0000000000000..be4420261afe3
--- /dev/null
+++ b/vllm/entrypoints/openai/serving_rerank.py
@@ -0,0 +1,206 @@
+import asyncio
+from typing import Any, AsyncGenerator, Dict, List, Optional, Union, cast
+
+from fastapi import Request
+
+from vllm.config import ModelConfig
+from vllm.engine.protocol import EngineClient
+from vllm.entrypoints.logger import RequestLogger
+from vllm.entrypoints.openai.protocol import (ErrorResponse, RerankDocument,
+                                              RerankRequest, RerankResponse,
+                                              RerankResult, RerankUsage)
+from vllm.entrypoints.openai.serving_engine import OpenAIServing
+from vllm.entrypoints.openai.serving_models import OpenAIServingModels
+from vllm.inputs.data import TokensPrompt
+from vllm.logger import init_logger
+from vllm.outputs import PoolingRequestOutput, ScoringRequestOutput
+from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
+from vllm.utils import make_async, merge_async_iterators
+
+logger = init_logger(__name__)
+
+
+class JinaAIServingRerank(OpenAIServing):
+
+    def __init__(
+        self,
+        engine_client: EngineClient,
+        model_config: ModelConfig,
+        models: OpenAIServingModels,
+        *,
+        request_logger: Optional[RequestLogger],
+    ) -> None:
+        super().__init__(engine_client=engine_client,
+                         model_config=model_config,
+                         models=models,
+                         request_logger=request_logger)
+
+    async def do_rerank(
+        self,
+        request: RerankRequest,
+        raw_request: Optional[Request] = None
+    ) -> Union[RerankResponse, ErrorResponse]:
+        """
+        Rerank API based on JinaAI's rerank API; implements the same
+        API interface. Designed for compatibility with off-the-shelf
+        tooling, since this is a common standard for reranking APIs
+
+        See example client implementations at
+        https://github.com/infiniflow/ragflow/blob/main/rag/llm/rerank_model.py
+        numerous clients use this standard.
+        """
+        error_check_ret = await self._check_model(request)
+        if error_check_ret is not None:
+            return error_check_ret
+
+        model_name = request.model
+        request_id = f"rerank-{self._base_request_id(raw_request)}"
+        truncate_prompt_tokens = request.truncate_prompt_tokens
+        query = request.query
+        documents = request.documents
+        request_prompts = []
+        engine_prompts = []
+        top_n = request.top_n if request.top_n > 0 else len(documents)
+
+        try:
+            (
+                lora_request,
+                prompt_adapter_request,
+            ) = self._maybe_get_adapters(request)
+
+            tokenizer = await self.engine_client.get_tokenizer(lora_request)
+
+            if prompt_adapter_request is not None:
+                raise NotImplementedError("Prompt adapter is not supported "
+                                          "for scoring models")
+
+            if isinstance(tokenizer, MistralTokenizer):
+                raise ValueError(
+                    "MistralTokenizer not supported for cross-encoding")
+
+            if not self.model_config.is_cross_encoder:
+                raise ValueError("Model is not cross encoder.")
+
+            if truncate_prompt_tokens is not None and \
+                    truncate_prompt_tokens > self.max_model_len:
+                raise ValueError(
+                    f"truncate_prompt_tokens value ({truncate_prompt_tokens}) "
+                    f"is greater than max_model_len ({self.max_model_len})."
+                    f" Please, select a smaller truncation size.")
+            for doc in documents:
+                request_prompt = f"{query}{tokenizer.sep_token}{doc}"
+                tokenization_kwargs: Dict[str, Any] = {}
+                if truncate_prompt_tokens is not None:
+                    tokenization_kwargs["truncation"] = True
+                    tokenization_kwargs["max_length"] = truncate_prompt_tokens
+
+                tokenize_async = make_async(tokenizer.__call__,
+                                            executor=self._tokenizer_executor)
+                prompt_inputs = await tokenize_async(text=query,
+                                                     text_pair=doc,
+                                                     **tokenization_kwargs)
+
+                input_ids = prompt_inputs["input_ids"]
+                text_token_prompt = \
+                    self._validate_input(request, input_ids, request_prompt)
+                engine_prompt = TokensPrompt(
+                    prompt_token_ids=text_token_prompt["prompt_token_ids"],
+                    token_type_ids=prompt_inputs.get("token_type_ids"))
+
+                request_prompts.append(request_prompt)
+                engine_prompts.append(engine_prompt)
+
+        except ValueError as e:
+            logger.exception("Error in preprocessing prompt inputs")
+            return self.create_error_response(str(e))
+
+        # Schedule the request and get the result generator.
+        generators: List[AsyncGenerator[PoolingRequestOutput, None]] = []
+
+        try:
+            pooling_params = request.to_pooling_params()
+
+            for i, engine_prompt in enumerate(engine_prompts):
+                request_id_item = f"{request_id}-{i}"
+
+                self._log_inputs(request_id_item,
+                                 request_prompts[i],
+                                 params=pooling_params,
+                                 lora_request=lora_request,
+                                 prompt_adapter_request=prompt_adapter_request)
+
+                trace_headers = (None if raw_request is None else await
+                                 self._get_trace_headers(raw_request.headers))
+
+                generator = self.engine_client.encode(
+                    engine_prompt,
+                    pooling_params,
+                    request_id_item,
+                    lora_request=lora_request,
+                    trace_headers=trace_headers,
+                    priority=request.priority,
+                )
+
+                generators.append(generator)
+        except ValueError as e:
+            # TODO: Use a vllm-specific Validation Error
+            return self.create_error_response(str(e))
+        result_generator = merge_async_iterators(*generators)
+
+        num_prompts = len(engine_prompts)
+
+        # Non-streaming response
+        final_res_batch: List[Optional[PoolingRequestOutput]]
+        final_res_batch = [None] * num_prompts
+
+        try:
+            async for i, res in result_generator:
+                final_res_batch[i] = res
+
+            assert all(final_res is not None for final_res in final_res_batch)
+
+            final_res_batch_checked = cast(List[PoolingRequestOutput],
+                                           final_res_batch)
+
+            response = self.request_output_to_rerank_response(
+                final_res_batch_checked, request_id, model_name, documents,
+                top_n)
+        except asyncio.CancelledError:
+            return self.create_error_response("Client disconnected")
+        except ValueError as e:
+            # TODO: Use a vllm-specific Validation Error
+            return self.create_error_response(str(e))
+
+        return response
+
+    def request_output_to_rerank_response(
+            self, final_res_batch: List[PoolingRequestOutput], request_id: str,
+            model_name: str, documents: List[str],
+            top_n: int) -> RerankResponse:
+        """
+        Convert the output of do_rank to a RerankResponse
+        """
+        results: List[RerankResult] = []
+        num_prompt_tokens = 0
+        for idx, final_res in enumerate(final_res_batch):
+            classify_res = ScoringRequestOutput.from_base(final_res)
+
+            result = RerankResult(
+                index=idx,
+                document=RerankDocument(text=documents[idx]),
+                relevance_score=classify_res.outputs.score,
+            )
+            results.append(result)
+            prompt_token_ids = final_res.prompt_token_ids
+            num_prompt_tokens += len(prompt_token_ids)
+
+        # sort by relevance, then return the top n if set
+        results.sort(key=lambda x: x.relevance_score, reverse=True)
+        if top_n < len(documents):
+            results = results[:top_n]
+
+        return RerankResponse(
+            id=request_id,
+            model=model_name,
+            results=results,
+            usage=RerankUsage(total_tokens=num_prompt_tokens))

From 582cf78798a6fef9b69d0471df73d81e09a7d3d8 Mon Sep 17 00:00:00 2001
From: Yuan Tang <terrytangyuan@gmail.com>
Date: Sun, 26 Jan 2025 21:46:19 -0600
Subject: [PATCH 389/462] [DOC] Add link to vLLM blog (#12460)

Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
---
 docs/source/community/blog.md | 3 +++
 docs/source/index.md          | 1 +
 2 files changed, 4 insertions(+)
 create mode 100644 docs/source/community/blog.md

diff --git a/docs/source/community/blog.md b/docs/source/community/blog.md
new file mode 100644
index 0000000000000..e8030edfa02ee
--- /dev/null
+++ b/docs/source/community/blog.md
@@ -0,0 +1,3 @@
+# vLLM Blog
+
+vLLM blog posts are published [here](https://blog.vllm.ai/).
diff --git a/docs/source/index.md b/docs/source/index.md
index d7a1117df9c27..2c302d3f3e863 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -184,6 +184,7 @@ api/model/index
 :caption: Community
 :maxdepth: 1
 
+community/blog
 community/meetups
 community/sponsors
 ```

From 28e0750847ded93158a66efdcbc869d87463b38f Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Sun, 26 Jan 2025 19:57:56 -0800
Subject: [PATCH 390/462] [V1] Avoid list creation in input preparation
 (#12457)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu_model_runner.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 6339f1f03f11d..9d7e30079dfbb 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -171,7 +171,8 @@ def __init__(
 
         # OPTIMIZATION: Cache the tensors rather than creating them every step.
         self.arange_np = np.arange(max(self.max_num_reqs + 1,
-                                       self.max_model_len),
+                                       self.max_model_len,
+                                       self.max_num_tokens),
                                    dtype=np.int32)
         # NOTE(woosuk): These tensors are "stateless", i.e., they are literally
         # a faster version of creating a new tensor every time. Thus, we should
@@ -358,8 +359,15 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
 
         # Get batched arange.
         # E.g., [2, 5, 3] -> [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
-        arange = np.concatenate(
-            [self.arange_np[:n] for n in num_scheduled_tokens])
+        # Equivalent to but faster than:
+        # np.concatenate([np.arange(n) for n in num_scheduled_tokens])
+        # Step 1. [2, 5, 3] -> [2, 7, 10]
+        cu_num_tokens = np.cumsum(num_scheduled_tokens)
+        # Step 2. [2, 7, 10] -> [0, 0, 2, 2, 2, 2, 2, 7, 7, 7]
+        cumsums_offsets = np.repeat(cu_num_tokens - num_scheduled_tokens,
+                                    num_scheduled_tokens)
+        # Step 3. [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
+        arange = self.arange_np[:total_num_scheduled_tokens] - cumsums_offsets
 
         # Get positions.
         positions_np = self.positions_np[:total_num_scheduled_tokens]
@@ -406,8 +414,7 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
 
         # Prepare the attention metadata.
         self.query_start_loc_np[0] = 0
-        np.cumsum(num_scheduled_tokens,
-                  out=self.query_start_loc_np[1:num_reqs + 1])
+        self.query_start_loc_np[1:num_reqs + 1] = cu_num_tokens
 
         self.seq_lens_np[:num_reqs] = (
             self.input_batch.num_computed_tokens_cpu[:num_reqs] +

From 0cc6b383d73eb662dfeec671d3b47cda301b2f47 Mon Sep 17 00:00:00 2001
From: Pooya Davoodi <pooya.davoodi@parasail.io>
Date: Sun, 26 Jan 2025 20:30:17 -0800
Subject: [PATCH 391/462] [Frontend] Support scores endpoint in run_batch
 (#12430)

Signed-off-by: Pooya Davoodi <pooya.davoodi@parasail.io>
---
 .../offline_inference/openai/openai_batch.md  | 33 ++++++++++++++++-
 tests/entrypoints/openai/test_run_batch.py    | 37 +++++++++++++++++++
 vllm/entrypoints/openai/protocol.py           |  5 ++-
 vllm/entrypoints/openai/run_batch.py          | 31 ++++++++++++++--
 4 files changed, 99 insertions(+), 7 deletions(-)

diff --git a/examples/offline_inference/openai/openai_batch.md b/examples/offline_inference/openai/openai_batch.md
index a4774e57cd9a5..953e6ef130f18 100644
--- a/examples/offline_inference/openai/openai_batch.md
+++ b/examples/offline_inference/openai/openai_batch.md
@@ -13,7 +13,7 @@ The OpenAI batch file format consists of a series of json objects on new lines.
 Each line represents a separate request. See the [OpenAI package reference](https://platform.openai.com/docs/api-reference/batch/requestInput) for more details.
  
 ```{note}
-We currently only support `/v1/chat/completions` and `/v1/embeddings` endpoints (completions coming soon).
+We currently support `/v1/chat/completions`, `/v1/embeddings`, and `/v1/score` endpoints (completions coming soon).
 ```
  
 ## Pre-requisites
@@ -203,3 +203,34 @@ $ cat results.jsonl
 {"id":"vllm-db0f71f7dec244e6bce530e0b4ef908b","custom_id":"request-1","response":{"status_code":200,"request_id":"vllm-batch-3580bf4d4ae54d52b67eee266a6eab20","body":{"id":"embd-33ac2efa7996430184461f2e38529746","object":"list","created":444647,"model":"intfloat/e5-mistral-7b-instruct","data":[{"index":0,"object":"embedding","embedding":[0.016204833984375,0.0092010498046875,0.0018358230590820312,-0.0028228759765625,0.001422882080078125,-0.0031147003173828125,...]}],"usage":{"prompt_tokens":8,"total_tokens":8,"completion_tokens":0}}},"error":null}
 ...
 ```
+
+## Example 5: Using score endpoint
+
+### Additional prerequisites
+
+* Ensure you are using `vllm >= 0.7.0`.
+
+### Step 1: Create your batch file
+ 
+Add score requests to your batch file. The following is an example:
+ 
+```
+{"custom_id": "request-1", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
+{"custom_id": "request-2", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
+```
+
+You can mix chat completion, embedding, and score requests in the batch file, as long as the model you are using supports them all (note that all requests must use the same model).
+
+### Step 2: Run the batch
+
+You can run the batch using the same command as in earlier examples.
+
+### Step 3: Check your results
+
+You can check your results by running `cat results.jsonl`
+
+```
+$ cat results.jsonl
+{"id":"vllm-f87c5c4539184f618e555744a2965987","custom_id":"request-1","response":{"status_code":200,"request_id":"vllm-batch-806ab64512e44071b37d3f7ccd291413","body":{"id":"score-4ee45236897b4d29907d49b01298cdb1","object":"list","created":1737847944,"model":"BAAI/bge-reranker-v2-m3","data":[{"index":0,"object":"score","score":0.0010900497436523438},{"index":1,"object":"score","score":1.0}],"usage":{"prompt_tokens":37,"total_tokens":37,"completion_tokens":0,"prompt_tokens_details":null}}},"error":null}
+{"id":"vllm-41990c51a26d4fac8419077f12871099","custom_id":"request-2","response":{"status_code":200,"request_id":"vllm-batch-73ce66379026482699f81974e14e1e99","body":{"id":"score-13f2ffe6ba40460fbf9f7f00ad667d75","object":"list","created":1737847944,"model":"BAAI/bge-reranker-v2-m3","data":[{"index":0,"object":"score","score":0.001094818115234375},{"index":1,"object":"score","score":1.0}],"usage":{"prompt_tokens":37,"total_tokens":37,"completion_tokens":0,"prompt_tokens_details":null}}},"error":null}
+```
diff --git a/tests/entrypoints/openai/test_run_batch.py b/tests/entrypoints/openai/test_run_batch.py
index 097d6b1a32349..1f8a56bb43ac6 100644
--- a/tests/entrypoints/openai/test_run_batch.py
+++ b/tests/entrypoints/openai/test_run_batch.py
@@ -1,3 +1,4 @@
+import json
 import subprocess
 import sys
 import tempfile
@@ -21,6 +22,9 @@
 {"custom_id": "request-3", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "Hello world!"}}
 {"custom_id": "request-4", "method": "POST", "url": "/v1/embeddings", "body": {"model": "NonExistModel", "input": "Hello world!"}}"""
 
+INPUT_SCORE_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
+{"custom_id": "request-2", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}"""
+
 
 def test_empty_file():
     with tempfile.NamedTemporaryFile(
@@ -102,3 +106,36 @@ def test_embeddings():
             # Ensure that the output format conforms to the openai api.
             # Validation should throw if the schema is wrong.
             BatchRequestOutput.model_validate_json(line)
+
+
+def test_score():
+    with tempfile.NamedTemporaryFile(
+            "w") as input_file, tempfile.NamedTemporaryFile(
+                "r") as output_file:
+        input_file.write(INPUT_SCORE_BATCH)
+        input_file.flush()
+        proc = subprocess.Popen([
+            sys.executable,
+            "-m",
+            "vllm.entrypoints.openai.run_batch",
+            "-i",
+            input_file.name,
+            "-o",
+            output_file.name,
+            "--model",
+            "BAAI/bge-reranker-v2-m3",
+        ], )
+        proc.communicate()
+        proc.wait()
+        assert proc.returncode == 0, f"{proc=}"
+
+        contents = output_file.read()
+        for line in contents.strip().split("\n"):
+            # Ensure that the output format conforms to the openai api.
+            # Validation should throw if the schema is wrong.
+            BatchRequestOutput.model_validate_json(line)
+
+            # Ensure that there is no error in the response.
+            line_dict = json.loads(line)
+            assert isinstance(line_dict, dict)
+            assert line_dict["error"] is None
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 311c16c538f34..f89c3f42aab17 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -1283,7 +1283,7 @@ class BatchRequestInput(OpenAIBaseModel):
     url: str
 
     # The parameters of the request.
-    body: Union[ChatCompletionRequest, EmbeddingRequest]
+    body: Union[ChatCompletionRequest, EmbeddingRequest, ScoreRequest]
 
 
 class BatchResponseData(OpenAIBaseModel):
@@ -1294,7 +1294,8 @@ class BatchResponseData(OpenAIBaseModel):
     request_id: str
 
     # The body of the response.
-    body: Optional[Union[ChatCompletionResponse, EmbeddingResponse]] = None
+    body: Optional[Union[ChatCompletionResponse, EmbeddingResponse,
+                         ScoreResponse]] = None
 
 
 class BatchRequestOutput(OpenAIBaseModel):
diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
index f8f136f9d5024..37ae23506acea 100644
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -16,12 +16,14 @@
                                               BatchRequestOutput,
                                               BatchResponseData,
                                               ChatCompletionResponse,
-                                              EmbeddingResponse, ErrorResponse)
+                                              EmbeddingResponse, ErrorResponse,
+                                              ScoreResponse)
 # yapf: enable
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
 from vllm.entrypoints.openai.serving_models import (BaseModelPath,
                                                     OpenAIServingModels)
+from vllm.entrypoints.openai.serving_score import OpenAIServingScores
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import FlexibleArgumentParser, random_uuid
 from vllm.version import __version__ as VLLM_VERSION
@@ -167,7 +169,8 @@ async def run_request(serving_engine_func: Callable,
                       tracker: BatchProgressTracker) -> BatchRequestOutput:
     response = await serving_engine_func(request.body)
 
-    if isinstance(response, (ChatCompletionResponse, EmbeddingResponse)):
+    if isinstance(response,
+                  (ChatCompletionResponse, EmbeddingResponse, ScoreResponse)):
         batch_output = BatchRequestOutput(
             id=f"vllm-{random_uuid()}",
             custom_id=request.custom_id,
@@ -239,6 +242,12 @@ async def main(args):
         chat_template=None,
         chat_template_content_format="auto",
     ) if model_config.task == "embed" else None
+    openai_serving_scores = (OpenAIServingScores(
+        engine,
+        model_config,
+        openai_serving_models,
+        request_logger=request_logger,
+    ) if model_config.task == "score" else None)
 
     tracker = BatchProgressTracker()
     logger.info("Reading batch from %s...", args.input_file)
@@ -279,14 +288,28 @@ async def main(args):
                     ))
                 continue
 
+            response_futures.append(run_request(handler_fn, request, tracker))
+            tracker.submitted()
+        elif request.url == "/v1/score":
+            handler_fn = (None if openai_serving_scores is None else
+                          openai_serving_scores.create_score)
+            if handler_fn is None:
+                response_futures.append(
+                    make_async_error_request_output(
+                        request,
+                        error_msg="The model does not support Scores API",
+                    ))
+                continue
+
             response_futures.append(run_request(handler_fn, request, tracker))
             tracker.submitted()
         else:
             response_futures.append(
                 make_async_error_request_output(
                     request,
-                    error_msg="Only /v1/chat/completions and "
-                    "/v1/embeddings are supported in the batch endpoint.",
+                    error_msg=
+                    "Only /v1/chat/completions, /v1/embeddings, and /v1/score "
+                    "are supported in the batch endpoint.",
                 ))
 
     with tracker.pbar():

From 5204ff5c3feeb96e8a6eea65dfcb78395f90d4d8 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 27 Jan 2025 13:26:44 +0800
Subject: [PATCH 392/462] [Bugfix] Fix Granite 3.0 MoE model loading (#12446)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/granitemoe.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py
index 51296ef0cc08e..b518a0a6cbdee 100644
--- a/vllm/model_executor/models/granitemoe.py
+++ b/vllm/model_executor/models/granitemoe.py
@@ -348,6 +348,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         self.config = config
         self.lora_config = lora_config
+        self.quant_config = quant_config  # Required by MixtralForCausalLM
 
         self.model = GraniteMoeModel(vllm_config=vllm_config,
                                      prefix=maybe_prefix(prefix, "model"))

From 372bf0890b19cc3c2992ce5c16eca3647e2a9e13 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Mon, 27 Jan 2025 15:25:30 +0800
Subject: [PATCH 393/462] [Bugfix] Fix missing seq_start_loc in xformers
 prefill metadata (#12464)

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 vllm/attention/backends/xformers.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
index 8c25dda7aad2c..49f47f9c8ded3 100644
--- a/vllm/attention/backends/xformers.py
+++ b/vllm/attention/backends/xformers.py
@@ -199,6 +199,8 @@ def prefill_metadata(self) -> Optional["XFormersMetadata"]:
         # Compute some attn_metadata fields which default to None
         query_start_loc = (None if self.query_start_loc is None else
                            self.query_start_loc[:self.num_prefills + 1])
+        seq_start_loc = (None if self.seq_start_loc is None else
+                         self.seq_start_loc[:self.num_prefills + 1])
         slot_mapping = (None if self.slot_mapping is None else
                         self.slot_mapping[:self.num_prefill_tokens])
         seq_lens = (None if self.seq_lens is None else
@@ -225,6 +227,7 @@ def prefill_metadata(self) -> Optional["XFormersMetadata"]:
             max_prefill_seq_len=self.max_prefill_seq_len,
             max_decode_seq_len=0,
             query_start_loc=query_start_loc,
+            seq_start_loc=seq_start_loc,
             context_lens_tensor=context_lens_tensor,
             block_tables=block_tables,
             use_cuda_graph=False,

From 624a1e4711cb9cfdd7e336980668e64744a84863 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Mon, 27 Jan 2025 01:09:27 -0800
Subject: [PATCH 394/462] [V1][Minor] Minor optimizations for
 update_from_output (#12454)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/core/scheduler.py | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index 8ded5e5787133..de7fb1a698df6 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -411,6 +411,10 @@ def update_from_output(
         num_scheduled_tokens = scheduler_output.num_scheduled_tokens
         new_running: List[Request] = []
         outputs: List[EngineCoreOutput] = []
+
+        # NOTE(woosuk): As len(self.running) can be up to 1K or more, the below
+        # loop can be a performance bottleneck. We should do our best to avoid
+        # expensive operations inside the loop.
         for request in self.running:
             req_id = request.request_id
             request.num_computed_tokens += num_scheduled_tokens[req_id]
@@ -421,13 +425,15 @@ def update_from_output(
 
             cached_encoder_input_ids = (
                 self.encoder_cache_manager.get_cached_input_ids(request))
-            for input_id in list(cached_encoder_input_ids):
-                start_pos = request.mm_positions[input_id]["offset"]
-                num_tokens = request.mm_positions[input_id]["length"]
-                if start_pos + num_tokens <= request.num_computed_tokens:
-                    # The encoder output is already processed and stored
-                    # in the decoder's KV cache.
-                    self.encoder_cache_manager.free(request, input_id)
+            # OPTIMIZATION: Avoid list(set) if the set is empty.
+            if cached_encoder_input_ids:
+                for input_id in list(cached_encoder_input_ids):
+                    start_pos = request.mm_positions[input_id]["offset"]
+                    num_tokens = request.mm_positions[input_id]["length"]
+                    if start_pos + num_tokens <= request.num_computed_tokens:
+                        # The encoder output is already processed and stored
+                        # in the decoder's KV cache.
+                        self.encoder_cache_manager.free(request, input_id)
 
             if request.num_computed_tokens == request.num_tokens:
                 req_index = model_runner_output.req_id_to_index[req_id]

From ce69f7f7542bdb8b6e6302d112fb9fad212c1460 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Mon, 27 Jan 2025 18:31:49 +0800
Subject: [PATCH 395/462] [Bugfix] Fix gpt2 GGUF inference (#12467)

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 vllm/model_executor/models/gpt2.py | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py
index 1656a3cc9e46d..2f1aa2d68653c 100644
--- a/vllm/model_executor/models/gpt2.py
+++ b/vllm/model_executor/models/gpt2.py
@@ -258,13 +258,13 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.transformer = GPT2Model(vllm_config=vllm_config,
                                      prefix=maybe_prefix(
                                          prefix, "transformer"))
+        self.lm_head = ParallelLMHead(self.config.vocab_size,
+                                      self.config.hidden_size,
+                                      quant_config=quant_config,
+                                      prefix=f"{prefix}.lm_head")
         if self.config.tie_word_embeddings:
-            self.lm_head = self.transformer.wte
-        else:
-            self.lm_head = ParallelLMHead(self.config.vocab_size,
-                                          self.config.hidden_size,
-                                          quant_config=quant_config,
-                                          prefix=f"{prefix}.lm_head")
+            self.lm_head = self.lm_head.tie_weights(self.transformer.wte)
+
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
@@ -309,15 +309,12 @@ def load_weights(self, weights: Iterable[Tuple[str,
         params_dict = dict(self.named_parameters(remove_duplicate=False))
         loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
-            if name.startswith("lm_head"):
-                # GPT-2 ties the weights of the embedding layer and the final
-                # linear layer.
-                continue
             if ".attn.bias" in name or ".attn.masked_bias" in name:
                 # Skip attention mask.
                 # NOTE: "c_attn.bias" should not be skipped.
                 continue
-            if not name.startswith("transformer."):
+            if not name.startswith("transformer.") and not name.startswith(
+                    "lm_head"):
                 name = "transformer." + name
 
             if is_pp_missing_parameter(name, self):

From 103bd17ac585b44372a47f365d80f13446cf362d Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Mon, 27 Jan 2025 10:40:00 -0500
Subject: [PATCH 396/462] [Build] Only build 9.0a for scaled_mm and sparse
 kernels (#12339)

Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
---
 CMakeLists.txt    |  8 ++++----
 cmake/utils.cmake | 43 ++++++++++++++++++++++++++++---------------
 2 files changed, 32 insertions(+), 19 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ead539993d98c..4dee9ec36895f 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -275,7 +275,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # Only build Marlin kernels if we are building for at least some compatible archs.
   # Keep building Marlin for 9.0 as there are some group sizes and shapes that
   # are not supported by Machete yet.
-  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0" ${CUDA_ARCHS})
+  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
   if (MARLIN_ARCHS)
     set(MARLIN_SRCS
        "csrc/quantization/fp8/fp8_marlin.cu"
@@ -296,8 +296,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   endif()
 
   # The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
-  # CUDA 12.0 or later (and only work on Hopper, 9.0/9.0a for now).
-  cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0;9.0a" "${CUDA_ARCHS}")
+  # CUDA 12.0 or later (and only work on Hopper, 9.0a for now).
+  cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0a" "${CUDA_ARCHS}")
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
     set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")
     set_gencode_flags_for_srcs(
@@ -351,7 +351,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # 2:4 Sparse Kernels
 
   # The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor
-  # require CUDA 12.2 or later (and only work on Hopper, 9.0/9.0a for now).
+  # require CUDA 12.2 or later (and only work on Hopper, 9.0a for now).
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS)
     set(SRCS "csrc/sparse/cutlass/sparse_compressor_c3x.cu"
              "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index 15b09395a889f..1c1c539819d05 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -259,7 +259,7 @@ endmacro()
 #  in `SRC_CUDA_ARCHS` that is less or equal to the version in `TGT_CUDA_ARCHS`.
 # We have special handling for 9.0a, if 9.0a is in `SRC_CUDA_ARCHS` and 9.0 is
 #  in `TGT_CUDA_ARCHS` then we should remove 9.0a from `SRC_CUDA_ARCHS` and add
-#  9.0a to the result. 
+#  9.0a to the result (and remove 9.0 from TGT_CUDA_ARCHS). 
 # The result is stored in `OUT_CUDA_ARCHS`.
 #
 # Example:
@@ -270,34 +270,47 @@ endmacro()
 #
 function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_ARCHS)
   list(REMOVE_DUPLICATES SRC_CUDA_ARCHS)
+  set(TGT_CUDA_ARCHS_ ${TGT_CUDA_ARCHS})
 
   # if 9.0a is in SRC_CUDA_ARCHS and 9.0 is in CUDA_ARCHS then we should
   # remove 9.0a from SRC_CUDA_ARCHS and add 9.0a to _CUDA_ARCHS
   set(_CUDA_ARCHS)
   if ("9.0a" IN_LIST SRC_CUDA_ARCHS)
     list(REMOVE_ITEM SRC_CUDA_ARCHS "9.0a")
-    if ("9.0" IN_LIST TGT_CUDA_ARCHS)
+    if ("9.0" IN_LIST TGT_CUDA_ARCHS_)
+      list(REMOVE_ITEM TGT_CUDA_ARCHS_ "9.0")
       set(_CUDA_ARCHS "9.0a")
     endif()
   endif()
 
   list(SORT SRC_CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING)
 
-  # for each ARCH in CUDA_ARCHS find the highest arch in SRC_CUDA_ARCHS that is 
-  # less or eqault to ARCH
-  foreach(_ARCH ${CUDA_ARCHS})
-  set(_TMP_ARCH)
-  foreach(_SRC_ARCH ${SRC_CUDA_ARCHS})
-    if (_SRC_ARCH VERSION_LESS_EQUAL _ARCH)
-      set(_TMP_ARCH ${_SRC_ARCH})
-    else()
-      break()
+  # for each ARCH in TGT_CUDA_ARCHS find the highest arch in SRC_CUDA_ARCHS that
+  # is less or equal to ARCH (but has the same major version since SASS binary
+  # compatibility is only forward compatible within the same major version).
+  foreach(_ARCH ${TGT_CUDA_ARCHS_})
+    set(_TMP_ARCH)
+    # Extract the major version of the target arch
+    string(REGEX REPLACE "^([0-9]+)\\..*$" "\\1" TGT_ARCH_MAJOR "${_ARCH}")
+    foreach(_SRC_ARCH ${SRC_CUDA_ARCHS})
+      # Extract the major version of the source arch
+      string(REGEX REPLACE "^([0-9]+)\\..*$" "\\1" SRC_ARCH_MAJOR "${_SRC_ARCH}")
+      # Check major-version match AND version-less-or-equal
+      if (_SRC_ARCH VERSION_LESS_EQUAL _ARCH)
+        if (SRC_ARCH_MAJOR STREQUAL TGT_ARCH_MAJOR)
+          set(_TMP_ARCH "${_SRC_ARCH}")
+        endif()
+      else()
+        # If we hit a version greater than the target, we can break
+        break()
+      endif()
+    endforeach()
+
+    # If we found a matching _TMP_ARCH, append it to _CUDA_ARCHS
+    if (_TMP_ARCH)
+      list(APPEND _CUDA_ARCHS "${_TMP_ARCH}")
     endif()
   endforeach()
-  if (_TMP_ARCH)
-    list(APPEND _CUDA_ARCHS ${_TMP_ARCH})
-  endif()
-  endforeach()
 
   list(REMOVE_DUPLICATES _CUDA_ARCHS)
   set(${OUT_CUDA_ARCHS} ${_CUDA_ARCHS} PARENT_SCOPE)

From 01ba927040d0b6f7d8daf6bfbf32fde562d2f8a6 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Mon, 27 Jan 2025 17:26:28 +0000
Subject: [PATCH 397/462] [V1][Metrics] Add initial Prometheus logger (#12416)

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
---
 tests/entrypoints/openai/test_metrics.py | 41 ++++++++++++++++++++----
 vllm/v1/engine/async_llm.py              | 11 ++++---
 vllm/v1/metrics/loggers.py               | 36 +++++++++++++++++++++
 3 files changed, 78 insertions(+), 10 deletions(-)

diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py
index 6523c8b6297c6..469a5fb039fb6 100644
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@@ -16,6 +16,24 @@
 MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
 
 
+@pytest.fixture(scope="module", params=[True, False])
+def use_v1(request):
+    # Module-scoped variant of run_with_both_engines
+    #
+    # Use this fixture to run a test with both v0 and v1, and
+    # also to conditionalize the test logic e.g.
+    #
+    # def test_metrics_exist(use_v1, server, client):
+    #     ...
+    #     expected = EXPECTED_V1_METRICS if use_v1 else EXPECTED_METRICS
+    #     for metric in expected:
+    #         assert metric in response.text
+    #
+    # @skip_v1 wouldn't work here because this is a module-level
+    # fixture - per-function decorators would have no effect
+    yield request.param
+
+
 @pytest.fixture(scope="module")
 def default_server_args():
     return [
@@ -36,10 +54,12 @@ def default_server_args():
                     "--enable-chunked-prefill",
                     "--disable-frontend-multiprocessing",
                 ])
-def server(default_server_args, request):
+def server(use_v1, default_server_args, request):
     if request.param:
         default_server_args.append(request.param)
-    with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
+    env_dict = dict(VLLM_USE_V1='1' if use_v1 else '0')
+    with RemoteOpenAIServer(MODEL_NAME, default_server_args,
+                            env_dict=env_dict) as remote_server:
         yield remote_server
 
 
@@ -84,7 +104,9 @@ async def client(server):
 
 @pytest.mark.asyncio
 async def test_metrics_counts(server: RemoteOpenAIServer,
-                              client: openai.AsyncClient):
+                              client: openai.AsyncClient, use_v1: bool):
+    if use_v1:
+        pytest.skip("Skipping test on vllm V1")
     for _ in range(_NUM_REQUESTS):
         # sending a request triggers the metrics to be logged.
         await client.completions.create(
@@ -174,10 +196,15 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
     "swap_space_bytes",
 ]
 
+EXPECTED_METRICS_V1 = [
+    "vllm:num_requests_running",
+    "vllm:num_requests_waiting",
+]
+
 
 @pytest.mark.asyncio
 async def test_metrics_exist(server: RemoteOpenAIServer,
-                             client: openai.AsyncClient):
+                             client: openai.AsyncClient, use_v1: bool):
     # sending a request triggers the metrics to be logged.
     await client.completions.create(model=MODEL_NAME,
                                     prompt="Hello, my name is",
@@ -187,11 +214,13 @@ async def test_metrics_exist(server: RemoteOpenAIServer,
     response = requests.get(server.url_for("metrics"))
     assert response.status_code == HTTPStatus.OK
 
-    for metric in EXPECTED_METRICS:
+    for metric in (EXPECTED_METRICS_V1 if use_v1 else EXPECTED_METRICS):
         assert metric in response.text
 
 
-def test_metrics_exist_run_batch():
+def test_metrics_exist_run_batch(use_v1: bool):
+    if use_v1:
+        pytest.skip("Skipping test on vllm V1")
     input_batch = """{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are a helpful assistant."}}"""  # noqa: E501
 
     base_url = "0.0.0.0"
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 6dc68b3a16099..917d52d3220b8 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -24,7 +24,8 @@
 from vllm.v1.engine.output_processor import OutputProcessor
 from vllm.v1.engine.processor import Processor
 from vllm.v1.executor.abstract import Executor
-from vllm.v1.metrics.loggers import LoggingStatLogger, StatLoggerBase
+from vllm.v1.metrics.loggers import (LoggingStatLogger, PrometheusStatLogger,
+                                     StatLoggerBase)
 from vllm.v1.metrics.stats import IterationStats, SchedulerStats
 
 logger = init_logger(__name__)
@@ -46,13 +47,15 @@ def __init__(
 
         assert start_engine_loop
 
+        self.model_config = vllm_config.model_config
+
         self.log_requests = log_requests
         self.log_stats = log_stats
         self.stat_loggers: List[StatLoggerBase] = [
             LoggingStatLogger(),
-            # TODO(rob): PrometheusStatLogger(),
+            PrometheusStatLogger(labels=dict(
+                model_name=self.model_config.served_model_name)),
         ]
-        self.model_config = vllm_config.model_config
 
         # Tokenizer (+ ensure liveness if running in another process).
         self.tokenizer = init_tokenizer_from_configs(
@@ -272,7 +275,7 @@ async def _run_output_handler(self):
 
                 # 4) Logging.
                 # TODO(rob): make into a coroutine and launch it in
-                # background thread once we add Prometheus.
+                # background thread once Prometheus overhead is non-trivial.
                 assert iteration_stats is not None
                 self._log_stats(
                     scheduler_stats=outputs.scheduler_stats,
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index 8feeef17542e6..b84f03fa3267c 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -1,5 +1,8 @@
 import time
 from abc import ABC, abstractmethod
+from typing import Dict
+
+import prometheus_client
 
 from vllm.logger import init_logger
 from vllm.v1.metrics.stats import SchedulerStats
@@ -36,3 +39,36 @@ def log(self, scheduler_stats: SchedulerStats):
             scheduler_stats.num_running_reqs,
             scheduler_stats.num_waiting_reqs,
         )
+
+
+class PrometheusStatLogger(StatLoggerBase):
+
+    def __init__(self, labels: Dict[str, str]):
+        self.labels = labels
+
+        labelnames = self.labels.keys()
+        labelvalues = self.labels.values()
+
+        self._unregister_vllm_metrics()
+
+        self.gauge_scheduler_running = prometheus_client.Gauge(
+            name="vllm:num_requests_running",
+            documentation="Number of requests in model execution batches.",
+            labelnames=labelnames).labels(*labelvalues)
+
+        self.gauge_scheduler_waiting = prometheus_client.Gauge(
+            name="vllm:num_requests_waiting",
+            documentation="Number of requests waiting to be processed.",
+            labelnames=labelnames).labels(*labelvalues)
+
+    def log(self, scheduler_stats: SchedulerStats):
+        """Log to prometheus."""
+        self.gauge_scheduler_running.set(scheduler_stats.num_running_reqs)
+        self.gauge_scheduler_waiting.set(scheduler_stats.num_waiting_reqs)
+
+    @staticmethod
+    def _unregister_vllm_metrics():
+        # Unregister any existing vLLM collectors (for CI/CD
+        for collector in list(prometheus_client.REGISTRY._collector_to_names):
+            if hasattr(collector, "_name") and "vllm" in collector._name:
+                prometheus_client.REGISTRY.unregister(collector)

From 3f1fc7425a7db4d9722941075e43bb2ebfb90613 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Mon, 27 Jan 2025 09:40:04 -0800
Subject: [PATCH 398/462] [V1][CI/Test] Do basic test for top-p & top-k
 sampling (#12469)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 tests/v1/engine/test_engine_core.py | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index cccfd305ac604..033bbcfce564e 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -144,7 +144,7 @@ def test_engine_core(monkeypatch):
 def test_engine_core_advanced_sampling(monkeypatch):
     """
     A basic end-to-end test to verify that the engine functions correctly 
-    when additional sampling parameters, such as min_tokens and 
+    when additional sampling parameters, such as top_p, min_tokens, and 
     presence_penalty, are set.
     """
     with monkeypatch.context() as m:
@@ -167,11 +167,23 @@ def test_engine_core_advanced_sampling(monkeypatch):
             stop_token_ids=[1001, 1002],
         )
         engine_core.add_request(request)
-        assert len(engine_core.scheduler.waiting) == 1
-        assert len(engine_core.scheduler.running) == 0
-        # Loop through until they are all done.
-        while len(engine_core.step().outputs) > 0:
-            pass
 
-        assert len(engine_core.scheduler.waiting) == 0
-        assert len(engine_core.scheduler.running) == 0
+        def _check_engine_state():
+            assert len(engine_core.scheduler.waiting) == 1
+            assert len(engine_core.scheduler.running) == 0
+            # Loop through until they are all done.
+            while len(engine_core.step().outputs) > 0:
+                pass
+            assert len(engine_core.scheduler.waiting) == 0
+            assert len(engine_core.scheduler.running) == 0
+
+        _check_engine_state()
+
+        # Second request.
+        request2 = make_request()
+        request2.sampling_params = SamplingParams(
+            top_p=0.99,
+            top_k=50,
+        )
+        engine_core.add_request(request2)
+        _check_engine_state()

From 2bc3fbba0cf5b07fabb798d41b153b895d30c7b4 Mon Sep 17 00:00:00 2001
From: Bowen Wang <abmfy@icloud.com>
Date: Tue, 28 Jan 2025 02:19:24 +0800
Subject: [PATCH 399/462] [FlashInfer] Upgrade to 0.2.0 (#11194)

Signed-off-by: Bowen Wang <abmfy@icloud.com>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
---
 .buildkite/test-pipeline.yaml                 |  11 +-
 Dockerfile                                    |  23 ++-
 .../test_basic_correctness.py                 |   5 +-
 tests/compile/test_basic_correctness.py       |   2 +-
 tests/kernels/test_flashinfer.py              |  74 +++----
 vllm/attention/backends/flashinfer.py         | 183 ++++++++++++++++--
 vllm/config.py                                |  10 +-
 vllm/model_executor/model_loader/loader.py    |   4 +-
 .../model_executor/model_loader/tensorizer.py |   3 +-
 vllm/worker/worker_base.py                    |  17 +-
 10 files changed, 257 insertions(+), 75 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index daec46760117d..d5d02fdeb7f4b 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -183,7 +183,16 @@ steps:
     - vllm/
     - tests/v1
   commands:
-    - VLLM_USE_V1=1 pytest -v -s v1
+    # split the test to avoid interference
+    - VLLM_USE_V1=1 pytest -v -s v1/core
+    - VLLM_USE_V1=1 pytest -v -s v1/engine
+    - VLLM_USE_V1=1 pytest -v -s v1/sample
+    - VLLM_USE_V1=1 pytest -v -s v1/worker
+    - VLLM_USE_V1=1 pytest -v -s v1/test_stats.py
+    - VLLM_USE_V1=1 pytest -v -s v1/test_utils.py
+    # TODO: accuracy does not match, whether setting
+    # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
+    - VLLM_USE_V1=1 pytest -v -s v1/e2e
 
 - label: Examples Test # 25min
   working_dir: "/vllm-workspace/examples"
diff --git a/Dockerfile b/Dockerfile
index cb9cf0da5be65..0b9f74e08dc68 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -149,7 +149,8 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 
 #################### vLLM installation IMAGE ####################
 # image with vLLM installed
-FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu22.04 AS vllm-base
+# TODO: Restore to base image after FlashInfer AOT wheel fixed
+FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS vllm-base
 ARG CUDA_VERSION=12.4.1
 ARG PYTHON_VERSION=3.12
 WORKDIR /vllm-workspace
@@ -194,12 +195,30 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
     --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install dist/*.whl --verbose
 
+# How to build this FlashInfer wheel:
+# $ export FLASHINFER_ENABLE_AOT=1
+# $ # Note we remove 7.0 from the arch list compared to the list below, since FlashInfer only supports sm75+
+# $ export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.6 8.9 9.0+PTX'
+# $ git clone https://github.com/flashinfer-ai/flashinfer.git --recursive
+# $ cd flashinfer
+# $ git checkout 524304395bd1d8cd7d07db083859523fcaa246a4
+# $ python3 setup.py bdist_wheel --dist-dir=dist --verbose
+
 RUN --mount=type=cache,target=/root/.cache/pip \
 . /etc/environment && \
 if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
-    python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl; \
+    python3 -m pip install https://wheels.vllm.ai/flashinfer/524304395bd1d8cd7d07db083859523fcaa246a4/flashinfer_python-0.2.0.post1-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl; \
 fi
 COPY examples examples
+
+# Although we build Flashinfer with AOT mode, there's still
+# some issues w.r.t. JIT compilation. Therefore we need to
+# install build dependencies for JIT compilation.
+# TODO: Remove this once FlashInfer AOT wheel is fixed
+COPY requirements-build.txt requirements-build.txt
+RUN --mount=type=cache,target=/root/.cache/pip \
+    python3 -m pip install -r requirements-build.txt
+
 #################### vLLM installation IMAGE ####################
 
 #################### TEST IMAGE ####################
diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
index 31a101e48e026..23285040642a8 100644
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -61,9 +61,10 @@ def test_models(
     if backend == "FLASHINFER" and current_platform.is_rocm():
         pytest.skip("Flashinfer does not support ROCm/HIP.")
 
-    if backend == "XFORMERS" and model == "google/gemma-2-2b-it":
+    if backend in ("XFORMERS",
+                   "FLASHINFER") and model == "google/gemma-2-2b-it":
         pytest.skip(
-            "XFORMERS does not support gemma2 with full context length.")
+            f"{backend} does not support gemma2 with full context length.")
 
     os.environ["VLLM_ATTENTION_BACKEND"] = backend
 
diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py
index 87d5aefea6cb4..1945479fc3031 100644
--- a/tests/compile/test_basic_correctness.py
+++ b/tests/compile/test_basic_correctness.py
@@ -58,7 +58,7 @@ class TestSetting:
         model_args=["--task", "embed"],
         pp_size=1,
         tp_size=1,
-        attn_backend="FLASHINFER",
+        attn_backend="FLASH_ATTN",
         method="encode",
         fullgraph=True,
     ),
diff --git a/tests/kernels/test_flashinfer.py b/tests/kernels/test_flashinfer.py
index a2c8f71665737..1645ef911d697 100644
--- a/tests/kernels/test_flashinfer.py
+++ b/tests/kernels/test_flashinfer.py
@@ -133,17 +133,19 @@ def test_flashinfer_decode_with_paged_kv(
                 use_tensor_cores=(
                     (num_query_heads//num_kv_heads) > 4)
                 )
-    wrapper.begin_forward(kv_indptr,
-                          kv_indices,
-                          kv_last_page_lens,
-                          num_query_heads,
-                          num_kv_heads,
-                          head_size,
-                          block_size,
-                          "NONE",
-                          data_type=dtype)
-
-    output = wrapper.forward(query, key_value_cache, logits_soft_cap=soft_cap)
+    wrapper.plan(kv_indptr,
+                 kv_indices,
+                 kv_last_page_lens,
+                 num_query_heads,
+                 num_kv_heads,
+                 head_size,
+                 block_size,
+                 "NONE",
+                 q_data_type=dtype,
+                 kv_data_type=dtype,
+                 logits_soft_cap=soft_cap)
+
+    output = wrapper.run(query, key_value_cache)
 
     ref_output = ref_paged_attn(query=query,
                                 key_cache=key_cache,
@@ -228,7 +230,7 @@ def test_flashinfer_prefill_with_paged_kv(seq_lens: List[Tuple[int, int]],
     workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8)
     wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper(
         workspace_buffer, "NHD")
-    wrapper.begin_forward(
+    wrapper.plan(
         qo_indptr,
         kv_indptr,
         kv_indices,
@@ -237,12 +239,14 @@ def test_flashinfer_prefill_with_paged_kv(seq_lens: List[Tuple[int, int]],
         num_kv_heads,
         head_size,
         block_size,
+        q_data_type=dtype,
+        kv_data_type=dtype,
+        logits_soft_cap=soft_cap,
     )
 
-    output = wrapper.forward(
+    output = wrapper.run(
         query,
         key_value_cache,
-        logits_soft_cap=soft_cap,
     )
 
     ref_output = ref_paged_attn(query=query,
@@ -253,7 +257,7 @@ def test_flashinfer_prefill_with_paged_kv(seq_lens: List[Tuple[int, int]],
                                 block_tables=block_tables,
                                 scale=scale,
                                 soft_cap=soft_cap)
-    torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2), \
+    torch.testing.assert_close(output, ref_output, atol=5e-2, rtol=1e-2), \
         f"{torch.max(torch.abs(output - ref_output))}"
 
 
@@ -332,7 +336,7 @@ def test_flashinfer_prefill_with_paged_fp8_kv(
     workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8)
     wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper(
         workspace_buffer, "NHD")
-    wrapper.begin_forward(
+    wrapper.plan(
         qo_indptr,
         kv_indptr,
         kv_indices,
@@ -341,13 +345,12 @@ def test_flashinfer_prefill_with_paged_fp8_kv(
         num_kv_heads,
         head_size,
         block_size,
+        q_data_type=dtype,
+        kv_data_type=kv_cache_dtype,
+        logits_soft_cap=soft_cap,
     )
 
-    output = wrapper.forward(query,
-                             kv_cache_fp8,
-                             logits_soft_cap=soft_cap,
-                             k_scale=k_scale,
-                             v_scale=v_scale)
+    output = wrapper.run(query, kv_cache_fp8, k_scale=k_scale, v_scale=v_scale)
 
     ref_output = ref_paged_attn(query=query,
                                 key_cache=key_cache.squeeze(1),
@@ -360,7 +363,7 @@ def test_flashinfer_prefill_with_paged_fp8_kv(
     del query
     del block_tables
     # verify prefill fp8
-    torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2), \
+    torch.testing.assert_close(output, ref_output, atol=5e-2, rtol=1e-2), \
         f"{torch.max(torch.abs(output - ref_output))}"
 
 
@@ -439,21 +442,18 @@ def test_flashinfer_decode_with_paged_fp8_kv(
     wrapper = flashinfer.\
         BatchDecodeWithPagedKVCacheWrapper(workspace_buffer, "NHD",
                     use_tensor_cores=use_tensor_cores)
-    wrapper.begin_forward(kv_indptr,
-                          kv_indices,
-                          kv_last_page_lens,
-                          num_query_heads,
-                          num_kv_heads,
-                          head_size,
-                          block_size,
-                          "NONE",
-                          data_type=dtype,
-                          q_data_type=dtype)
-    output = wrapper.forward(query,
-                             kv_cache_fp8,
-                             logits_soft_cap=soft_cap,
-                             k_scale=k_scale,
-                             v_scale=v_scale)
+    wrapper.plan(kv_indptr,
+                 kv_indices,
+                 kv_last_page_lens,
+                 num_query_heads,
+                 num_kv_heads,
+                 head_size,
+                 block_size,
+                 "NONE",
+                 q_data_type=dtype,
+                 kv_data_type=kv_cache_dtype,
+                 logits_soft_cap=soft_cap)
+    output = wrapper.run(query, kv_cache_fp8, k_scale=k_scale, v_scale=v_scale)
     key_cache = key_value_cache[:, 0, :, :, :].squeeze(1)
     value_cache = key_value_cache[:, 1, :, :, :].squeeze(1)
 
diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
index 3135b0b405343..7cccef9608218 100644
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -1,3 +1,4 @@
+import dataclasses
 from collections import defaultdict
 from contextlib import contextmanager
 from dataclasses import dataclass
@@ -13,9 +14,11 @@
     from vllm.vllm_flash_attn import flash_attn_varlen_func
     FLASHINFER_WORKSPACE_BUFFER_SIZE = 256 * 1024 * 1024
 except ImportError:
-    BatchDecodeWithPagedKVCacheWrapper = None
-    CUDAGraphBatchDecodeWithPagedKVCacheWrapper = None
-    BatchPrefillWithPagedKVCacheWrapper = None
+    # Avoid turning these types into variables during type checking
+    if not TYPE_CHECKING:
+        BatchDecodeWithPagedKVCacheWrapper = None
+        CUDAGraphBatchDecodeWithPagedKVCacheWrapper = None
+        BatchPrefillWithPagedKVCacheWrapper = None
     FLASHINFER_WORKSPACE_BUFFER_SIZE = 0
 
 import torch
@@ -30,7 +33,9 @@
 from vllm.attention.backends.utils import (PAD_SLOT_ID, compute_slot_mapping,
                                            compute_slot_mapping_start_idx,
                                            is_block_tables_empty)
+from vllm.attention.layer import Attention
 from vllm.attention.ops.paged_attn import PagedAttention
+from vllm.config import VllmConfig, get_current_vllm_config
 from vllm.utils import (async_tensor_h2d, get_kv_cache_torch_dtype,
                         make_tensor_with_pad)
 
@@ -99,6 +104,72 @@ def get_fp8_dtype_for_flashinfer(kv_cache_dtype: str) -> torch.dtype:
             raise ValueError(f"Unrecognized FP8 dtype: {kv_cache_dtype}")
 
 
+@dataclass
+class PerLayerParameters:
+    """
+    Currently, FlashInfer backend only support models in which all layers share
+    the same values for the following hyperparameters.
+    """
+
+    window_left: int
+    logits_soft_cap: Optional[float]
+    sm_scale: float
+
+
+def get_per_layer_parameters(
+        vllm_config: VllmConfig) -> Dict[str, PerLayerParameters]:
+    """
+    Scan all attention layers and determine some hyperparameters
+    to use during `plan`.
+    """
+
+    layers = vllm_config.compilation_config.static_forward_context
+    per_layer_params: Dict[str, PerLayerParameters] = {}
+
+    for key, layer in layers.items():
+        assert isinstance(layer, Attention)
+
+        impl = layer.impl
+        assert isinstance(impl, FlashInferImpl)
+
+        # Infer hyperparameters from the attention layer
+        window_size = impl.sliding_window
+        window_left = window_size[0] if window_size is not None else -1
+        logits_soft_cap = impl.logits_soft_cap
+        sm_scale = impl.scale
+
+        per_layer_params[key] = PerLayerParameters(window_left,
+                                                   logits_soft_cap, sm_scale)
+
+    return per_layer_params
+
+
+def infer_global_hyperparameters(
+        per_layer_params: Dict[str, PerLayerParameters]) -> PerLayerParameters:
+    """
+    Currently, FlashInfer backend only support models in which all layers share
+    the same values for the following hyperparameters:
+    - `window_left`
+    - `logits_soft_cap`
+    - `sm_scale`
+
+    So this function asserts that all layers share the same values for these
+    hyperparameters and returns the global values.
+    """
+
+    assert len(per_layer_params) > 0, "No attention layers found in the model."
+
+    param_sets = list(per_layer_params.values())
+    global_params = param_sets[0]
+    for params in param_sets:
+        assert params == global_params, (
+            "FlashInfer backend currently only supports models in which all "
+            "layers share the same values for the following hyperparameters: "
+            "`window_left`, `logits_soft_cap`, `sm_scale`.")
+
+    return global_params
+
+
 class FlashInferState(AttentionState):
 
     def __init__(self, runner):
@@ -108,6 +179,11 @@ def __init__(self, runner):
         self._decode_wrapper = None
         self._prefill_wrapper = None
 
+        # Global hyperparameters shared by all attention layers
+        self.global_hyperparameters: Optional[PerLayerParameters] = None
+
+        self.vllm_config = get_current_vllm_config()
+
     def _get_workspace_buffer(self):
         if self._workspace_buffer is None:
             self._workspace_buffer = torch.empty(
@@ -215,6 +291,9 @@ def graph_capture_get_metadata_for_batch(
                                             batch_size + 1,
                                             dtype=torch.int32)
 
+        global_params = infer_global_hyperparameters(
+            get_per_layer_parameters(self.vllm_config))
+
         attn_metadata = self.runner.attn_backend.make_metadata(
             num_prefills=0,
             slot_mapping=self._graph_slot_mapping[:batch_size],
@@ -238,7 +317,9 @@ def graph_capture_get_metadata_for_batch(
             q_data_type=self.runner.model_config.dtype,
             use_cuda_graph=True,
             decode_wrapper=self._graph_decode_wrapper,
-            prefill_wrapper=None)
+            prefill_wrapper=None,
+            **dataclasses.asdict(global_params),
+        )
         attn_metadata.begin_forward()
         return attn_metadata
 
@@ -325,9 +406,28 @@ class FlashInferMetadata(AttentionMetadata):
     data_type: torch.dtype = None
     # The data type of the query
     q_data_type: torch.dtype = None
-    device: torch.device = torch.device("cuda")
+    # FlashInfer 0.2 encourages passing host tensors
+    device: torch.device = torch.device("cpu")
     is_profile_run: bool = False
 
+    # The FlashInfer backend currently supports only models in which all layers
+    # share the same following hyperparameters:
+
+    # The left (inclusive) window size for the attention window, when
+    # set to `-1`, the window size will be set to the full length of
+    # the sequence. Defaults to `-1`.
+    window_left: int = -1
+    # The attention logits soft capping value (used in Gemini, Grok and
+    # Gemma-2, etc.), if not provided, will be set to `0`. If greater
+    # than 0, the logits will be capped according to formula:
+    # $$\texttt{logits\_soft\_cap} \times
+    # \mathrm{tanh}(x / \texttt{logits\_soft\_cap})$$,
+    # where $x$ is the input logits.
+    logits_soft_cap: Optional[float] = None
+    # The scale used in softmax, if not provided, will be set to
+    # `1.0 / sqrt(head_dim)`.
+    sm_scale: Optional[float] = None
+
     def __post_init__(self):
         # Refer to
         # https://github.com/flashinfer-ai/flashinfer/blob/3d55c71a62052c590c130897d3a3db49b14fcc34/include/flashinfer/utils.cuh#L157
@@ -363,14 +463,21 @@ def begin_forward(self):
                 self.block_table_bound = self.block_table_bound.to(self.device)
                 self.seq_lens_tensor = self.seq_lens_tensor.to(self.device)
                 self.paged_kv_indices = self.paged_kv_indices.to(self.device)
-                self.prefill_wrapper.end_forward()
-                self.prefill_wrapper.begin_forward(
+                self.prefill_wrapper.plan(
                     self.query_start_loc,
                     self.paged_kv_indptr[:self.num_prefills + 1],
                     self.paged_kv_indices,
                     self.paged_kv_last_page_len[:self.num_prefills],
-                    self.num_qo_heads, self.num_kv_heads, self.head_dim,
-                    self.page_size)
+                    self.num_qo_heads,
+                    self.num_kv_heads,
+                    self.head_dim,
+                    self.page_size,
+                    causal=True,
+                    sm_scale=self.sm_scale,
+                    window_left=self.window_left,
+                    logits_soft_cap=self.logits_soft_cap,
+                    q_data_type=self.q_data_type,
+                    kv_data_type=self.data_type)
         if self.num_decode_tokens > 0:
             assert self.paged_kv_indices is not None
             assert self.paged_kv_indptr is not None
@@ -386,8 +493,7 @@ def begin_forward(self):
                 self.seq_lens_tensor = self.seq_lens_tensor.to(self.device)
 
             assert self.decode_wrapper is not None
-            self.decode_wrapper.end_forward()
-            self.decode_wrapper.begin_forward(
+            self.decode_wrapper.plan(
                 self.paged_kv_indptr[self.num_prefills:],
                 self.paged_kv_indices,
                 self.paged_kv_last_page_len[self.num_prefills:],
@@ -397,8 +503,11 @@ def begin_forward(self):
                 self.page_size,
                 # Disable flashinfer's pos encoding and use vllm's rope.
                 pos_encoding_mode="NONE",
+                window_left=self.window_left,
+                logits_soft_cap=self.logits_soft_cap,
+                sm_scale=self.sm_scale,
                 # kv-cache data type.
-                data_type=self.data_type,
+                kv_data_type=self.data_type,
                 # query data type.
                 q_data_type=self.q_data_type)
 
@@ -496,6 +605,11 @@ def __init__(self, input_builder: "ModelInputForGPUBuilder"):
         self.sliding_window = input_builder.sliding_window
         self.block_size = input_builder.block_size
 
+        # Global hyperparameters shared by all attention layers
+        self.global_hyperparameters: Optional[PerLayerParameters] = None
+
+        self.vllm_config = get_current_vllm_config()
+
     def prepare(self):
         self.slot_mapping: List[int] = []
         self.prefill_seq_lens: List[int] = []
@@ -528,6 +642,20 @@ def prepare(self):
         self.total_blocks = 0
         self.is_profile_run: bool = False
 
+        if self.global_hyperparameters is None:
+            # Infer global hyperparameters, since currently we only support
+            # models in which all layers share the same values for the
+            # following hyperparameters:
+            # - `window_left`
+            # - `logits_soft_cap`
+            # - `sm_scale`
+            inferred_params = infer_global_hyperparameters(
+                get_per_layer_parameters(self.vllm_config))
+            self.global_hyperparameters = inferred_params
+            self.window_left = inferred_params.window_left
+            self.logits_soft_cap = inferred_params.logits_soft_cap
+            self.sm_scale = inferred_params.sm_scale
+
     def _add_seq_group(
             self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup",
             chunked_prefill_enabled: bool):
@@ -756,7 +884,11 @@ def build(self, seq_lens: List[int], query_lens: List[int],
             data_type=kv_cache_dtype,
             q_data_type=self.runner.model_config.dtype,
             use_cuda_graph=use_captured_graph,
-            is_profile_run=self.is_profile_run)
+            is_profile_run=self.is_profile_run,
+            window_left=self.window_left,
+            logits_soft_cap=self.logits_soft_cap,
+            sm_scale=self.sm_scale,
+        )
 
 
 class FlashInferImpl(AttentionImpl):
@@ -885,25 +1017,34 @@ def forward(
             else:
                 assert prefill_meta is not None
                 assert prefill_meta.prefill_wrapper is not None
-                prefill_output = prefill_meta.prefill_wrapper.forward(
+
+                assert prefill_meta.prefill_wrapper._causal
+                assert prefill_meta.prefill_wrapper._window_left == window_left
+                assert prefill_meta.prefill_wrapper._logits_soft_cap == (
+                    logits_soft_cap or 0.0)
+                assert prefill_meta.prefill_wrapper._sm_scale == softmax_scale
+
+                prefill_output = prefill_meta.prefill_wrapper.run(
                     query,
                     kv_cache,
-                    logits_soft_cap=logits_soft_cap,
-                    causal=True,
                     k_scale=layer._k_scale_float,
                     v_scale=layer._v_scale_float,
-                    window_left=window_left)
+                )
         if decode_meta := attn_metadata.decode_metadata:
             assert decode_meta is not None
             assert decode_meta.decode_wrapper is not None
-            decode_output = decode_meta.decode_wrapper.forward(
+
+            assert decode_meta.decode_wrapper._window_left == window_left
+            assert decode_meta.decode_wrapper._logits_soft_cap == (
+                logits_soft_cap or 0.0)
+            assert decode_meta.decode_wrapper._sm_scale == softmax_scale
+
+            decode_output = decode_meta.decode_wrapper.run(
                 decode_query,
                 kv_cache,
-                sm_scale=softmax_scale,
-                logits_soft_cap=logits_soft_cap,
                 k_scale=layer._k_scale_float,
                 v_scale=layer._v_scale_float,
-                window_left=window_left)
+            )
 
         if prefill_output is None and decode_output is not None:
             # Decode only batch.
diff --git a/vllm/config.py b/vllm/config.py
index 7a58d64bcc6e2..dc1d611115489 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -310,14 +310,15 @@ def __init__(
             (self.hf_text_config.model_type in ["gemma2", "cohere2"]))
 
         if (not self.disable_sliding_window and has_interleaved_attention):
-            if envs.VLLM_ATTENTION_BACKEND == "XFORMERS":
+            if (backend :=
+                    envs.VLLM_ATTENTION_BACKEND) in ("XFORMERS", "FLASHINFER"):
                 sliding_window_len_min = get_min_sliding_window(
                     self.hf_text_config.sliding_window)
 
                 logger.warning_once(
                     f"{self.hf_text_config.model_type} has interleaved "
                     "attention, which is currently not supported by the "
-                    "XFORMERS backend. Disabling sliding window and capping "
+                    f"{backend} backend. Disabling sliding window and capping "
                     "the max length to the sliding window size "
                     f"({sliding_window_len_min}).")
                 self.disable_sliding_window = True
@@ -3310,7 +3311,7 @@ def __str__(self):
 
 
 @contextmanager
-def set_current_vllm_config(vllm_config: VllmConfig):
+def set_current_vllm_config(vllm_config: VllmConfig, check_compile=False):
     """
     Temporarily set the current VLLM config.
     Used during model initialization.
@@ -3330,7 +3331,8 @@ def set_current_vllm_config(vllm_config: VllmConfig):
                      vllm_config.compilation_config.enabled_custom_ops)
         logger.debug("disabled custom ops: %s",
                      vllm_config.compilation_config.disabled_custom_ops)
-        if vllm_config.compilation_config.level == CompilationLevel.PIECEWISE \
+        if check_compile and \
+            vllm_config.compilation_config.level == CompilationLevel.PIECEWISE \
             and compilation_counter.num_models_seen == num_models_seen:
             # If the model supports compilation,
             # compilation_counter.num_models_seen should be increased
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index e9779878710ee..527b4307f3670 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -114,7 +114,7 @@ def _initialize_model(
     all_params = [param.name for param in signatures.parameters.values()]
     if "vllm_config" in all_params and "prefix" in all_params:
         # new-style model class
-        with set_current_vllm_config(vllm_config):
+        with set_current_vllm_config(vllm_config, check_compile=True):
             return model_class(vllm_config=vllm_config, prefix=prefix)
 
     msg = ("vLLM model class should accept `vllm_config` and `prefix` as "
@@ -142,7 +142,7 @@ def _initialize_model(
         kwargs["lora_config"] = vllm_config.lora_config
     if "scheduler_config" in all_params:
         kwargs["scheduler_config"] = vllm_config.scheduler_config
-    with set_current_vllm_config(vllm_config):
+    with set_current_vllm_config(vllm_config, check_compile=True):
         return model_class(**kwargs)
 
 
diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py
index 5b4757072353f..e359aef9dcb7f 100644
--- a/vllm/model_executor/model_loader/tensorizer.py
+++ b/vllm/model_executor/model_loader/tensorizer.py
@@ -288,7 +288,8 @@ def _init_model(self):
         model_args.torch_dtype = self.tensorizer_config.dtype
         assert self.tensorizer_config.model_class is not None
         # TODO: Do we need to consider old-style model class?
-        with no_init_or_tensor(), set_current_vllm_config(self.vllm_config):
+        with no_init_or_tensor(), set_current_vllm_config(self.vllm_config,
+                                                          check_compile=True):
             return self.tensorizer_config.model_class(
                 vllm_config=self.vllm_config, )
 
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index c6e6693c54f57..6eeb4aa17051f 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -8,7 +8,8 @@
 import torch
 import torch.nn as nn
 
-from vllm.config import ObservabilityConfig, VllmConfig
+from vllm.config import (ObservabilityConfig, VllmConfig,
+                         set_current_vllm_config)
 from vllm.distributed import broadcast_tensor_dict, get_pp_group, get_tp_group
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
@@ -498,8 +499,11 @@ def __init__(
         group.
         """
         self.rpc_rank = rpc_rank
-        self.vllm_config = vllm_config
         self.worker: Optional[WorkerBase] = None
+        # do not store this `vllm_config`, `init_worker` will set the final
+        # one. TODO: investigate if we can remove this field in
+        # `WorkerWrapperBase`, `init_cached_hf_modules` should be
+        # unnecessary now.
         if vllm_config.model_config is not None:
             # it can be None in tests
             trust_remote_code = vllm_config.model_config.trust_remote_code
@@ -533,6 +537,9 @@ def init_worker(self, all_kwargs: List[Dict[str, Any]]) -> None:
         Arguments are passed to the worker class constructor.
         """
         kwargs = all_kwargs[self.rpc_rank]
+        self.vllm_config = kwargs.get("vllm_config", None)
+        assert self.vllm_config is not None, (
+            "vllm_config is required to initialize the worker")
         enable_trace_function_call_for_thread(self.vllm_config)
 
         from vllm.plugins import load_general_plugins
@@ -546,8 +553,10 @@ def init_worker(self, all_kwargs: List[Dict[str, Any]]) -> None:
                               bytes)
             worker_class = cloudpickle.loads(
                 self.vllm_config.parallel_config.worker_cls)
-        self.worker = worker_class(**kwargs)
-        assert self.worker is not None
+        with set_current_vllm_config(self.vllm_config):
+            # To make vLLM config available during worker initialization
+            self.worker = worker_class(**kwargs)
+            assert self.worker is not None
 
     def execute_method(self, method: Union[str, bytes], *args, **kwargs):
         try:

From 6116ca8cd79b642c64f4ae6f050a6bc12b96d037 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nicolo.lucchesi@gmail.com>
Date: Mon, 27 Jan 2025 22:38:35 +0100
Subject: [PATCH 400/462] [Feature] [Spec decode]: Enable MLPSpeculator/Medusa
 and `prompt_logprobs` with ChunkedPrefill (#10132)

Signed-off-by: NickLucche <nlucches@redhat.com>
Signed-off-by: wallashss <wallashss@ibm.com>
Co-authored-by: wallashss <wallashss@ibm.com>
---
 tests/spec_decode/e2e/conftest.py             |  19 +-
 .../e2e/test_integration_dist_tp2.py          |  10 +-
 tests/spec_decode/e2e/test_logprobs.py        |  16 +-
 .../e2e/test_medusa_correctness.py            |  31 ++-
 tests/spec_decode/e2e/test_mlp_correctness.py |  53 ++++-
 .../e2e/test_multistep_correctness.py         |  31 +--
 .../spec_decode/e2e/test_ngram_correctness.py |  13 +-
 tests/spec_decode/test_scorer.py              |   1 +
 tests/spec_decode/test_spec_decode_worker.py  |   1 +
 tests/spec_decode/utils.py                    |  12 +
 vllm/config.py                                |   9 +-
 vllm/engine/llm_engine.py                     |  19 +-
 vllm/spec_decode/batch_expansion.py           | 133 +++++++----
 vllm/spec_decode/interfaces.py                |   8 +-
 vllm/spec_decode/mqa_scorer.py                |  68 +++++-
 vllm/spec_decode/spec_decode_worker.py        | 211 +++++++++++++-----
 16 files changed, 469 insertions(+), 166 deletions(-)

diff --git a/tests/spec_decode/e2e/conftest.py b/tests/spec_decode/e2e/conftest.py
index b9cb3858c0068..5cb982a0811c7 100644
--- a/tests/spec_decode/e2e/conftest.py
+++ b/tests/spec_decode/e2e/conftest.py
@@ -2,6 +2,7 @@
 from typing import List, Optional, Sequence, Tuple, Union
 
 import pytest
+import torch
 
 from vllm import LLM, SamplingParams
 from vllm.distributed import cleanup_dist_env_and_memory
@@ -154,6 +155,8 @@ def _check_logprobs_when_output_disabled(
          spec_pos_logprob) = next(iter(spec_pos_logprobs.items()))
         assert spec_pos_logprob.rank == -1
         assert spec_pos_logprob.logprob == 0.0
+        if isinstance(spec_pos_logprob_token_id, torch.Tensor):
+            spec_pos_logprob_token_id = spec_pos_logprob_token_id.item()
         assert spec_pos_logprob_token_id in baseline_pos_logprobs
 
 
@@ -244,7 +247,8 @@ def run_equality_correctness_test_tp(model,
                                      batch_size: int,
                                      max_output_len: int,
                                      seed: int = 0,
-                                     temperature: float = 0.0):
+                                     temperature: float = 0.0,
+                                     logprobs: Optional[int] = None):
     """Helper method that compares the outputs of both the baseline LLM and
     the test LLM. It asserts greedy equality, e.g. that the outputs are exactly
     the same when temperature is zero.
@@ -257,7 +261,6 @@ def run_equality_correctness_test_tp(model,
     results = []
 
     prompts = [prompt for prompt, _ in zip(cycle(PROMPTS), range(batch_size))]
-
     for args, env in ((arg1, env1), (arg2, env2)):
         with RemoteOpenAIServer(model,
                                 args,
@@ -269,12 +272,14 @@ def run_equality_correctness_test_tp(model,
                                                    prompt=prompts,
                                                    max_tokens=max_output_len,
                                                    seed=seed,
-                                                   temperature=temperature)
+                                                   temperature=temperature,
+                                                   logprobs=logprobs)
 
             results.append({
                 "test":
                 "seeded_sampling",
                 "text": [choice.text for choice in completion.choices],
+                "logprobs": [choice.logprobs for choice in completion.choices],
                 "finish_reason":
                 [choice.finish_reason for choice in completion.choices],
                 "usage":
@@ -284,7 +289,15 @@ def run_equality_correctness_test_tp(model,
     n = len(results) // 2
     arg1_results = results[:n]
     arg2_results = results[n:]
+    # Separate logprobs to avoid asserting exact equality.
+    arg1_logprobs = [r.pop("logprobs") for r in arg1_results]
+    arg2_logprobs = [r.pop("logprobs") for r in arg2_results]
+
     for arg1_result, arg2_result in zip(arg1_results, arg2_results):
         assert arg1_result == arg2_result, (
             f"Results for {model=} are not the same with {arg1=} and {arg2=}. "
             f"{arg1_result=} != {arg2_result=}")
+    if logprobs:
+        for logs1, logs2 in zip(arg1_logprobs, arg2_logprobs):
+            for l1, l2 in zip(logs1, logs2):
+                assert l1.tokens == l2.tokens
diff --git a/tests/spec_decode/e2e/test_integration_dist_tp2.py b/tests/spec_decode/e2e/test_integration_dist_tp2.py
index 02cba92795142..7001ee4c007fe 100644
--- a/tests/spec_decode/e2e/test_integration_dist_tp2.py
+++ b/tests/spec_decode/e2e/test_integration_dist_tp2.py
@@ -2,6 +2,8 @@
 tensor parallelism.
 """
 
+from typing import Optional
+
 import pytest
 import torch
 
@@ -154,15 +156,20 @@ def test_draft_model_tp_lt_target_model_tp2(model, common_llm_kwargs,
                               "--speculative-draft-tensor-parallel-size",
                               "1",
                           ])])
+@pytest.mark.parametrize("logprobs", [None, 2])
 @pytest.mark.parametrize("batch_size", [2])
 @pytest.mark.parametrize("seed", [1])
 def test_spec_decode_chunked_prefill_tp2(model, common_llm_kwargs,
                                          per_test_common_llm_kwargs,
                                          baseline_llm_kwargs, test_llm_kwargs,
+                                         logprobs: Optional[int],
                                          batch_size: int, seed: int):
     """Verify spec decode works well with same and different TP size for
     the draft model with chunked prefill.
     """
+    if logprobs:
+        test_llm_kwargs.extend(
+            ["--disable_logprobs_during_spec_decoding", "False"])
     run_equality_correctness_test_tp(model,
                                      common_llm_kwargs,
                                      per_test_common_llm_kwargs,
@@ -171,4 +178,5 @@ def test_spec_decode_chunked_prefill_tp2(model, common_llm_kwargs,
                                      batch_size,
                                      max_output_len=32,
                                      seed=seed,
-                                     temperature=0.0)
+                                     temperature=0.0,
+                                     logprobs=logprobs)
diff --git a/tests/spec_decode/e2e/test_logprobs.py b/tests/spec_decode/e2e/test_logprobs.py
index 4cfca8b78e79b..1a543606cb3f3 100644
--- a/tests/spec_decode/e2e/test_logprobs.py
+++ b/tests/spec_decode/e2e/test_logprobs.py
@@ -4,26 +4,27 @@
 
 from vllm import SamplingParams
 
+from ..utils import maybe_enable_chunked_prefill
 from .conftest import run_equality_correctness_test
 
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
-        "model_name": "JackFram/llama-68m",
+        "model_name": "JackFram/llama-160m",
 
         # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
+        "enforce_eager": True
     }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize("test_llm_kwargs",
                          [{
-                             "speculative_model": "JackFram/llama-160m",
+                             "speculative_model": "JackFram/llama-68m",
                              "num_speculative_tokens": 3,
                              "disable_logprobs_during_spec_decoding": False,
                          }, {
-                             "speculative_model": "JackFram/llama-160m",
+                             "speculative_model": "JackFram/llama-68m",
                              "num_speculative_tokens": 3,
                              "disable_logprobs_during_spec_decoding": True,
                          }])
@@ -36,12 +37,15 @@
     ])
 @pytest.mark.parametrize("seed", [1])
 @pytest.mark.parametrize("logprobs", [1, 6])
+@pytest.mark.parametrize("prefill_chunk_size", [-1, 4, 12])
 def test_logprobs_equality(vllm_runner, common_llm_kwargs,
                            per_test_common_llm_kwargs, baseline_llm_kwargs,
                            test_llm_kwargs, batch_size: int, output_len: int,
-                           seed: int, logprobs: int):
-    """Verify output logprobs are equal with and without speculative decoding.
+                           seed: int, logprobs: int, prefill_chunk_size: int):
+    """Verify output logprobs are equal with and without speculative decoding,
+        as well as with and without chunked prefill.
     """
+    maybe_enable_chunked_prefill(prefill_chunk_size, common_llm_kwargs)
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
                                   per_test_common_llm_kwargs,
diff --git a/tests/spec_decode/e2e/test_medusa_correctness.py b/tests/spec_decode/e2e/test_medusa_correctness.py
index b8965606b3d0e..dbcbc0db10881 100644
--- a/tests/spec_decode/e2e/test_medusa_correctness.py
+++ b/tests/spec_decode/e2e/test_medusa_correctness.py
@@ -21,6 +21,7 @@
 
 import pytest
 
+from ..utils import maybe_enable_chunked_prefill
 from .conftest import run_equality_correctness_test
 
 # main model
@@ -67,12 +68,14 @@
 ])
 @pytest.mark.parametrize("batch_size", [1, 32])
 @pytest.mark.parametrize("seed", [1])
+@pytest.mark.parametrize("prefill_chunk_size", [-1, 32])
 def test_medusa_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
                                        per_test_common_llm_kwargs,
                                        baseline_llm_kwargs, test_llm_kwargs,
                                        batch_size: int, output_len: int,
-                                       seed: int):
+                                       seed: int, prefill_chunk_size: int):
     """Verify greedy equality with different batch size."""
+    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
                                   per_test_common_llm_kwargs,
@@ -119,12 +122,15 @@ def test_medusa_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
 @pytest.mark.parametrize("batch_size", [8])
 @pytest.mark.parametrize("seed", [1])
 @pytest.mark.parametrize("logprobs", [1, 6])
+@pytest.mark.parametrize("prefill_chunk_size", [-1, 32])
 def test_medusa_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
                                     per_test_common_llm_kwargs,
                                     baseline_llm_kwargs, test_llm_kwargs,
                                     batch_size: int, output_len: int,
-                                    seed: int, logprobs: int):
+                                    seed: int, logprobs: int,
+                                    prefill_chunk_size: int):
     """Verify greedy equality with different batch size."""
+    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
                                   per_test_common_llm_kwargs,
@@ -167,12 +173,14 @@ def test_medusa_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
 ])
 @pytest.mark.parametrize("batch_size", [1, 32])
 @pytest.mark.parametrize("seed", [1])
+@pytest.mark.parametrize("prefill_chunk_size", [-1, 32])
 def test_medusa_e2e_greedy_correctness_cuda_graph(
         vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
         baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
-        seed: int):
+        seed: int, prefill_chunk_size: int):
     """Verify greedy equality with cuda graph enabled and different 
     batch sizes."""
+    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
                                   per_test_common_llm_kwargs,
@@ -217,13 +225,15 @@ def test_medusa_e2e_greedy_correctness_cuda_graph(
     ])
 @pytest.mark.parametrize("batch_size", [4])
 @pytest.mark.parametrize("seed", [1])
+@pytest.mark.parametrize("prefill_chunk_size", [-1, 32])
 def test_medusa_e2e_greedy_correctness_with_preemption(
         vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
         baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
-        seed: int):
+        seed: int, prefill_chunk_size: int):
     """Verify greedy equality, even when some sequences are preempted mid-
     generation.
     """
+    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
                                   per_test_common_llm_kwargs,
@@ -267,13 +277,15 @@ def test_medusa_e2e_greedy_correctness_with_preemption(
         32,
     ])
 @pytest.mark.parametrize("seed", [1])
+@pytest.mark.parametrize("prefill_chunk_size", [-1, 32])
 def test_medusa_different_k(vllm_runner, common_llm_kwargs,
                             per_test_common_llm_kwargs, baseline_llm_kwargs,
                             test_llm_kwargs, batch_size: int, output_len: int,
-                            seed: int):
+                            seed: int, prefill_chunk_size: int):
     """Verify that medusa speculative decoding produces exact equality
     to without spec decode with different values of num_speculative_tokens.
     """
+    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
                                   per_test_common_llm_kwargs,
@@ -313,14 +325,17 @@ def test_medusa_different_k(vllm_runner, common_llm_kwargs,
         32,
     ])
 @pytest.mark.parametrize("seed", [1])
+@pytest.mark.parametrize("prefill_chunk_size", [-1, 32])
 def test_medusa_disable_queue(vllm_runner, common_llm_kwargs,
                               per_test_common_llm_kwargs, baseline_llm_kwargs,
                               test_llm_kwargs, batch_size: int,
-                              output_len: int, seed: int):
+                              output_len: int, seed: int,
+                              prefill_chunk_size: int):
     """Verify that medusa speculative decoding produces exact equality
     to without spec decode when speculation is disabled for large
     batch sizes.
     """
+    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
                                   per_test_common_llm_kwargs,
@@ -361,12 +376,14 @@ def test_medusa_disable_queue(vllm_runner, common_llm_kwargs,
         32,
     ])
 @pytest.mark.parametrize("seed", [1])
+@pytest.mark.parametrize("prefill_chunk_size", [-1, 32])
 def test_mqa_scorer(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
                     baseline_llm_kwargs, test_llm_kwargs, batch_size: int,
-                    output_len: int, seed: int):
+                    output_len: int, seed: int, prefill_chunk_size: int):
     """Verify that speculative decoding generates the same output 
     with batch expansion scorer and mqa scorer.
     """
+    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
                                   per_test_common_llm_kwargs,
diff --git a/tests/spec_decode/e2e/test_mlp_correctness.py b/tests/spec_decode/e2e/test_mlp_correctness.py
index 183ff2f5db274..1fa1104f5d3a8 100644
--- a/tests/spec_decode/e2e/test_mlp_correctness.py
+++ b/tests/spec_decode/e2e/test_mlp_correctness.py
@@ -25,6 +25,7 @@
 
 from vllm.model_executor.layers.vocab_parallel_embedding import pad_vocab_size
 
+from ..utils import maybe_enable_chunked_prefill
 from .conftest import run_equality_correctness_test
 
 # main model
@@ -66,14 +67,16 @@
 @pytest.mark.parametrize("output_len", [
     128,
 ])
-@pytest.mark.parametrize("batch_size", [1, 32])
+@pytest.mark.parametrize("batch_size", [4, 32])
 @pytest.mark.parametrize("seed", [1])
+@pytest.mark.parametrize("prefill_chunk_size", [-1, 32])
 def test_mlp_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
                                     per_test_common_llm_kwargs,
                                     baseline_llm_kwargs, test_llm_kwargs,
                                     batch_size: int, output_len: int,
-                                    seed: int):
+                                    seed: int, prefill_chunk_size: int):
     """Verify greedy equality with different batch size."""
+    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
                                   per_test_common_llm_kwargs,
@@ -116,12 +119,19 @@ def test_mlp_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
 @pytest.mark.parametrize("batch_size", [8])
 @pytest.mark.parametrize("seed", [1])
 @pytest.mark.parametrize("logprobs", [1, 6])
+@pytest.mark.parametrize("prefill_chunk_size", [-1, 4])
 def test_mlp_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
                                  per_test_common_llm_kwargs,
                                  baseline_llm_kwargs, test_llm_kwargs,
                                  batch_size: int, output_len: int, seed: int,
-                                 logprobs: int):
+                                 logprobs: int, prefill_chunk_size: int):
     """Verify greedy equality with different batch size."""
+    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
+    # NOTE Test is sensitive enough st if we don't enable chunked prefill
+    # scheduling on baseline too, we get slightly different logprobs, ending
+    # up sampling different tokens at the tail (ie top tokens don't change).
+    # TL;DR: sd+cp == org+cp but sd+cp != org..is this expected?
+    maybe_enable_chunked_prefill(prefill_chunk_size, baseline_llm_kwargs)
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
                                   per_test_common_llm_kwargs,
@@ -162,12 +172,15 @@ def test_mlp_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
 @pytest.mark.parametrize("output_len", [2048])
 @pytest.mark.parametrize("batch_size", [1, 32])
 @pytest.mark.parametrize("seed", [1])
+@pytest.mark.parametrize("prefill_chunk_size", [-1, 4])
 def test_mlp_e2e_acceptance_rate(vllm_runner, common_llm_kwargs,
                                  per_test_common_llm_kwargs,
                                  baseline_llm_kwargs, test_llm_kwargs,
-                                 batch_size: int, output_len: int, seed: int):
+                                 batch_size: int, output_len: int,
+                                 prefill_chunk_size: int, seed: int):
     """Verify acceptance rate with different batch size and large output 
     length."""
+    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
                                   per_test_common_llm_kwargs,
@@ -204,13 +217,17 @@ def test_mlp_e2e_acceptance_rate(vllm_runner, common_llm_kwargs,
 @pytest.mark.parametrize("output_len", [64])
 @pytest.mark.parametrize("batch_size", [1, 32])
 @pytest.mark.parametrize("temperature", [1.0])
+@pytest.mark.parametrize("prefill_chunk_size", [-1, 4])
 @pytest.mark.parametrize("seed", [1])
 def test_mlp_e2e_seeded_correctness(vllm_runner, common_llm_kwargs,
                                     per_test_common_llm_kwargs,
                                     baseline_llm_kwargs, test_llm_kwargs,
                                     batch_size: int, output_len: int,
-                                    temperature: float, seed: int):
+                                    temperature: float,
+                                    prefill_chunk_size: int, seed: int):
     """Verify seeded runs produce the same output."""
+    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
+    maybe_enable_chunked_prefill(prefill_chunk_size, baseline_llm_kwargs)
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
                                   per_test_common_llm_kwargs,
@@ -266,14 +283,16 @@ def test_mlp_e2e_seeded_correctness(vllm_runner, common_llm_kwargs,
         128,
     ])
 @pytest.mark.parametrize("batch_size", [4])
+@pytest.mark.parametrize("prefill_chunk_size", [-1, 4])
 @pytest.mark.parametrize("seed", [1])
 def test_mlp_e2e_greedy_correctness_with_preemption(
         vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
         baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
-        seed: int):
+        prefill_chunk_size: int, seed: int):
     """Verify greedy equality, even when some sequences are preempted mid-
     generation.
     """
+    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
                                   per_test_common_llm_kwargs,
@@ -317,12 +336,14 @@ def test_mlp_e2e_greedy_correctness_with_preemption(
     ])
 @pytest.mark.parametrize("batch_size", [4])
 @pytest.mark.parametrize("seed", [1])
+@pytest.mark.parametrize("prefill_chunk_size", [-1, 4])
 def test_mlp_e2e_greedy_correctness_with_padding(
         vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
         baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
-        seed: int):
+        prefill_chunk_size: int, seed: int):
     """Verify greedy equality when the vocab dimension is padded
     """
+    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
 
     # Default pad_to is 64, test model has vocab_size of 32000
     def patched_pad_vocab_size(vocab_size, pad_to=None):
@@ -373,14 +394,16 @@ def patched_pad_vocab_size(vocab_size, pad_to=None):
         # Use smaller output len for fast test.
         32,
     ])
+@pytest.mark.parametrize("prefill_chunk_size", [-1, 4])
 @pytest.mark.parametrize("seed", [1])
 def test_mlp_different_k(vllm_runner, common_llm_kwargs,
                          per_test_common_llm_kwargs, baseline_llm_kwargs,
-                         test_llm_kwargs, batch_size: int, seed: int,
-                         output_len: int):
+                         test_llm_kwargs, batch_size: int,
+                         prefill_chunk_size: int, seed: int, output_len: int):
     """Verify that mlp speculative decoding produces exact equality
     to without spec decode with different values of num_speculative_tokens.
     """
+    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
                                   per_test_common_llm_kwargs,
@@ -418,15 +441,21 @@ def test_mlp_different_k(vllm_runner, common_llm_kwargs,
         # Use smaller output len for fast test.
         32,
     ])
+# Speculative decoding is disabled when sequences reach decoding and the batch
+# consists of single-token requests. Hence we set `max_num_seqs`
+# >= `speculative_disable_by_batch_size` to test feature interaction.
+@pytest.mark.parametrize("prefill_chunk_size", [-1, 4])
 @pytest.mark.parametrize("seed", [1])
 def test_mlp_disable_queue(vllm_runner, common_llm_kwargs,
                            per_test_common_llm_kwargs, baseline_llm_kwargs,
-                           test_llm_kwargs, batch_size: int, seed: int,
+                           test_llm_kwargs, batch_size: int,
+                           prefill_chunk_size: int, seed: int,
                            output_len: int):
     """Verify that mlp speculative decoding produces exact equality
     to without spec decode when speculation is disabled for large
     batch sizes.
     """
+    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
                                   per_test_common_llm_kwargs,
@@ -460,13 +489,15 @@ def test_mlp_disable_queue(vllm_runner, common_llm_kwargs,
         # Use smaller output len for fast test.
         32,
     ])
+@pytest.mark.parametrize("prefill_chunk_size", [-1, 4])
 @pytest.mark.parametrize("seed", [1])
 def test_mqa_scorer(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
                     baseline_llm_kwargs, test_llm_kwargs, batch_size: int,
-                    output_len: int, seed: int):
+                    output_len: int, prefill_chunk_size: int, seed: int):
     """Verify that speculative decoding generates the same output 
     with batch expansion scorer and mqa scorer.
     """
+    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
                                   per_test_common_llm_kwargs,
diff --git a/tests/spec_decode/e2e/test_multistep_correctness.py b/tests/spec_decode/e2e/test_multistep_correctness.py
index a13cca41f99e5..05ad468dd8bc5 100644
--- a/tests/spec_decode/e2e/test_multistep_correctness.py
+++ b/tests/spec_decode/e2e/test_multistep_correctness.py
@@ -147,20 +147,20 @@ def test_spec_decode_e2e_with_detokenization(test_llm_generator,
         },
     ])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_model": "JackFram/llama-68m",
-        "num_speculative_tokens": 5,
-        "enable_chunked_prefill": False,
-    },
-    {
-        "speculative_model": "JackFram/llama-68m",
-        "num_speculative_tokens": 5,
-        "enable_chunked_prefill": True,
-        "max_num_batched_tokens": 4,
-        "max_num_seqs": 4,
-    },
-])
+@pytest.mark.parametrize("test_llm_kwargs",
+                         [{
+                             "speculative_model": "JackFram/llama-68m",
+                             "num_speculative_tokens": 5,
+                             "enable_chunked_prefill": False,
+                             "disable_logprobs_during_spec_decoding": False
+                         }, {
+                             "speculative_model": "JackFram/llama-68m",
+                             "num_speculative_tokens": 3,
+                             "enable_chunked_prefill": True,
+                             "max_num_batched_tokens": 4,
+                             "max_num_seqs": 4,
+                             "disable_logprobs_during_spec_decoding": False
+                         }])
 @pytest.mark.parametrize(
     "output_len",
     [
@@ -192,6 +192,9 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
                                   batch_size,
                                   max_output_len=output_len,
                                   seed=seed,
+                                  prompt_logprobs=2,
+                                  logprobs=2,
+                                  disable_logprobs=False,
                                   temperature=0.0,
                                   ensure_all_accepted=ensure_all_accepted)
 
diff --git a/tests/spec_decode/e2e/test_ngram_correctness.py b/tests/spec_decode/e2e/test_ngram_correctness.py
index e53d169a8fcc3..77f8b8998c8d3 100644
--- a/tests/spec_decode/e2e/test_ngram_correctness.py
+++ b/tests/spec_decode/e2e/test_ngram_correctness.py
@@ -26,6 +26,7 @@
 
 import pytest
 
+from ..utils import maybe_enable_chunked_prefill
 from .conftest import run_equality_correctness_test
 
 
@@ -49,11 +50,13 @@
         "speculative_model": "[ngram]",
         "num_speculative_tokens": 5,
         "ngram_prompt_lookup_max": 3,
+        "speculative_disable_mqa_scorer": False,
     },
     {
         "speculative_model": "[ngram]",
         "num_speculative_tokens": 5,
         "ngram_prompt_lookup_max": 3,
+        "speculative_disable_mqa_scorer": True,
     },
 ])
 @pytest.mark.parametrize("output_len", [
@@ -68,15 +71,7 @@ def test_ngram_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
                                       batch_size: int, output_len: int,
                                       prefill_chunk_size: int, seed: int):
     """Verify greedy equality on a tiny model with different batch size."""
-    if prefill_chunk_size > 0:
-        common_llm_kwargs.update(
-            **{
-                "enable_chunked_prefill": True,
-                "max_num_batched_tokens": prefill_chunk_size,
-                "max_num_seqs": prefill_chunk_size
-            })
-    else:
-        common_llm_kwargs["enable_chunked_prefill"] = False
+    maybe_enable_chunked_prefill(prefill_chunk_size, common_llm_kwargs)
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
                                   per_test_common_llm_kwargs,
diff --git a/tests/spec_decode/test_scorer.py b/tests/spec_decode/test_scorer.py
index 0b1509d8b7785..5a093dea16d40 100644
--- a/tests/spec_decode/test_scorer.py
+++ b/tests/spec_decode/test_scorer.py
@@ -60,6 +60,7 @@ def test_scorer(model_name: str, batch_size: int, max_propose_len: int,
     num_gpu_blocks = 2048 // block_size
     scorer_worker = create_worker(Worker, model_name, block_size,
                                   num_gpu_blocks, seed)
+    scorer_worker.model_runner.disable_logprobs = True  # accessed by mqa_scorer
     scorer_worker.model_runner.model.sampler.include_gpu_probs_tensor = True
     scorer_worker.model_runner.model.sampler.\
         should_modify_greedy_probs_inplace = True
diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py
index caf7a7e625b46..d8c3af4c1cd1e 100644
--- a/tests/spec_decode/test_spec_decode_worker.py
+++ b/tests/spec_decode/test_spec_decode_worker.py
@@ -754,6 +754,7 @@ def test_populate_seq_ids_with_bonus_tokens():
         seq_group_metadata_list=seq_group_metadata_list,
         accepted_token_ids=accepted_token_ids,
         target_logprobs=target_token_logprobs,
+        prompt_logprobs=None,
         k=k,
         stage_times=(0, 0, 0))
     # Verify that _seq_with_bonus_token_in_last_step contains the following:
diff --git a/tests/spec_decode/utils.py b/tests/spec_decode/utils.py
index a4bfa6b2f384b..2f883c2ff9b7a 100644
--- a/tests/spec_decode/utils.py
+++ b/tests/spec_decode/utils.py
@@ -274,3 +274,15 @@ def create_batch(batch_size,
             prompts, num_gpu_blocks, block_size, final_prompt_lens,
             prev_output_tokens, seq_ids)
     return seq_group_metadata_list, prompts, prev_output_tokens
+
+
+def maybe_enable_chunked_prefill(prefill_chunk_size, llm_kwargs):
+    if prefill_chunk_size > 0:
+        llm_kwargs.update(
+            **{
+                "enable_chunked_prefill": True,
+                "max_num_batched_tokens": prefill_chunk_size,
+                "max_num_seqs": prefill_chunk_size
+            })
+    else:
+        llm_kwargs["enable_chunked_prefill"] = False
diff --git a/vllm/config.py b/vllm/config.py
index dc1d611115489..7ab632d7e3667 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1685,7 +1685,8 @@ def maybe_create_spec_config(
             raise ValueError("Expect the batch size threshold of disabling "
                              "speculative decoding is > 1, but got "
                              f"{speculative_disable_by_batch_size=}")
-
+        if (enable_chunked_prefill and speculative_model == "eagle"):
+            raise ValueError("Chunked prefill and EAGLE are not compatible.")
         # TODO: The user should be able to specify revision/max model len
         # for the draft model. It is not currently supported.
         draft_revision = None
@@ -1752,12 +1753,6 @@ def maybe_create_spec_config(
                         f"num_speculative_tokens={n_predict}, but "
                         f"{num_speculative_tokens=} was provided.")
 
-            if enable_chunked_prefill and draft_hf_config.model_type in (
-                    "medusa", "mlp_speculator", "eagle"):
-                raise ValueError(
-                    "Chunked prefill and hidden-state based draft models are "
-                    "not compatible.")
-
             speculative_draft_tensor_parallel_size = \
                 SpeculativeConfig._verify_and_get_draft_model_tensor_parallel_size(
                     target_parallel_config,
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 7da18d5f7d2eb..ab67ae29723cd 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -1010,8 +1010,23 @@ def _process_model_outputs(self,
                      self.speculative_config
             # Organize outputs by [step][sequence group] instead of
             # [sequence group][step].
-            outputs_by_sequence_group = create_output_by_sequence_group(
-                outputs, num_seq_groups=len(seq_group_metadata_list))
+            if self.scheduler_config.is_multi_step:
+                outputs_by_sequence_group = create_output_by_sequence_group(
+                    outputs, len(seq_group_metadata_list))
+            elif self.speculative_config:
+                # Decodes are multi-steps while prefills are not, outputting at
+                # most 1 token. Separate them so that we can trigger chunk
+                # processing without having to pad or copy over prompts K times
+                # to match decodes structure (costly with prompt_logprobs).
+                num_prefills = sum(sg.is_prompt
+                                   for sg in seq_group_metadata_list)
+                prefills, decodes = outputs[:num_prefills], outputs[
+                    num_prefills:]
+                outputs_by_sequence_group = create_output_by_sequence_group(
+                    decodes,
+                    num_seq_groups=len(seq_group_metadata_list) - num_prefills)
+                outputs_by_sequence_group = [p.outputs for p in prefills
+                                             ] + outputs_by_sequence_group
             # We have outputs for multiple steps submitted in a single burst,
             # so invalidate is_first_step_output.
             is_first_step_output = None
diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py
index 01b9cdad963da..56fb9ba506a44 100644
--- a/vllm/spec_decode/batch_expansion.py
+++ b/vllm/spec_decode/batch_expansion.py
@@ -83,13 +83,13 @@ def score_proposals(
 
         if not non_spec_indices:
             # All sequence groups in batch have spec decoding enabled
-            contracted = self._contract_batch_all_spec(
+            return self._contract_batch_all_spec(
                 target_sampler_output=target_sampler_output,
                 proposals=proposals,
             )
         else:
             # Batch has a mix of spec decode enabled and disabled seq groups
-            contracted = self._contract_batch(
+            return self._contract_batch(
                 execute_model_req.seq_group_metadata_list,
                 target_sampler_output=target_sampler_output,
                 proposals=proposals,
@@ -99,14 +99,6 @@ def score_proposals(
                 k=execute_model_req.num_lookahead_slots,
             )
 
-        all_tokens, all_probs, spec_logprobs, all_hidden_states = contracted
-        return SpeculativeScores(
-            probs=all_probs,
-            token_ids=all_tokens,
-            logprobs=spec_logprobs,
-            hidden_states=all_hidden_states,
-        )
-
     def _expand_batch(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
@@ -143,13 +135,57 @@ def _expand_batch(
         return (spec_indices, non_spec_indices, target_seq_group_metadata_list,
                 num_scoring_tokens)
 
+    def _contract_non_speculative(
+            self, scores: SpeculativeScores,
+            seq_group_metadata_list: List[SequenceGroupMetadata],
+            non_spec_indices: List[int], non_spec_outputs: SpeculativeScores,
+            has_prompt_log: bool) -> SpeculativeScores:
+        """
+            Augment input `scores` with non-speculative requests outputs. 
+            This includes decode requests with speculation turned off, as well
+            as prefill requests when `enable_chunked_prefill` is set.
+            For the latter, prefills are further separated into terminal and 
+            non-terminal chunks (from which no token is sampled).
+        """
+        if not non_spec_indices:
+            return scores
+
+        if has_prompt_log:
+            # When prompt_logprobs is enabled, prefills yield output token
+            # (and respective prob) in the last entry (prompt|out):
+            # [.|.|.|prefill0_out|.|prefill1_out|decode0_out|..].
+            # With chunked prefill, non-terminal chunks have -1 on each
+            # position: they're still picked, but they're discarded later.
+            seq_meta = seq_group_metadata_list
+            nospec_sizes = torch.tensor([
+                seq_meta[i].token_chunk_size if seq_meta[i].is_prompt else 1
+                for i in non_spec_indices
+            ])
+            nospec_sampled_token_idxs = torch.cumsum(nospec_sizes, 0).add_(-1)
+        else:
+            # In this case only sampled tokens are returned, select all.
+            nospec_sampled_token_idxs = list(
+                range(len(non_spec_outputs.token_ids)))
+
+        scores.token_ids[non_spec_indices, :1] = \
+            non_spec_outputs.token_ids[nospec_sampled_token_idxs].unsqueeze(1)
+        scores.probs[non_spec_indices, :1, :] = \
+            non_spec_outputs.probs[nospec_sampled_token_idxs].unsqueeze(1)
+        scores.logprobs[non_spec_indices, :1, :] = \
+            non_spec_outputs.logprobs[nospec_sampled_token_idxs].unsqueeze(1)
+        if scores.hidden_states is not None:
+            assert non_spec_outputs.hidden_states is not None
+            scores.hidden_states[non_spec_indices, :1, :] = \
+                non_spec_outputs.hidden_states[nospec_sampled_token_idxs].unsqueeze(1)
+        return scores
+
     def _contract_batch(
-        self, contracted_seq_group_metadata_list: List[SequenceGroupMetadata],
-        target_sampler_output: SamplerOutput, proposals: SpeculativeProposals,
-        num_scoring_tokens: int, non_spec_indices: List[int],
-        spec_indices: List[int], k: int
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor,
-               Optional[torch.Tensor]]:
+            self,
+            contracted_seq_group_metadata_list: List[SequenceGroupMetadata],
+            target_sampler_output: SamplerOutput,
+            proposals: SpeculativeProposals, num_scoring_tokens: int,
+            non_spec_indices: List[int], spec_indices: List[int],
+            k: int) -> SpeculativeScores:
         """Contract the expanded batch back into its original size.
         This maps the scores of speculative tokens back to their original
         sequences.
@@ -195,23 +231,28 @@ def _contract_batch(
         else:
             all_hidden_states = None
 
-        # Rule out prefills that produce no tokens.
-        non_spec_indices = [
-            idx for idx in non_spec_indices
-            if contracted_seq_group_metadata_list[idx].do_sample
-        ]
-        if len(non_spec_indices):
-            all_tokens[non_spec_indices, :1] = \
-                non_spec_target_token_ids.unsqueeze(1)
-            all_probs[non_spec_indices, :1, :] = \
-                non_spec_target_probs.unsqueeze(1)
-            all_logprobs[non_spec_indices, :1, :] = \
-                non_spec_target_logprobs.unsqueeze(1)
-            if all_hidden_states is not None:
-                assert non_spec_target_hidden_states is not None
-                all_hidden_states[non_spec_indices, :1, :] = \
-                    non_spec_target_hidden_states.unsqueeze(1)
-
+        has_prompt_log = any((sg.sampling_params.prompt_logprobs
+                              and sg.sampling_params.prompt_logprobs > 0)
+                             for sg in contracted_seq_group_metadata_list)
+        # When prompt logprobs is enabled, lens of returned tensors go from
+        # n_sampled (requests with do_sample=True) to n_prompt+n_prefills.
+        # We adjust stride accordingly to get the generated tokens and
+        # their probs, but pass on prompt_logprobs as is.
+        prompt_logprobs = None
+        if (not self._scorer_worker.model_runner.disable_logprobs\
+            and has_prompt_log):
+            prompt_logprobs = [
+                o.prompt_logprobs for o in target_sampler_output.outputs
+            ]
+        elif not has_prompt_log:
+            # When prompt logprobs are not to be returned,
+            # we can ignore non-terminal chunks (no out token).
+            non_spec_indices = [
+                idx for idx in non_spec_indices
+                if contracted_seq_group_metadata_list[idx].do_sample
+            ]
+
+        # "Contract" speculative.
         if spec_indices:
             all_tokens[spec_indices] = target_token_ids
             all_probs[spec_indices] = target_probs
@@ -219,14 +260,27 @@ def _contract_batch(
             if all_hidden_states is not None:
                 all_hidden_states[spec_indices] = target_hidden_states
 
-        return all_tokens, all_probs, all_logprobs, all_hidden_states
+        spec_scores = SpeculativeScores(probs=all_probs,
+                                        token_ids=all_tokens,
+                                        logprobs=all_logprobs,
+                                        hidden_states=all_hidden_states,
+                                        prompt_logprobs=prompt_logprobs)
+
+        non_spec_outputs = SpeculativeScores(
+            probs=non_spec_target_probs,
+            token_ids=non_spec_target_token_ids,
+            logprobs=non_spec_target_logprobs,
+            hidden_states=non_spec_target_hidden_states)
+        # Contract remaining nonspec entries based on non_spec_indices, if any.
+        return self._contract_non_speculative(
+            spec_scores, contracted_seq_group_metadata_list, non_spec_indices,
+            non_spec_outputs, has_prompt_log)
 
     def _contract_batch_all_spec(
         self,
         target_sampler_output: SamplerOutput,
         proposals: SpeculativeProposals,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor,
-               Optional[torch.Tensor]]:
+    ) -> SpeculativeScores:
         """Contract the expanded batch back into its original size.
         This maps the scores of speculative tokens back to their original
         sequences.
@@ -250,8 +304,11 @@ def _contract_batch_all_spec(
             target_hidden_states = target_hidden_states.reshape(
                 *target_token_ids.shape, target_hidden_states.shape[-1])
 
-        return (target_token_ids, target_probs, target_logprobs,
-                target_hidden_states)
+        return SpeculativeScores(probs=target_probs,
+                                 token_ids=target_token_ids,
+                                 logprobs=target_logprobs,
+                                 hidden_states=target_hidden_states,
+                                 prompt_logprobs=None)
 
     def _create_scoring_model_input(
         self,
diff --git a/vllm/spec_decode/interfaces.py b/vllm/spec_decode/interfaces.py
index a4fe0f13c8db1..c39e98b6cca12 100644
--- a/vllm/spec_decode/interfaces.py
+++ b/vllm/spec_decode/interfaces.py
@@ -1,10 +1,10 @@
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
-from typing import Optional, Set, Union
+from typing import List, Optional, Set, Union
 
 import torch
 
-from vllm.sequence import ExecuteModelRequest
+from vllm.sequence import ExecuteModelRequest, PromptLogprobs
 from vllm.worker.worker_base import WorkerBase
 
 
@@ -54,6 +54,10 @@ class SpeculativeScores:
     # Optional last hidden states from the scoring model.
     hidden_states: Optional[torch.Tensor] = None
 
+    # Scoring model may also return logprobs for prompt tokens
+    # for each request, when chunked prefill is enabled.
+    prompt_logprobs: Optional[List[PromptLogprobs]] = None
+
     def __repr__(self):
         return (f"SpeculativeScores("
                 f"probs={self.probs.shape}, "
diff --git a/vllm/spec_decode/mqa_scorer.py b/vllm/spec_decode/mqa_scorer.py
index cbf793e2043e3..3aea2eabb4144 100644
--- a/vllm/spec_decode/mqa_scorer.py
+++ b/vllm/spec_decode/mqa_scorer.py
@@ -72,9 +72,15 @@ def score_proposals(
         target_token_ids = target_sampler_output.sampled_token_ids
         target_probs = target_sampler_output.sampled_token_probs
         target_logprobs = target_sampler_output.logprobs
+        prompt_logprobs = None
+
         # If all requests have the same number of query tokens, we can avoid
         # the for loop to build output for better performance.
         if min(all_proposal_lengths) == k:
+            # Regular decodes only.
+            assert all(not sg.is_prompt
+                       for sg in target_seq_group_metadata_list
+                       if sg.is_prompt)
             bs, _ = proposals.proposal_token_ids.shape
             all_tokens = target_token_ids.reshape(bs, k + 1)
             all_probs = target_probs.reshape(bs, k + 1, self._vocab_size)
@@ -88,19 +94,56 @@ def score_proposals(
             all_logprobs = target_logprobs.new_full(size=all_probs.shape,
                                                     fill_value=-float("inf"))
             target_token_ids = target_token_ids.flatten()
-            start_loc = 0
-            for i, (proposed_len, seq_meta) in enumerate(
-                    zip(all_proposal_lengths, target_seq_group_metadata_list)):
+
+            # When prompt logprobs is enabled, lens of returned tensors go from
+            # n_sampled (requests with do_sample=True) to n_prompt+n_prefills.
+            # We adjust stride accordingly to get the generated tokens and
+            # their probs, but pass on prompt_logprobs as is, since it may be
+            # that n_prompts >> K.
+            has_prompt_log = any((sg.sampling_params.prompt_logprobs
+                                  and sg.sampling_params.prompt_logprobs > 0)
+                                 for sg in target_seq_group_metadata_list)
+            # TODO (NickLucche) we should surface `disable_logprobs` as to not
+            # break abstraction to get its value.
+            if (not self._scorer_worker.model_runner.disable_logprobs\
+                and has_prompt_log):
+                prompt_logprobs = [
+                    o.prompt_logprobs for o in target_sampler_output.outputs
+                ]
+
+            # Split loop into prefill|decode for readability.
+            start_loc, i = 0, 0
+            while i < len(target_seq_group_metadata_list
+                          ) and target_seq_group_metadata_list[i].is_prompt:
+                seq_meta = target_seq_group_metadata_list[i]
+                end_loc = start_loc
+                if has_prompt_log:
+                    end_loc += seq_meta.token_chunk_size
+                elif seq_meta.do_sample:
+                    end_loc += 1
+
                 # Skip chunks with no output tokens.
                 if seq_meta.do_sample:
-                    output_len = proposed_len + 1
-                    end_loc = start_loc + output_len
-                    all_tokens[
-                        i, :output_len] = target_token_ids[start_loc:end_loc]
-                    all_probs[i, :output_len] = target_probs[start_loc:end_loc]
-                    all_logprobs[
-                        i, :output_len] = target_logprobs[start_loc:end_loc]
-                    start_loc = end_loc
+                    # Get sampled token (last position in chunk) and its prob.
+                    all_tokens[i, 0] = target_token_ids[end_loc - 1]
+                    all_probs[i, 0] = target_probs[end_loc - 1]
+                    all_logprobs[i, 0] = target_logprobs[end_loc - 1]
+
+                i += 1
+                start_loc = end_loc
+            # Decodes.
+            while i < len(target_seq_group_metadata_list):
+                proposed_len, seq_meta = all_proposal_lengths[
+                    i], target_seq_group_metadata_list[i]
+                output_len = proposed_len + 1
+                end_loc = start_loc + output_len
+                all_tokens[
+                    i, :output_len] = target_token_ids[start_loc:end_loc]
+                all_probs[i, :output_len] = target_probs[start_loc:end_loc]
+                all_logprobs[
+                    i, :output_len] = target_logprobs[start_loc:end_loc]
+                start_loc = end_loc
+                i += 1
 
         hidden_states = None
         if target_sampler_output.hidden_states is not None:
@@ -110,4 +153,5 @@ def score_proposals(
         return SpeculativeScores(probs=all_probs,
                                  token_ids=all_tokens,
                                  logprobs=all_logprobs,
-                                 hidden_states=hidden_states)
+                                 hidden_states=hidden_states,
+                                 prompt_logprobs=prompt_logprobs)
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index 0d66ede3d907a..8e9802c7d333c 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -563,50 +563,57 @@ def _serialize_sampler_output_no_logprobs(
             (seq_id, seq_data) for sg in \
             execute_model_req.seq_group_metadata_list \
             for seq_id, seq_data in sg.seq_data.items()
-            if sg.do_sample # ignore empty token sequences
         ]
         completion_seq_group_output_list: List[
             CompletionSequenceGroupOutput] = []
         output_index = 0
         # Make sure the non-terminal prefill chunks are still aligned with
         # their own empty output.
-        for seq_group_meta in execute_model_req.seq_group_metadata_list:
-            # Since we can get chunks here, we dont always have a sampled token
-            # (only on last chunk) but we still have to provide an output.
-            if not seq_group_meta.do_sample:
-                completion_seq_group_output_list.append(
-                    CompletionSequenceGroupOutput(samples=[],
-                                                  prompt_logprobs=None))
-            else:
-                # Sequence with output.
-                seq_id, seq_data = seq_data_entries[output_index]
-                needs_prompt_logprobs = seq_output_prompt_logprobs[
-                    output_index]
-                if needs_prompt_logprobs:
-                    prompt_token_ids = seq_data.get_prompt_token_ids()
-                    prompt_logprobs = [
-                        create_logprobs_output(
-                            token_id=p_token_id,
-                            token_id_logprob_rank=-1,
-                            token_id_logprob=0.0,
-                            topk_token_ids=[],
-                            topk_logprobs=[],
-                        )
-                        # no prompt logprobs for the first token
-                        for p_token_id in prompt_token_ids[1:]
-                    ]
-                else:
-                    prompt_logprobs = None
-                completion_seq_group_output_list.append(
-                    create_sequence_group_output(
-                        token_id=sampled_token_ids_list[output_index][0],
+        for idx, seq_group_meta in enumerate(
+                execute_model_req.seq_group_metadata_list):
+            needs_prompt_logprobs = seq_output_prompt_logprobs[idx]
+            seq_id, seq_data = seq_data_entries[idx]
+            if needs_prompt_logprobs:
+                prompt_token_ids = seq_data.get_prompt_token_ids()
+
+                # Some of these sequences may belong to non-terminal chunks,
+                # which may still have to report logprobs for prompts.
+                start = 1 if seq_data._num_computed_tokens == 0 \
+                    else seq_data._num_computed_tokens
+                end = (seq_data._num_computed_tokens + \
+                       seq_group_meta.token_chunk_size)
+                prompt_token_ids = prompt_token_ids[start:end]
+                prompt_logprobs = [
+                    create_logprobs_output(
+                        token_id=p_token_id,
                         token_id_logprob_rank=-1,
                         token_id_logprob=0.0,
-                        seq_id=seq_id,
                         topk_token_ids=[],
                         topk_logprobs=[],
-                        prompt_logprobs=prompt_logprobs))
-                output_index += 1
+                    ) for p_token_id in prompt_token_ids
+                ]
+            else:
+                prompt_logprobs = None
+
+            # Since we can get chunks here, we dont always have a sampled token
+            # (only on last chunk) but we still have to provide an output.
+            if not seq_group_meta.do_sample:
+                completion_seq_group_output_list.append(
+                    CompletionSequenceGroupOutput(
+                        samples=[], prompt_logprobs=prompt_logprobs))
+                continue
+
+            # Sequence with output.
+            completion_seq_group_output_list.append(
+                create_sequence_group_output(
+                    token_id=sampled_token_ids_list[output_index][0],
+                    token_id_logprob_rank=-1,
+                    token_id_logprob=0.0,
+                    seq_id=seq_id,
+                    topk_token_ids=[],
+                    topk_logprobs=[],
+                    prompt_logprobs=prompt_logprobs))
+            output_index += 1
 
         return [SamplerOutput(outputs=completion_seq_group_output_list)]
 
@@ -624,24 +631,27 @@ def _run_no_spec(self, execute_model_req: ExecuteModelRequest,
         assert len(sampler_output) == 1
         sampler_output = sampler_output[0]
 
-        # Store hidden states from target model execution.
+        # Store hidden states from target model execution, BxD.
         hidden_states = sampler_output.hidden_states
         if hidden_states is not None:
-            # remove hidden_states for prompt tokens
-            # TODO Enable `return_hidden_states`: prefill chunks hidden states
-            # are pruned by the logits processor. Also, they should be arranged
-            # back into full-prefill latent. Address it to enable MLPSpeculator.
-            if any(seq.is_prompt
-                   for seq in execute_model_req.seq_group_metadata_list):
+            # Only decodes and prefill terminal chunks need a hidden state.
+            seq_group_meta_with_hidden = [
+                sg for sg in execute_model_req.seq_group_metadata_list
+                if sg.do_sample
+            ]
+            if any(seq.is_prompt for seq in seq_group_meta_with_hidden):
+                # Drop hidden_states with no prediction (eg non-terminal chunks)
                 hidden_states = hidden_states[
                     torch.where(sampler_output.sampled_token_ids -
                                 VLLM_INVALID_TOKEN_ID)[0]]
-            if self.previous_hidden_states is None:
+            if self.previous_hidden_states is None and len(
+                    seq_group_meta_with_hidden):
                 self.previous_hidden_states = HiddenStates(
-                    hidden_states, execute_model_req.seq_group_metadata_list)
-            else:
-                self.previous_hidden_states.update(
-                    hidden_states, execute_model_req.seq_group_metadata_list)
+                    hidden_states, seq_group_meta_with_hidden)
+            elif self.previous_hidden_states and len(
+                    seq_group_meta_with_hidden):
+                self.previous_hidden_states.update(hidden_states,
+                                                   seq_group_meta_with_hidden)
 
         if not skip_proposer:
             # We prepare the prefill hidden states here so that there no
@@ -752,13 +762,13 @@ def _run_speculative_decoding_step(
         ]
         if len(non_spec_indices):
             all_hidden_states = proposal_scores.hidden_states
-            # TODO fix `return_hidden_states`, same as in `_run_no_spec`
             if all_hidden_states is not None:
                 prefill_hidden_states = all_hidden_states[non_spec_indices]
                 execute_model_req.previous_hidden_states = \
                     prepare_prefill_hidden_states(prefill_hidden_states)
             # Sync proposer KV cache for prefills.
             prefill_req = execute_model_req.clone(non_spec_seqs)
+            # TODO avoid sampling here?
             self.proposer_worker.execute_model(prefill_req)
 
         with Timer() as verification_timer:
@@ -774,6 +784,8 @@ def _run_speculative_decoding_step(
             execute_model_req.seq_group_metadata_list,
             accepted_token_ids,
             target_logprobs=target_logprobs,
+            prompt_logprobs=proposal_scores.prompt_logprobs
+            if not self._disable_logprobs else None,
             k=execute_model_req.num_lookahead_slots,
             stage_times=stage_times)
 
@@ -845,19 +857,32 @@ def _verify_tokens(
         # metadata.
         accepted_token_ids[original_indices] = accepted_token_ids.clone()
 
+        # B x K+1 x D
         hidden_states = proposal_scores.hidden_states
         if hidden_states is not None:
+            # Only get terminal hidden states for next step
+            terminal_metadata = [
+                sg for sg in seq_group_metadata_list if sg.do_sample
+            ]
+
             # Contract hidden states based on accepted tokens
             hs_size = hidden_states.shape[-1]
-
             accepted_index = accepted_token_ids + 1  # Convert -1 to 0
-            accepted_index = accepted_index.count_nonzero(dim=1).add_(-1)
-            index = accepted_index[:, None, None].expand(-1, 1, hs_size)
+            accepted_index = accepted_index.count_nonzero(dim=1).add_(-1)  # b
+            # Drop non-terminal prefill chunks hidden states.
+            hidden_states = hidden_states[
+                accepted_index != VLLM_INVALID_TOKEN_ID]
+            accepted_index = accepted_index[
+                accepted_index != VLLM_INVALID_TOKEN_ID]
+            assert len(accepted_index) == hidden_states.shape[0] == len(
+                terminal_metadata)
+            index = accepted_index[:, None, None].expand(-1, 1,
+                                                         hs_size)  # b x 1 x d
             second_last_token_hidden_states = hidden_states[:, -2]  # b x d
             hidden_states = hidden_states.gather(1, index).squeeze(1)  # b x d
             # Store hidden states from target model for subsequent decode step
             self.previous_hidden_states = HiddenStates(
-                hidden_states, seq_group_metadata_list,
+                hidden_states, terminal_metadata,
                 second_last_token_hidden_states)
         return accepted_token_ids, logprobs
 
@@ -866,6 +891,8 @@ def _create_output_sampler_list(
         seq_group_metadata_list: List[SequenceGroupMetadata],
         accepted_token_ids: torch.Tensor,  # shape: [batch_size, k+1]
         target_logprobs: torch.Tensor,  # shape: [batch_size, k+1, vocab_size]
+        prompt_logprobs: Optional[
+            torch.Tensor],  # shape: [nprompt_tokens, vocab_size]
         k: int,
         stage_times: Tuple[float, float, float],
     ) -> List[SamplerOutput]:
@@ -909,15 +936,89 @@ def _create_output_sampler_list(
 
         # Construct the output on a per-step, per-sequence basis.
         # Non-terminal prefill chunks will end up here as rows with just -1s
-        # i.e mixed-batch [[-1, 1576], [-1, 29884], [-1, -1], [-1, -1]]
+        # i.e mixed-batch [[-1, 1576], [-1, 29884], [-1, -1], [-1, -1]] while
+        # terminal chunks will only have one generated token at time 0.
         sampler_output_list: List[SamplerOutput] = []
+
+        # Prefills are not multi-step (return at most 1 token), in order to
+        # avoid padding or repetition to fit decodes, we separate them.
+        for i, sg in enumerate(seq_group_metadata_list):
+            if not sg.is_prompt:
+                # Requests are ordered as prefills|decodes=>no more prefills.
+                break
+            num_logprobs = num_logprobs_per_seq[i]
+            seq_kwargs = dict(token_id=-1,
+                              token_id_logprob_rank=0,
+                              token_id_logprob=-float('inf'),
+                              topk_token_ids=[-1] * num_logprobs,
+                              topk_logprobs=[-float('inf')] * num_logprobs,
+                              seq_id=seq_ids[i])
+            # Terminal chunk, has token.
+            if sg.do_sample:
+                seq_kwargs.update(
+                    dict(
+                        token_id=accepted_token_ids[i][0].item(),
+                        token_id_logprob_rank=accepted_token_id_ranks_by_step[
+                            0][i],
+                        token_id_logprob=accepted_token_id_logprobs_by_step[0]
+                        [i],
+                        topk_token_ids=topk_indices_by_step[0][i]
+                        [:num_logprobs],
+                        # output only so step is 0
+                        topk_logprobs=topk_logprobs_by_step[0][i]
+                        [:num_logprobs],
+                    ))
+            needs_plogs = (sg.sampling_params.prompt_logprobs
+                           and sg.sampling_params.prompt_logprobs > 0)
+            plogs = None
+            if prompt_logprobs is not None:
+                # Even non-terminal prompt chunks can have logprobs here.
+                plogs = prompt_logprobs[i]
+            elif needs_plogs:
+                # Prompt logprobs are requested but `_disable_logprobs` is set.
+                seq_data = next(iter(sg.seq_data.values()))
+                # Get only the tokens in this chunk!
+                prompt_token_ids = seq_data.get_prompt_token_ids()
+                prompt_token_ids = prompt_token_ids[
+                    seq_data.
+                    _num_computed_tokens:seq_data._num_computed_tokens +
+                    sg.token_chunk_size]
+
+                is_first_chunk = seq_data._num_computed_tokens == 0
+                # There's no prob generated for the first token in a sequence.
+                if is_first_chunk:
+                    prompt_token_ids = prompt_token_ids[1:]
+                plogs = [
+                    create_logprobs_output(
+                        token_id=p_token_id,
+                        token_id_logprob_rank=-1,
+                        token_id_logprob=0.0,
+                        topk_token_ids=[],
+                        topk_logprobs=[],
+                    ) for p_token_id in prompt_token_ids
+                ]
+            seq_kwargs.update(dict(prompt_logprobs=plogs))
+
+            sampler_output_list.append(
+                SamplerOutput(
+                    outputs=[create_sequence_group_output(
+                        **seq_kwargs)]))  # type: ignore
+
+        # Decodes, create one SamplerOutput per-step (at most K+1).
         for step_index in range(num_steps):
-            if all(token_id == -1
-                   for token_id in accepted_token_ids_by_step[step_index]):
+            if all(token_id == -1 for sg, token_id in zip(
+                    seq_group_metadata_list,
+                    accepted_token_ids_by_step[step_index])
+                   if not sg.is_prompt):
                 break
 
             step_output_token_ids: List[CompletionSequenceGroupOutput] = []
             for sequence_index in range(batch_size):
+                seq_meta = seq_group_metadata_list[sequence_index]
+                # Prompts already processed above.
+                if seq_meta.is_prompt:
+                    continue
+
                 # Each sequence may have a different num_logprobs; retrieve it.
                 num_logprobs = num_logprobs_per_seq[sequence_index]
                 step_output_token_ids.append(
@@ -952,6 +1053,8 @@ def _create_output_sampler_list(
             # This is periodic because the rejection sampler emits metrics
             # periodically.
             self._maybe_log_stage_times(*stage_times)
+        # First `n_prefills` entries will contain prefills SamplerOutput when
+        # chunked prefill is enabled, the rest is decodes in multi-step format.
         return sampler_output_list
 
     def _maybe_log_stage_times(self, average_time_per_proposal_tok_ms: float,

From 823ab796330825f4052d771e2c462ad3b55236eb Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 28 Jan 2025 00:23:08 +0000
Subject: [PATCH 401/462] Update `pre-commit` hooks (#12475)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .pre-commit-config.yaml                       |  10 +-
 benchmarks/benchmark_serving.py               |   4 +-
 csrc/custom_all_reduce.cuh                    |   8 +-
 csrc/moe/marlin_kernels/marlin_moe_kernel.h   |   8 +-
 csrc/quantization/gptq_marlin/gptq_marlin.cu  |  16 +--
 .../marlin/dense/marlin_cuda_kernel.cu        |   4 +-
 .../marlin/qqq/marlin_qqq_gemm_kernel.cu      |   4 +-
 csrc/quantization/marlin/sparse/common/mma.h  |   4 +-
 csrc/rocm/attention.cu                        |   4 +-
 setup.py                                      |   2 +-
 tests/kernels/test_block_fp8.py               |  25 ++--
 tests/kv_transfer/test_lookup_buffer.py       |  10 +-
 tests/lora/test_qwen2vl.py                    |   6 +-
 .../vision_language/test_models.py            | 130 ++++++++++--------
 .../vision_language/test_pixtral.py           |  17 ++-
 tests/quantization/test_compressed_tensors.py |   6 +-
 tests/samplers/test_rejection_sampler.py      |  15 +-
 tools/report_build_time_ninja.py              |   5 +-
 vllm/_custom_ops.py                           |   4 +-
 vllm/attention/ops/prefix_prefill.py          |  28 ++--
 vllm/attention/ops/triton_flash_attention.py  |   4 +-
 vllm/attention/selector.py                    |   4 +-
 vllm/config.py                                |   7 +-
 vllm/core/block/common.py                     |   7 +-
 vllm/core/block_manager.py                    |   4 +-
 vllm/core/scheduler.py                        |  23 ++--
 .../device_communicators/shm_broadcast.py     |   8 +-
 vllm/distributed/parallel_state.py            |   8 +-
 vllm/entrypoints/chat_utils.py                |   4 +-
 vllm/entrypoints/openai/serving_completion.py |   9 +-
 .../granite_20b_fc_tool_parser.py             |   4 +-
 vllm/lora/layers.py                           |  12 +-
 vllm/lora/models.py                           |   5 +-
 vllm/lora/ops/triton_ops/sgmv_expand.py       |   5 +-
 vllm/lora/ops/triton_ops/sgmv_shrink.py       |   4 +-
 .../kernels/mixed_precision/MPLinearKernel.py |  12 +-
 .../kernels/scaled_mm/ScaledMMLinearKernel.py |  14 +-
 .../layers/quantization/utils/fp8_utils.py    |   7 +-
 .../layers/quantization/utils/w8a8_utils.py   |   4 +-
 vllm/model_executor/layers/sampler.py         |   7 +-
 .../layers/vocab_parallel_embedding.py        |  16 +--
 vllm/model_executor/model_loader/loader.py    |   5 +-
 .../model_executor/model_loader/tensorizer.py |   4 +-
 vllm/model_executor/models/gemma.py           |   4 +-
 vllm/model_executor/models/granitemoe.py      |   6 +-
 vllm/model_executor/models/mllama.py          |   4 +-
 vllm/model_executor/models/mlp_speculator.py  |   4 +-
 vllm/model_executor/models/phimoe.py          |   8 +-
 vllm/model_executor/models/registry.py        |   3 +-
 vllm/model_executor/models/ultravox.py        |   8 +-
 vllm/model_executor/models/utils.py           |   5 +-
 vllm/model_executor/sampling_metadata.py      |  11 +-
 vllm/platforms/neuron.py                      |   4 +-
 vllm/scalar_type.py                           |   4 +-
 vllm/spec_decode/spec_decode_worker.py        |   4 +-
 vllm/spec_decode/top1_proposer.py             |  10 +-
 vllm/spec_decode/util.py                      |  12 +-
 vllm/transformers_utils/configs/nemotron.py   |   4 +-
 vllm/utils.py                                 |  10 +-
 vllm/v1/core/scheduler.py                     |   4 +-
 vllm/v1/stats/common.py                       |   4 +-
 vllm/v1/worker/gpu_model_runner.py            |   2 +-
 vllm/worker/hpu_worker.py                     |   8 +-
 vllm/worker/tpu_model_runner.py               |   4 +-
 64 files changed, 322 insertions(+), 288 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 432bf5ed18dbc..7b32df90bfd8b 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -3,18 +3,18 @@ default_stages:
   - manual # Run in CI
 repos:
 - repo: https://github.com/google/yapf
-  rev: v0.32.0
+  rev: v0.43.0
   hooks:
   - id: yapf
     args: [--in-place, --verbose]
     additional_dependencies: [toml] # TODO: Remove when yapf is upgraded
 - repo: https://github.com/astral-sh/ruff-pre-commit
-  rev: v0.6.5
+  rev: v0.9.3
   hooks:
   - id: ruff
     args: [--output-format, github]
 - repo: https://github.com/codespell-project/codespell
-  rev: v2.3.0
+  rev: v2.4.0
   hooks:
   - id: codespell
     exclude: 'benchmarks/sonnet.txt|(build|tests/(lora/data|models/fixtures|prompts))/.*'
@@ -23,7 +23,7 @@ repos:
   hooks:
   - id: isort
 - repo: https://github.com/pre-commit/mirrors-clang-format
-  rev: v18.1.5
+  rev: v19.1.7
   hooks:
   - id: clang-format
     exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))'
@@ -35,7 +35,7 @@ repos:
   - id: pymarkdown
     files: docs/.*
 - repo: https://github.com/rhysd/actionlint
-  rev: v1.7.6
+  rev: v1.7.7
   hooks:
   - id: actionlint
 - repo: local
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 63d2c3f7c7dd9..8b3212831e7e0 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -926,8 +926,8 @@ def main(args: argparse.Namespace):
                     )
 
         # Traffic
-        result_json["request_rate"] = (
-            args.request_rate if args.request_rate < float("inf") else "inf")
+        result_json["request_rate"] = (args.request_rate if args.request_rate
+                                       < float("inf") else "inf")
         result_json["burstiness"] = args.burstiness
         result_json["max_concurrency"] = args.max_concurrency
 
diff --git a/csrc/custom_all_reduce.cuh b/csrc/custom_all_reduce.cuh
index 6be4d4f2b2eb8..b9df4ed160b03 100644
--- a/csrc/custom_all_reduce.cuh
+++ b/csrc/custom_all_reduce.cuh
@@ -38,9 +38,13 @@ struct Signal {
   alignas(128) FlagType peer_counter[2][kMaxBlocks][8];
 };
 
-struct __align__(16) RankData { const void* __restrict__ ptrs[8]; };
+struct __align__(16) RankData {
+  const void* __restrict__ ptrs[8];
+};
 
-struct __align__(16) RankSignals { Signal* signals[8]; };
+struct __align__(16) RankSignals {
+  Signal* signals[8];
+};
 
 // like std::array, but aligned
 template <typename T, int sz>
diff --git a/csrc/moe/marlin_kernels/marlin_moe_kernel.h b/csrc/moe/marlin_kernels/marlin_moe_kernel.h
index a217401b3d7c2..47ecf109d0f53 100644
--- a/csrc/moe/marlin_kernels/marlin_moe_kernel.h
+++ b/csrc/moe/marlin_kernels/marlin_moe_kernel.h
@@ -138,8 +138,8 @@ __device__ inline FragB dequant<vllm::kU4B8.id()>(int q) {
   const int HI = 0x00f000f0;
   const int EX = 0x64006400;
   // Guarantee that the `(a & b) | c` operations are LOP3s.
-  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
-  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
+  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
   // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
   // directly into `SUB` and `ADD`.
   const int SUB = 0x64086408;
@@ -182,8 +182,8 @@ __device__ inline FragB dequant<vllm::kU4.id()>(int q) {
   const int HI = 0x00f000f0;
   const int EX = 0x64006400;
   // Guarantee that the `(a & b) | c` operations are LOP3s.
-  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
-  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
+  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
 
   const int SUB = 0x64006400;
   const int MUL = 0x2c002c00;
diff --git a/csrc/quantization/gptq_marlin/gptq_marlin.cu b/csrc/quantization/gptq_marlin/gptq_marlin.cu
index 04ef842fbdf95..7c33fea93d6ae 100644
--- a/csrc/quantization/gptq_marlin/gptq_marlin.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu
@@ -173,8 +173,8 @@ dequant<half, vllm::kU4B8.id()>(int q) {
   const int HI = 0x00f000f0;
   const int EX = 0x64006400;
   // Guarantee that the `(a & b) | c` operations are LOP3s.
-  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
-  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
+  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
   // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
   // directly into `SUB` and `ADD`.
   const int SUB = 0x64086408;
@@ -197,9 +197,9 @@ dequant<nv_bfloat16, vllm::kU4B8.id()>(int q) {
 
   // Guarantee that the `(a & b) | c` operations are LOP3s.
 
-  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
+  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
   q >>= 4;
-  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
+  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
 
   typename ScalarType<nv_bfloat16>::FragB frag_b;
   static constexpr uint32_t MUL = 0x3F803F80;
@@ -221,8 +221,8 @@ dequant<half, vllm::kU4.id()>(int q) {
   const int HI = 0x00f000f0;
   const int EX = 0x64006400;
   // Guarantee that the `(a & b) | c` operations are LOP3s.
-  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
-  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
+  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
 
   const int SUB = 0x64006400;
   const int MUL = 0x2c002c00;
@@ -244,9 +244,9 @@ dequant<nv_bfloat16, vllm::kU4.id()>(int q) {
 
   // Guarantee that the `(a & b) | c` operations are LOP3s.
 
-  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
+  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
   q >>= 4;
-  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
+  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
 
   typename ScalarType<nv_bfloat16>::FragB frag_b;
   static constexpr uint32_t MUL = 0x3F803F80;
diff --git a/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu b/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
index c03fef886e4db..4db8f5dcdabf6 100644
--- a/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
+++ b/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
@@ -96,8 +96,8 @@ __device__ inline FragB dequant(int q) {
   const int HI = 0x00f000f0;
   const int EX = 0x64006400;
   // Guarantee that the `(a & b) | c` operations are LOP3s.
-  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
-  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
+  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
   // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
   // directly into `SUB` and `ADD`.
   const int SUB = 0x64086408;
diff --git a/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu b/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu
index 103a6444f3a21..048a3f736fb71 100644
--- a/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu
+++ b/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu
@@ -141,8 +141,8 @@ __device__ inline FragB dequant_per_group(int q, FragS_GROUP& frag_s, int i) {
   static constexpr uint32_t HI = 0x00f000f0;
   static constexpr uint32_t EX = 0x64006400;
   // Guarantee that the `(a & b) | c` operations are LOP3s.
-  uint32_t t0 = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
-  uint32_t t1 = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+  uint32_t t0 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
+  uint32_t t1 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
   // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
   // directly into `SUB` and `ADD`.
   static constexpr uint32_t SUB = 0x64086408;
diff --git a/csrc/quantization/marlin/sparse/common/mma.h b/csrc/quantization/marlin/sparse/common/mma.h
index b26505f771c8b..49eee4128ee7c 100644
--- a/csrc/quantization/marlin/sparse/common/mma.h
+++ b/csrc/quantization/marlin/sparse/common/mma.h
@@ -127,8 +127,8 @@ __device__ inline FragB dequant_4bit(int q) {
   const int HI = 0x00f000f0;
   const int EX = 0x64006400;
   // Guarantee that the `(a & b) | c` operations are LOP3s.
-  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
-  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
+  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
   // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
   // directly into `SUB` and `ADD`.
   const int SUB = 0x64086408;
diff --git a/csrc/rocm/attention.cu b/csrc/rocm/attention.cu
index 9477790629c9f..ffa9d44610a7f 100644
--- a/csrc/rocm/attention.cu
+++ b/csrc/rocm/attention.cu
@@ -907,7 +907,9 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
     const scalar_t* __restrict__ tmp_out,  // [num_seqs, num_heads,
                                            // max_num_partitions, head_size]
     const int* __restrict__ context_lens,  // [num_seqs]
-    const int max_num_partitions){UNREACHABLE_CODE}
+    const int max_num_partitions) {
+  UNREACHABLE_CODE
+}
 
 #endif  // defined(__HIP__MI300_MI250__) TODO: Add NAVI support
 
diff --git a/setup.py b/setup.py
index ee193e4693806..59ece870b5585 100755
--- a/setup.py
+++ b/setup.py
@@ -417,7 +417,7 @@ def get_rocm_version():
 
         if (get_rocm_core_version(ctypes.byref(major), ctypes.byref(minor),
                                   ctypes.byref(patch)) == 0):
-            return "%d.%d.%d" % (major.value, minor.value, patch.value)
+            return f"{major.value}.{minor.value}.{patch.value}"
         return None
     except Exception:
         return None
diff --git a/tests/kernels/test_block_fp8.py b/tests/kernels/test_block_fp8.py
index a16cc4582a180..f28fdf3feedbc 100644
--- a/tests/kernels/test_block_fp8.py
+++ b/tests/kernels/test_block_fp8.py
@@ -92,8 +92,10 @@ def native_w8a8_block_fp8_matmul(A,
         A[:, i * block_k:min((i + 1) * block_k, K)] for i in range(k_tiles)
     ]
     B_tiles = [[
-        B[j * block_n:min((j + 1) * block_n, N),
-          i * block_k:min((i + 1) * block_k, K), ] for i in range(k_tiles)
+        B[
+            j * block_n:min((j + 1) * block_n, N),
+            i * block_k:min((i + 1) * block_k, K),
+        ] for i in range(k_tiles)
     ] for j in range(n_tiles)]
     C_tiles = [
         C[:, j * block_n:min((j + 1) * block_n, N)] for j in range(n_tiles)
@@ -157,9 +159,9 @@ def setup_cuda():
     torch.set_default_device("cuda")
 
 
-@pytest.mark.parametrize("num_tokens,d,dtype,group_size,seed",
-                         itertools.product(NUM_TOKENS, D, DTYPES, GROUP_SIZE,
-                                           SEEDS))
+@pytest.mark.parametrize(
+    "num_tokens,d,dtype,group_size,seed",
+    itertools.product(NUM_TOKENS, D, DTYPES, GROUP_SIZE, SEEDS))
 @torch.inference_mode()
 def test_per_token_group_quant_fp8(num_tokens, d, dtype, group_size, seed):
     torch.manual_seed(seed)
@@ -174,9 +176,9 @@ def test_per_token_group_quant_fp8(num_tokens, d, dtype, group_size, seed):
     assert torch.allclose(scale, ref_scale)
 
 
-@pytest.mark.parametrize("M,N,K,block_size,out_dtype,seed",
-                         itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES,
-                                           SEEDS))
+@pytest.mark.parametrize(
+    "M,N,K,block_size,out_dtype,seed",
+    itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES, SEEDS))
 @torch.inference_mode()
 def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):
     torch.manual_seed(seed)
@@ -207,9 +209,10 @@ def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):
     assert rel_diff < 0.001
 
 
-@pytest.mark.parametrize("M,N,K,E,topk,block_size,dtype,seed",
-                         itertools.product(M_moe, N_moe, K_moe, E, TOP_KS,
-                                           BLOCK_SIZE, DTYPES, SEEDS))
+@pytest.mark.parametrize(
+    "M,N,K,E,topk,block_size,dtype,seed",
+    itertools.product(M_moe, N_moe, K_moe, E, TOP_KS, BLOCK_SIZE, DTYPES,
+                      SEEDS))
 @torch.inference_mode()
 def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed):
     torch.manual_seed(seed)
diff --git a/tests/kv_transfer/test_lookup_buffer.py b/tests/kv_transfer/test_lookup_buffer.py
index 718730bb8cbbe..4d6890305af73 100644
--- a/tests/kv_transfer/test_lookup_buffer.py
+++ b/tests/kv_transfer/test_lookup_buffer.py
@@ -20,7 +20,7 @@ def test_run(my_rank, buffer, device):
         assert buffer.buffer_size == 0
         assert len(buffer.buffer) == 0
 
-    print("My rank: %d, device: %s" % (my_rank, device))
+    print(f"My rank: {my_rank}, device: {device}")
 
     # insert
     tokens = torch.tensor([1, 2, 3]).to(device)
@@ -48,7 +48,7 @@ def test_run(my_rank, buffer, device):
         assert buffer.buffer_size == 0
         assert len(buffer.buffer) == 0
 
-    print("My rank: %d, Test run passed!" % (my_rank))
+    print(f"My rank: {my_rank}, Test run passed!")
 
 
 def stress_test(my_rank, buf, device):
@@ -94,7 +94,7 @@ def stress_test(my_rank, buf, device):
                 assert torch.allclose(k, k_)
                 assert torch.allclose(v, v_)
                 assert torch.allclose(h, h_)
-    print('Rank %d done' % my_rank)
+    print(f"Rank {my_rank} done")
     torch.distributed.barrier()
 
     if my_rank == 0:
@@ -108,7 +108,7 @@ def stress_test(my_rank, buf, device):
     else:
         torch.distributed.send(torch.tensor([n]), 0)
 
-    print("My rank: %d, Passed stress test!" % (my_rank))
+    print(f"My rank: {my_rank}, Passed stress test!")
 
 
 if __name__ == "__main__":
@@ -122,7 +122,7 @@ def stress_test(my_rank, buf, device):
         rank=my_rank,
     )
 
-    print("initialized! My rank is %d" % my_rank)
+    print(f"initialized! My rank is {my_rank}")
 
     config = KVTransferConfig(
         kv_connector='PyNcclConnector',
diff --git a/tests/lora/test_qwen2vl.py b/tests/lora/test_qwen2vl.py
index ebdd129db5f6a..570aa3861d0be 100644
--- a/tests/lora/test_qwen2vl.py
+++ b/tests/lora/test_qwen2vl.py
@@ -55,9 +55,9 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
     return generated_texts
 
 
-@pytest.mark.xfail(current_platform.is_rocm(),
-                   reason="Qwen2-VL dependency xformers incompatible with ROCm"
-                   )
+@pytest.mark.xfail(
+    current_platform.is_rocm(),
+    reason="Qwen2-VL dependency xformers incompatible with ROCm")
 def test_qwen2vl_lora(qwen2vl_lora_files):
     llm = vllm.LLM(
         MODEL_PATH,
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index 14d9a739be318..d5f0d63288cc1 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -521,12 +521,13 @@ def _mark_splits(
 # - image embeddings
 # - video
 # - custom inputs
-@pytest.mark.parametrize("model_type,test_case",
-                         get_parametrized_options(
-                             VLM_TEST_SETTINGS,
-                             test_type=VLMTestType.IMAGE,
-                             fork_new_process_for_each_test=False,
-                         ))
+@pytest.mark.parametrize(
+    "model_type,test_case",
+    get_parametrized_options(
+        VLM_TEST_SETTINGS,
+        test_type=VLMTestType.IMAGE,
+        fork_new_process_for_each_test=False,
+    ))
 def test_single_image_models(tmp_path: PosixPath, model_type: str,
                              test_case: ExpandableVLMTestArgs,
                              hf_runner: Type[HfRunner],
@@ -543,12 +544,13 @@ def test_single_image_models(tmp_path: PosixPath, model_type: str,
     )
 
 
-@pytest.mark.parametrize("model_type,test_case",
-                         get_parametrized_options(
-                             VLM_TEST_SETTINGS,
-                             test_type=VLMTestType.MULTI_IMAGE,
-                             fork_new_process_for_each_test=False,
-                         ))
+@pytest.mark.parametrize(
+    "model_type,test_case",
+    get_parametrized_options(
+        VLM_TEST_SETTINGS,
+        test_type=VLMTestType.MULTI_IMAGE,
+        fork_new_process_for_each_test=False,
+    ))
 def test_multi_image_models(tmp_path: PosixPath, model_type: str,
                             test_case: ExpandableVLMTestArgs,
                             hf_runner: Type[HfRunner],
@@ -565,12 +567,13 @@ def test_multi_image_models(tmp_path: PosixPath, model_type: str,
     )
 
 
-@pytest.mark.parametrize("model_type,test_case",
-                         get_parametrized_options(
-                             VLM_TEST_SETTINGS,
-                             test_type=VLMTestType.EMBEDDING,
-                             fork_new_process_for_each_test=False,
-                         ))
+@pytest.mark.parametrize(
+    "model_type,test_case",
+    get_parametrized_options(
+        VLM_TEST_SETTINGS,
+        test_type=VLMTestType.EMBEDDING,
+        fork_new_process_for_each_test=False,
+    ))
 def test_image_embedding_models(model_type: str,
                                 test_case: ExpandableVLMTestArgs,
                                 hf_runner: Type[HfRunner],
@@ -586,12 +589,13 @@ def test_image_embedding_models(model_type: str,
     )
 
 
-@pytest.mark.parametrize("model_type,test_case",
-                         get_parametrized_options(
-                             VLM_TEST_SETTINGS,
-                             test_type=VLMTestType.VIDEO,
-                             fork_new_process_for_each_test=False,
-                         ))
+@pytest.mark.parametrize(
+    "model_type,test_case",
+    get_parametrized_options(
+        VLM_TEST_SETTINGS,
+        test_type=VLMTestType.VIDEO,
+        fork_new_process_for_each_test=False,
+    ))
 def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
                       hf_runner: Type[HfRunner], vllm_runner: Type[VllmRunner],
                       video_assets: _VideoAssets):
@@ -605,12 +609,13 @@ def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
     )
 
 
-@pytest.mark.parametrize("model_type,test_case",
-                         get_parametrized_options(
-                             VLM_TEST_SETTINGS,
-                             test_type=VLMTestType.CUSTOM_INPUTS,
-                             fork_new_process_for_each_test=False,
-                         ))
+@pytest.mark.parametrize(
+    "model_type,test_case",
+    get_parametrized_options(
+        VLM_TEST_SETTINGS,
+        test_type=VLMTestType.CUSTOM_INPUTS,
+        fork_new_process_for_each_test=False,
+    ))
 def test_custom_inputs_models(
     model_type: str,
     test_case: ExpandableVLMTestArgs,
@@ -627,12 +632,13 @@ def test_custom_inputs_models(
 
 
 #### Tests filtering for things running each test as a new process
-@pytest.mark.parametrize("model_type,test_case",
-                         get_parametrized_options(
-                             VLM_TEST_SETTINGS,
-                             test_type=VLMTestType.IMAGE,
-                             fork_new_process_for_each_test=True,
-                         ))
+@pytest.mark.parametrize(
+    "model_type,test_case",
+    get_parametrized_options(
+        VLM_TEST_SETTINGS,
+        test_type=VLMTestType.IMAGE,
+        fork_new_process_for_each_test=True,
+    ))
 @fork_new_process_for_each_test
 def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str,
                                    test_case: ExpandableVLMTestArgs,
@@ -650,12 +656,13 @@ def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str,
     )
 
 
-@pytest.mark.parametrize("model_type,test_case",
-                         get_parametrized_options(
-                             VLM_TEST_SETTINGS,
-                             test_type=VLMTestType.MULTI_IMAGE,
-                             fork_new_process_for_each_test=True,
-                         ))
+@pytest.mark.parametrize(
+    "model_type,test_case",
+    get_parametrized_options(
+        VLM_TEST_SETTINGS,
+        test_type=VLMTestType.MULTI_IMAGE,
+        fork_new_process_for_each_test=True,
+    ))
 @fork_new_process_for_each_test
 def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str,
                                   test_case: ExpandableVLMTestArgs,
@@ -673,12 +680,13 @@ def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str,
     )
 
 
-@pytest.mark.parametrize("model_type,test_case",
-                         get_parametrized_options(
-                             VLM_TEST_SETTINGS,
-                             test_type=VLMTestType.EMBEDDING,
-                             fork_new_process_for_each_test=True,
-                         ))
+@pytest.mark.parametrize(
+    "model_type,test_case",
+    get_parametrized_options(
+        VLM_TEST_SETTINGS,
+        test_type=VLMTestType.EMBEDDING,
+        fork_new_process_for_each_test=True,
+    ))
 @fork_new_process_for_each_test
 def test_image_embedding_models_heavy(model_type: str,
                                       test_case: ExpandableVLMTestArgs,
@@ -695,12 +703,13 @@ def test_image_embedding_models_heavy(model_type: str,
     )
 
 
-@pytest.mark.parametrize("model_type,test_case",
-                         get_parametrized_options(
-                             VLM_TEST_SETTINGS,
-                             test_type=VLMTestType.VIDEO,
-                             fork_new_process_for_each_test=True,
-                         ))
+@pytest.mark.parametrize(
+    "model_type,test_case",
+    get_parametrized_options(
+        VLM_TEST_SETTINGS,
+        test_type=VLMTestType.VIDEO,
+        fork_new_process_for_each_test=True,
+    ))
 def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
                             hf_runner: Type[HfRunner],
                             vllm_runner: Type[VllmRunner],
@@ -715,12 +724,13 @@ def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
     )
 
 
-@pytest.mark.parametrize("model_type,test_case",
-                         get_parametrized_options(
-                             VLM_TEST_SETTINGS,
-                             test_type=VLMTestType.CUSTOM_INPUTS,
-                             fork_new_process_for_each_test=True,
-                         ))
+@pytest.mark.parametrize(
+    "model_type,test_case",
+    get_parametrized_options(
+        VLM_TEST_SETTINGS,
+        test_type=VLMTestType.CUSTOM_INPUTS,
+        fork_new_process_for_each_test=True,
+    ))
 @fork_new_process_for_each_test
 def test_custom_inputs_models_heavy(
     model_type: str,
diff --git a/tests/models/decoder_only/vision_language/test_pixtral.py b/tests/models/decoder_only/vision_language/test_pixtral.py
index 90c0fab99054c..8103e5305b91b 100644
--- a/tests/models/decoder_only/vision_language/test_pixtral.py
+++ b/tests/models/decoder_only/vision_language/test_pixtral.py
@@ -135,10 +135,10 @@ def _dump_outputs_w_logprobs(
     outputs: OutputsLogprobs,
     filename: "StrPath",
 ) -> None:
-    json_data = [(tokens, text,
-                  [{k: asdict(v)
-                    for k, v in token_logprobs.items()}
-                   for token_logprobs in (logprobs or [])])
+    json_data = [(tokens, text, [{
+        k: asdict(v)
+        for k, v in token_logprobs.items()
+    } for token_logprobs in (logprobs or [])])
                  for tokens, text, logprobs in outputs]
 
     with open(filename, "w") as f:
@@ -149,11 +149,10 @@ def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs:
     with open(filename, "rb") as f:
         json_data = json.load(f)
 
-    return [(tokens, text,
-             [{int(k): Logprob(**v)
-               for k, v in token_logprobs.items()}
-              for token_logprobs in logprobs])
-            for tokens, text, logprobs in json_data]
+    return [(tokens, text, [{
+        int(k): Logprob(**v)
+        for k, v in token_logprobs.items()
+    } for token_logprobs in logprobs]) for tokens, text, logprobs in json_data]
 
 
 @large_gpu_test(min_gb=80)
diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index bf0d454ad511c..1072697ecf5cc 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -314,9 +314,9 @@ def check_model(model):
 
 
 @pytest.mark.skip(reason="2of4 sparse w16a16 CUTLASS produces bad output.")
-@pytest.mark.skipif(not sparse_cutlass_supported(),
-                    reason="2of4 Sparse is not yet supported on this GPU type."
-                    )
+@pytest.mark.skipif(
+    not sparse_cutlass_supported(),
+    reason="2of4 Sparse is not yet supported on this GPU type.")
 @pytest.mark.parametrize(
     "args_2of4",
     [("nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor")])
diff --git a/tests/samplers/test_rejection_sampler.py b/tests/samplers/test_rejection_sampler.py
index 397fa2cc85821..dcb1b27bff37f 100644
--- a/tests/samplers/test_rejection_sampler.py
+++ b/tests/samplers/test_rejection_sampler.py
@@ -23,16 +23,17 @@ def mock_causal_accepted_tensor(
     """
     batch_size = last_accepted_indices.shape[0]
 
-    accepted = (torch.arange(k).expand(batch_size, k) <=
-                last_accepted_indices.unsqueeze(-1).broadcast_to(
+    accepted = (torch.arange(k).expand(batch_size, k)
+                <= last_accepted_indices.unsqueeze(-1).broadcast_to(
                     batch_size, k))
 
     # Sprinkle accepted values after the contiguous initial accepted values.
     # This replicates the behavior of rejection sampling, which may "accept"
     # a token that cannot be accepted because of causality.
-    sprinkle_candidates = (
-        torch.arange(k).expand(batch_size, k) >
-        last_accepted_indices.unsqueeze(-1).broadcast_to(batch_size, k) + 1)
+    sprinkle_candidates = (torch.arange(k).expand(
+        batch_size,
+        k) > last_accepted_indices.unsqueeze(-1).broadcast_to(batch_size, k) +
+                           1)
     sprinkle = torch.rand(batch_size, k) > 0.5
     accepted[sprinkle_candidates] = sprinkle[sprinkle_candidates]
     return accepted
@@ -445,8 +446,8 @@ def test_rejection_sampling_approximates_target_distribution(
         distance_wrt_reference)
 
     expected_improvement_multiplier = 20
-    assert (relative_change_in_distance_wrt_target >
-            relative_change_in_distance_wrt_reference *
+    assert (relative_change_in_distance_wrt_target
+            > relative_change_in_distance_wrt_reference *
             expected_improvement_multiplier)
 
 
diff --git a/tools/report_build_time_ninja.py b/tools/report_build_time_ninja.py
index 51ad2adc74fe1..9dc19f5fd4cdd 100644
--- a/tools/report_build_time_ninja.py
+++ b/tools/report_build_time_ninja.py
@@ -274,8 +274,9 @@ def SummarizeEntries(entries, extra_step_types):
     print('    {:.1f} s weighted time ({:.1f} s elapsed time sum, {:1.1f}x '
           'parallelism)'.format(length, total_cpu_time,
                                 total_cpu_time * 1.0 / length))
-    print('    %d build steps completed, average of %1.2f/s' %
-          (len(entries), len(entries) / (length)))
+    print('    {} build steps completed, average of {:1.2f}/s'.format(
+        len(entries),
+        len(entries) / (length)))
 
 
 def main():
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 440bc52012ab7..85c1121ed6ff8 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -820,8 +820,8 @@ def scaled_int8_quant(
     if scale is not None:
         # static-per-tensor quantization.
         assert symmetric == (
-            azp is
-            None), "azp must only be provided for asymmetric quantization."
+            azp
+            is None), "azp must only be provided for asymmetric quantization."
         torch.ops._C.static_scaled_int8_quant(output, input, scale, azp)
         return output, scale, azp
 
diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py
index e2f2b66dfc90c..ec3c8459c43ef 100644
--- a/vllm/attention/ops/prefix_prefill.py
+++ b/vllm/attention/ops/prefix_prefill.py
@@ -219,8 +219,8 @@ def _fwd_kernel(
                           float("-inf"))
             if SLIDING_WINDOW > 0:
                 qk = tl.where(
-                    offs_m[:, None] -
-                    (start_n + offs_n[None, :]) < SLIDING_WINDOW, qk, -10000)
+                    offs_m[:, None] - (start_n + offs_n[None, :])
+                    < SLIDING_WINDOW, qk, -10000)
 
             # -- compute m_ij, p, l_ij
             m_ij = tl.max(qk, 1)
@@ -324,10 +324,10 @@ def _fwd_kernel_flash_attn_v2(
             (cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs +
             cur_head * stride_qh + offs_d[None, :] * stride_qd)
 
-        q = tl.load(
-            Q + off_q,
-            mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len,
-            other=0.0)
+        q = tl.load(Q + off_q,
+                    mask=offs_m[:, None]
+                    < cur_batch_seq_len - cur_batch_ctx_len,
+                    other=0.0)
 
         # # initialize pointer to m and l
         m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
@@ -402,8 +402,8 @@ def _fwd_kernel_flash_attn_v2(
             # -- compute qk ----
             k = tl.load(k_ptrs +
                         (cur_batch_in_all_start_index + start_n) * stride_kbs,
-                        mask=(start_n + offs_n[None, :]) <
-                        cur_batch_seq_len - cur_batch_ctx_len,
+                        mask=(start_n + offs_n[None, :])
+                        < cur_batch_seq_len - cur_batch_ctx_len,
                         other=0.0)
 
             qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
@@ -430,8 +430,8 @@ def _fwd_kernel_flash_attn_v2(
             # update acc
             v = tl.load(v_ptrs +
                         (cur_batch_in_all_start_index + start_n) * stride_vbs,
-                        mask=(start_n + offs_n[:, None]) <
-                        cur_batch_seq_len - cur_batch_ctx_len,
+                        mask=(start_n + offs_n[:, None])
+                        < cur_batch_seq_len - cur_batch_ctx_len,
                         other=0.0)
 
             p = p.to(v.dtype)
@@ -639,8 +639,8 @@ def _fwd_kernel_alibi(
             k = tl.load(k_ptrs +
                         (cur_batch_in_all_start_index + start_n) * stride_kbs,
                         mask=dim_mask[:, None] &
-                        ((start_n + offs_n[None, :]) <
-                         cur_batch_seq_len - cur_batch_ctx_len),
+                        ((start_n + offs_n[None, :])
+                         < cur_batch_seq_len - cur_batch_ctx_len),
                         other=0.0)
 
             qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
@@ -677,8 +677,8 @@ def _fwd_kernel_alibi(
             v = tl.load(v_ptrs +
                         (cur_batch_in_all_start_index + start_n) * stride_vbs,
                         mask=dim_mask[None, :] &
-                        ((start_n + offs_n[:, None]) <
-                         cur_batch_seq_len - cur_batch_ctx_len),
+                        ((start_n + offs_n[:, None])
+                         < cur_batch_seq_len - cur_batch_ctx_len),
                         other=0.0)
             p = p.to(v.dtype)
 
diff --git a/vllm/attention/ops/triton_flash_attention.py b/vllm/attention/ops/triton_flash_attention.py
index f94211116a746..ef04603f22b6e 100644
--- a/vllm/attention/ops/triton_flash_attention.py
+++ b/vllm/attention/ops/triton_flash_attention.py
@@ -627,8 +627,8 @@ def attn_fwd(
                                         causal_start_idx,
                                         dtype=tl.int32)
             mask_m_offsets = start_m_idx + tl.arange(0, BLOCK_M)
-            out_ptrs_mask = (mask_m_offsets[:, None] >=
-                             out_mask_boundary[None, :])
+            out_ptrs_mask = (mask_m_offsets[:, None]
+                             >= out_mask_boundary[None, :])
             z = 0.0
             acc = tl.where(out_ptrs_mask, acc, z.to(acc.type.element_ty))
     # write back LSE
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index 81ea6eefb5410..1376274d57777 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -1,6 +1,6 @@
 import os
 from contextlib import contextmanager
-from functools import lru_cache
+from functools import cache
 from typing import Generator, Optional, Type
 
 import torch
@@ -100,7 +100,7 @@ def get_attn_backend(
     )
 
 
-@lru_cache(maxsize=None)
+@cache
 def _cached_get_attn_backend(
     head_size: int,
     dtype: torch.dtype,
diff --git a/vllm/config.py b/vllm/config.py
index 7ab632d7e3667..d7c9311ae3cb0 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -67,7 +67,8 @@
 
 _TASK_RUNNER: Dict[_ResolvedTask, RunnerType] = {
     task: runner
-    for runner, tasks in _RUNNER_TASKS.items() for task in tasks
+    for runner, tasks in _RUNNER_TASKS.items()
+    for task in tasks
 }
 
 HfOverrides = Union[Dict[str, Any], Callable[[PretrainedConfig],
@@ -1976,8 +1977,8 @@ def _verify_args(self) -> None:
                              "typical_acceptance_sampler.")
 
         if (self.draft_token_acceptance_method != 'rejection_sampler'
-                and self.draft_token_acceptance_method !=
-                'typical_acceptance_sampler'):
+                and self.draft_token_acceptance_method
+                != 'typical_acceptance_sampler'):
             raise ValueError(
                 "Expected draft_token_acceptance_method to be either "
                 "rejection_sampler or typical_acceptance_sampler. Instead it "
diff --git a/vllm/core/block/common.py b/vllm/core/block/common.py
index c03b5932eafb6..115f663e4ad34 100644
--- a/vllm/core/block/common.py
+++ b/vllm/core/block/common.py
@@ -34,9 +34,10 @@ class RefCounter(RefCounterProtocol):
 
     def __init__(self, all_block_indices: Iterable[BlockId]):
         deduped = set(all_block_indices)
-        self._refcounts: Dict[BlockId,
-                              RefCount] = {index: 0
-                                           for index in deduped}
+        self._refcounts: Dict[BlockId, RefCount] = {
+            index: 0
+            for index in deduped
+        }
 
     def incr(self, block_id: BlockId) -> RefCount:
         assert block_id in self._refcounts
diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index 62a5f0bda061a..2d6a132ed555b 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -136,8 +136,8 @@ def can_allocate(self,
             device=Device.GPU)
 
         # Use watermark to avoid frequent cache eviction.
-        if (self.num_total_gpu_blocks - num_required_blocks <
-                self.watermark_blocks):
+        if (self.num_total_gpu_blocks - num_required_blocks
+                < self.watermark_blocks):
             return AllocStatus.NEVER
         if num_free_gpu_blocks - num_required_blocks >= self.watermark_blocks:
             return AllocStatus.OK
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index b1630b34947bd..2bb961481e5fe 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -988,8 +988,8 @@ def _schedule_prefills(
                     waiting_queue.popleft()
                     continue
 
-            if (budget.num_batched_tokens >=
-                    self.scheduler_config.max_num_batched_tokens):
+            if (budget.num_batched_tokens
+                    >= self.scheduler_config.max_num_batched_tokens):
                 # We've reached the budget limit - since there might be
                 # continuous prefills in the running queue, we should break
                 # to avoid scheduling any new prefills.
@@ -1096,8 +1096,8 @@ def _schedule_default(self) -> SchedulerOutputs:
                     running_scheduled.swapped_out) == 0:
                 swapped_in = self._schedule_swapped(budget, curr_loras)
 
-        assert (budget.num_batched_tokens <=
-                self.scheduler_config.max_num_batched_tokens)
+        assert (budget.num_batched_tokens
+                <= self.scheduler_config.max_num_batched_tokens)
         assert budget.num_curr_seqs <= self.scheduler_config.max_num_seqs
 
         # Update waiting requests.
@@ -1189,8 +1189,8 @@ def _schedule_chunked_prefill(self) -> SchedulerOutputs:
                                            curr_loras,
                                            enable_chunking=True)
 
-        assert (budget.num_batched_tokens <=
-                self.scheduler_config.max_num_batched_tokens)
+        assert (budget.num_batched_tokens
+                <= self.scheduler_config.max_num_batched_tokens)
         assert budget.num_curr_seqs <= self.scheduler_config.max_num_seqs
 
         # Update waiting requests.
@@ -1358,8 +1358,8 @@ def schedule(
                 # NOTE: We use get_len instead of get_prompt_len because when
                 # a sequence is preempted, prefill includes previous generated
                 # output tokens.
-                if (token_chunk_size + num_computed_tokens <
-                        seqs[0].data.get_len()):
+                if (token_chunk_size + num_computed_tokens
+                        < seqs[0].data.get_len()):
                     do_sample = False
 
             # It assumes the scheduled_seq_groups is ordered by
@@ -1625,10 +1625,9 @@ def _passed_delay(self, now: float) -> bool:
         if self.scheduler_config.delay_factor > 0 and self.waiting:
             earliest_arrival_time = min(
                 [e.metrics.arrival_time for e in self.waiting])
-            passed_delay = (
-                (now - earliest_arrival_time) >
-                (self.scheduler_config.delay_factor * self.last_prompt_latency)
-                or not self.running)
+            passed_delay = ((now - earliest_arrival_time)
+                            > (self.scheduler_config.delay_factor *
+                               self.last_prompt_latency) or not self.running)
         else:
             passed_delay = True
         return passed_delay
diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py
index 4ced991f62f66..268edc0925fe8 100644
--- a/vllm/distributed/device_communicators/shm_broadcast.py
+++ b/vllm/distributed/device_communicators/shm_broadcast.py
@@ -352,8 +352,8 @@ def acquire_write(self, timeout: Optional[float] = None):
                     sched_yield()
 
                     # if we wait for a long time, log a message
-                    if (time.monotonic() - start_time >
-                            VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning):
+                    if (time.monotonic() - start_time
+                            > VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning):
                         logger.debug("No available block found in %s second. ",
                                      VLLM_RINGBUFFER_WARNING_INTERVAL)
                         n_warning += 1
@@ -410,8 +410,8 @@ def acquire_read(self, timeout: Optional[float] = None):
                     sched_yield()
 
                     # if we wait for a long time, log a message
-                    if (time.monotonic() - start_time >
-                            VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning):
+                    if (time.monotonic() - start_time
+                            > VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning):
                         logger.debug("No available block found in %s second. ",
                                      VLLM_RINGBUFFER_WARNING_INTERVAL)
                         n_warning += 1
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index ffdf8b0f48087..7fe9b68d4b9e8 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -1014,8 +1014,8 @@ def initialize_model_parallel(
     backend = backend or torch.distributed.get_backend(
         get_world_group().device_group)
 
-    if (world_size !=
-            tensor_model_parallel_size * pipeline_model_parallel_size):
+    if (world_size
+            != tensor_model_parallel_size * pipeline_model_parallel_size):
         raise RuntimeError(
             f"world_size ({world_size}) is not equal to "
             f"tensor_model_parallel_size ({tensor_model_parallel_size}) x "
@@ -1069,8 +1069,8 @@ def ensure_kv_transfer_initialized(vllm_config: "VllmConfig") -> None:
         return
 
     if all([
-            vllm_config.kv_transfer_config.need_kv_parallel_group,
-            _KV_TRANSFER is None
+            vllm_config.kv_transfer_config.need_kv_parallel_group, _KV_TRANSFER
+            is None
     ]):
         _KV_TRANSFER = kv_transfer.KVTransferAgent(
             rank=get_world_group().rank,
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index beedf5d16ab86..723d6e9085806 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -3,7 +3,7 @@
 import json
 from abc import ABC, abstractmethod
 from collections import defaultdict, deque
-from functools import lru_cache, partial
+from functools import cache, lru_cache, partial
 from pathlib import Path
 from typing import (Any, Awaitable, Callable, Dict, Generic, Iterable, List,
                     Literal, Optional, Tuple, TypeVar, Union, cast)
@@ -377,7 +377,7 @@ def allowed_local_media_path(self):
         return self._model_config.allowed_local_media_path
 
     @staticmethod
-    @lru_cache(maxsize=None)
+    @cache
     def _cached_token_str(tokenizer: AnyTokenizer, token_index: int) -> str:
         return tokenizer.decode(token_index)
 
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 2c9c20caf8119..b0179f78bd635 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -522,11 +522,10 @@ def _create_completion_logprobs(
                 out_top_logprobs.append({
                     # Convert float("-inf") to the
                     # JSON-serializable float that OpenAI uses
-                    self._get_decoded_token(
-                        top_lp[1],
-                        top_lp[0],
-                        tokenizer,
-                        return_as_token_id=self.return_tokens_as_token_ids):
+                    self._get_decoded_token(top_lp[1],
+                                            top_lp[0],
+                                            tokenizer,
+                                            return_as_token_id=self.return_tokens_as_token_ids):
                     max(top_lp[1].logprob, -9999.0)
                     for i, top_lp in enumerate(step_top_logprobs.items())
                     if num_output_top_logprobs >= i
diff --git a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
index 94db8f379e33a..93e357e8b9f21 100644
--- a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
@@ -62,8 +62,8 @@ def extract_tool_calls(
                 start_of_json = match.end()
                 # end_index == the start of the next function call
                 # (if exists)
-                next_function_call_start = (matches[i + 1].start()
-                                            if i + 1 < len(matches) else None)
+                next_function_call_start = (matches[i + 1].start() if i +
+                                            1 < len(matches) else None)
 
                 raw_function_calls.append(
                     dec.raw_decode(
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index e6f26d2b74b2f..cdd439d0385b6 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -220,8 +220,10 @@ def set_lora(
                                 lora_b.T, non_blocking=True)
         if embeddings_tensor is not None:
             self.embeddings_tensors[
-                index, :embeddings_tensor.shape[0], :embeddings_tensor.
-                shape[1], ].copy_(embeddings_tensor, non_blocking=True)
+                index,
+                :embeddings_tensor.shape[0],
+                :embeddings_tensor.shape[1],
+            ].copy_(embeddings_tensor, non_blocking=True)
             if self.embeddings_slice is not None:
                 # TODO(yard1): Optimize this copy, we don't need to copy
                 # everything, just the modified part
@@ -1024,8 +1026,10 @@ def set_lora(
                                 lora_b.T, non_blocking=True)
         if embeddings_tensor is not None:
             self.embeddings_tensors[
-                index, :embeddings_tensor.shape[0], :embeddings_tensor.
-                shape[1], ] = embeddings_tensor
+                index,
+                :embeddings_tensor.shape[0],
+                :embeddings_tensor.shape[1],
+            ] = embeddings_tensor
 
     def _get_logits(
         self,
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index b77b6b3d72ff4..2e04cb902d009 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -75,8 +75,9 @@ def __init__(
         # Scaling factor for long context lora model. None if it is not
         # fine tuned for the long context.
         self.scaling_factor = scaling_factor
-        assert (lora_model_id >
-                0), f"a valid lora id should be greater than 0, got {self.id}"
+        assert (
+            lora_model_id
+            > 0), f"a valid lora id should be greater than 0, got {self.id}"
         self.rank = rank
         self.loras: Dict[str, LoRALayerWeights] = loras
 
diff --git a/vllm/lora/ops/triton_ops/sgmv_expand.py b/vllm/lora/ops/triton_ops/sgmv_expand.py
index 8af44b703810b..48fa5cd63741f 100644
--- a/vllm/lora/ops/triton_ops/sgmv_expand.py
+++ b/vllm/lora/ops/triton_ops/sgmv_expand.py
@@ -136,9 +136,8 @@ def _sgmv_expand_kernel(
     c_ptr = (out_ptr + offset_cm[:, None] * output_d0_stride +
              offset_cn[None, :] * output_d1_stride)
     M = tl.load(seq_lens + cur_batch)
-    c_mask = (offset_cm[:, None] <
-              (cur_seq_start + M)) & (offset_cn[None, :] <
-                                      (cur_slice_start + curr_N))
+    c_mask = (offset_cm[:, None] < (cur_seq_start + M)) & (
+        offset_cn[None, :] < (cur_slice_start + curr_N))
     if ADD_INPUTS:
         tiled_out = tl.load(c_ptr, mask=c_mask)
         tiled_c += tiled_out
diff --git a/vllm/lora/ops/triton_ops/sgmv_shrink.py b/vllm/lora/ops/triton_ops/sgmv_shrink.py
index 3d2ebe8286f56..9bb35e8ffd323 100644
--- a/vllm/lora/ops/triton_ops/sgmv_shrink.py
+++ b/vllm/lora/ops/triton_ops/sgmv_shrink.py
@@ -114,8 +114,8 @@ def _sgmv_shrink_kernel(
                    slice_id * output_d0_stride)
     c_ptr = cur_out_ptr + offset_cm[:, None] * output_d1_stride + offset_cn[
         None, :] * output_d2_stride
-    c_mask = (offset_cm[:, None] <
-              (cur_seq_start + M)) & (offset_cn[None, :] < N)
+    c_mask = (offset_cm[:, None] < (cur_seq_start + M)) & (offset_cn[None, :]
+                                                           < N)
     accumulator *= scaling
     # handles write-back with reduction-splitting
     if SPLIT_K == 1:
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py
index b04612a9b00d9..915bdc4778929 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py
@@ -73,12 +73,12 @@ def _transform_param(self, layer: torch.nn.Module, name: Optional[str],
                 torch.nn.Parameter(new_param.data, requires_grad=False))
 
     def _get_weight_params(
-            self, layer: torch.nn.Module
-    ) -> Tuple[torch.Tensor,  # w_q
-               torch.Tensor,  # w_s
-               Optional[torch.Tensor],  # w_zp, 
-               Optional[torch.Tensor]  # w_gidx
-               ]:
+            self, layer: torch.nn.Module) -> Tuple[
+                torch.Tensor,  # w_q
+                torch.Tensor,  # w_s
+                Optional[torch.Tensor],  # w_zp, 
+                Optional[torch.Tensor]  # w_gidx
+            ]:
         return (
             getattr(layer, self.w_q_name),
             getattr(layer, self.w_s_name),
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py
index 75cf91f191136..c4a83b4faafe6 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py
@@ -48,13 +48,13 @@ def apply_weights(self,
         raise NotImplementedError
 
     def _get_weight_params(
-            self, layer: torch.nn.Module
-    ) -> Tuple[torch.Tensor,  # weight
-               torch.Tensor,  # weight_scale
-               Optional[torch.Tensor],  # input_scale, 
-               Optional[torch.Tensor],  # input_zp
-               Optional[torch.Tensor],  # azp_adj
-               ]:
+            self, layer: torch.nn.Module) -> Tuple[
+                torch.Tensor,  # weight
+                torch.Tensor,  # weight_scale
+                Optional[torch.Tensor],  # input_scale, 
+                Optional[torch.Tensor],  # input_zp
+                Optional[torch.Tensor],  # azp_adj
+            ]:
         return (
             getattr(layer, self.w_q_name),
             getattr(layer, self.w_s_name),
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index b6882cc7c837c..43b1997019107 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -72,9 +72,10 @@ def block_quant_to_tensor_quant(
     x_dq_block = x_q_block.to(torch.float32)
 
     x_dq_block_tiles = [[
-        x_dq_block[j * block_n:min((j + 1) * block_n, n),
-                   i * block_k:min((i + 1) * block_k, k), ]
-        for i in range(k_tiles)
+        x_dq_block[
+            j * block_n:min((j + 1) * block_n, n),
+            i * block_k:min((i + 1) * block_k, k),
+        ] for i in range(k_tiles)
     ] for j in range(n_tiles)]
 
     for i in range(k_tiles):
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index 7cdce67cf1677..9977804188a50 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -73,8 +73,8 @@ def requantize_with_max_scale(
     # from disk in this case. Skip requantization in this case (since)
     # we already are quantized with the single scale.
     # * Sample Model: nm-testing/Phi-3-mini-128k-instruct-FP8
-    unfused_module_in_checkpoint = (weight_scale[-1] > torch.finfo(
-        torch.float8_e4m3fn).min)
+    unfused_module_in_checkpoint = (weight_scale[-1]
+                                    > torch.finfo(torch.float8_e4m3fn).min)
 
     # If unfused checkpoint, need requanize with the single scale.
     if unfused_module_in_checkpoint:
diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index c2d12c466ba45..8dc26309d754e 100644
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -716,9 +716,10 @@ def _sample_with_torch(
       tensors required for Pythonization
     '''
 
-    categorized_seq_group_ids: Dict[SamplingType,
-                                    List[int]] = {t: []
-                                                  for t in SamplingType}
+    categorized_seq_group_ids: Dict[SamplingType, List[int]] = {
+        t: []
+        for t in SamplingType
+    }
     categorized_sample_indices = sampling_metadata.categorized_sample_indices
     for i, seq_group in enumerate(sampling_metadata.seq_groups):
         sampling_params = seq_group.sampling_params
diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py
index 3eb5c39ccf580..f230efacacdbb 100644
--- a/vllm/model_executor/layers/vocab_parallel_embedding.py
+++ b/vllm/model_executor/layers/vocab_parallel_embedding.py
@@ -115,17 +115,17 @@ def num_elements_padded(self) -> int:
 
     def __post_init__(self):
         # sanity checks
-        assert (self.padded_org_vocab_start_index <=
-                self.padded_org_vocab_end_index)
-        assert (self.padded_added_vocab_start_index <=
-                self.padded_added_vocab_end_index)
+        assert (self.padded_org_vocab_start_index
+                <= self.padded_org_vocab_end_index)
+        assert (self.padded_added_vocab_start_index
+                <= self.padded_added_vocab_end_index)
 
         assert self.org_vocab_start_index <= self.org_vocab_end_index
         assert self.added_vocab_start_index <= self.added_vocab_end_index
 
         assert self.org_vocab_start_index <= self.padded_org_vocab_start_index
-        assert (self.added_vocab_start_index <=
-                self.padded_added_vocab_start_index)
+        assert (self.added_vocab_start_index
+                <= self.padded_added_vocab_start_index)
         assert self.org_vocab_end_index <= self.padded_org_vocab_end_index
         assert self.added_vocab_end_index <= self.padded_added_vocab_end_index
 
@@ -141,8 +141,8 @@ def get_masked_input_and_mask(
         added_vocab_end_index: int) -> Tuple[torch.Tensor, torch.Tensor]:
     # torch.compile will fuse all of the pointwise ops below
     # into a single kernel, making it very fast
-    org_vocab_mask = (input_ >= org_vocab_start_index) & (input_ <
-                                                          org_vocab_end_index)
+    org_vocab_mask = (input_ >= org_vocab_start_index) & (
+        input_ < org_vocab_end_index)
     added_vocab_mask = (input_ >= added_vocab_start_index) & (
         input_ < added_vocab_end_index)
     added_offset = added_vocab_start_index - (
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 527b4307f3670..712266ee42639 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -1121,8 +1121,9 @@ def _load_weights(self, model_config: ModelConfig,
                 # from being incorrectly identified as being present in
                 # 'vpm.encoder.layers.0.self_attn.qkv_proj.weight
                 shard_pos = quant_param_name.find(shard_name)
-                can_correct_rename = (shard_pos > 0) and (
-                    quant_param_name[shard_pos - 1] == ".")
+                can_correct_rename = (shard_pos
+                                      > 0) and (quant_param_name[shard_pos - 1]
+                                                == ".")
                 # If the quant_param_name is packed, it won't occur in the
                 # param_dict before renaming.
                 new_quant_param_name = quant_param_name.replace(
diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py
index e359aef9dcb7f..9266ca75ddaac 100644
--- a/vllm/model_executor/model_loader/tensorizer.py
+++ b/vllm/model_executor/model_loader/tensorizer.py
@@ -298,8 +298,8 @@ def _resize_lora_embeddings(self):
         to allow for adapter added tokens."""
         for child in self.model.modules():
             if (isinstance(child, VocabParallelEmbedding)
-                    and child.weight.shape[0] <
-                    child.num_embeddings_per_partition):
+                    and child.weight.shape[0]
+                    < child.num_embeddings_per_partition):
                 new_weight = torch.empty(child.num_embeddings_per_partition,
                                          child.embedding_dim,
                                          dtype=child.weight.dtype,
diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
index 6de0c866bc2f0..b23aba829c549 100644
--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Gemma model compatible with HuggingFace weights."""
-from functools import lru_cache
+from functools import cache
 from typing import Iterable, List, Optional, Set, Tuple, Union
 
 import torch
@@ -48,7 +48,7 @@
 logger = init_logger(__name__)
 
 
-@lru_cache(maxsize=None)
+@cache
 def _get_gemma_act_fn(
     hidden_act: Optional[str],
     hidden_activation: Optional[str],
diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py
index b518a0a6cbdee..cdf9414d5949c 100644
--- a/vllm/model_executor/models/granitemoe.py
+++ b/vllm/model_executor/models/granitemoe.py
@@ -429,10 +429,10 @@ def load_weights(self, weights: Iterable[Tuple[str,
                 for e in range(p.size(0)):
                     w1_name = n.replace(
                         '.block_sparse_moe.input_linear.weight',
-                        ".block_sparse_moe.experts.%d.w1.weight" % e)
+                        f".block_sparse_moe.experts.{e}.w1.weight")
                     w3_name = n.replace(
                         '.block_sparse_moe.input_linear.weight',
-                        ".block_sparse_moe.experts.%d.w3.weight" % e)
+                        f".block_sparse_moe.experts.{e}.w3.weight")
                     w1_param, w3_param = p[e].chunk(2, dim=0)
                     assert w1_name not in new_weights
                     assert w3_name not in new_weights
@@ -442,7 +442,7 @@ def load_weights(self, weights: Iterable[Tuple[str,
                 for e in range(p.size(0)):
                     w2_name = n.replace(
                         '.block_sparse_moe.output_linear.weight',
-                        ".block_sparse_moe.experts.%d.w2.weight" % e)
+                        f".block_sparse_moe.experts.{e}.w2.weight")
                     w2_param = p[e]
                     assert w2_name not in new_weights
                     new_weights[w2_name] = w2_param
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index 61baa8e588d74..e15ac84a6049b 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -1365,8 +1365,8 @@ def forward(
         # For 1) text-only prefill and decode, 2) image-present decode.
         if image_inputs is None:
             full_text_row_masked_out_mask = (
-                attn_metadata.encoder_seq_lens_tensor != 0).reshape(-1, 1).to(
-                    input_ids.device)
+                attn_metadata.encoder_seq_lens_tensor
+                != 0).reshape(-1, 1).to(input_ids.device)
             skip_cross_attention = max(attn_metadata.encoder_seq_lens) == 0
 
         # For image-present prefill.
diff --git a/vllm/model_executor/models/mlp_speculator.py b/vllm/model_executor/models/mlp_speculator.py
index d49da5f29aa14..f1d796ca26a16 100644
--- a/vllm/model_executor/models/mlp_speculator.py
+++ b/vllm/model_executor/models/mlp_speculator.py
@@ -81,8 +81,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
 
         if self.tie_weights:
             assert (
-                self.n_predict >
-                1), "You cannot tie weights between stages when only 1 exists"
+                self.n_predict > 1
+            ), "You cannot tie weights between stages when only 1 exists"
             embedding = VocabParallelEmbedding(
                 config.vocab_size,
                 self.inner_dim,
diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py
index 881c09ea9db99..6367b770a0aff 100644
--- a/vllm/model_executor/models/phimoe.py
+++ b/vllm/model_executor/models/phimoe.py
@@ -167,8 +167,8 @@ def sparsemixer(scores, jitter_eps=0.01):
         # compute mask for sparsity
         mask_logits_threshold, max_ind = scores.max(dim=-1, keepdim=True)
         factor = scores.abs().clamp(min=mask_logits_threshold)
-        mask_logits_threshold = (
-            (mask_logits_threshold - scores) / factor) > (2 * jitter_eps)
+        mask_logits_threshold = ((mask_logits_threshold - scores) /
+                                 factor) > (2 * jitter_eps)
 
     # apply mask
     masked_gates = scores.masked_fill(mask_logits_threshold, float("-inf"))
@@ -192,8 +192,8 @@ def sparsemixer(scores, jitter_eps=0.01):
         mask_logits_threshold, max_ind = masked_scores.max(dim=-1,
                                                            keepdim=True)
         factor = scores.abs().clamp(min=mask_logits_threshold)
-        mask_logits_threshold = (
-            (mask_logits_threshold - scores) / factor) > (2 * jitter_eps)
+        mask_logits_threshold = ((mask_logits_threshold - scores) /
+                                 factor) > (2 * jitter_eps)
 
     # apply mask
     masked_gates_top2 = masked_scores.masked_fill(mask_logits_threshold,
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 8d2719ca2d00d..8d71b19060bf4 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -462,7 +462,8 @@ def is_hybrid_model(
 
 
 ModelRegistry = _ModelRegistry({
-    model_arch: _LazyRegisteredModel(
+    model_arch:
+    _LazyRegisteredModel(
         module_name=f"vllm.model_executor.models.{mod_relname}",
         class_name=cls_name,
     )
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index d577e545a473b..605a0ecf4e0a9 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -333,10 +333,10 @@ def forward(
         return hidden_states
 
 
-@MULTIMODAL_REGISTRY.register_processor(UltravoxMultiModalProcessor,
-                                        info=UltravoxProcessingInfo,
-                                        dummy_inputs=UltravoxDummyInputsBuilder
-                                        )
+@MULTIMODAL_REGISTRY.register_processor(
+    UltravoxMultiModalProcessor,
+    info=UltravoxProcessingInfo,
+    dummy_inputs=UltravoxDummyInputsBuilder)
 class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP):
 
     hf_to_vllm_mapper = WeightsMapper(
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index 43b3c973c97b8..01a232fdc76de 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -599,9 +599,8 @@ def make_empty_intermediate_tensors(
         device: torch.device,
     ) -> IntermediateTensors:
         return IntermediateTensors({
-            key: torch.zeros((batch_size, hidden_size),
-                             dtype=dtype,
-                             device=device)
+            key:
+            torch.zeros((batch_size, hidden_size), dtype=dtype, device=device)
             for key in keys
         })
 
diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py
index 1df8f84ed4093..61e8881b64f5d 100644
--- a/vllm/model_executor/sampling_metadata.py
+++ b/vllm/model_executor/sampling_metadata.py
@@ -166,7 +166,8 @@ def prepare(
             pin_memory=pin_memory,
         )
         categorized_sample_indices = {
-            t: async_tensor_h2d(
+            t:
+            async_tensor_h2d(
                 seq_ids,
                 dtype=torch.int,
                 target_device=device,
@@ -198,8 +199,12 @@ def _prepare_seq_groups(
     device: str,
     generators: Optional[Dict[str, torch.Generator]] = None,
     cache: Optional[SamplingMetadataCache] = None,
-) -> Tuple[List[SequenceGroupToSample], List[int], Dict[SamplingType,
-                                                        List[int]], int, ]:
+) -> Tuple[
+        List[SequenceGroupToSample],
+        List[int],
+        Dict[SamplingType, List[int]],
+        int,
+]:
     """Prepare sequence groups and indices for sampling.
 
     Args:
diff --git a/vllm/platforms/neuron.py b/vllm/platforms/neuron.py
index ead3dab05a6b1..23a7126fb05cf 100644
--- a/vllm/platforms/neuron.py
+++ b/vllm/platforms/neuron.py
@@ -38,8 +38,8 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         if parallel_config.world_size > 1:
             parallel_config.distributed_executor_backend = "uni"
 
-        assert (vllm_config.lora_config is
-                None), "LoRA is not supported for Neuron backend."
+        assert (vllm_config.lora_config
+                is None), "LoRA is not supported for Neuron backend."
         assert (not vllm_config.speculative_config
                 ), "Speculative decoding not yet supported for Neuron backend."
 
diff --git a/vllm/scalar_type.py b/vllm/scalar_type.py
index 9d711b0debcd8..20063a5b4b085 100644
--- a/vllm/scalar_type.py
+++ b/vllm/scalar_type.py
@@ -121,8 +121,8 @@ def _raw_min(self) -> Union[int, float]:
             min_raw = max_raw | sign_bit_double
             return struct.unpack('!d', struct.pack('!Q', min_raw))[0]
         else:
-            assert (not self.is_signed() or
-                    self.size_bits <= 64), "Cannot represent min as a int64_t"
+            assert (not self.is_signed() or self.size_bits
+                    <= 64), "Cannot represent min as a int64_t"
 
             if self.is_signed():
                 return -(1 << (self.size_bits - 1))
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index 8e9802c7d333c..af1c4dfcebbc0 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -510,8 +510,8 @@ def _should_disable_all_speculation(
             self, execute_model_req: ExecuteModelRequest) -> bool:
         # When the batch size is too large, disable speculative decoding
         # to stop trading off throughput for latency.
-        return (execute_model_req.running_queue_size >=
-                self.disable_by_batch_size)
+        return (execute_model_req.running_queue_size
+                >= self.disable_by_batch_size)
 
     def _maybe_disable_speculative_tokens(
             self, disable_all_speculation: bool,
diff --git a/vllm/spec_decode/top1_proposer.py b/vllm/spec_decode/top1_proposer.py
index 5a7999a258b2d..6bf7587cdda19 100644
--- a/vllm/spec_decode/top1_proposer.py
+++ b/vllm/spec_decode/top1_proposer.py
@@ -104,11 +104,11 @@ def get_spec_proposals(
             sampler_transposed=transposed,
         )
 
-        proposals = SpeculativeProposals(
-            proposal_token_ids=proposal_tokens,
-            proposal_probs=proposal_probs,
-            proposal_lens=proposal_lens,
-            no_proposals=maybe_sampler_output is None)
+        proposals = SpeculativeProposals(proposal_token_ids=proposal_tokens,
+                                         proposal_probs=proposal_probs,
+                                         proposal_lens=proposal_lens,
+                                         no_proposals=maybe_sampler_output
+                                         is None)
         return proposals
 
     def _split_by_proposal_len(
diff --git a/vllm/spec_decode/util.py b/vllm/spec_decode/util.py
index da8706658d09a..c88820ab27b69 100644
--- a/vllm/spec_decode/util.py
+++ b/vllm/spec_decode/util.py
@@ -40,13 +40,15 @@ def get_sampled_token_logprobs(
     """
     num_steps, batch_size, vocab_size = logprob_tensor.shape
 
-    selected_logprobs = logprob_tensor[torch.arange(num_steps).unsqueeze(1),
-                                       torch.arange(batch_size),
-                                       sampled_token_ids, ]
+    selected_logprobs = logprob_tensor[
+        torch.arange(num_steps).unsqueeze(1),
+        torch.arange(batch_size),
+        sampled_token_ids,
+    ]
     expanded_selected_logprobs = selected_logprobs.unsqueeze(-1).expand(
         -1, -1, vocab_size)
-    sampled_token_ids_ranks = (logprob_tensor >
-                               expanded_selected_logprobs).sum(-1).add_(1)
+    sampled_token_ids_ranks = (logprob_tensor
+                               > expanded_selected_logprobs).sum(-1).add_(1)
 
     return sampled_token_ids_ranks, selected_logprobs
 
diff --git a/vllm/transformers_utils/configs/nemotron.py b/vllm/transformers_utils/configs/nemotron.py
index 93fec667d1cf3..1edf36329d83b 100644
--- a/vllm/transformers_utils/configs/nemotron.py
+++ b/vllm/transformers_utils/configs/nemotron.py
@@ -182,8 +182,8 @@ def _rope_scaling_validation(self):
         if self.rope_scaling is None:
             return
 
-        if not isinstance(self.rope_scaling,
-                          dict) or len(self.rope_scaling) != 2:
+        if not isinstance(self.rope_scaling, dict) or len(
+                self.rope_scaling) != 2:
             raise ValueError(
                 "`rope_scaling` must be a dictionary with two fields, "
                 f"`type` and `factor`, got {self.rope_scaling}")
diff --git a/vllm/utils.py b/vllm/utils.py
index 17bffd2846b46..15481fb06e08e 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -29,7 +29,7 @@
 from collections import OrderedDict, UserDict, defaultdict
 from collections.abc import Hashable, Iterable, Mapping
 from dataclasses import dataclass, field
-from functools import lru_cache, partial, wraps
+from functools import cache, lru_cache, partial, wraps
 from typing import (TYPE_CHECKING, Any, AsyncGenerator, Awaitable, Callable,
                     Dict, Generator, Generic, Iterator, List, Literal,
                     NamedTuple, Optional, Tuple, Type, TypeVar, Union,
@@ -352,7 +352,7 @@ def reset(self):
         self._index = 0
 
 
-@lru_cache(maxsize=None)
+@cache
 def get_max_shared_memory_bytes(gpu: int = 0) -> int:
     """Returns the maximum shared memory per thread block in bytes."""
     from vllm import _custom_ops as ops
@@ -697,7 +697,7 @@ def create_kv_caches_with_random(
     return key_caches, value_caches
 
 
-@lru_cache(maxsize=None)
+@cache
 def is_pin_memory_available() -> bool:
     from vllm.platforms import current_platform
     return current_platform.is_pin_memory_available()
@@ -886,7 +886,7 @@ def init_cached_hf_modules() -> None:
     init_hf_modules()
 
 
-@lru_cache(maxsize=None)
+@cache
 def find_library(lib_name: str) -> str:
     """
     Find the library file in the system.
@@ -1607,7 +1607,7 @@ def import_from_path(module_name: str, file_path: Union[str, os.PathLike]):
     return module
 
 
-@lru_cache(maxsize=None)
+@cache
 def get_vllm_optional_dependencies():
     metadata = importlib.metadata.metadata("vllm")
     requirements = metadata.get_all("Requires-Dist", [])
diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index de7fb1a698df6..7a88cc9433b32 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -247,8 +247,8 @@ def schedule(self) -> "SchedulerOutput":
                 token_budget -= num_new_tokens
                 request.status = RequestStatus.RUNNING
                 request.num_computed_tokens = num_computed_tokens
-                has_partial_request = (num_computed_tokens + num_new_tokens <
-                                       request.num_tokens)
+                has_partial_request = (num_computed_tokens + num_new_tokens
+                                       < request.num_tokens)
 
                 # Encoder-related.
                 if encoder_inputs_to_schedule:
diff --git a/vllm/v1/stats/common.py b/vllm/v1/stats/common.py
index 500bc356fc179..902800e0573bf 100644
--- a/vllm/v1/stats/common.py
+++ b/vllm/v1/stats/common.py
@@ -311,8 +311,8 @@ def output_token_latency_s_lst(self) -> List[float]:
             return []
         latency_s_lst = []
         for i in range(1, len(self.output_token_ts_s_lst)):
-            assert (self.output_token_ts_s_lst[i] >=
-                    self.output_token_ts_s_lst[i - 1])
+            assert (self.output_token_ts_s_lst[i]
+                    >= self.output_token_ts_s_lst[i - 1])
             latency_s = (self.output_token_ts_s_lst[i] -
                          self.output_token_ts_s_lst[i - 1])
             latency_s_lst.append(latency_s)
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 9d7e30079dfbb..a00c00c307335 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -205,7 +205,7 @@ def __init__(
 
     def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
         # Remove stopped requests from the cached states.
-        # Keep the states of the pre-empted requests.
+        # Keep the states of the preempted requests.
         for req_id in scheduler_output.finished_req_ids:
             self.requests.pop(req_id, None)
             self.encoder_cache.pop(req_id, None)
diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py
index 3c570212625c4..aaf9cb40bf2aa 100644
--- a/vllm/worker/hpu_worker.py
+++ b/vllm/worker/hpu_worker.py
@@ -173,13 +173,13 @@ def execute_model(
                 cpu_fallback_ctx as cpu_fallback_local_metric:
                 output = LocalOrDistributedWorkerBase.execute_model(
                     self, execute_model_req)
-            if (log_graph_compilation and gc_local_metric.stats()[0][1] > 0
-                ) or log_graph_compilation_all:
+            if (log_graph_compilation and gc_local_metric.stats()[0][1]
+                    > 0) or log_graph_compilation_all:
                 msg = ("VLLM_HPU_STEP_GRAPH_COMPILATION: "
                        f"{gc_local_metric.stats()}, {input_stats}")
                 logger.warning(msg)
-            if (log_cpu_fallbacks and cpu_fallback_local_metric.stats()[0][1] >
-                    0) or log_cpu_fallbacks_all:
+            if (log_cpu_fallbacks and cpu_fallback_local_metric.stats()[0][1]
+                    > 0) or log_cpu_fallbacks_all:
                 msg = ("VLLM_HPU_STEP_CPU_FALLBACK: "
                        f"{cpu_fallback_local_metric.stats()}, {input_stats}")
                 logger.warning(msg)
diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py
index a3f648f4cc645..8749518284288 100644
--- a/vllm/worker/tpu_model_runner.py
+++ b/vllm/worker/tpu_model_runner.py
@@ -316,8 +316,8 @@ def warmup_model(
                     logger.info("batch_size: %d, seq_len: %d", batch_size,
                                 seq_len)
                     num_tokens = batch_size * seq_len
-                    if (num_tokens >=
-                            self.scheduler_config.max_num_batched_tokens):
+                    if (num_tokens
+                            >= self.scheduler_config.max_num_batched_tokens):
                         break
                     seq_len = seq_len * 2
             end = time.time()

From ddee88d0ff2757bdef98a83a9c78af1ea4559758 Mon Sep 17 00:00:00 2001
From: Liangfu Chen <liangfc@amazon.com>
Date: Mon, 27 Jan 2025 17:31:16 -0800
Subject: [PATCH 402/462] [Neuron][Kernel] NKI-based flash-attention kernel
 with paged KV cache (#11277)

Signed-off-by: Liangfu Chen <liangfc@amazon.com>
Co-authored-by: Jiangfei Duan <jfduan@outlook.com>
---
 .buildkite/run-neuron-test.sh        |   2 +-
 tests/neuron/test_prefix_prefill.py  | 456 ++++++++++++++++++
 vllm/attention/ops/nki_flash_attn.py | 669 +++++++++++++++++++++++++++
 3 files changed, 1126 insertions(+), 1 deletion(-)
 create mode 100644 tests/neuron/test_prefix_prefill.py
 create mode 100644 vllm/attention/ops/nki_flash_attn.py

diff --git a/.buildkite/run-neuron-test.sh b/.buildkite/run-neuron-test.sh
index 0590dad4f311f..1ad77cf50f612 100644
--- a/.buildkite/run-neuron-test.sh
+++ b/.buildkite/run-neuron-test.sh
@@ -54,4 +54,4 @@ docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \
        -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
        --name "${container_name}" \
        ${image_name} \
-       /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py"
+       /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/ -v --capture=tee-sys"
diff --git a/tests/neuron/test_prefix_prefill.py b/tests/neuron/test_prefix_prefill.py
new file mode 100644
index 0000000000000..77b707a737118
--- /dev/null
+++ b/tests/neuron/test_prefix_prefill.py
@@ -0,0 +1,456 @@
+import random
+from typing import Optional
+
+import pytest
+import torch
+import torch.nn.functional as F
+
+
+class BlockDiagonalCausalFromBottomRightMask:
+
+    @staticmethod
+    def _from_seqlens(query_lens, seq_lens, block_size=None):
+        from torch import logical_and, logical_or
+
+        contexted = block_size is None
+        context_lens = torch.tensor(seq_lens) - torch.tensor(query_lens)
+        n_queries = sum(query_lens)
+        num_seqs = len(query_lens)
+        if contexted:
+            key_lens_blockaligned = seq_lens
+        else:
+            n_blocks_per_seq = (context_lens + block_size - 1) // block_size
+            offset_per_seq = n_blocks_per_seq * block_size
+            key_lens_blockaligned = offset_per_seq[:num_seqs].tolist()
+        n_keys = sum(key_lens_blockaligned)
+
+        a = (torch.arange(n_queries).reshape(n_queries,
+                                             1).expand(n_queries, n_keys))
+        b = torch.arange(n_keys).reshape(1, n_keys).expand(n_queries, n_keys)
+        q_cumsum = torch.tensor([0] + query_lens).cumsum(dim=0)
+        k_cumsum = torch.tensor([0] + key_lens_blockaligned).cumsum(dim=0)
+
+        prior_mask = torch.zeros(n_queries, n_keys)
+        new_masks: list[torch.Tensor] = []
+        for seq_id in range(num_seqs):
+            ri = q_cumsum[seq_id]
+            ci = k_cumsum[seq_id]
+            nr = query_lens[seq_id]
+
+            if contexted:
+                nc = seq_lens[seq_id]
+                a_offset = ci + nc - ri - nr
+                new_mask = (a + a_offset) >= b
+            else:
+                nc = context_lens[seq_id]
+                a_offset = ci + nc - 1
+                new_mask = a_offset >= b
+
+            left_mask = b >= ci
+            top_mask = a >= ri
+            bottom_mask = a < (ri + nr)
+
+            new_mask = logical_and(
+                logical_and(logical_and(new_mask, left_mask), top_mask),
+                bottom_mask,
+            )
+            prior_mask = logical_or(prior_mask, new_mask)
+            new_masks = new_masks + [new_mask]
+        return prior_mask
+
+    @staticmethod
+    def from_seqlens(query_lens, seq_lens, block_size=None):
+        contexted = block_size is None
+        if contexted:
+            prior_mask = BlockDiagonalCausalFromBottomRightMask._from_seqlens(
+                query_lens, seq_lens)
+            active_mask = None
+        else:
+            prior_mask = BlockDiagonalCausalFromBottomRightMask._from_seqlens(
+                query_lens, seq_lens, block_size)
+            active_mask = BlockDiagonalCausalFromBottomRightMask._from_seqlens(
+                query_lens, query_lens)
+        return prior_mask, active_mask
+
+
+def ref_softmax(x: torch.Tensor,
+                dim: int,
+                mixed_precision=False,
+                return_max_reduce=False):
+    max_value = torch.amax(x, dim=dim, keepdims=True)
+    exp = torch.exp(x - max_value)
+    if mixed_precision:
+        sum_value = torch.sum(exp.astype(torch.float32),
+                              dim=dim,
+                              keepdims=True).astype(x.dtype)
+    else:
+        sum_value = torch.sum(exp, dim=dim, keepdims=True)
+    if return_max_reduce:
+        return exp / sum_value, max_value, torch.reciprocal(sum_value)
+    return exp / sum_value
+
+
+def ref_masked_attention(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    scale: float,
+    attn_mask: Optional[torch.Tensor] = None,
+    return_max_reduce: Optional[bool] = False,
+) -> torch.Tensor:
+    scaled_qk = scale * torch.einsum("qhd,khd->hqk", query, key).float()
+    if attn_mask is not None:
+        masked_score = scaled_qk + attn_mask.float()
+    if return_max_reduce:
+        norm_score, cached_max, cached_sum_reciprocal = ref_softmax(
+            masked_score, dim=-1, return_max_reduce=True)
+    else:
+        norm_score = ref_softmax(masked_score, dim=-1)
+    out = torch.einsum("hqk,khd->qhd", norm_score, value)
+    if return_max_reduce:
+        return (
+            out,
+            cached_max,
+            cached_sum_reciprocal,
+            norm_score,
+            masked_score,
+            scaled_qk,
+        )
+    else:
+        return out
+
+
+def ref_context_attention(
+    query,
+    key,
+    value,
+    query_lens,
+    seq_lens,
+    head_size,
+    num_kv_heads,
+    num_heads,
+    num_queries_per_kv,
+    return_max_reduce=False,
+):
+    scale = float(1.0 / (head_size**0.5))
+    if num_queries_per_kv > 1:
+        # Handle MQA and GQA
+        key = torch.repeat_interleave(key, num_queries_per_kv, dim=1)
+        value = torch.repeat_interleave(value, num_queries_per_kv, dim=1)
+
+    attn_mask, _ = BlockDiagonalCausalFromBottomRightMask.from_seqlens(
+        query_lens, seq_lens)
+
+    # convert binary mask to -inf values
+    attn_mask = torch.logical_not(attn_mask)
+    attn_mask = attn_mask.float() * -30000
+
+    output, cached_max, cached_sum_reciprocal, lse, masked_score, scaled_qk = (
+        ref_masked_attention(
+            query,
+            key,
+            value,
+            scale,
+            attn_mask,
+            return_max_reduce=return_max_reduce,
+        ))
+
+    output = output.unsqueeze(1)
+    if return_max_reduce:
+        return (
+            output,
+            cached_max,
+            cached_sum_reciprocal,
+            lse,
+            masked_score,
+            scaled_qk,
+        )
+    else:
+        return output
+
+
+@pytest.mark.parametrize(
+    "num_heads,num_queries_per_kv,head_size,mixed_precision",
+    [
+        (4, 2, 8, False),
+        (4, 2, 8, True),
+        (32, 8, 64, True),
+    ],
+)
+@torch.inference_mode()
+def test_contexted_kv_attention(
+    num_heads: int,
+    num_queries_per_kv: int,
+    head_size: int,
+    mixed_precision: bool,
+) -> None:
+    import os
+
+    import torch_xla.core.xla_model as xm
+
+    from vllm.attention.ops.nki_flash_attn import flash_attn_varlen_nkifunc
+
+    device = xm.xla_device()
+
+    os.environ["NEURON_CC_FLAGS"] = (
+        " --model-type=transformer -O1 "
+        " --internal-hlo2tensorizer-options='--verify-hlo' ")
+
+    random.seed(0)
+    torch.manual_seed(0)
+    torch.set_printoptions(sci_mode=False)
+
+    min_ctx_len = 2
+    max_ctx_len = 64
+    min_query_len = 2
+    max_query_len = 64
+    prefill_batch_size = 2
+    decode_batch_size = 6
+    batch_size = prefill_batch_size + decode_batch_size
+    block_size = 32
+    max_model_len = (max_query_len + max_ctx_len) * 4
+
+    max_block_per_request = max_model_len // block_size
+    dtype = torch.float32
+    cache_size = (batch_size * max_block_per_request) + 2
+    ctx_lens = [
+        random.randint(min_ctx_len, max_ctx_len)
+        for _ in range(prefill_batch_size)
+    ] + [
+        random.randint(min_ctx_len, max_ctx_len)
+        for _ in range(decode_batch_size)
+    ]
+    query_lens = [
+        random.randint(min_query_len, max_query_len)
+        for _ in range(prefill_batch_size)
+    ] + [1 for _ in range(decode_batch_size)]
+    seq_lens = [a + b for a, b in zip(query_lens, ctx_lens)]
+    num_kv_heads = num_heads // num_queries_per_kv
+
+    num_tokens = sum(query_lens)
+    query = torch.empty(num_tokens, num_heads, head_size, dtype=dtype)
+    query.uniform_(-1, 1)
+    torch.empty(num_tokens, num_heads, head_size, dtype=dtype)
+
+    kv = torch.empty(sum(seq_lens), 2, num_kv_heads, head_size, dtype=dtype)
+    kv.uniform_(-1, 1)
+    key, value = kv.unbind(dim=1)
+
+    k_cache = torch.zeros(cache_size,
+                          block_size,
+                          num_kv_heads,
+                          head_size,
+                          dtype=dtype)
+    v_cache = torch.zeros(cache_size,
+                          block_size,
+                          num_kv_heads,
+                          head_size,
+                          dtype=dtype)
+    k = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype)
+    v = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype)
+    values = torch.arange(0, cache_size, dtype=torch.long)
+    values = values[torch.randperm(cache_size)]
+    block_table = values[:batch_size * max_block_per_request].view(
+        batch_size, max_block_per_request)
+    torch.tensor(seq_lens, dtype=torch.long)
+    b_ctx_len = torch.tensor(ctx_lens, dtype=torch.long)
+    b_start_loc = torch.cumsum(torch.tensor([0] + query_lens[:-1],
+                                            dtype=torch.long),
+                               dim=0)
+    # copy kv to cache
+    b_seq_start_loc = torch.cumsum(torch.tensor([0] + seq_lens[:-1],
+                                                dtype=torch.long),
+                                   dim=0)
+    for i in range(batch_size):
+        for j in range(query_lens[i]):
+            k[b_start_loc[i] + j].copy_(key[b_seq_start_loc[i] + b_ctx_len[i] +
+                                            j])
+            v[b_start_loc[i] + j].copy_(value[b_seq_start_loc[i] +
+                                              b_ctx_len[i] + j])
+        cur_ctx = 0
+        block_id = 0
+        while cur_ctx < b_ctx_len[i]:
+            start_loc = b_seq_start_loc[i] + cur_ctx
+            if cur_ctx + block_size > b_ctx_len[i]:
+                end_loc = b_seq_start_loc[i] + b_ctx_len[i]
+            else:
+                end_loc = start_loc + block_size
+            start_slot = block_table[i, block_id] * block_size
+            end_slot = start_slot + end_loc - start_loc
+            k_cache.view(-1, num_kv_heads,
+                         head_size)[start_slot:end_slot].copy_(
+                             key[start_loc:end_loc])
+            v_cache.view(-1, num_kv_heads,
+                         head_size)[start_slot:end_slot].copy_(
+                             value[start_loc:end_loc])
+            cur_ctx += block_size
+            block_id += 1
+
+    (
+        output_ref,
+        cached_max,
+        cached_sum_reciprocal,
+        lse,
+        masked_score,
+        scaled_qk,
+    ) = ref_context_attention(
+        query,
+        key,
+        value,
+        query_lens,
+        seq_lens,
+        head_size,
+        num_kv_heads,
+        num_heads,
+        num_queries_per_kv,
+        return_max_reduce=True,
+    )
+
+    # build neuron program
+    return_debug_tensors = False
+    B_P_SIZE = 128
+    LARGE_TILE_SZ = 2048
+    max_num_queries = (
+        (sum(query_lens) + block_size - 1) // block_size) * block_size
+
+    def get_active_block_tables(block_tables, query_lens, seq_lens, block_size,
+                                num_blocks):
+        context_lens = seq_lens - query_lens
+        blocks_per_seq = (context_lens + block_size - 1) // block_size
+        num_seqs = len(seq_lens)
+        active_blocks: list[int] = []
+        for seq_id in range(num_seqs):
+            active_blocks = (
+                active_blocks +
+                block_tables[seq_id, :blocks_per_seq[seq_id]].tolist())
+        return F.pad(
+            torch.tensor(active_blocks),
+            (0, num_blocks - len(active_blocks)),
+            "constant",
+            0,
+        )
+
+    def shift_bit_length(x):
+        return 1 << (x - 1).bit_length()
+
+    # calculate input shapes
+    max_num_queries_shifted = shift_bit_length(max_num_queries)
+    max_num_queries_factor = B_P_SIZE // max_num_queries_shifted
+    max_num_queries_padded = max_num_queries_shifted * max_num_queries_factor
+    assert (max_num_queries_padded == B_P_SIZE
+            ), "invalid {max_num_queries_padded=}"
+    head_size_padded = B_P_SIZE
+    context_lens = torch.tensor(seq_lens) - torch.tensor(query_lens)
+    num_active_blocks_shifted = shift_bit_length(
+        ((context_lens + block_size - 1) // block_size).sum().item())
+    num_active_blocks_factor = (LARGE_TILE_SZ // block_size //
+                                num_active_blocks_shifted)
+    num_active_blocks = num_active_blocks_shifted * num_active_blocks_factor
+    assert (num_active_blocks *
+            block_size) == LARGE_TILE_SZ, "invalid {num_active_blocks=}"
+    context_kv_len = num_active_blocks * block_size
+    assert context_kv_len == LARGE_TILE_SZ, f"invalid {context_kv_len=}"
+
+    # pad QKV tensors
+    pad_dims = (
+        0,
+        head_size_padded - query.shape[2],
+        0,
+        0,
+        0,
+        max_num_queries_padded - query.shape[0],
+    )
+    query = F.pad(query, pad_dims, "constant", 0)
+    k = F.pad(k, pad_dims, "constant", 0)
+    v = F.pad(v, pad_dims, "constant", 0)
+    k_cache = F.pad(k_cache, (0, head_size_padded - head_size), "constant", 0)
+    v_cache = F.pad(v_cache, (0, head_size_padded - head_size), "constant", 0)
+
+    # permute QKV tensors
+    # query: (1, n_heads, d, seq_q)
+    # key:   (1, n_kv_heads, d, seq_k)
+    # value: (1, n_kv_heads, seq_v, d)
+    query = query.unsqueeze(0).permute(0, 2, 3, 1).contiguous()
+    k = k.unsqueeze(0).permute(0, 2, 3, 1).contiguous()
+    v = v.unsqueeze(0).permute(0, 2, 1, 3).contiguous()
+
+    # transform block table
+    active_block_table = get_active_block_tables(
+        block_table,
+        torch.tensor(query_lens),
+        torch.tensor(seq_lens),
+        block_size,
+        num_active_blocks,
+    )
+
+    # Build attention masks
+    prior_mask, active_mask = (
+        BlockDiagonalCausalFromBottomRightMask.from_seqlens(
+            query_lens, seq_lens, block_size=block_size))
+    attn_mask = torch.concat(
+        [
+            F.pad(
+                prior_mask,
+                (
+                    0,
+                    context_kv_len - prior_mask.shape[1],
+                    0,
+                    B_P_SIZE - prior_mask.shape[0],
+                ),
+                "constant",
+                0,
+            ).bool(),
+            F.pad(
+                active_mask,
+                (
+                    0,
+                    B_P_SIZE - active_mask.shape[1],
+                    0,
+                    B_P_SIZE - active_mask.shape[0],
+                ),
+                "constant",
+                0,
+            ).bool(),
+        ],
+        dim=1,
+    )
+
+    input_args = (
+        query.to(device=device),
+        k.to(device=device),
+        v.to(device=device),
+        k_cache.to(device=device),
+        v_cache.to(device=device),
+        active_block_table.to(torch.int32).to(device=device),
+        attn_mask.to(device=device),
+    )
+    input_kwargs = dict(
+        n_kv_head=num_kv_heads,
+        head_size=head_size,
+        mixed_precision=mixed_precision,
+    )
+
+    if return_debug_tensors:
+        output_nki, *debug_tensors = flash_attn_varlen_nkifunc(
+            *input_args, **input_kwargs)
+    else:
+        output_nki = flash_attn_varlen_nkifunc(*input_args, **input_kwargs)
+        debug_tensors = []
+
+    output_nki = torch.tensor(output_nki).cpu()
+    debug_tensors = [torch.tensor(dt).cpu() for dt in debug_tensors]
+
+    num_actual_tokens = sum(query_lens)
+    print(f"{num_actual_tokens=}")
+    # - o: shape (bs, n_heads, seq_q, d) -> (bs, seq_q, n_heads, d)
+    output_nki = output_nki.permute(
+        0, 2, 1, 3)[:, :, :, :head_size].cpu()[0, :num_actual_tokens, :, :]
+    output_ref_padded = F.pad(
+        output_ref,
+        (0, 0, 0, 0, 0, 0, 0, max_num_queries_padded - output_ref.shape[0]),
+        "constant",
+        0,
+    )
+    output_ref = output_ref_padded.transpose(0, 1)[0, :num_actual_tokens, :, :]
+
+    torch.testing.assert_close(output_nki, output_ref, atol=1e-2, rtol=0)
diff --git a/vllm/attention/ops/nki_flash_attn.py b/vllm/attention/ops/nki_flash_attn.py
new file mode 100644
index 0000000000000..b9765b0f0283d
--- /dev/null
+++ b/vllm/attention/ops/nki_flash_attn.py
@@ -0,0 +1,669 @@
+from dataclasses import dataclass
+
+import neuronxcc.nki.isa as nisa
+import neuronxcc.nki.language as nl
+import numpy as np
+from neuronxcc import nki
+from neuronxcc.nki.language import par_dim
+
+
+@dataclass(frozen=True)
+class FlashConfig:
+    """
+    Config class for flash attention with default values
+    """
+
+    seq_tile_size: int = 2048
+    should_transpose_v: bool = False
+
+    __annotations__ = {
+        "seq_tile_size": int,
+        "should_transpose_v": bool,
+    }
+
+
+@nki.jit
+def transpose_p_local(p_local_transposed,
+                      p_local,
+                      LARGE_TILE_SZ,
+                      forward_mask,
+                      B_F_SIZE=512):
+    for i in nl.affine_range(LARGE_TILE_SZ // B_F_SIZE):
+        if nisa.get_nc_version() == nisa.nc_version.gen3:
+            p_local_t_tmp = nl.ndarray((par_dim(128), B_F_SIZE),
+                                       buffer=nl.sbuf,
+                                       dtype=p_local.dtype)
+        else:
+            p_local_t_tmp = nl.ndarray((par_dim(128), B_F_SIZE),
+                                       buffer=nl.psum,
+                                       dtype=np.float32)
+
+        for j in nl.affine_range(B_F_SIZE // 128):
+            j_128_slice = nl.ds(j * 128, 128)
+            i_j_128_slice = nl.ds(i * B_F_SIZE + j * 128, 128)
+
+            if nisa.get_nc_version() == nisa.nc_version.gen3:
+                p_local_t_tmp[:, j_128_slice] = nisa.dma_transpose(
+                    p_local[:, i_j_128_slice], mask=forward_mask)
+            else:
+                p_local_t_tmp[:, j_128_slice] = nisa.nc_transpose(
+                    p_local[:, i_j_128_slice], mask=forward_mask)
+
+        p_local_transposed[:, nl.ds(i * B_F_SIZE, B_F_SIZE)] = nl.copy(
+            p_local_t_tmp, dtype=p_local_transposed.dtype, mask=forward_mask)
+
+
+@nki.jit
+def _flash_attention_core(
+    q_local_tile,
+    k,
+    v,
+    q_h_per_k_h,
+    seqlen_q,
+    nheads,
+    o_buffer,
+    l_buffer,
+    m_buffer,
+    batch_id,
+    head_id,
+    gqa_head_idx,
+    q_tile_idx,
+    local_k_large_tile_idx,
+    kernel_dtype,
+    acc_type,
+    flash_config: FlashConfig,
+    use_causal_mask=False,
+    continuous_batching_mask=None,
+    initialize=False,
+    B_P_SIZE=128,
+    B_F_SIZE=512,
+    B_D_SIZE=128,
+    dropout_p=0.0,
+    dropout_p_tensor=None,
+    seed_tensor=None,
+    logit_bias_tile=None,
+    qk_res_buffer=None,
+):
+    """
+    The flash attention core function to calculate self attention between a tile
+    of q and a block of K and V.
+    The q_local_tile has (B_P_SIZE, B_F_SIZE), which is loaded into the SBUF 
+    already. The block size of K and V
+    is defined in the seq_tile_size of the flash_config. The results are stored
+    in the following three buffers
+    o_buffer: (B_P_SIZE, d)
+    l_buffer: (B_P_SIZE, 1)
+    m_buffer: (B_P_SIZE, 1)
+    """
+    LARGE_TILE_SZ = flash_config.seq_tile_size
+    num_k_tile_per_large_tile = LARGE_TILE_SZ // B_F_SIZE
+    seqlen_k = k.shape[-1]
+    seqlen_q // B_P_SIZE
+    seqlen_k // B_F_SIZE
+
+    # TODO : support logit_bias with continuous_batching_mask
+    assert not use_causal_mask, "causal mask is not supported."
+    assert (continuous_batching_mask
+            is not None), "continuous_batching_mask input is required."
+    if continuous_batching_mask is not None:
+        assert (logit_bias_tile is
+                None), "continuous_batching_mask does not support logit_bias!"
+
+    # mask are used to only apply computation to the lower half of the matrix,
+    # which reduce the arthimetic intensity by half
+    forward_mask = (q_tile_idx * B_P_SIZE >= local_k_large_tile_idx *
+                    LARGE_TILE_SZ if use_causal_mask else None)
+
+    qk_res_buf = nl.ndarray((par_dim(B_P_SIZE), LARGE_TILE_SZ),
+                            buffer=nl.sbuf,
+                            dtype=acc_type)
+    max_local = nl.ndarray((par_dim(B_P_SIZE), num_k_tile_per_large_tile),
+                           dtype=acc_type)
+    for k_i in nl.affine_range(num_k_tile_per_large_tile):
+        k_i_b_f_slice = nl.ds(k_i * B_F_SIZE, B_F_SIZE)
+
+        qk_psum = nl.zeros((par_dim(B_P_SIZE), B_F_SIZE),
+                           dtype=np.float32,
+                           buffer=nl.psum)  # (128, 512)
+        qk_psum[:, :] = nl.matmul(q_local_tile,
+                                  k[:, k_i_b_f_slice],
+                                  transpose_x=True,
+                                  mask=None)  # (p(128), 512)
+
+        qk_res_buf[:, k_i_b_f_slice] = nl.where(
+            continuous_batching_mask[:, k_i_b_f_slice],
+            qk_psum[:, nl.ds(0, B_F_SIZE)],
+            -9984.0,
+            dtype=acc_type,
+        )
+
+        # Calculate max of the current tile
+        max_local[:, k_i] = nisa.tensor_reduce(
+            np.max,
+            qk_res_buf[:, k_i_b_f_slice],
+            axis=(1, ),
+            dtype=acc_type,
+            negate=False,
+            mask=forward_mask,
+        )
+
+    if qk_res_buffer is not None:
+        qk_res_buffer[:, :] = nl.copy(qk_res_buf[:, :])
+
+    max_ = nisa.tensor_reduce(
+        np.max,
+        max_local[:, :],
+        axis=(1, ),
+        dtype=acc_type,
+        negate=False,
+        mask=forward_mask,
+    )
+
+    o_previous_scaled = nl.ndarray((par_dim(B_P_SIZE), B_D_SIZE),
+                                   dtype=o_buffer.dtype)
+
+    if initialize:
+        m_buffer[:, 0] = nl.copy(max_)
+        m_current = max_
+    else:
+        m_previous = nl.copy(m_buffer[:, 0])
+        m_buffer[:, 0] = nl.maximum(m_previous, max_,
+                                    mask=forward_mask)  # (128,1)
+
+        m_current = m_buffer[:, 0]
+        # Compute scaling factor
+        alpha = nisa.activation(
+            np.exp,
+            m_previous,
+            bias=-1 * m_current,
+            scale=1.0,
+            mask=forward_mask,
+        )
+        o_previous_scaled[...] = nl.multiply(o_buffer[:, :],
+                                             alpha,
+                                             mask=forward_mask)
+
+    p_local = nl.ndarray((par_dim(B_P_SIZE), LARGE_TILE_SZ),
+                         dtype=kernel_dtype)
+    REDUCTION_TILE = min(2048, LARGE_TILE_SZ // 2)
+
+    p_partial_sum = nl.ndarray(
+        (par_dim(B_P_SIZE), LARGE_TILE_SZ // REDUCTION_TILE), dtype=acc_type)
+
+    for k_r_i in nl.affine_range(LARGE_TILE_SZ // REDUCTION_TILE):
+        k_r_i_reduce_slice = nl.ds(k_r_i * REDUCTION_TILE, REDUCTION_TILE)
+
+        # compute exp(qk - max)
+        # Compute partial row - tile sum of exp(qk - max))
+        # FIXME : Use activation accumulate to accumulate over k_r_i loop ?
+        p_local[:, k_r_i_reduce_slice] = nisa.activation_reduce(
+            np.exp,
+            qk_res_buf[:, k_r_i_reduce_slice],
+            bias=-1 * m_current,
+            scale=1.0,
+            reduce_op=nl.add,
+            reduce_res=p_partial_sum[:, k_r_i],
+            dtype=kernel_dtype,
+            mask=forward_mask,
+        )
+
+    ps = nl.sum(p_partial_sum, axis=1, dtype=acc_type, mask=forward_mask)
+
+    p_local_transposed = nl.ndarray((par_dim(B_P_SIZE), LARGE_TILE_SZ),
+                                    dtype=kernel_dtype)
+    transpose_p_local(
+        p_local_transposed=p_local_transposed,
+        p_local=p_local,
+        LARGE_TILE_SZ=LARGE_TILE_SZ,
+        forward_mask=forward_mask,
+        B_F_SIZE=B_F_SIZE,
+    )
+
+    pv_psum = nl.zeros((par_dim(B_P_SIZE), B_D_SIZE),
+                       dtype=np.float32,
+                       buffer=nl.psum)
+    for k_i in nl.affine_range(LARGE_TILE_SZ // B_P_SIZE):
+        pv_psum[:, :] += nl.matmul(
+            p_local_transposed[:, nl.ds(k_i * B_P_SIZE, B_P_SIZE)],
+            v[k_i, :, :],
+            transpose_x=True,
+            mask=forward_mask,
+        )  # (128, 128) (p(Br), d)
+
+    if initialize:
+        o_buffer[:, :] = nl.copy(pv_psum[:, :])
+        l_buffer[:, 0] = nl.add(nl.log(ps), max_)
+    else:
+        o_buffer[:, :] = nl.add(o_previous_scaled, pv_psum, mask=forward_mask)
+
+        l_prev = l_buffer[:, 0]
+        l_exp = nl.add(
+            nl.exp(
+                nl.subtract(l_prev, m_current, mask=forward_mask),
+                mask=forward_mask,
+            ),
+            ps,
+            mask=forward_mask,
+        )
+        l_buffer[:, 0] = nl.add(m_current,
+                                nl.log(l_exp, mask=forward_mask),
+                                mask=forward_mask)
+
+
+@nki.jit
+def load_v_tile(v_hbm_tile, cur_v_tile, j, v_i, config):
+    LARGE_TILE_SZ = config.seq_tile_size
+    B_P_SIZE = 128
+
+    if not config.should_transpose_v:
+        cur_v_tile[v_i, :, :] = nl.load(
+            v_hbm_tile[nl.ds(j * LARGE_TILE_SZ + B_P_SIZE * v_i, B_P_SIZE), :],
+            dtype=cur_v_tile.dtype,
+        )
+        return
+
+    if nisa.get_nc_version() == nisa.nc_version.gen3:
+        cur_v_tile_transposed = nisa.dma_transpose(
+            v_hbm_tile[:,
+                       nl.ds(j * LARGE_TILE_SZ + B_P_SIZE * v_i, B_P_SIZE)])
+        cur_v_tile[v_i, :, :] = nisa.tensor_copy(cur_v_tile_transposed,
+                                                 dtype=cur_v_tile.dtype)
+        return
+
+    cur_v_tile[v_i, :, :] = nl.load_transpose2d(
+        v_hbm_tile[:, nl.ds(j * LARGE_TILE_SZ + B_P_SIZE * v_i, B_P_SIZE)],
+        dtype=cur_v_tile.dtype,
+    )
+
+
+@nki.jit
+def flash_paged_attention(
+    query,
+    key,
+    value,
+    key_cache,
+    value_cache,
+    block_tables,
+    mask,
+    softmax_scale=None,
+    mixed_precision=True,
+    config=None,
+    return_debug_tensors=False,
+):
+    """
+    Flash PagedAttention Forward Kernel.
+      - PagedAttention Paper: https://arxiv.org/abs/2309.06180
+      - Chunked Prefill Paper: https://arxiv.org/abs/2403.02310
+
+    IO tensor layouts:
+      - query: shape   (1, n_heads, d, seq_q)
+      - key:   shape   (1, n_kv_heads, d, seq_k)
+      - value: shape   (1, n_kv_heads, seq_v, d)
+      - key_cache: (num_blocks, block_size, n_kv_heads, d)
+      - value_cache: (num_blocks, block_size, n_kv_heads, d)
+      - block_tables: (num_active_blocks, )
+      - mask: (seq_q, num_active_blocks * block_size)
+      - o: shape (1, n_heads, seq_q, d)
+      - l_m: shape (1, n_heads, seq_q, 2)
+
+      - This kernel requires seq_k == seq_v
+      - We use continuous batching by default, so the batch dimension is
+        always 1, and different requests are concatenated along sequence
+        dimension.
+      - We use paged cache blocks (key_cache, value_cache) to store KV cache.
+
+    IO tensor dtypes:
+      - This kernel assumes all IO tensors have the same dtype except for 
+        block_tables (int32) and mask (int32)
+      - If mixed_percision is True, then all Tensor Engine operation will be 
+        performed in bfloat16 and accumulation will be performed in float32. 
+        Otherwise the intermediates will be in the same type as the inputs.
+
+    Compile-time Constants:
+      - softmax_scale: scaling for softmax, is None, default is `1.0/(d**0.5)`
+      - mixed_precision: flag to set non-matmul ops in fp32 precision, default
+        is set to `true`, if false, we use same precision as input types 
+      - config: Instance of dataclass :class:`nki.kernels.attention.FlashConfig`
+          with Performance config parameters for flash attention with default
+          values
+        seq_tile_size: `default=2048`, size of the kv tile size for attention 
+          computation reduction
+
+    GQA support Notes:
+      the spmd kernel for launching kernel should be on kv_heads instead of 
+      nheads
+
+    Example usage:
+      MHA: q: [b, h, d, s], k: [b, h, d, s], v: [b, h, s, d]
+        usage: `flash_fwd[b, h](q, k, v, ...)`
+      GQA: q: [b, h, d, s], k: [b, kv_h, d, s], v: [b, kv_h, s, d]
+        usage: `flash_fwd[b, kv_h](q, k, v, ...)`
+    """
+    config = config or FlashConfig()
+    B_F_SIZE = 512
+    B_P_SIZE = 128
+    b, h, d, seqlen_q = query.shape
+    B_D_SIZE = d
+    LARGE_TILE_SZ = config.seq_tile_size
+    n_tile_q = seqlen_q // B_P_SIZE  # since q will be loaded on tensor engine
+    num_blocks, block_size, k_h, _ = key_cache.shape
+    q_h_per_k_h = h // k_h
+    assert tuple(key_cache.shape) == (
+        num_blocks,
+        block_size,
+        k_h,
+        d,
+    ), "Input shape mismatch!"
+    assert tuple(value_cache.shape) == (
+        num_blocks,
+        block_size,
+        k_h,
+        d,
+    ), "Input shape mismatch!"
+    assert b == 1, f"invalid batch size {b=}"
+    assert d <= 128, f" we do not support head_dim > 128, got head dim {d}"
+    kernel_dtype = nl.bfloat16 if mixed_precision else query.dtype
+    acc_type = np.dtype(np.float32) if mixed_precision else kernel_dtype
+
+    o = nl.ndarray((b, h, seqlen_q, d),
+                   dtype=query.dtype,
+                   buffer=nl.shared_hbm)
+    hbm_l_buffer, hbm_m_buffer, hbm_qk_res, qk_res_buffer = (
+        None,
+        None,
+        None,
+        None,
+    )
+    if return_debug_tensors:
+        hbm_l_buffer = nl.ndarray((b, h, seqlen_q),
+                                  dtype=acc_type,
+                                  buffer=nl.shared_hbm)
+        hbm_m_buffer = nl.ndarray((b, h, seqlen_q),
+                                  dtype=acc_type,
+                                  buffer=nl.shared_hbm)
+        hbm_qk_res = nl.ndarray((b, h, B_P_SIZE, seqlen_q),
+                                dtype=acc_type,
+                                buffer=nl.shared_hbm)
+        qk_res_buffer = nl.zeros(
+            (n_tile_q, q_h_per_k_h, par_dim(B_P_SIZE), seqlen_q),
+            dtype=acc_type,
+            buffer=nl.sbuf,
+            lazy_initialization=True,
+        )
+
+    assert (
+        nl.program_ndim() == 2
+    ), f"Expect spmd grid with 2 dimensions, got {nl.program_ndim()} instead!"
+    batch_id = nl.program_id(axis=0)
+    head_id = nl.program_id(axis=1)
+
+    softmax_scale = softmax_scale or (1.0 / (d**0.5))
+
+    (num_active_blocks, ) = block_tables.shape
+    context_kv_len = num_active_blocks * block_size
+    assert (config.seq_tile_size >= 512
+            ), f" seq tile_size {config.seq_tile_size} cannot be less than 512"
+    assert (context_kv_len % LARGE_TILE_SZ == 0
+            ), f"Need {context_kv_len=} to be divisible by {LARGE_TILE_SZ=}"
+    assert (
+        LARGE_TILE_SZ % B_P_SIZE == 0
+    ), f"Need LARGE_TILE_SZ ({LARGE_TILE_SZ}) to be divisible by {B_P_SIZE=}"
+    assert (B_P_SIZE % block_size == 0
+            ), f"Need B_P_SIZE ({B_P_SIZE}) to be divisible by {block_size=}"
+    num_large_k_tile = context_kv_len // LARGE_TILE_SZ
+    num_blocks_per_large_tile = LARGE_TILE_SZ // block_size
+    assert (num_blocks_per_large_tile <= B_P_SIZE
+    ), f"The number of blocks in each large tile " \
+    f"({num_blocks_per_large_tile}) shouldn't exceed partition size {B_P_SIZE}"
+
+    block_tables_sbuf = nl.full((par_dim(B_P_SIZE), num_large_k_tile),
+                                0,
+                                dtype=np.int32,
+                                buffer=nl.sbuf)
+    for j in nl.affine_range(num_large_k_tile):
+        i_p = nl.arange(num_blocks_per_large_tile)[:, None]
+        block_tables_sbuf[i_p, j] = nl.load(
+            block_tables[j * num_blocks_per_large_tile + i_p], dtype=np.int32)
+
+    # Global Flash Attention accumulators
+    o_buffer = nl.zeros(
+        (n_tile_q, q_h_per_k_h, par_dim(B_P_SIZE), d),
+        dtype=acc_type,
+        buffer=nl.sbuf,
+        lazy_initialization=True,
+    )
+    l_buffer = nl.zeros(
+        (par_dim(B_P_SIZE), n_tile_q, q_h_per_k_h),
+        dtype=acc_type,
+        buffer=nl.sbuf,
+        lazy_initialization=True,
+    )
+    m_buffer = nl.zeros(
+        (n_tile_q, q_h_per_k_h, par_dim(B_P_SIZE), 1),
+        dtype=acc_type,
+        buffer=nl.sbuf,
+        lazy_initialization=True,
+    )
+
+    for j in nl.sequential_range(0, num_large_k_tile):
+        cur_k_tile = nl.ndarray((par_dim(B_D_SIZE), LARGE_TILE_SZ),
+                                dtype=kernel_dtype)
+        cur_v_tile = nl.ndarray(
+            (LARGE_TILE_SZ // B_P_SIZE, par_dim(B_P_SIZE), B_D_SIZE),
+            dtype=kernel_dtype,
+        )
+
+        for k_i in nl.affine_range(num_blocks_per_large_tile):
+            loaded = nl.load(key_cache[block_tables_sbuf[k_i, j], :,
+                                       head_id, :])
+            cur_k_tile[:, nl.ds(k_i *
+                                block_size, block_size)] = nl.transpose(loaded)
+
+        load_tile_size = B_P_SIZE
+        num_blocks_per_partition = load_tile_size // block_size
+        for partition_idx in nl.affine_range(LARGE_TILE_SZ // load_tile_size):
+            for block_in_partition in nl.affine_range(
+                    num_blocks_per_partition):
+                v_i = (partition_idx * num_blocks_per_partition +
+                       block_in_partition)
+                loaded_v = nl.load(value_cache[block_tables_sbuf[v_i, j], :,
+                                               head_id, :])
+                cur_v_tile[partition_idx,
+                           nl.ds(block_in_partition *
+                                 block_size, block_size), :, ] = loaded_v
+
+        cur_mask = nl.ndarray((par_dim(B_P_SIZE), LARGE_TILE_SZ),
+                              dtype=mask.dtype)
+        for m_i in nl.affine_range(LARGE_TILE_SZ // B_F_SIZE):
+            cur_mask[:, nl.ds(m_i * B_F_SIZE, B_F_SIZE)] = nl.load(
+                mask[:, nl.ds(j * LARGE_TILE_SZ + m_i * B_F_SIZE, B_F_SIZE)])
+
+        for i_q_h in nl.affine_range(q_h_per_k_h):
+            for i in nl.affine_range(n_tile_q):
+                q_tile = nl.ndarray((B_D_SIZE, B_P_SIZE), dtype=kernel_dtype)
+                q_hbm_tile = query[batch_id, head_id * q_h_per_k_h + i_q_h]
+                q_sbuf_tile = nl.load(
+                    q_hbm_tile[:, nl.ds(i * B_P_SIZE, B_P_SIZE)],
+                    dtype=kernel_dtype,
+                )  # load (d, 128) tile in SBUF
+                q_tile[:, :] = q_sbuf_tile * softmax_scale
+
+                _flash_attention_core(
+                    q_local_tile=q_tile,
+                    k=cur_k_tile,
+                    v=cur_v_tile,
+                    q_h_per_k_h=q_h_per_k_h,
+                    seqlen_q=seqlen_q,
+                    nheads=h,
+                    o_buffer=o_buffer[i, i_q_h],
+                    l_buffer=l_buffer[:, i, i_q_h],
+                    m_buffer=m_buffer[i, i_q_h],
+                    batch_id=batch_id,
+                    head_id=head_id,
+                    gqa_head_idx=i_q_h,
+                    q_tile_idx=i,
+                    local_k_large_tile_idx=j,
+                    kernel_dtype=kernel_dtype,
+                    acc_type=acc_type,
+                    flash_config=config,
+                    use_causal_mask=False,
+                    continuous_batching_mask=cur_mask,
+                    initialize=j == 0,
+                    B_P_SIZE=B_P_SIZE,
+                    B_F_SIZE=B_F_SIZE,
+                    B_D_SIZE=B_D_SIZE,
+                    dropout_p=0.0,
+                    dropout_p_tensor=None,
+                    seed_tensor=None,
+                    logit_bias_tile=None,
+                )
+
+    # compute attention between input query, key and value
+    if key is not None and value is not None:
+        B_F_SIZE = seqlen_q
+        LARGE_TILE_SZ = seqlen_q
+        active_config = FlashConfig(
+            seq_tile_size=LARGE_TILE_SZ,
+            should_transpose_v=config.should_transpose_v,
+        )
+
+        cur_k_tile = nl.ndarray((par_dim(B_D_SIZE), LARGE_TILE_SZ),
+                                dtype=kernel_dtype)
+        cur_v_tile = nl.ndarray(
+            (LARGE_TILE_SZ // B_P_SIZE, par_dim(B_P_SIZE), B_D_SIZE),
+            dtype=kernel_dtype,
+        )
+
+        cur_k_tile[:, :] = nl.load(key[batch_id, head_id, :, :])
+
+        load_tile_size = B_P_SIZE
+        v_hbm_tile = value[batch_id, head_id]
+        for v_i in nl.affine_range(LARGE_TILE_SZ // load_tile_size):
+            load_v_tile(
+                v_hbm_tile=v_hbm_tile,
+                cur_v_tile=cur_v_tile,
+                j=0,
+                v_i=v_i,
+                config=active_config,
+            )
+
+        cur_mask = nl.ndarray((par_dim(B_P_SIZE), B_F_SIZE), dtype=mask.dtype)
+        cur_mask[:, :] = nl.load(mask[:, nl.ds(context_kv_len, B_F_SIZE)])
+
+        for i_q_h in nl.affine_range(q_h_per_k_h):
+            for i in nl.affine_range(n_tile_q):
+                q_tile = nl.ndarray((B_D_SIZE, B_P_SIZE), dtype=kernel_dtype)
+                q_hbm_tile = query[batch_id, head_id * q_h_per_k_h + i_q_h]
+                q_sbuf_tile = nl.load(
+                    q_hbm_tile[:, nl.ds(i * B_P_SIZE, B_P_SIZE)],
+                    dtype=kernel_dtype,
+                )  # load (d, 128) tile in SBUF
+                q_tile[:, :] = q_sbuf_tile * softmax_scale
+                _flash_attention_core(
+                    q_local_tile=q_tile,
+                    k=cur_k_tile,
+                    v=cur_v_tile,
+                    q_h_per_k_h=q_h_per_k_h,
+                    seqlen_q=seqlen_q,
+                    nheads=h,
+                    o_buffer=o_buffer[i, i_q_h],
+                    l_buffer=l_buffer[:, i, i_q_h],
+                    m_buffer=m_buffer[i, i_q_h],
+                    batch_id=batch_id,
+                    head_id=head_id,
+                    gqa_head_idx=i_q_h,
+                    q_tile_idx=i,
+                    local_k_large_tile_idx=0,
+                    kernel_dtype=kernel_dtype,
+                    acc_type=acc_type,
+                    flash_config=active_config,
+                    use_causal_mask=False,
+                    continuous_batching_mask=cur_mask,
+                    initialize=False,
+                    B_P_SIZE=B_P_SIZE,
+                    B_F_SIZE=B_F_SIZE,
+                    B_D_SIZE=B_D_SIZE,
+                    dropout_p=0.0,
+                    dropout_p_tensor=None,
+                    seed_tensor=None,
+                    logit_bias_tile=None,
+                    qk_res_buffer=qk_res_buffer[i, i_q_h]
+                    if qk_res_buffer is not None else None,
+                )
+
+    # -- -- -- -- write output to buffer on HBM -- -- -- -- -- -- #
+    for i_q_h in nl.affine_range(q_h_per_k_h):
+        for i in nl.affine_range(n_tile_q):
+            out = nl.multiply(
+                o_buffer[i, i_q_h, :, :],
+                nl.exp(m_buffer[i, i_q_h, :, :] - l_buffer[:, i, i_q_h]),
+                dtype=kernel_dtype,
+            )
+
+            nl.store(
+                o[batch_id, head_id * q_h_per_k_h + i_q_h,
+                  nl.ds(i * B_P_SIZE, B_P_SIZE), :, ],
+                out,
+            )
+            # maximum and summation statistics
+            if return_debug_tensors:
+                nl.store(
+                    hbm_m_buffer[batch_id, head_id * q_h_per_k_h + i_q_h,
+                                 nl.ds(i * B_P_SIZE, B_P_SIZE), ],
+                    m_buffer[i, i_q_h, :, :],
+                )
+                nl.store(
+                    hbm_l_buffer[batch_id, head_id * q_h_per_k_h + i_q_h,
+                                 nl.ds(i * B_P_SIZE, B_P_SIZE), ],
+                    l_buffer[:, i, i_q_h],
+                )
+                nl.store(
+                    hbm_qk_res[batch_id, head_id * q_h_per_k_h + i_q_h, :, :],
+                    qk_res_buffer[batch_id, i_q_h, :, :],
+                )
+
+    if return_debug_tensors:
+        return o, hbm_m_buffer, hbm_l_buffer, hbm_qk_res
+    return o
+
+
+def flash_attn_varlen_nkifunc(
+    query,
+    key,
+    value,
+    key_cache,
+    value_cache,
+    block_table,
+    attn_mask,
+    n_kv_head=None,
+    head_size=None,
+    B_P_SIZE=128,
+    LARGE_TILE_SZ=2048,
+    return_debug_tensors=False,
+    mixed_precision=True,
+):
+    config = FlashConfig(
+        seq_tile_size=LARGE_TILE_SZ,
+        should_transpose_v=False,
+    )
+    kwargs = dict(
+        query=query,
+        key=key,
+        value=value,
+        key_cache=key_cache,
+        value_cache=value_cache,
+        block_tables=block_table,
+        mask=attn_mask,
+        softmax_scale=1.0 / (head_size**0.5),
+        config=config,
+        mixed_precision=mixed_precision,
+        return_debug_tensors=return_debug_tensors,
+    )
+    _, n_kv_head, _, _ = key.shape
+
+    if return_debug_tensors:
+        o, *debug_tensors = flash_paged_attention[1, n_kv_head](**kwargs)
+        return o, *debug_tensors
+    else:
+        o = flash_paged_attention[1, n_kv_head](**kwargs)
+        return o

From 426a5c362557c6df4604ed084660b8915fbca30c Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Mon, 27 Jan 2025 20:56:31 -0500
Subject: [PATCH 403/462] Fix bad path in prometheus example (#12481)

Signed-off-by: mgoin <michael@neuralmagic.com>
---
 examples/online_serving/prometheus_grafana/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/online_serving/prometheus_grafana/README.md b/examples/online_serving/prometheus_grafana/README.md
index c49e5306a1cb4..4a85f953b0b4c 100644
--- a/examples/online_serving/prometheus_grafana/README.md
+++ b/examples/online_serving/prometheus_grafana/README.md
@@ -24,7 +24,7 @@ Submit some sample requests to the server:
 ```bash
 wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
 
-python3 ../../benchmarks/benchmark_serving.py \
+python3 ../../../benchmarks/benchmark_serving.py \
     --model mistralai/Mistral-7B-v0.1 \
     --tokenizer mistralai/Mistral-7B-v0.1 \
     --endpoint /v1/completions \

From 23a7cbc88b5a17499766d1cbc0de283c9f980509 Mon Sep 17 00:00:00 2001
From: Hossein Sarshar <hossein.sarshar@gmail.com>
Date: Mon, 27 Jan 2025 22:18:07 -0500
Subject: [PATCH 404/462] [CI/Build] Fixed the xla nightly issue report in
 #12451 (#12453)

---
 requirements-tpu.txt | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/requirements-tpu.txt b/requirements-tpu.txt
index 51a0c65eac5aa..1abde714af7c9 100644
--- a/requirements-tpu.txt
+++ b/requirements-tpu.txt
@@ -10,17 +10,14 @@ wheel
 jinja2
 ray[default]
 
-# Install torch, torch_xla
+# Install torch_xla
+--pre
+--extra-index-url https://download.pytorch.org/whl/nightly/cpu
+--find-links https://storage.googleapis.com/libtpu-wheels/index.html
 --find-links https://storage.googleapis.com/libtpu-releases/index.html
 --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
 --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
-# Note: This torch whl can be slightly different from the official torch nightly whl
-# since they are not built on the same commit (but on the same day). This difference may cause C++ undefined symbol issue
-# if some change between the 2 commits introduce some C++ API change.
-# Here we install the exact torch whl from which torch_xla is built from, to avoid potential C++ undefined symbol issue.
-torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.7.0.dev20250124-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
-torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.7.0.dev20250124-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
-torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.7.0.dev20250124-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
-torch_xla[pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250124-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
-torch_xla[pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250124-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
-torch_xla[pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250124-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
+torch==2.6.0.dev20241216+cpu
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250124-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250124-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250124-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"

From 0f465ab53303fbd3c8ad32163db161cdb0cf8dad Mon Sep 17 00:00:00 2001
From: Gabriel Marinho <104592062+gmarinho2@users.noreply.github.com>
Date: Tue, 28 Jan 2025 00:30:13 -0300
Subject: [PATCH 405/462] [FEATURE] Enables offline /score for embedding models
 (#12021)

Signed-off-by: Gabriel Marinho <gmarinho@ibm.com>
---
 .../models/embedding/language/test_scoring.py | 100 +++++++++++
 vllm/entrypoints/llm.py                       | 160 +++++++++++++-----
 2 files changed, 216 insertions(+), 44 deletions(-)

diff --git a/tests/models/embedding/language/test_scoring.py b/tests/models/embedding/language/test_scoring.py
index be6e3842821e2..3db27d942ac8c 100644
--- a/tests/models/embedding/language/test_scoring.py
+++ b/tests/models/embedding/language/test_scoring.py
@@ -5,12 +5,18 @@
 import math
 
 import pytest
+import torch
+import torch.nn.functional as F
 
 MODELS = [
     "cross-encoder/ms-marco-MiniLM-L-6-v2",  # Bert
     "BAAI/bge-reranker-v2-m3",  # Roberta
 ]
 
+EMBEDDING_MODELS = [
+    "sentence-transformers/all-MiniLM-L12-v2",
+]
+
 TEXTS_1 = [
     "What is the capital of France?",
     "What is the capital of Germany?",
@@ -87,3 +93,97 @@ def test_llm_N_to_N(vllm_runner, hf_runner, model_name, dtype: str):
 
     assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
     assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)
+
+
+@pytest.fixture(scope="module", params=EMBEDDING_MODELS)
+def emb_model_name(request):
+    yield request.param
+
+
+@pytest.mark.parametrize("dtype", ["half"])
+def test_llm_1_to_1_embedding(vllm_runner, hf_runner, emb_model_name,
+                              dtype: str):
+
+    text_pair = [TEXTS_1[0], TEXTS_2[0]]
+
+    with hf_runner(emb_model_name, dtype=dtype,
+                   is_sentence_transformer=True) as hf_model:
+        hf_embeddings = hf_model.encode(text_pair)
+        hf_outputs = [
+            F.cosine_similarity(*map(torch.tensor, hf_embeddings), dim=0)
+        ]
+
+    with vllm_runner(emb_model_name,
+                     task="embed",
+                     dtype=dtype,
+                     max_model_len=None) as vllm_model:
+        vllm_outputs = vllm_model.score(text_pair[0], text_pair[1])
+
+    assert len(vllm_outputs) == 1
+    assert len(hf_outputs) == 1
+
+    assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
+
+
+@pytest.mark.parametrize("dtype", ["half"])
+def test_llm_1_to_N_embedding(vllm_runner, hf_runner, emb_model_name,
+                              dtype: str):
+
+    text_pairs = [
+        [TEXTS_1[0], TEXTS_2[0]],
+        [TEXTS_1[0], TEXTS_2[1]],
+    ]
+
+    with hf_runner(emb_model_name, dtype=dtype,
+                   is_sentence_transformer=True) as hf_model:
+        hf_embeddings = [
+            hf_model.encode(text_pair) for text_pair in text_pairs
+        ]
+        hf_outputs = [
+            F.cosine_similarity(*map(torch.tensor, pair), dim=0)
+            for pair in hf_embeddings
+        ]
+
+    with vllm_runner(emb_model_name,
+                     task="embed",
+                     dtype=dtype,
+                     max_model_len=None) as vllm_model:
+        vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2)
+
+    assert len(vllm_outputs) == 2
+    assert len(hf_outputs) == 2
+
+    assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
+    assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)
+
+
+@pytest.mark.parametrize("dtype", ["half"])
+def test_llm_N_to_N_embedding(vllm_runner, hf_runner, emb_model_name,
+                              dtype: str):
+
+    text_pairs = [
+        [TEXTS_1[0], TEXTS_2[0]],
+        [TEXTS_1[1], TEXTS_2[1]],
+    ]
+
+    with hf_runner(emb_model_name, dtype=dtype,
+                   is_sentence_transformer=True) as hf_model:
+        hf_embeddings = [
+            hf_model.encode(text_pair) for text_pair in text_pairs
+        ]
+        hf_outputs = [
+            F.cosine_similarity(*map(torch.tensor, pair), dim=0)
+            for pair in hf_embeddings
+        ]
+
+    with vllm_runner(emb_model_name,
+                     task="embed",
+                     dtype=dtype,
+                     max_model_len=None) as vllm_model:
+        vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2)
+
+    assert len(vllm_outputs) == 2
+    assert len(hf_outputs) == 2
+
+    assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
+    assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 1860ed3d7db5a..46b595b0da73c 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -5,6 +5,7 @@
                     Tuple, Type, Union, cast, overload)
 
 import cloudpickle
+import torch
 import torch.nn as nn
 from tqdm import tqdm
 from typing_extensions import TypeVar, deprecated
@@ -996,6 +997,107 @@ def classify(
 
         return [ClassificationRequestOutput.from_base(item) for item in items]
 
+    def _embedding_score(
+        self,
+        tokenizer: AnyTokenizer,
+        text_1: List[Union[str, TextPrompt, TokensPrompt]],
+        text_2: List[Union[str, TextPrompt, TokensPrompt]],
+        truncate_prompt_tokens: Optional[int] = None,
+        use_tqdm: bool = True,
+        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+    ) -> List[ScoringRequestOutput]:
+
+        encoded_output = self.encode(
+            text_1 + text_2,
+            use_tqdm=use_tqdm,
+            lora_request=lora_request,
+            prompt_adapter_request=prompt_adapter_request)
+        encoded_output_1 = encoded_output[0:len(text_1)]
+        encoded_output_2 = encoded_output[len(text_1):]
+
+        if len(encoded_output_1) == 1:
+            encoded_output_1 = encoded_output_1 * len(encoded_output_2)
+
+        output_pairs = [(t1, t2)
+                        for t1, t2 in zip(encoded_output_1, encoded_output_2)]
+
+        scores = []
+        scorer = torch.nn.CosineSimilarity(0)
+
+        for embed_1, embed_2 in output_pairs:
+            pair_score = scorer(embed_1.outputs.data, embed_2.outputs.data)
+
+            if (pad_token_id := getattr(tokenizer, "pad_token_id",
+                                        None)) is not None:
+                tokens = embed_1.prompt_token_ids + [
+                    pad_token_id
+                ] + embed_2.prompt_token_ids
+            else:
+                tokens = embed_1.prompt_token_ids + embed_2.prompt_token_ids
+
+            scores.append(
+                PoolingRequestOutput(
+                    request_id=f"{embed_1.request_id}_{embed_2.request_id}",
+                    outputs=pair_score,
+                    prompt_token_ids=tokens,
+                    finished=True))
+
+        items = self.engine_class.validate_outputs(scores,
+                                                   PoolingRequestOutput)
+        return [ScoringRequestOutput.from_base(item) for item in items]
+
+    def _cross_encoding_score(
+        self,
+        tokenizer: Union[AnyTokenizer],
+        text_1: List[Union[str, TextPrompt, TokensPrompt]],
+        text_2: List[Union[str, TextPrompt, TokensPrompt]],
+        truncate_prompt_tokens: Optional[int] = None,
+        use_tqdm: bool = True,
+        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+    ) -> List[ScoringRequestOutput]:
+
+        if isinstance(tokenizer, MistralTokenizer):
+            raise ValueError(
+                "Score API is only enabled for `--task embed or score`")
+
+        if len(text_1) == 1:
+            text_1 = text_1 * len(text_2)
+
+        input_pairs = [(t1, t2) for t1, t2 in zip(text_1, text_2)]
+
+        pooling_params = PoolingParams()
+
+        tokenization_kwargs: Dict[str, Any] = {}
+        if truncate_prompt_tokens is not None:
+            tokenization_kwargs["truncation"] = True
+            tokenization_kwargs["max_length"] = truncate_prompt_tokens
+
+        parsed_prompts = []
+
+        for q, t in input_pairs:
+            prompt_inputs = tokenizer(text=q,
+                                      text_pair=t,
+                                      **tokenization_kwargs)
+            engine_prompt = TokensPrompt(
+                prompt_token_ids=prompt_inputs["input_ids"],
+                token_type_ids=prompt_inputs.get("token_type_ids"))
+            parsed_prompts.append(engine_prompt)
+
+        self._validate_and_add_requests(
+            prompts=parsed_prompts,
+            params=pooling_params,
+            lora_request=lora_request,
+            prompt_adapter_request=prompt_adapter_request,
+        )
+
+        outputs = self._run_engine(use_tqdm=use_tqdm)
+        items = self.engine_class.validate_outputs(outputs,
+                                                   PoolingRequestOutput)
+
+        return [ScoringRequestOutput.from_base(item) for item in items]
+
     def score(
         self,
         text_1: Union[SingletonPrompt, Sequence[SingletonPrompt]],
@@ -1047,25 +1149,20 @@ def score(
 
             raise ValueError(" ".join(messages))
 
-        if not self.llm_engine.model_config.is_cross_encoder:
-            raise ValueError("Your model does not support cross encoding")
-        if self.llm_engine.model_config.task != "score":
-            raise ValueError("Score API is only enabled for `--task score`")
-
-        tokenizer = self.llm_engine.get_tokenizer()
-
-        if isinstance(tokenizer, MistralTokenizer):
+        if self.llm_engine.model_config.task not in ("embed", "score"):
             raise ValueError(
-                "MistralTokenizer not supported for cross-encoding")
+                "Score API is only enabled for `--task embed or --task score`")
 
         # the tokenizer for models such as
         # "cross-encoder/ms-marco-MiniLM-L-6-v2" doesn't support passing
         # lists of tokens to the `text` and `text_pair` kwargs
+        tokenizer = self.llm_engine.get_tokenizer()
+
         def ensure_str(prompt: SingletonPrompt):
             if isinstance(prompt, dict):
                 if "multi_modal_data" in prompt:
                     raise ValueError("Multi-modal prompt is not "
-                                     "supported for cross encoding")
+                                     "supported for scoring")
                 elif "prompt_token_ids" in prompt:
                     prompt = tokenizer.decode(
                         cast(TokensPrompt, prompt)["prompt_token_ids"])
@@ -1091,40 +1188,15 @@ def ensure_str(prompt: SingletonPrompt):
         if len(text_2) == 0:
             raise ValueError("At least one text_pair element must be given")
 
-        if len(text_1) == 1:
-            text_1 = text_1 * len(text_2)
-
-        input_pairs = [(t1, t2) for t1, t2 in zip(text_1, text_2)]
-        pooling_params = PoolingParams()
-
-        tokenization_kwargs: Dict[str, Any] = {}
-        if truncate_prompt_tokens is not None:
-            tokenization_kwargs["truncation"] = True
-            tokenization_kwargs["max_length"] = truncate_prompt_tokens
-
-        parsed_prompts = []
-
-        for q, t in input_pairs:
-            prompt_inputs = tokenizer(text=q,
-                                      text_pair=t,
-                                      **tokenization_kwargs)
-            engine_prompt = TokensPrompt(
-                prompt_token_ids=prompt_inputs["input_ids"],
-                token_type_ids=prompt_inputs.get("token_type_ids"))
-            parsed_prompts.append(engine_prompt)
-
-        self._validate_and_add_requests(
-            prompts=parsed_prompts,
-            params=pooling_params,
-            lora_request=lora_request,
-            prompt_adapter_request=prompt_adapter_request,
-        )
-
-        outputs = self._run_engine(use_tqdm=use_tqdm)
-        items = self.engine_class.validate_outputs(outputs,
-                                                   PoolingRequestOutput)
-
-        return [ScoringRequestOutput.from_base(item) for item in items]
+        if self.llm_engine.model_config.is_cross_encoder:
+            return self._cross_encoding_score(tokenizer, text_1, text_2,
+                                              truncate_prompt_tokens, use_tqdm,
+                                              lora_request,
+                                              prompt_adapter_request)
+        else:
+            return self._embedding_score(tokenizer, text_1, text_2,
+                                         truncate_prompt_tokens, use_tqdm,
+                                         lora_request, prompt_adapter_request)
 
     def start_profile(self) -> None:
         self.llm_engine.start_profile()

From dd66fd2b01e1195b7ccc8ffcd4b5d49ff1946a56 Mon Sep 17 00:00:00 2001
From: Mengqing Cao <cmq0113@163.com>
Date: Tue, 28 Jan 2025 14:11:05 +0800
Subject: [PATCH 406/462] [CI] fix pre-commit error (#12494)

Signed-off-by: Mengqing Cao <cmq0113@163.com>
---
 vllm/attention/ops/nki_flash_attn.py   | 37 +++++++++++++++++---------
 vllm/spec_decode/spec_decode_worker.py |  8 +++---
 2 files changed, 29 insertions(+), 16 deletions(-)

diff --git a/vllm/attention/ops/nki_flash_attn.py b/vllm/attention/ops/nki_flash_attn.py
index b9765b0f0283d..9de4ef7f5a140 100644
--- a/vllm/attention/ops/nki_flash_attn.py
+++ b/vllm/attention/ops/nki_flash_attn.py
@@ -106,11 +106,12 @@ def _flash_attention_core(
     assert (continuous_batching_mask
             is not None), "continuous_batching_mask input is required."
     if continuous_batching_mask is not None:
-        assert (logit_bias_tile is
-                None), "continuous_batching_mask does not support logit_bias!"
+        assert (
+            logit_bias_tile
+            is None), "continuous_batching_mask does not support logit_bias!"
 
     # mask are used to only apply computation to the lower half of the matrix,
-    # which reduce the arthimetic intensity by half
+    # which reduce the arithmetic intensity by half
     forward_mask = (q_tile_idx * B_P_SIZE >= local_k_large_tile_idx *
                     LARGE_TILE_SZ if use_causal_mask else None)
 
@@ -468,9 +469,11 @@ def flash_paged_attention(
                        block_in_partition)
                 loaded_v = nl.load(value_cache[block_tables_sbuf[v_i, j], :,
                                                head_id, :])
-                cur_v_tile[partition_idx,
-                           nl.ds(block_in_partition *
-                                 block_size, block_size), :, ] = loaded_v
+                cur_v_tile[
+                    partition_idx,
+                    nl.ds(block_in_partition * block_size, block_size),
+                    :,
+                ] = loaded_v
 
         cur_mask = nl.ndarray((par_dim(B_P_SIZE), LARGE_TILE_SZ),
                               dtype=mask.dtype)
@@ -601,20 +604,30 @@ def flash_paged_attention(
             )
 
             nl.store(
-                o[batch_id, head_id * q_h_per_k_h + i_q_h,
-                  nl.ds(i * B_P_SIZE, B_P_SIZE), :, ],
+                o[
+                    batch_id,
+                    head_id * q_h_per_k_h + i_q_h,
+                    nl.ds(i * B_P_SIZE, B_P_SIZE),
+                    :,
+                ],
                 out,
             )
             # maximum and summation statistics
             if return_debug_tensors:
                 nl.store(
-                    hbm_m_buffer[batch_id, head_id * q_h_per_k_h + i_q_h,
-                                 nl.ds(i * B_P_SIZE, B_P_SIZE), ],
+                    hbm_m_buffer[
+                        batch_id,
+                        head_id * q_h_per_k_h + i_q_h,
+                        nl.ds(i * B_P_SIZE, B_P_SIZE),
+                    ],
                     m_buffer[i, i_q_h, :, :],
                 )
                 nl.store(
-                    hbm_l_buffer[batch_id, head_id * q_h_per_k_h + i_q_h,
-                                 nl.ds(i * B_P_SIZE, B_P_SIZE), ],
+                    hbm_l_buffer[
+                        batch_id,
+                        head_id * q_h_per_k_h + i_q_h,
+                        nl.ds(i * B_P_SIZE, B_P_SIZE),
+                    ],
                     l_buffer[:, i, i_q_h],
                 )
                 nl.store(
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index af1c4dfcebbc0..8d6d05cbaea75 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -870,10 +870,10 @@ def _verify_tokens(
             accepted_index = accepted_token_ids + 1  # Convert -1 to 0
             accepted_index = accepted_index.count_nonzero(dim=1).add_(-1)  # b
             # Drop non-terminal prefill chunks hidden states.
-            hidden_states = hidden_states[
-                accepted_index != VLLM_INVALID_TOKEN_ID]
-            accepted_index = accepted_index[
-                accepted_index != VLLM_INVALID_TOKEN_ID]
+            hidden_states = hidden_states[accepted_index !=
+                                          VLLM_INVALID_TOKEN_ID]
+            accepted_index = accepted_index[accepted_index !=
+                                            VLLM_INVALID_TOKEN_ID]
             assert len(accepted_index) == hidden_states.shape[0] == len(
                 terminal_metadata)
             index = accepted_index[:, None, None].expand(-1, 1,

From 8cbc4249758d399c0606ef4a1241e01176d0160b Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Tue, 28 Jan 2025 00:22:41 -0800
Subject: [PATCH 407/462] Update README.md with V1 alpha release (#12495)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 4ed905bf7aa9d..5fd30f2b1b9d7 100644
--- a/README.md
+++ b/README.md
@@ -16,6 +16,7 @@ Easy, fast, and cheap LLM serving for everyone
 ---
 
 *Latest News* 🔥
+- [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
 - [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing).
 - [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
 - [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing).

From e29d4358ef054163b80dfb7e53ce3eb0e08d1328 Mon Sep 17 00:00:00 2001
From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Date: Tue, 28 Jan 2025 03:27:41 -0500
Subject: [PATCH 408/462] [V1] Include Engine Version in Logs (#12496)

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
---
 vllm/engine/llm_engine.py | 2 +-
 vllm/v1/engine/core.py    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index ab67ae29723cd..dd677300fc66a 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -230,7 +230,7 @@ def __init__(
         )
 
         logger.info(
-            "Initializing an LLM engine (v%s) with config: %s, "
+            "Initializing a V0 LLM engine (v%s) with config: %s, "
             "use_cached_outputs=%s, ",
             VLLM_VERSION,
             vllm_config,
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index cf94033a38d96..f50303bda58fd 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -42,7 +42,7 @@ def __init__(
     ):
         assert vllm_config.model_config.runner_type != "pooling"
 
-        logger.info("Initializing an LLM engine (v%s) with config: %s",
+        logger.info("Initializing a V1 LLM engine (v%s) with config: %s",
                     VLLM_VERSION, vllm_config)
 
         # Setup Model.

From 2079e43beecc486a607c9d79ab691e0e4563aa11 Mon Sep 17 00:00:00 2001
From: Sebastian Schoennenbeck <sebastian.schoennenbeck@comma-soft.com>
Date: Tue, 28 Jan 2025 11:56:45 +0100
Subject: [PATCH 409/462] [Core] Make raw_request optional in ServingCompletion
 (#12503)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Sebastian Schönnenbeck <sebastian.schoennenbeck@comma-soft.com>
---
 vllm/entrypoints/openai/serving_completion.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index b0179f78bd635..13c3926368890 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -58,7 +58,7 @@ def __init__(
     async def create_completion(
         self,
         request: CompletionRequest,
-        raw_request: Request,
+        raw_request: Optional[Request] = None,
     ) -> Union[AsyncGenerator[str, None], CompletionResponse, ErrorResponse]:
         """Completion API similar to OpenAI's API.
 
@@ -137,7 +137,7 @@ async def create_completion(
                                  lora_request=lora_request,
                                  prompt_adapter_request=prompt_adapter_request)
 
-                trace_headers = (await
+                trace_headers = (None if raw_request is None else await
                                  self._get_trace_headers(raw_request.headers))
 
                 if isinstance(sampling_params, BeamSearchParams):

From 335a4343bc36cb14068f783c7329a2711f6d70c7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= <dtrifiro@redhat.com>
Date: Tue, 28 Jan 2025 14:06:21 +0000
Subject: [PATCH 410/462] Dockerfile.ubi: fix build

the inclusion of the cumem allocator requires linking to the CUDA
driver, which isn't in the library paths, so we need to create a
symlink to it.

https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#common-instructions-for-rhel-rocky
---
 Dockerfile.ubi | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/Dockerfile.ubi b/Dockerfile.ubi
index 2e5ad2967c534..54ba9ceb8a78b 100644
--- a/Dockerfile.ubi
+++ b/Dockerfile.ubi
@@ -42,13 +42,15 @@ FROM python-install as cuda-base
 RUN curl -Lo /etc/yum.repos.d/cuda-rhel9.repo \
         https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo
 
+ENV CUDA_HOME="/usr/local/cuda" \
+    PATH="${CUDA_HOME}/bin:${PATH}"
+ENV LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${CUDA_HOME}/lib64/stubs/:${CUDA_HOME}/extras/CUPTI/lib64:${LD_LIBRARY_PATH}"
 RUN microdnf install -y \
         cuda-nvcc-12-4 cuda-nvtx-12-4 cuda-libraries-devel-12-4 && \
-    microdnf clean all
+    microdnf clean all && \
+    ln -s ${CUDA_HOME}/lib64/stubs/libcuda.so /usr/lib64/
+
 
-ENV CUDA_HOME="/usr/local/cuda" \
-    PATH="${CUDA_HOME}/bin:${PATH}" \
-    LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${CUDA_HOME}/extras/CUPTI/lib64:${LD_LIBRARY_PATH}"
 
 ## Python cuda base #################################################################
 FROM cuda-base AS python-cuda-base

From 8f58a5135874770ac8429f4772d7f92fe33094e5 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 29 Jan 2025 00:25:05 +0800
Subject: [PATCH 411/462] [VLM] Merged multi-modal processor and V1 support for
 Qwen-VL (#12504)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/source/models/supported_models.md        |   2 +-
 .../multimodal/processing/test_common.py      |  64 +-
 .../models/multimodal/processing/test_qwen.py | 144 ----
 vllm/model_executor/models/qwen.py            | 654 ++++++++++--------
 4 files changed, 387 insertions(+), 477 deletions(-)
 delete mode 100644 tests/models/multimodal/processing/test_qwen.py

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 8cdc663a0320f..e59150cdd3b83 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -745,7 +745,7 @@ See [this page](#generative-models) for more information on how to use generativ
   - `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc.
   - ✅︎
   - ✅︎
-  -
+  - ✅︎
 * - `Qwen2AudioForConditionalGeneration`
   - Qwen2-Audio
   - T + A<sup>+</sup>
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index fe5b733c750a8..b575ec6acbef3 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -16,7 +16,6 @@
 
 def _test_processing_correctness(
     model_id: str,
-    modalities: dict[str, bool],
     hit_rate: float,
     num_batches: int,
     simplify_rate: float,
@@ -25,11 +24,6 @@ def _test_processing_correctness(
     model_info.check_available_online(on_fail="skip")
     model_info.check_transformers_version(on_fail="skip")
 
-    limit_mm_per_prompt = {
-        modality: 3 if supports_multi else 1
-        for modality, supports_multi in modalities.items()
-    }
-
     model_config = ModelConfig(
         model_id,
         task="auto",
@@ -40,18 +34,29 @@ def _test_processing_correctness(
         dtype="float16",
         revision=None,
         hf_overrides=model_info.hf_overrides,
-        limit_mm_per_prompt=limit_mm_per_prompt,
     )
 
     model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
     factories = MULTIMODAL_REGISTRY._processor_factories[model_cls]
     ctx = InputProcessingContext(
         model_config,
-        tokenizer=cached_get_tokenizer(model_config.tokenizer),
+        tokenizer=cached_get_tokenizer(
+            model_config.tokenizer,
+            trust_remote_code=model_info.trust_remote_code,
+        ),
     )
     # Ensure that it can fit all of the data
     cache = ProcessingCache(capacity=1 << 30)
 
+    processing_info = factories.info(ctx)
+    supported_mm_limits = processing_info.get_supported_mm_limits()
+    limit_mm_per_prompt = {
+        modality: 3 if limit is None else limit
+        for modality, limit in supported_mm_limits.items()
+    }
+
+    model_config.get_multimodal_config().limit_per_prompt = limit_mm_per_prompt
+
     baseline_processor = factories.build_processor(ctx, cache=None)
     cached_processor = factories.build_processor(ctx, cache=cache)
     dummy_inputs = baseline_processor.dummy_inputs
@@ -82,8 +87,8 @@ def _test_processing_correctness(
         mm_data = {
             k:
             [(input_to_hit[k] if rng.rand() < hit_rate else input_factory[k]())
-             for _ in range(rng.randint(limit_mm_per_prompt[k]))]
-            for k in modalities
+             for _ in range(rng.randint(limit))]
+            for k, limit in limit_mm_per_prompt.items()
         }
 
         mm_counts = {k: len(vs) for k, vs in mm_data.items()}
@@ -135,21 +140,22 @@ def _test_processing_correctness(
 
 # yapf: disable
 # True if the model supports multiple data items of the modality per request
-@pytest.mark.parametrize(("model_id", "modalities"), [
-    ("rhymes-ai/Aria", {"image": True}),
-    ("Salesforce/blip2-opt-2.7b", {"image": False}),
-    ("facebook/chameleon-7b", {"image": False}),
-    ("deepseek-ai/deepseek-vl2-tiny", {"image": True}),
-    ("adept/fuyu-8b", {"image": False}),
-    ("llava-hf/llava-1.5-7b-hf", {"image": True}),
-    ("llava-hf/llava-v1.6-mistral-7b-hf", {"image": True}),
-    ("llava-hf/LLaVA-NeXT-Video-7B-hf", {"video": False}),
-    ("llava-hf/llava-onevision-qwen2-0.5b-ov-hf", {"image": True, "video": True}),  # noqa: E501
-    ("TIGER-Lab/Mantis-8B-siglip-llama3", {"image": True}),
-    ("mistral-community/pixtral-12b", {"image": True}),
-    ("Qwen/Qwen2-VL-2B-Instruct", {"image": True, "video": True}),
-    ("Qwen/Qwen2-Audio-7B-Instruct", {"audio": True}),
-    ("fixie-ai/ultravox-v0_3", {"audio": True}),
+@pytest.mark.parametrize("model_id", [
+    "rhymes-ai/Aria",
+    "Salesforce/blip2-opt-2.7b",
+    "facebook/chameleon-7b",
+    "deepseek-ai/deepseek-vl2-tiny",
+    "adept/fuyu-8b",
+    "llava-hf/llava-1.5-7b-hf",
+    "llava-hf/llava-v1.6-mistral-7b-hf",
+    "llava-hf/LLaVA-NeXT-Video-7B-hf",
+    "llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
+    "TIGER-Lab/Mantis-8B-siglip-llama3",
+    "mistral-community/pixtral-12b",
+    "Qwen/Qwen-VL-Chat",
+    "Qwen/Qwen2-VL-2B-Instruct",
+    "Qwen/Qwen2-Audio-7B-Instruct",
+    "fixie-ai/ultravox-v0_3",
 ])
 @pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
 @pytest.mark.parametrize("num_batches", [32])
@@ -157,14 +163,12 @@ def _test_processing_correctness(
 # yapf: enable
 def test_processing_correctness(
     model_id: str,
-    modalities: dict[str, bool],
     hit_rate: float,
     num_batches: int,
     simplify_rate: float,
 ):
     _test_processing_correctness(
         model_id,
-        modalities,
         hit_rate=hit_rate,
         num_batches=num_batches,
         simplify_rate=simplify_rate,
@@ -172,16 +176,13 @@ def test_processing_correctness(
 
 
 # yapf: disable
-@pytest.mark.parametrize(("model_id", "modalities"), [
-    ("microsoft/Phi-3-vision-128k-instruct", {"image": True}),
-])
+@pytest.mark.parametrize("model_id", ["microsoft/Phi-3-vision-128k-instruct"])
 @pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
 @pytest.mark.parametrize("num_batches", [32])
 @pytest.mark.parametrize("simplify_rate", [1.0])
 # yapf: enable
 def test_processing_correctness_phi3v(
     model_id: str,
-    modalities: dict[str, bool],
     hit_rate: float,
     num_batches: int,
     simplify_rate: float,
@@ -195,7 +196,6 @@ def test_processing_correctness_phi3v(
 
     _test_processing_correctness(
         model_id,
-        modalities,
         hit_rate=hit_rate,
         num_batches=num_batches,
         simplify_rate=simplify_rate,
diff --git a/tests/models/multimodal/processing/test_qwen.py b/tests/models/multimodal/processing/test_qwen.py
deleted file mode 100644
index af0ace711ba3e..0000000000000
--- a/tests/models/multimodal/processing/test_qwen.py
+++ /dev/null
@@ -1,144 +0,0 @@
-"""Tests for Qwen's multimodal preprocessing kwargs."""
-from typing import Dict, List, Union
-
-import pytest
-import torch
-from PIL.Image import Image
-
-from vllm.inputs import InputContext, token_inputs
-from vllm.multimodal import MultiModalKwargs
-from vllm.multimodal.utils import cached_get_tokenizer
-
-from ....conftest import IMAGE_ASSETS
-from ...utils import build_model_context
-
-### Multimodal preprocessing tests
-SAMPLE_IMAGE = IMAGE_ASSETS[0].pil_image
-# These values are specific to Qwen-VL/Chat; we can get these from the model
-# config also, but they are hardcoded here to keep the parameterize/fixtures
-# easy to read.
-IMG_START_ID = 151857
-IMG_END_ID = 151858
-IMG_PAD_ID = 151859
-TOKS_PER_IMG = 256
-VIS_ENC_DIM = 4096
-IMG_SIZE = 448
-
-
-@pytest.fixture()
-def input_mapper_for_qwen():
-    # Lazy import to avoid initializing CUDA during test collection
-    from vllm.model_executor.models.qwen import input_mapper_for_qwen
-    return input_mapper_for_qwen
-
-
-@pytest.fixture()
-def input_processor_for_qwen():
-    # Lazy import to avoid initializing CUDA during test collection
-    from vllm.model_executor.models.qwen import input_processor_for_qwen
-    return input_processor_for_qwen
-
-
-@pytest.fixture()
-def qwen_vl_context() -> InputContext:
-    """Get an InputContext for Qwen-VL."""
-    return build_model_context(model_name="Qwen/Qwen-VL",
-                               trust_remote_code=True)
-
-
-# Happy path tests for single/multi-image scenarios for the multimodal
-# input processor and mapper, respectively
-@pytest.mark.parametrize("num_images", [1, 2])
-def test_input_processor_valid_mm_data(input_processor_for_qwen,
-                                       qwen_vl_context: InputContext,
-                                       num_images: int):
-    """Happy cases for image inputs to Qwen's multimodal input processor."""
-    prompt = "".join(
-        [f"Picture {num}: <img></img>\n" for num in range(1, num_images + 1)])
-    inputs = token_inputs(
-        prompt=prompt,
-        # When processing multimodal data for a multimodal model, the qwen
-        # input processor will overwrite the provided prompt_token_ids with
-        # the image prompts
-        prompt_token_ids=[],
-        multi_modal_data={"image": torch.rand(num_images, TOKS_PER_IMG, 4096)},
-    )
-    proc_inputs = input_processor_for_qwen(qwen_vl_context, inputs)
-    assert isinstance(proc_inputs, dict)
-
-    # Each image should have one start / stop and a fixed context of 256
-    proc_tokens = proc_inputs["prompt_token_ids"]
-    assert proc_tokens.count(IMG_START_ID) == num_images
-    assert proc_tokens.count(IMG_END_ID) == num_images
-    assert proc_tokens.count(IMG_PAD_ID) == num_images * TOKS_PER_IMG
-
-
-@pytest.mark.parametrize(
-    "img_data,expected_shape",
-    [
-        # single / multi-image
-        (SAMPLE_IMAGE, (1, 3, IMG_SIZE, IMG_SIZE)),
-        (2 * [SAMPLE_IMAGE], (2, 3, IMG_SIZE, IMG_SIZE)),
-        # single / multi-image embeddings
-        (torch.rand(
-            (TOKS_PER_IMG, VIS_ENC_DIM)), (1, TOKS_PER_IMG, VIS_ENC_DIM)),
-        (torch.rand(
-            (1, TOKS_PER_IMG, VIS_ENC_DIM)), (1, TOKS_PER_IMG, VIS_ENC_DIM)),
-        (torch.rand(
-            (2, TOKS_PER_IMG, VIS_ENC_DIM)), (2, TOKS_PER_IMG, VIS_ENC_DIM)),
-    ])
-def test_input_mapper_valid_mm_data(input_mapper_for_qwen,
-                                    qwen_vl_context: InputContext,
-                                    img_data: Union[torch.Tensor, List[Image],
-                                                    Image],
-                                    expected_shape: List[int]):
-    """Happy cases for image inputs to Qwen's multimodal input mapper."""
-    mapped_img_data = input_mapper_for_qwen(qwen_vl_context, img_data)
-    # Ensure that we get the appropriately shaped pixel_values
-    # for images and image embeddings, respectively.
-    assert isinstance(mapped_img_data, MultiModalKwargs)
-    assert "pixel_values" in mapped_img_data
-    assert mapped_img_data["pixel_values"].shape == expected_shape
-
-
-# Sad path tests for the multimodal input processor and mapper, respectively
-@pytest.mark.parametrize("mm_data", [
-    {
-        "image": torch.rand(5)
-    },
-    {
-        "image": torch.rand((5, 5, 5, 5, 5))
-    },
-])
-def test_input_processor_invalid_mm_data(input_processor_for_qwen,
-                                         qwen_vl_context: InputContext,
-                                         mm_data: Dict[str, torch.Tensor]):
-    """Test sad cases validated in Qwen's multimodal input processor."""
-    tokenizer = cached_get_tokenizer(qwen_vl_context.model_config.tokenizer,
-                                     trust_remote_code=True)
-    prompt = "Picture 1: <img></img>\n"
-    prompt_token_ids = tokenizer.encode(prompt)
-    inputs = token_inputs(prompt=prompt,
-                          prompt_token_ids=prompt_token_ids,
-                          multi_modal_data=mm_data)
-    # Should fail since we have too many or too few dimensions for embeddings
-    with pytest.raises(ValueError):
-        input_processor_for_qwen(qwen_vl_context, inputs)
-
-
-@pytest.mark.parametrize(
-    "img_data",
-    [
-        # Wrong context length
-        torch.rand((1, TOKS_PER_IMG + 10, VIS_ENC_DIM)),
-        # Wrong visual encoder output size
-        torch.rand((1, TOKS_PER_IMG, VIS_ENC_DIM + 10)),
-    ])
-def test_input_mapper_invalid_mm_data(
-    input_mapper_for_qwen,
-    qwen_vl_context: InputContext,
-    img_data: Union[torch.Tensor, List[Image], Image],
-):
-    """Sad cases validated in Qwen VL's multimodal input mapper."""
-    with pytest.raises(ValueError):
-        input_mapper_for_qwen(qwen_vl_context, img_data)
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index 1345b381f0a99..86a9d3089c3ee 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -4,26 +4,28 @@
 # LICENSE: https://huggingface.co/Qwen/Qwen-7B/blob/main/LICENSE
 """Inference-only QWen model compatible with HuggingFace weights."""
 
+import copy
 import math
 import re
-from functools import partial
-from typing import (Any, Callable, Dict, Iterable, List, Literal, Mapping,
-                    Optional, Set, Tuple, TypedDict, Union)
+import unicodedata
+from functools import lru_cache, partial
+from typing import (AbstractSet, Any, Callable, Collection, Dict, Iterable,
+                    List, Literal, Mapping, Optional, Set, Tuple, TypedDict,
+                    Union)
 
-import numpy as np
 import torch
-from PIL import Image
 from torch import nn
 from torchvision import transforms
 from torchvision.transforms import InterpolationMode
-from transformers import PretrainedConfig
+from transformers import (BatchFeature, PretrainedConfig, PreTrainedTokenizer,
+                          TensorType)
+from transformers.image_utils import ImageInput
+from transformers.tokenization_utils_base import TextInput
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
-                         InputContext, token_inputs)
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn
 from vllm.model_executor.layers.layernorm import RMSNorm
@@ -42,15 +44,20 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
-from vllm.multimodal.utils import cached_get_tokenizer
-from vllm.sequence import IntermediateTensors, SequenceData
-from vllm.utils import is_list_of
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
+                                    NestedTensors)
+from vllm.multimodal.parse import MultiModalDataItems
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptReplacementDetails)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP
 from .utils import (flatten_bn, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
+                    maybe_prefix, merge_multimodal_embeddings)
 
 logger = init_logger(__name__)
 
@@ -353,8 +360,10 @@ def __init__(self,
         self.ln_post = norm_layer(output_dim)
         self.proj = nn.Parameter(
             (output_dim**-0.5) * torch.randn(output_dim, output_dim))
+
         self.image_start_id = image_start_id
         self.image_end_id = image_start_id + 1
+        self.image_pad_id = image_start_id + 2
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         x = x.to(
@@ -383,21 +392,6 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
         return x
 
-    def get_image_positions(self,
-                            input_ids: torch.Tensor) -> Optional[torch.Tensor]:
-        """Given the input IDs, extracts start/stop points corresponding to
-        images.
-
-        args:
-        Returns:
-            Optional torch tensor corresponding to start/stop pairs of images.
-        """
-        if torch.any(input_ids == self.image_start_id):
-            bos_pos = torch.where(input_ids == self.image_start_id)
-            eos_pos = torch.where(input_ids == self.image_end_id)
-            return torch.stack((bos_pos[0], eos_pos[0]), dim=1)
-        return None
-
 
 class QWenMLP(nn.Module):
     """MLP for the language component of the Qwen model, which contains a
@@ -579,9 +573,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             make_empty_intermediate_tensors_factory(
                 ["hidden_states", "residual"], config.hidden_size))
-        self.visual = VisionTransformer(**config.visual,
-                                        quant_config=quant_config) if hasattr(
-                                            config, "visual") else None
+
+        if (vision_config := getattr(config, "visual", None)):
+            self.visual = VisionTransformer(**vision_config,
+                                            quant_config=quant_config)
+        else:
+            self.visual = None
 
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.wte(input_ids)
@@ -593,38 +590,13 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
-        pixel_values: Optional[QwenImageInputs],
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
-        img_pos = None
-        # If pixel / visual embeddings are provided, this is a visual model
-        if pixel_values is not None and self.visual is not None:
-            if pixel_values["type"] != "image_embeds":
-                image_embeds = self.visual(pixel_values["data"])
-            else:
-                image_embeds = pixel_values["data"]
-
-            # features should be of shape (# images, 256, hidden_dim)
-            img_pos = self.visual.get_image_positions(input_ids)
-            if isinstance(
-                    img_pos,
-                    np.ndarray) and img_pos.shape[0] != image_embeds.shape[0]:
-                raise ValueError(
-                    f"Number of placeholders: {img_pos.shape[0]} "
-                    f"does not match number of images {image_embeds.shape[0]}."
-                )
-
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
                 hidden_states = self.get_input_embeddings(input_ids)
-            hidden_states = self.wte(input_ids)
-            # Merge the image embeddings into the hidden states if actually have
-            # visual features and the corresponding image tokens
-            if img_pos is not None:
-                for idx, (img_bos, img_eos) in enumerate(img_pos):
-                    hidden_states[img_bos + 1:img_eos] = image_embeds[idx]
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -648,159 +620,9 @@ def forward(
         return hidden_states
 
 
-def get_image_text(image_num: int, padding: bool) -> str:
-    """Retrieves a placeholder text that when tokenized, will be expanded with
-    image pads.
-
-    Args:
-        image_num: The number of the image that we want a text prompt for.
-            Images should be indexed starting at 1.
-        padding: Whether or not padding should be manually added.
-
-    Returns:
-        Text placeholder prompt for the image being considered.
-    """
-    image_start = f"Picture {image_num}: {IMG_START}"
-    image_end = f"{IMG_END}\n"
-    if not padding:
-        return f"{image_start}{image_end}"
-    return f"{image_start}{MAX_QWEN_IMG_TOKENS * IMG_PAD}{image_end}"
-
-
-def input_processor_for_qwen(ctx: InputContext,
-                             inputs: DecoderOnlyInputs) -> DecoderOnlyInputs:
-    """Processes the inputs, which may or may not be multimodal.
-    Multimodal inputs will only be processed if the model has a "visual"
-    component in its model config, otherwise they'll be ignored.
-
-    Args:
-        ctx: Context of the loaded model.
-        inputs: LLM inputs which may have a multi_modal_data attribute.
-
-    Returns:
-        If the model is language only or not multimodal inputs were provided,
-        returns inputs unmodified. Otherwise, processes the multimodal
-        images / image embeddings and adds the fixed-length image placeholders.
-    """
-    multi_modal_data = inputs.get("multi_modal_data")
-
-    # Only process images if we have multimodal data and a visual config
-    hf_config = ctx.get_hf_config()
-    if (multi_modal_data is None or "image" not in multi_modal_data
-            or not hasattr(hf_config, "visual")):
-        return inputs
-
-    prompt = inputs.get("prompt")
-    prompt_token_ids = inputs["prompt_token_ids"]
-    model_config = ctx.model_config
-    tokenizer = cached_get_tokenizer(
-        model_config.tokenizer,
-        trust_remote_code=model_config.trust_remote_code)
-    image_data = multi_modal_data["image"]
-    if isinstance(image_data, torch.Tensor):
-        num_dims = len(image_data.shape)
-        if num_dims < 2 or num_dims > 3:
-            raise ValueError(
-                f"Expected img embeds to be have 3 dimensions, got {num_dims}")
-        num_images = 1 if num_dims == 2 else image_data.shape[0]
-    elif isinstance(image_data, Image.Image):
-        num_images = 1
-    elif is_list_of(image_data, Image.Image):
-        num_images = len(image_data)
-    else:
-        raise TypeError(f"Invalid image type: {type(image_data)}")
-
-    if prompt is None:
-        prompt = tokenizer.decode(prompt_token_ids)
-
-    # Drops anything between <img>/</img> tags; encoding with the tokenizer
-    # will automatically add the image pads for the context.
-    new_prompt, num_matched_images = re.subn(
-        r"(Picture \d*: <img>).*?(<\/img>\n)",
-        r"\1\2",
-        prompt,
-    )
-
-    if num_matched_images != num_images:
-        logger.warning(
-            "Number of matched image placeholders %s doesn't match the number "
-            "of expected images %s; check your placeholder formatting.",
-            num_matched_images, num_images)
-
-    new_prompt_token_ids = tokenizer.encode(new_prompt)
-
-    return token_inputs(prompt=new_prompt,
-                        prompt_token_ids=new_prompt_token_ids,
-                        multi_modal_data=multi_modal_data)
-
-
-def input_mapper_for_qwen(ctx: InputContext, data: object) -> MultiModalKwargs:
-    """Maps the input data to its MultiModalKwargs (if any).
-
-    Args:
-        ctx: Context of the loaded model.
-        data: data potentially containing image/image embeddings to be mapped
-            to pixel_values in .forward() for a visual QWenLMHeadModel model.
-
-    Returns:
-        MultiModalKwargs containing the stacked normalized images tensor or
-        image embeddings.
-    """
-    # Early exit if we have provided an image to a language only Qwen model
-    hf_config = ctx.get_hf_config()
-    if not hasattr(hf_config, "visual"):
-        logger.warning(
-            "Images were provided but this model has no visual config; "
-            "multimodal inputs will not be forwarded to the model.")
-        return MultiModalKwargs()
-
-    model_config = ctx.model_config
-    tokenizer = cached_get_tokenizer(
-        model_config.tokenizer,
-        trust_remote_code=model_config.trust_remote_code)
-
-    image_pair_tok = tokenizer.encode(IMG_START + IMG_END,
-                                      add_special_tokens=False,
-                                      return_tensors="pt").squeeze()
-    image_start_id = image_pair_tok[0]
-    image_end_id = image_pair_tok[-1]
-    if (image_start_id + 1) != image_end_id:
-        raise ValueError(
-            f"Found image end ID {image_end_id}, but expected {IMG_START} + 1")
-    if len(image_pair_tok) != (MAX_QWEN_IMG_TOKENS + 2):
-        raise ValueError(
-            f"Expected image context length of {MAX_QWEN_IMG_TOKENS}, "
-            f"but got {image_pair_tok - 2}")
-
-    hf_config = ctx.get_hf_config()
-    image_size = hf_config.visual["image_size"]
-    img_emb_size = hf_config.visual["output_dim"]
-
-    if isinstance(data, torch.Tensor):
-        # It's expected that our values have already been processed
-        # by the visual transformer; shape is expected to be:
-        # (# images, 256, hidden_size)
-        if len(data.shape) == 2:
-            # Assume only one image embed was provided; unsqueeze the extra dim
-            data = data.unsqueeze(0)
-        if len(data.shape) != 3 or data.shape[
-                1] != MAX_QWEN_IMG_TOKENS or data.shape[2] != img_emb_size:
-            raise ValueError(
-                "Expected image embeds to be a tensor of shape"
-                f"[# images, {MAX_QWEN_IMG_TOKENS}, {img_emb_size}], but "
-                f"received shape [{data.shape}]")
-        pixel_values = data
-    else:
-        transform = build_normalization_transform(image_size)
-        if not isinstance(data, (list, tuple)):
-            data = [data]
-        transformed_images = [transform(datum) for datum in data]
-        pixel_values = torch.stack(transformed_images, dim=0)
-    return MultiModalKwargs({"pixel_values": pixel_values})
-
-
 def build_normalization_transform(image_size: int) -> transforms.Compose:
-    """Builds a normalization transform which can be applied to one or
+    """
+    Build a normalization transform which can be applied to one or
     more input images from which we want to extract visual features.
 
     Args:
@@ -817,62 +639,251 @@ def build_normalization_transform(image_size: int) -> transforms.Compose:
     ])
 
 
-def dummy_data_for_qwen(
-    ctx: InputContext,
-    seq_len: int,
-    mm_counts: Mapping[str, int],
-) -> DummyData:
-    """Build dummy data for warming up Qwen models; this will only contain text
-    matching the defaults for VLLM unless the model has a visual config.
+@lru_cache(maxsize=1)
+def _get_tokenizer_without_image_pad(
+        tokenizer: PreTrainedTokenizer) -> PreTrainedTokenizer:
+    """
+    The logic of adding image pad tokens should only be applied in
+    :class:`QWenVLProcessor`, so they are patched out here.
 
-    Args:
-        ctx: Context of the loaded model.
-        seq_len: Number of tokens in the text sequence.
-        mm_counts: multimodal data counts.
-    
-    Returns:
-        Tuple containing sequential and multimodal data.
+    The definition of the wrapped tokenizer can be found here:
+    https://huggingface.co/Qwen/Qwen-VL/blob/main/tokenization_qwen.py
+    """
+    new_tokenizer = copy.deepcopy(tokenizer)
+
+    class TokenizerWithoutImagePad(tokenizer.__class__):  # type: ignore
+
+        def tokenize(
+            self,
+            text: str,
+            allowed_special: Union[AbstractSet[str], str] = "all",
+            disallowed_special: Union[Collection[str], str] = (),
+            **kwargs,
+        ) -> list[Union[bytes, str]]:
+            text = unicodedata.normalize("NFC", text)
+
+            return [
+                self.decoder[t] for t in self.tokenizer.encode(
+                    text,
+                    allowed_special=allowed_special,
+                    disallowed_special=disallowed_special,
+                )
+            ]
+
+        def _decode(
+            self,
+            token_ids: Union[int, List[int]],
+            skip_special_tokens: bool = False,
+            errors: Optional[str] = None,
+            **kwargs,
+        ) -> str:
+            if isinstance(token_ids, int):
+                token_ids = [token_ids]
+
+            return self.tokenizer.decode(
+                token_ids,
+                errors=errors or self.errors,
+            )
+
+    TokenizerWithoutImagePad.__name__ = \
+        f"{tokenizer.__class__.__name__}WithoutImagePad"
+
+    new_tokenizer.__class__ = TokenizerWithoutImagePad
+    return new_tokenizer
+
+
+class QWenVLProcessor:
+    """
+    This model doesn't define its own HF processor,
+    so we implement our own one here.
+
+    We call the wrapped tokenizer to automatically insert image pad tokens:
+    https://huggingface.co/Qwen/Qwen-VL/blob/main/tokenization_qwen.py#L245
+
+    The image processor is defined here:
+    https://huggingface.co/Qwen/Qwen-VL/blob/main/visual.py#L354
     """
-    hf_config = ctx.get_hf_config()
-
-    # The presence of a visual config indicates this is a multimodal model.
-    # If we don't have it, the model is considered an LLM for warmup purposes.
-    if not hasattr(hf_config, "visual"):
-        seq_data = SequenceData.from_prompt_token_counts((0, seq_len))
-        mm_data = None
-        return DummyData(seq_data, mm_data)
-
-    # We have a visual component - use images to warm up
-    num_images = mm_counts["image"]
-    model_config = ctx.model_config
-    tokenizer = cached_get_tokenizer(
-        model_config.tokenizer,
-        trust_remote_code=model_config.trust_remote_code)
-
-    # Build the image prompts with no imgpads; the tokenizer will add img pads
-    image_prompt = ''.join(
-        [get_image_text(idx, False) for idx in range(1, num_images + 1)])
-    toks = tokenizer.encode(image_prompt, add_special_tokens=False)
-
-    # Make sure we actually get the fixed context size per tok padding
-    num_pads = toks.count(tokenizer.encode(IMG_PAD)[0])
-    if num_pads != (num_images * MAX_QWEN_IMG_TOKENS):
-        raise ValueError(
-            f"Tokenized dummy data should encode {MAX_QWEN_IMG_TOKENS} pads"
-            f" per image, but got {num_pads} pads for {num_images} image(s)"
-            " in total. Are you using a qwen tokenizer?")
-
-    # Ensure the number of tokens is at minimum the sequence length provided
-    if len(toks) < seq_len:
-        toks += [0] * (seq_len - len(toks))
-
-    seq_data = SequenceData.from_seqs(toks)
-
-    # Build the input images; width/height doesn't actually matter here since
-    # the data will get resized and the # of tokens per image is constant
-    image = Image.new("RGB", (224, 224), color=0)
-    mm_data = {"image": image if num_images == 1 else [image] * num_images}
-    return DummyData(seq_data, mm_data)
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        tokenizer: PreTrainedTokenizer,
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        self.tokenizer = tokenizer
+
+        if hasattr(self.config, "visual"):
+            self.image_transform = build_normalization_transform(
+                config.visual["image_size"])
+        else:
+            self.image_transform = None
+
+        special_tokens: dict[str,
+                             int] = tokenizer.special_tokens  # type: ignore
+        self.img_start_id = special_tokens[IMG_START]
+        self.img_end_id = special_tokens[IMG_END]
+
+    def __call__(
+        self,
+        text: Optional[Union[TextInput, list[TextInput]]] = None,
+        images: Optional[Union[ImageInput, list[ImageInput]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+    ) -> BatchFeature:
+        if text is None:
+            text = []
+        if not isinstance(text, list):
+            text = [text]
+        if images is None:
+            images = []
+        if not isinstance(images, list):
+            images = [images]
+
+        text_inputs = self.tokenizer(text)
+
+        if len(images) == 0:
+            image_inputs = {}
+        else:
+            if self.image_transform is None:
+                raise ValueError("This model does not support image inputs")
+
+            pixel_values = [self.image_transform(image) for image in images]
+            image_inputs = {"pixel_values": torch.stack(pixel_values)}
+
+        return BatchFeature(
+            {
+                **text_inputs,
+                **image_inputs,
+            },
+            tensor_type=return_tensors,
+        )
+
+
+class QWenVLProcessingInfo(BaseProcessingInfo):
+
+    def get_tokenizer(self) -> PreTrainedTokenizer:
+        tokenizer = self.ctx.tokenizer
+        assert isinstance(tokenizer, PreTrainedTokenizer)
+
+        return _get_tokenizer_without_image_pad(tokenizer)
+
+    def get_hf_processor(self) -> QWenVLProcessor:
+        tokenizer = self.ctx.tokenizer
+        assert isinstance(tokenizer, PreTrainedTokenizer)
+
+        return QWenVLProcessor(self.get_hf_config(), tokenizer)
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None}
+
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+        return {"image": self.get_num_image_tokens()}
+
+    def get_num_image_tokens(self) -> int:
+        return MAX_QWEN_IMG_TOKENS
+
+
+class QWenVLDummyInputsBuilder(BaseDummyInputsBuilder[QWenVLProcessingInfo]):
+
+    def get_dummy_processor_inputs(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        hf_config = self.info.get_hf_config()
+        if not hasattr(hf_config, "visual"):
+            return ProcessorInputs(prompt_text="", mm_data={})
+
+        vision_config = hf_config.visual
+
+        max_image_size = vision_config["image_size"]
+        num_images = mm_counts.get("image", 0)
+
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=max_image_size,
+                                   height=max_image_size,
+                                   num_images=num_images)
+        }
+
+        return ProcessorInputs(
+            prompt_text="".join(f"Picture {i}: {IMG_START}{IMG_END}\n"
+                                for i in range(1, num_images + 1)),
+            mm_data=mm_data,
+        )
+
+
+class QWenVLMultiModalProcessor(BaseMultiModalProcessor[QWenVLProcessingInfo]):
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        # Drops anything between <img>/</img> tags; encoding with the tokenizer
+        # will automatically add the image pads for the context.
+        prompt, num_matched_images = re.subn(
+            r"(Picture \d*: <img>).*?(<\/img>\n)",
+            r"\1\2",
+            prompt,
+        )
+
+        image_data = mm_data.get("images")
+        if image_data is not None:
+            assert isinstance(image_data, list)
+
+            num_images = len(image_data)
+            if num_matched_images != num_images:
+                logger.warning(
+                    "Number of matched image placeholders %s doesn't match "
+                    "the number of expected images %s; check your placeholder "
+                    "formatting.", num_matched_images, num_images)
+
+        return super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+        )
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+            image_embeds=MultiModalFieldConfig.batched("image"),
+        )
+
+    def _get_prompt_replacements(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> list[PromptReplacement]:
+        tokenizer = self.info.get_tokenizer()
+        special_tokens: dict[str,
+                             int] = tokenizer.special_tokens  # type: ignore
+
+        img_start_id = special_tokens[IMG_START]
+        img_end_id = special_tokens[IMG_END]
+        img_pad_id = special_tokens[IMG_PAD]
+
+        num_image_tokens = self.info.get_num_image_tokens()
+        image_tokens = [img_pad_id] * num_image_tokens
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=[img_start_id, img_end_id],
+                replacement=PromptReplacementDetails(
+                    full=[img_start_id] + image_tokens + [img_end_id],
+                    features=image_tokens,
+                ),
+            )
+        ]
 
 
 class QWenBaseModel(nn.Module, SupportsPP, SupportsLoRA):
@@ -898,38 +909,77 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.transformer.make_empty_intermediate_tensors)
 
-    def _get_image_input_type(
-            self,
-            pixel_values: Optional[torch.Tensor]) -> Optional[QwenImageInputs]:
-        """Determines if the provided pixel_values are normalized pixel values
-        or image embeddings.
+    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
+        h = w = self.config.visual["image_size"]
+        expected_dims = (3, h, w)
+        actual_dims = tuple(data.shape[1:])
 
-        Args:
-            pixel_values: Optional data to processed into visual embeddings.
+        if actual_dims != expected_dims:
+            expected_expr = ("batch_size", *map(str, expected_dims))
+            raise ValueError(
+                f"The expected shape of pixel values is {expected_expr}. "
+                f"You supplied {tuple(data.shape)}.")
+
+        return data
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[QwenImageInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+
+        if pixel_values is not None:
+            if not isinstance(pixel_values, torch.Tensor):
+                raise ValueError("Incorrect type of pixel values. "
+                                 f"Got type: {type(pixel_values)}")
+
+            return QwenImagePixelInputs(
+                type="pixel_values",
+                data=self._validate_pixel_values(
+                    flatten_bn(pixel_values, concat=True)),
+            )
+
+        if image_embeds is not None:
+            if not isinstance(image_embeds, torch.Tensor):
+                raise ValueError("Incorrect type of image embeddings. "
+                                 f"Got type: {type(image_embeds)}")
+
+            return QwenImageEmbeddingInputs(
+                type="image_embeds",
+                data=flatten_bn(image_embeds),
+            )
 
-        Returns:
-            None of the QwenImageInputs type used to determine whether or not
-            the visual transformer needs to process the pixel_values.
-        """
-        if pixel_values is not None and self.transformer.visual is not None:
-            pixel_values = flatten_bn(pixel_values)
-            if len(pixel_values.shape) == 3 and pixel_values.shape[
-                    1] == MAX_QWEN_IMG_TOKENS and pixel_values.shape[
-                        2] == self.config.visual["output_dim"]:
-                return QwenImageEmbeddingInputs(
-                    type="image_embeds",
-                    data=pixel_values,
-                )
-            else:
-                # If we have the wrong shape, assume we still need to process
-                return QwenImagePixelInputs(
-                    type="pixel_values",
-                    data=pixel_values,
-                )
         return None
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.transformer.get_input_embeddings(input_ids)
+    def _process_image_input(self,
+                             image_input: QwenImageInputs) -> torch.Tensor:
+        if image_input["type"] == "image_embeds":
+            return image_input["data"]
+
+        assert self.transformer.visual is not None
+        return self.transformer.visual(image_input["data"])
+
+    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return None
+
+        vision_embeddings = self._process_image_input(image_input)
+        return vision_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[NestedTensors] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.transformer.get_input_embeddings(input_ids)
+
+        if multimodal_embeddings is not None:
+            assert self.transformer.visual is not None
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, multimodal_embeddings,
+                self.transformer.visual.image_pad_id)
+
+        return inputs_embeds
 
     def forward(
         self,
@@ -938,18 +988,23 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
-        pixel_values: Optional[torch.Tensor] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
             input_ids = None
-            pixel_values = None
-        else:
-            pixel_values = self._get_image_input_type(pixel_values)
 
         hidden_states = self.transformer(input_ids, positions, kv_caches,
                                          attn_metadata, intermediate_tensors,
-                                         pixel_values, inputs_embeds)
+                                         inputs_embeds)
         return hidden_states
 
     def compute_logits(
@@ -1063,10 +1118,9 @@ def get_mm_mapping(self) -> MultiModelKeys:
             tower_model="transformer.visual.transformer")
 
 
-@MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_qwen)
-@MULTIMODAL_REGISTRY.register_max_image_tokens(MAX_QWEN_IMG_TOKENS)
-@INPUT_REGISTRY.register_dummy_data(dummy_data_for_qwen)
-@INPUT_REGISTRY.register_input_processor(input_processor_for_qwen)
+@MULTIMODAL_REGISTRY.register_processor(QWenVLMultiModalProcessor,
+                                        info=QWenVLProcessingInfo,
+                                        dummy_inputs=QWenVLDummyInputsBuilder)
 class QWenLMHeadModel(QWenBaseModel, SupportsMultiModal, SupportsLoRA):
     """
     QWenLMHeadModel is not only applicable to LLM  but also to VL, which is not 
@@ -1084,7 +1138,7 @@ def __new__(
         cls,
         vllm_config: VllmConfig,
         prefix: str = "",
-    ) -> None:
+    ) -> QWenBaseModel:
         config = vllm_config.model_config.hf_config
         # Initialize VL
         if hasattr(config, "visual"):

From 925d2f19089b50736ce5e0f2ba0c9b7f3da6fb15 Mon Sep 17 00:00:00 2001
From: Jun Duan <jun.duan.phd@outlook.com>
Date: Tue, 28 Jan 2025 11:37:10 -0500
Subject: [PATCH 412/462] [Doc] Fix typo for x86 CPU installation (#12514)

Signed-off-by: Jun Duan <jun.duan.phd@outlook.com>
---
 docs/source/getting_started/installation/cpu/x86.inc.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/getting_started/installation/cpu/x86.inc.md b/docs/source/getting_started/installation/cpu/x86.inc.md
index e4f99d3cebdf2..e0eaac5099305 100644
--- a/docs/source/getting_started/installation/cpu/x86.inc.md
+++ b/docs/source/getting_started/installation/cpu/x86.inc.md
@@ -18,7 +18,7 @@ vLLM initially supports basic model inferencing and serving on x86 CPU platform,
 :::
 
 ```{note}
-- AVX512_BF16 is an extension ISA provides native BF16 data type conversion and vector product instructions, will brings some performance improvement compared with pure AVX512. The CPU backend build script will check the host CPU flags to determine whether to enable AVX512_BF16.
+- AVX512_BF16 is an extension ISA provides native BF16 data type conversion and vector product instructions, which brings some performance improvement compared with pure AVX512. The CPU backend build script will check the host CPU flags to determine whether to enable AVX512_BF16.
 - If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable `VLLM_CPU_AVX512BF16=1` before the building.
 ```
 

From 3fd1fb63efb6c96f30237b12e2816b4f2c5323d0 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Tue, 28 Jan 2025 16:38:38 +0000
Subject: [PATCH 413/462] [V1][Metrics] Hook up IterationStats for Prometheus
 metrics (#12478)

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
---
 tests/entrypoints/openai/test_metrics.py |  7 ++-
 vllm/v1/engine/async_llm.py              |  3 +-
 vllm/v1/metrics/loggers.py               | 68 ++++++++++++++++++++----
 3 files changed, 66 insertions(+), 12 deletions(-)

diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py
index 469a5fb039fb6..64deaedf0f2c1 100644
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@@ -105,8 +105,6 @@ async def client(server):
 @pytest.mark.asyncio
 async def test_metrics_counts(server: RemoteOpenAIServer,
                               client: openai.AsyncClient, use_v1: bool):
-    if use_v1:
-        pytest.skip("Skipping test on vllm V1")
     for _ in range(_NUM_REQUESTS):
         # sending a request triggers the metrics to be logged.
         await client.completions.create(
@@ -120,6 +118,9 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
 
     # Loop over all expected metric_families
     for metric_family, suffix_values_list in EXPECTED_VALUES.items():
+        if use_v1 and metric_family not in EXPECTED_METRICS_V1:
+            continue
+
         found_metric = False
 
         # Check to see if the metric_family is found in the prom endpoint.
@@ -199,6 +200,8 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
 EXPECTED_METRICS_V1 = [
     "vllm:num_requests_running",
     "vllm:num_requests_waiting",
+    "vllm:prompt_tokens_total",
+    "vllm:generation_tokens_total",
 ]
 
 
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 917d52d3220b8..022b6d0668e99 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -305,7 +305,8 @@ def _log_stats(
             return
 
         for logger in self.stat_loggers:
-            logger.log(scheduler_stats=scheduler_stats)
+            logger.log(scheduler_stats=scheduler_stats,
+                       iteration_stats=iteration_stats)
 
     def encode(
         self,
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index b84f03fa3267c..6a7bb423749e1 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -1,11 +1,12 @@
 import time
 from abc import ABC, abstractmethod
-from typing import Dict
+from typing import Dict, List
 
+import numpy as np
 import prometheus_client
 
 from vllm.logger import init_logger
-from vllm.v1.metrics.stats import SchedulerStats
+from vllm.v1.metrics.stats import IterationStats, SchedulerStats
 
 logger = init_logger(__name__)
 
@@ -15,27 +16,61 @@
 class StatLoggerBase(ABC):
 
     @abstractmethod
-    def log(self, scheduler_stats: SchedulerStats):
+    def log(self, scheduler_stats: SchedulerStats,
+            iteration_stats: IterationStats):
         ...
 
 
 class LoggingStatLogger(StatLoggerBase):
 
     def __init__(self):
-        self.last_log_time = time.monotonic()
+        self._reset(time.monotonic())
 
-    def log(self, scheduler_stats: SchedulerStats):
-        """Log Stats to standard output."""
+    def _reset(self, now):
+        self.last_log_time = now
+
+        # Tracked stats over current local logging interval.
+        self.num_prompt_tokens: List[int] = []
+        self.num_generation_tokens: List[int] = []
 
+    def _local_interval_elapsed(self, now: float) -> bool:
         # Log every _LOCAL_LOGGING_INTERVAL_SEC.
+        elapsed_time = now - self.last_log_time
+        return elapsed_time > _LOCAL_LOGGING_INTERVAL_SEC
+
+    def _track_iteration_stats(self, iteration_stats: IterationStats):
+        # Save tracked stats for token counters.
+        self.num_prompt_tokens.append(iteration_stats.num_prompt_tokens)
+        self.num_generation_tokens.append(
+            iteration_stats.num_generation_tokens)
+
+    def _get_throughput(self, tracked_stats: List[int], now: float) -> float:
+        # Compute summary metrics for tracked stats
+        return float(np.sum(tracked_stats) / (now - self.last_log_time))
+
+    def log(self, scheduler_stats: SchedulerStats,
+            iteration_stats: IterationStats):
+        """Log Stats to standard output."""
+
+        self._track_iteration_stats(iteration_stats)
+
         now = time.monotonic()
-        if now - self.last_log_time < _LOCAL_LOGGING_INTERVAL_SEC:
+        if not self._local_interval_elapsed(now):
             return
-        self.last_log_time = now
+
+        prompt_throughput = self._get_throughput(self.num_prompt_tokens, now)
+        generation_throughput = self._get_throughput(
+            self.num_generation_tokens, now)
+
+        self._reset(now)
 
         # Format and print output.
         logger.info(
+            "Avg prompt throughput: %.1f tokens/s, "
+            "Avg generation throughput: %.1f tokens/s, "
             "Running: %d reqs, Waiting: %d reqs ",
+            prompt_throughput,
+            generation_throughput,
             scheduler_stats.num_running_reqs,
             scheduler_stats.num_waiting_reqs,
         )
@@ -61,11 +96,26 @@ def __init__(self, labels: Dict[str, str]):
             documentation="Number of requests waiting to be processed.",
             labelnames=labelnames).labels(*labelvalues)
 
-    def log(self, scheduler_stats: SchedulerStats):
+        self.counter_prompt_tokens = prometheus_client.Counter(
+            name="vllm:prompt_tokens_total",
+            documentation="Number of prefill tokens processed.",
+            labelnames=labelnames).labels(*labelvalues)
+
+        self.counter_generation_tokens = prometheus_client.Counter(
+            name="vllm:generation_tokens_total",
+            documentation="Number of generation tokens processed.",
+            labelnames=labelnames).labels(*labelvalues)
+
+    def log(self, scheduler_stats: SchedulerStats,
+            iteration_stats: IterationStats):
         """Log to prometheus."""
         self.gauge_scheduler_running.set(scheduler_stats.num_running_reqs)
         self.gauge_scheduler_waiting.set(scheduler_stats.num_waiting_reqs)
 
+        self.counter_prompt_tokens.inc(iteration_stats.num_prompt_tokens)
+        self.counter_generation_tokens.inc(
+            iteration_stats.num_generation_tokens)
+
     @staticmethod
     def _unregister_vllm_metrics():
         # Unregister any existing vLLM collectors (for CI/CD

From 0f657bdc52d4ad1d079beddf8e7556c419aca7b4 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Tue, 28 Jan 2025 14:06:32 -0500
Subject: [PATCH 414/462] Replace missed warning_once for rerank API (#12472)

Signed-off-by: mgoin <michael@neuralmagic.com>
---
 vllm/entrypoints/openai/api_server.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 45cf06566faaa..077bc993726ae 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -528,7 +528,7 @@ async def do_rerank(request: RerankRequest, raw_request: Request):
 @router.post("/v1/rerank")
 @with_cancellation
 async def do_rerank_v1(request: RerankRequest, raw_request: Request):
-    logger.warning(
+    logger.warning_once(
         "To indicate that the rerank API is not part of the standard OpenAI"
         " API, we have located it at `/rerank`. Please update your client"
         "accordingly. (Note: Conforms to JinaAI rerank API)")

From f26d790718b8e50a11a366f3301b6a9300377797 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 28 Jan 2025 20:05:27 +0000
Subject: [PATCH 415/462] Do not run `suggestion` `pre-commit` hook multiple
 times (#12521)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .pre-commit-config.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 7b32df90bfd8b..77010090965d4 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -90,3 +90,4 @@ repos:
     entry: bash -c 'echo "To bypass pre-commit hooks, add --no-verify to git commit."'
     language: system
     verbose: true
+    pass_filenames: false

From c386c43ca3a7156a953e0ca4d8f2c2f36ccf1423 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Tue, 28 Jan 2025 22:07:22 +0000
Subject: [PATCH 416/462] [V1][Metrics] Add per-request
 prompt/generation_tokens histograms (#12516)

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
---
 tests/entrypoints/openai/test_metrics.py |  6 +++
 vllm/v1/engine/async_llm.py              |  3 +-
 vllm/v1/engine/output_processor.py       | 11 ++++-
 vllm/v1/metrics/loggers.py               | 60 +++++++++++++++++++++---
 vllm/v1/metrics/stats.py                 | 36 ++++++++++++--
 5 files changed, 102 insertions(+), 14 deletions(-)

diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py
index 64deaedf0f2c1..9a84c82b62fdf 100644
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@@ -202,6 +202,12 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
     "vllm:num_requests_waiting",
     "vllm:prompt_tokens_total",
     "vllm:generation_tokens_total",
+    "vllm:request_prompt_tokens_sum",
+    "vllm:request_prompt_tokens_bucket",
+    "vllm:request_prompt_tokens_count",
+    "vllm:request_generation_tokens_sum",
+    "vllm:request_generation_tokens_bucket",
+    "vllm:request_generation_tokens_count",
 ]
 
 
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 022b6d0668e99..b9dc3561d1750 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -53,8 +53,7 @@ def __init__(
         self.log_stats = log_stats
         self.stat_loggers: List[StatLoggerBase] = [
             LoggingStatLogger(),
-            PrometheusStatLogger(labels=dict(
-                model_name=self.model_config.served_model_name)),
+            PrometheusStatLogger(vllm_config.model_config),
         ]
 
         # Tokenizer (+ ensure liveness if running in another process).
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index 564eab51bd3a8..39217b8090140 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -8,7 +8,7 @@
 from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
 from vllm.v1.engine.detokenizer import (DetokenizerOutput,
                                         IncrementalDetokenizer)
-from vllm.v1.metrics.stats import IterationStats
+from vllm.v1.metrics.stats import IterationStats, RequestStateStats
 
 
 @dataclass
@@ -37,6 +37,8 @@ def __init__(
         self.is_prefilling = True
         self.queue = queue
 
+        self.stats = RequestStateStats()
+
     @classmethod
     def from_new_request(
         cls,
@@ -146,7 +148,8 @@ def process_outputs(
             # 1) Compute stats for this iteration.
             iteration_stats.update_from_output(engine_core_output,
                                                req_state.is_prefilling,
-                                               req_state.prompt_len)
+                                               req_state.prompt_len,
+                                               req_state.stats)
             req_state.is_prefilling = False
 
             # 2) Detokenize the token ids into text.
@@ -171,6 +174,10 @@ def process_outputs(
                         # detected stop string, abort needed in EngineCore.
                         reqs_to_abort.append(req_id)
 
+                    # Track per-request stats
+                    iteration_stats.update_from_finished_request(
+                        request_output, req_state.stats)
+
         return OutputProcessorOutput(
             request_outputs=request_outputs,
             reqs_to_abort=reqs_to_abort,
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index 6a7bb423749e1..87d9d63652c05 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -1,10 +1,11 @@
 import time
 from abc import ABC, abstractmethod
-from typing import Dict, List
+from typing import List
 
 import numpy as np
 import prometheus_client
 
+from vllm.config import ModelConfig
 from vllm.logger import init_logger
 from vllm.v1.metrics.stats import IterationStats, SchedulerStats
 
@@ -78,13 +79,13 @@ def log(self, scheduler_stats: SchedulerStats,
 
 class PrometheusStatLogger(StatLoggerBase):
 
-    def __init__(self, labels: Dict[str, str]):
-        self.labels = labels
+    def __init__(self, model_config: ModelConfig):
+        self._unregister_vllm_metrics()
 
-        labelnames = self.labels.keys()
-        labelvalues = self.labels.values()
+        labelnames = ["model_name"]
+        labelvalues = [model_config.served_model_name]
 
-        self._unregister_vllm_metrics()
+        max_model_len = model_config.max_model_len
 
         self.gauge_scheduler_running = prometheus_client.Gauge(
             name="vllm:num_requests_running",
@@ -106,6 +107,20 @@ def __init__(self, labels: Dict[str, str]):
             documentation="Number of generation tokens processed.",
             labelnames=labelnames).labels(*labelvalues)
 
+        self.histogram_num_prompt_tokens_request = \
+            prometheus_client.Histogram(
+                name="vllm:request_prompt_tokens",
+                documentation="Number of prefill tokens processed.",
+                buckets=build_1_2_5_buckets(max_model_len),
+                labelnames=labelnames).labels(*labelvalues)
+
+        self.histogram_num_generation_tokens_request = \
+            prometheus_client.Histogram(
+                name="vllm:request_generation_tokens",
+                documentation="Number of generation tokens processed.",
+                buckets=build_1_2_5_buckets(max_model_len),
+                labelnames=labelnames).labels(*labelvalues)
+
     def log(self, scheduler_stats: SchedulerStats,
             iteration_stats: IterationStats):
         """Log to prometheus."""
@@ -116,9 +131,42 @@ def log(self, scheduler_stats: SchedulerStats,
         self.counter_generation_tokens.inc(
             iteration_stats.num_generation_tokens)
 
+        for finished_request in iteration_stats.finished_requests:
+            self.histogram_num_prompt_tokens_request.observe(
+                finished_request.num_prompt_tokens)
+            self.histogram_num_generation_tokens_request.observe(
+                finished_request.num_generation_tokens)
+
     @staticmethod
     def _unregister_vllm_metrics():
         # Unregister any existing vLLM collectors (for CI/CD
         for collector in list(prometheus_client.REGISTRY._collector_to_names):
             if hasattr(collector, "_name") and "vllm" in collector._name:
                 prometheus_client.REGISTRY.unregister(collector)
+
+
+def build_buckets(mantissa_lst: List[int], max_value: int) -> List[int]:
+    """
+    Builds a list of buckets with increasing powers of 10 multiplied by
+    mantissa values until the value exceeds the specified maximum.
+
+    """
+    exponent = 0
+    buckets: List[int] = []
+    while True:
+        for m in mantissa_lst:
+            value = m * 10**exponent
+            if value <= max_value:
+                buckets.append(value)
+            else:
+                return buckets
+        exponent += 1
+
+
+def build_1_2_5_buckets(max_value: int) -> List[int]:
+    """
+    Example:
+    >>> build_1_2_5_buckets(100)
+    [1, 2, 5, 10, 20, 50, 100]
+    """
+    return build_buckets([1, 2, 5], max_value)
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
index 60cb986f8bbce..55d85a7992cc5 100644
--- a/vllm/v1/metrics/stats.py
+++ b/vllm/v1/metrics/stats.py
@@ -1,7 +1,8 @@
 from dataclasses import dataclass
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, List
 
 if TYPE_CHECKING:
+    from vllm.outputs import RequestOutput
     from vllm.v1.engine import EngineCoreOutput
 
 
@@ -16,6 +17,21 @@ class SchedulerStats:
     # gpu_prefix_cache_hit_rate: float = 0.0
 
 
+@dataclass
+class RequestStateStats:
+    """Stats that need to be tracked across delta updates."""
+
+    num_generation_tokens: int = 0
+
+
+@dataclass
+class FinishedRequestStats:
+    """Stats associated with a finished request."""
+
+    num_prompt_tokens: int = 0
+    num_generation_tokens: int = 0
+
+
 class IterationStats:
     """Stats associated with a single set of EngineCoreOutputs."""
 
@@ -23,17 +39,29 @@ def __init__(self, log_stats: bool):
         self.log_stats = log_stats
         self.num_generation_tokens = 0
         self.num_prompt_tokens = 0
+        self.finished_requests: List[FinishedRequestStats] = []
 
     def update_from_output(self, output: "EngineCoreOutput",
-                           is_prefilling: bool, prompt_len: int):
+                           is_prefilling: bool, prompt_len: int,
+                           request_state_stats: RequestStateStats):
         if not self.log_stats:
             return
 
-        self.num_generation_tokens += len(output.new_token_ids)
+        num_new_generation_tokens = len(output.new_token_ids)
+
+        self.num_generation_tokens += num_new_generation_tokens
         if is_prefilling:
             # This relies on the invariant that EngineCore does
             # not stream outputs for partially completed prefills
             # (scheduler.update_from_output makes EngineCoreOutput
             # iff num_computed_tokens == num_tokens).
-            assert (len(output.new_token_ids) > 0)
+            assert (num_new_generation_tokens > 0)
             self.num_prompt_tokens += prompt_len
+
+        request_state_stats.num_generation_tokens += num_new_generation_tokens
+
+    def update_from_finished_request(self, request_output: "RequestOutput",
+                                     request_state_stats: RequestStateStats):
+        self.finished_requests.append(
+            FinishedRequestStats(len(request_output.prompt_token_ids),
+                                 request_state_stats.num_generation_tokens))

From 80fcc3ed1c940ea43e1b495bbdf8b9765f837128 Mon Sep 17 00:00:00 2001
From: fenghuizhang <159459388+fenghuizhang@users.noreply.github.com>
Date: Tue, 28 Jan 2025 14:36:44 -0800
Subject: [PATCH 417/462] [Kernel] Pipe attn_logits_soft_cap through paged
 attention TPU kernels (#12482)

Signed-off-by: Fenghui Zhang <fhzhang@google.com>
---
 .buildkite/run-tpu-test.sh        |  0
 vllm/attention/backends/pallas.py | 42 ++++++++++++-------------------
 2 files changed, 16 insertions(+), 26 deletions(-)
 mode change 100644 => 100755 .buildkite/run-tpu-test.sh

diff --git a/.buildkite/run-tpu-test.sh b/.buildkite/run-tpu-test.sh
old mode 100644
new mode 100755
diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py
index facdee6b29e39..209a623ba441c 100644
--- a/vllm/attention/backends/pallas.py
+++ b/vllm/attention/backends/pallas.py
@@ -110,6 +110,7 @@ def __init__(
 
         assert self.num_heads % self.num_kv_heads == 0
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
+        self.logits_soft_cap = logits_soft_cap
         if head_size % 128 != 0:
             raise NotImplementedError("Head size must be a multiple of 128.")
         if alibi_slopes is not None:
@@ -120,9 +121,6 @@ def __init__(
             raise NotImplementedError("FP8 KV cache dtype is not supported.")
         if blocksparse_params is not None:
             raise NotImplementedError("Blocksparse is not supported.")
-        if logits_soft_cap is not None:
-            raise NotImplementedError(
-                "Attention logits soft-capping is not supported.")
 
         if torch_xla.tpu.version() < 4:
             raise NotImplementedError("TPU version must be 4 or higher.")
@@ -230,6 +228,7 @@ def forward(
                     num_kv_pages_per_compute_block,
                     num_queries_per_compute_block,
                     use_kernel=True,
+                    attn_logits_soft_cap=self.logits_soft_cap,
                 )
         else:
             # Decoding run.
@@ -257,6 +256,7 @@ def forward(
                     attn_metadata.block_tables,
                     pages_per_compute_block,
                     self.megacore_mode,
+                    attn_logits_soft_cap=self.logits_soft_cap,
                 )
             else:
                 chunk_size = max_num_seq
@@ -280,6 +280,7 @@ def forward(
                         attn_metadata.block_tables[chunk_start:chunk_end],
                         pages_per_compute_block,
                         self.megacore_mode,
+                        attn_logits_soft_cap=self.logits_soft_cap,
                     )
                     output[chunk_start:chunk_end] = chunk_output
 
@@ -313,6 +314,8 @@ def paged_attention(
     block_tables: torch.Tensor,
     pages_per_compute_block: int,
     megacore_mode: Optional[str],
+    *,
+    attn_logits_soft_cap: Optional[float],
 ) -> torch.Tensor:
     batch_size = query.shape[0]
     if megacore_mode == "batch" and batch_size % 2 != 0:
@@ -320,26 +323,13 @@ def paged_attention(
     else:
         megacore_mode = megacore_mode
 
-    # NOTE(woosuk): A temporary workaround to avoid the error:
-    # "xla::paged_attention() Expected a value of type 'str' for
-    # argument 'megacore_mode' but instead found type 'NoneType'."
-    if megacore_mode is not None:
-        output = torch.ops.xla.paged_attention(
-            query,
-            key_cache,
-            value_cache,
-            context_lens,
-            block_tables,
-            pages_per_compute_block,
-            megacore_mode=megacore_mode,
-        )
-    else:
-        output = torch.ops.xla.paged_attention(
-            query,
-            key_cache,
-            value_cache,
-            context_lens,
-            block_tables,
-            pages_per_compute_block,
-        )
-    return output
+    return torch.ops.xla.paged_attention(
+        query,
+        key_cache,
+        value_cache,
+        context_lens,
+        block_tables,
+        pages_per_compute_block,
+        megacore_mode=megacore_mode,
+        attn_logits_soft_cap=attn_logits_soft_cap,
+    )

From fbb5bd4cefd62e3e389e2b873d5859eb8e07cbfa Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Tue, 28 Jan 2025 22:16:47 -0500
Subject: [PATCH 418/462] [TPU] Add example for profiling TPU inference
 (#12531)

Signed-off-by: mgoin <mgoin@redhat.com>
---
 .../offline_inference/profiling_tpu/README.md |  67 ++++++++++++
 .../profiling_tpu/profiling.py                | 101 ++++++++++++++++++
 2 files changed, 168 insertions(+)
 create mode 100644 examples/offline_inference/profiling_tpu/README.md
 create mode 100644 examples/offline_inference/profiling_tpu/profiling.py

diff --git a/examples/offline_inference/profiling_tpu/README.md b/examples/offline_inference/profiling_tpu/README.md
new file mode 100644
index 0000000000000..08efa63dc1021
--- /dev/null
+++ b/examples/offline_inference/profiling_tpu/README.md
@@ -0,0 +1,67 @@
+# vLLM TPU Profiling
+
+This script is used to profile the TPU performance of vLLM for specific prefill or decode token shapes.
+
+Note: an actual running server is a mix of both prefill of many shapes and decode of many shapes.
+
+We assume you are on a TPU already (this was tested on TPU v6e) and have installed vLLM according to the [installation guide](https://docs.vllm.ai/en/latest/getting_started/installation/ai_accelerator/index.html).
+
+> In all examples below, we run several warmups before (so `--enforce-eager` is okay)
+
+## Profile Examples
+
+### Generate Prefill Trace
+
+This example runs Qwen/Qwen2.5-7B-Instruct with a single request of 1024 input tokens. This is set up in attempt to profile just the prefill time and operations.
+
+```bash
+export XLA_HLO_DEBUG=1
+export MODEL=Qwen/Qwen2.5-7B-Instruct
+export VLLM_TPU_PROFILE_DURATION_MS=3000
+export VLLM_TPU_PROFILE_DELAY_MS=0
+
+python3 profiling.py \
+    --model $MODEL \
+    --input-len 1024 --output-len 1 \
+    --batch-size 1 --enforce-eager \
+    --max-model-len 2048 \
+    --tensor-parallel-size 1 \
+    --profile-result-dir profiles
+```
+
+
+### Generate Decode Trace
+
+This example runs Llama 3.1 70B with a batch of 32 requests where each has 1 input token and 128 output tokens. This is set up in attempt to profile just the 32 decodes running in parallel by having an extremely small prefill of 1 token and setting `VLLM_TPU_PROFILE_DELAY_MS=1000` to skip the first second of inference (hopefully prefill).
+
+```bash
+export XLA_HLO_DEBUG=1
+export MODEL=meta-llama/Llama-3.1-70B-Instruct
+export VLLM_TPU_PROFILE_DURATION_MS=2000
+export VLLM_TPU_PROFILE_DELAY_MS=1000
+
+rm -rf ~/.cache/vllm/xla_cache
+python3 profiling.py \
+    --model $MODEL \
+    --input-len 1 \
+    --output-len 128 \
+    --batch-size 32 \
+    --enforce-eager \
+    --profile-result-dir profiles \
+    --max-model-len 2048 --tensor-parallel-size 8
+```
+
+
+## Visualizing the profiles
+
+Once you have collected your profiles with this script, you can visualize them using [TensorBoard](https://cloud.google.com/tpu/docs/pytorch-xla-performance-profiling-tpu-vm).
+
+Here are most likely the dependencies you need to install:
+```bash
+pip install tensorflow-cpu tensorboard-plugin-profile etils importlib_resources
+```
+
+Then you just need to point TensorBoard to the directory where you saved the profiles and visit `http://localhost:6006/` in your browser:
+```bash
+tensorboard --logdir profiles/ --port 6006
+```
\ No newline at end of file
diff --git a/examples/offline_inference/profiling_tpu/profiling.py b/examples/offline_inference/profiling_tpu/profiling.py
new file mode 100644
index 0000000000000..d7423e6c6da93
--- /dev/null
+++ b/examples/offline_inference/profiling_tpu/profiling.py
@@ -0,0 +1,101 @@
+import argparse
+import dataclasses
+import os
+import time
+from typing import List
+
+import numpy as np
+import torch_xla.debug.profiler as xp
+from tqdm import tqdm
+
+from vllm import LLM, SamplingParams
+from vllm.engine.arg_utils import EngineArgs
+from vllm.inputs import PromptType
+from vllm.utils import FlexibleArgumentParser
+
+DURATION_MS = int(os.getenv("VLLM_TPU_PROFILE_DURATION_MS", 3000))
+DELAY_MS = int(os.getenv("VLLM_TPU_PROFILE_DELAY_MS", 0))
+
+
+def main(args: argparse.Namespace):
+    print(args)
+
+    engine_args = EngineArgs.from_cli_args(args)
+    llm = LLM(**dataclasses.asdict(engine_args))
+    _ = xp.start_server(9012)
+
+    sampling_params = SamplingParams(
+        temperature=0.0,
+        ignore_eos=True,
+        max_tokens=args.output_len,
+    )
+    print(sampling_params)
+    dummy_prompt_token_ids = np.random.randint(10000,
+                                               size=(args.batch_size,
+                                                     args.input_len))
+    dummy_prompts: List[PromptType] = [{
+        "prompt_token_ids": batch
+    } for batch in dummy_prompt_token_ids.tolist()]
+
+    def run_to_completion():
+        start_time = time.perf_counter()
+        llm.generate(dummy_prompts,
+                     sampling_params=sampling_params,
+                     use_tqdm=False)
+        end_time = time.perf_counter()
+        latency = end_time - start_time
+        return latency
+
+    # Warmup
+    print("Warming up...")
+    warmup_latencies = []
+    for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
+        warmup_latencies.append(run_to_completion())
+    print(f"Average warmup latency: {np.mean(warmup_latencies):.4f}s")
+
+    # Profile
+    profile_dir = args.profile_result_dir
+    print(f"Profiling (results will be saved to '{profile_dir}')...")
+    # Enable tracing on server
+    xp.trace_detached("localhost:9012",
+                      profile_dir,
+                      delay_ms=DELAY_MS,
+                      duration_ms=DURATION_MS)
+    if DELAY_MS == 0:
+        time.sleep(1.0)
+    profile_latencies = []
+    for _ in tqdm(range(args.num_iters), desc="Profile iterations"):
+        profile_latencies.append(run_to_completion())
+    print(f"Average profile latency: {np.mean(profile_latencies):.4f}s")
+
+    return
+
+
+if __name__ == '__main__':
+    parser = FlexibleArgumentParser(
+        description='Benchmark the latency of processing a single batch of '
+        'requests till completion.')
+    parser.add_argument('--input-len', type=int, default=32)
+    parser.add_argument('--output-len', type=int, default=128)
+    parser.add_argument('--batch-size', type=int, default=8)
+    parser.add_argument('--num-iters-warmup',
+                        type=int,
+                        default=5,
+                        help='Number of iterations to run for warmup.')
+    parser.add_argument('--num-iters',
+                        type=int,
+                        default=1,
+                        help='Number of iterations to run for profiling.')
+    parser.add_argument(
+        '--profile-result-dir',
+        type=str,
+        default="profiles",
+        help=
+        ('path to save the pytorch profiler output. Can be visualized '
+         'with ui.perfetto.dev or Tensorboard '
+         '(https://cloud.google.com/tpu/docs/pytorch-xla-performance-profiling-tpu-vm).'
+         ))
+
+    parser = EngineArgs.add_cli_args(parser)
+    args = parser.parse_args()
+    main(args)

From a7e3eba66fff82f7e12bb2354c4b26635f0f7761 Mon Sep 17 00:00:00 2001
From: Ce Gao <gaocegege@hotmail.com>
Date: Wed, 29 Jan 2025 11:38:08 +0800
Subject: [PATCH 419/462] [Frontend] Support reasoning content for deepseek r1
 (#12473)

Signed-off-by: Ce Gao <cegao@tensorchord.ai>
Co-authored-by: Rafael Vasquez <rafvasq21@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Michael Goin <mgoin@redhat.com>
---
 docs/source/features/reasoning_outputs.md     | 151 +++++++++++++++++
 docs/source/index.md                          |   1 +
 .../openai_chat_completion_with_reasoning.py  |  53 ++++++
 ...hat_completion_with_reasoning_streaming.py |  90 ++++++++++
 .../openai/reasoning_parsers/__init__.py      |   0
 .../test_deepseekr1_reasoning_parser.py       | 120 +++++++++++++
 .../openai/reasoning_parsers/utils.py         |  93 +++++++++++
 tests/entrypoints/openai/test_cli_args.py     |  29 ++++
 vllm/entrypoints/openai/api_server.py         |  10 ++
 vllm/entrypoints/openai/cli_args.py           |  30 ++++
 vllm/entrypoints/openai/protocol.py           |   2 +
 .../openai/reasoning_parsers/__init__.py      |   6 +
 .../abs_reasoning_parsers.py                  | 158 ++++++++++++++++++
 .../deepseek_r1_reasoning_parser.py           | 133 +++++++++++++++
 vllm/entrypoints/openai/serving_chat.py       | 105 +++++++++++-
 vllm/scripts.py                               |   1 +
 16 files changed, 977 insertions(+), 5 deletions(-)
 create mode 100644 docs/source/features/reasoning_outputs.md
 create mode 100644 examples/online_serving/openai_chat_completion_with_reasoning.py
 create mode 100644 examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
 create mode 100644 tests/entrypoints/openai/reasoning_parsers/__init__.py
 create mode 100644 tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py
 create mode 100644 tests/entrypoints/openai/reasoning_parsers/utils.py
 create mode 100644 vllm/entrypoints/openai/reasoning_parsers/__init__.py
 create mode 100644 vllm/entrypoints/openai/reasoning_parsers/abs_reasoning_parsers.py
 create mode 100644 vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py

diff --git a/docs/source/features/reasoning_outputs.md b/docs/source/features/reasoning_outputs.md
new file mode 100644
index 0000000000000..e39bbacf1138d
--- /dev/null
+++ b/docs/source/features/reasoning_outputs.md
@@ -0,0 +1,151 @@
+(reasoning-outputs)=
+
+# Reasoning Outputs
+
+vLLM offers support for reasoning models like [DeepSeek R1](https://huggingface.co/deepseek-ai/DeepSeek-R1), which are designed to generate outputs containing both reasoning steps and final conclusions.
+
+Reasoning models return a additional `reasoning_content` field in their outputs, which contains the reasoning steps that led to the final conclusion. This field is not present in the outputs of other models.
+
+## Supported Models
+
+vLLM currently supports the following reasoning models:
+
+- [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) (`deepseek_r1`, which looks for `<think> ... </think>`)
+
+## Quickstart
+
+To use reasoning models, you need to specify the `--enable-reasoning` and `--reasoning-parser` flags when making a request to the chat completion endpoint. The `--reasoning-parser` flag specifies the reasoning parser to use for extracting reasoning content from the model output.
+
+```bash
+vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
+    --enable-reasoning --reasoning-parser deepseek_r1
+```
+
+Next, make a request to the model that should return the reasoning content in the response.
+
+```python
+from openai import OpenAI
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
+client = OpenAI(
+    api_key=openai_api_key,
+    base_url=openai_api_base,
+)
+
+models = client.models.list()
+model = models.data[0].id
+
+# Round 1
+messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
+response = client.chat.completions.create(model=model, messages=messages)
+
+reasoning_content = response.choices[0].message.reasoning_content
+content = response.choices[0].message.content
+
+print("reasoning_content:", reasoning_content)
+print("content:", content)
+```
+
+The `reasoning_content` field contains the reasoning steps that led to the final conclusion, while the `content` field contains the final conclusion.
+
+## Streaming chat completions
+
+Streaming chat completions are also supported for reasoning models. The `reasoning_content` field is available in the `delta` field in [chat completion response chunks](https://platform.openai.com/docs/api-reference/chat/streaming).
+
+```json
+{
+    "id": "chatcmpl-123",
+    "object": "chat.completion.chunk",
+    "created": 1694268190,
+    "model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+    "system_fingerprint": "fp_44709d6fcb",
+    "choices": [
+        {
+            "index": 0,
+            "delta": {
+                "role": "assistant",
+                "reasoning_content": "is",
+            },
+            "logprobs": null,
+            "finish_reason": null
+        }
+    ]
+}
+```
+
+Please note that it is not compatible with the OpenAI Python client library. You can use the `requests` library to make streaming requests.
+
+## How to support a new reasoning model
+
+You can add a new `ReasoningParser` similar to `vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py`.
+
+```python
+# import the required packages
+
+from vllm.entrypoints.openai.reasoning_parsers.abs_reasoning_parsers import (
+    ReasoningParser, ReasoningParserManager)
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaMessage)
+
+# define a reasoning parser and register it to vllm
+# the name list in register_module can be used
+# in --reasoning-parser.
+@ReasoningParserManager.register_module(["example"])
+class ExampleParser(ReasoningParser):
+    def __init__(self, tokenizer: AnyTokenizer):
+        super().__init__(tokenizer)
+
+    def extract_reasoning_content_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+    ) -> Union[DeltaMessage, None]:
+        """
+        Instance method that should be implemented for extracting reasoning
+        from an incomplete response; for use when handling reasoning calls and
+        streaming. Has to be an instance method because  it requires state -
+        the current tokens/diffs, but also the information about what has
+        previously been parsed and extracted (see constructor)
+        """
+
+    def extract_reasoning_content(
+            self, model_output: str, request: ChatCompletionRequest
+    ) -> Tuple[Optional[str], Optional[str]]:
+        """
+        Extract reasoning content from a complete model-generated string.
+
+        Used for non-streaming responses where we have the entire model response
+        available before sending to the client.
+
+        Parameters:
+        model_output: str
+            The model-generated string to extract reasoning content from.
+
+        request: ChatCompletionRequest
+            The request object that was used to generate the model_output.
+
+        Returns:
+        Tuple[Optional[str], Optional[str]]
+            A tuple containing the reasoning content and the content.
+        """
+```
+
+After defining the reasoning parser, you can use it by specifying the `--reasoning-parser` flag when making a request to the chat completion endpoint.
+
+```bash
+vllm serve <model_tag> \
+    --enable-reasoning --reasoning-parser example
+```
+
+## Limitations
+
+- The reasoning content is only available for online serving's chat completion endpoint (`/v1/chat/completions`).
+- It is not compatible with the [`structured_outputs`](#structured_outputs) and [`tool_calling`](#tool_calling) features.
+- The reasoning content is not available for all models. Check the model's documentation to see if it supports reasoning.
diff --git a/docs/source/index.md b/docs/source/index.md
index 2c302d3f3e863..6957d5dd0f2e7 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -90,6 +90,7 @@ models/extensions/index
 features/quantization/index
 features/lora
 features/tool_calling
+features/reasoning_outputs
 features/structured_outputs
 features/automatic_prefix_caching
 features/disagg_prefill
diff --git a/examples/online_serving/openai_chat_completion_with_reasoning.py b/examples/online_serving/openai_chat_completion_with_reasoning.py
new file mode 100644
index 0000000000000..83e51a48bcc6b
--- /dev/null
+++ b/examples/online_serving/openai_chat_completion_with_reasoning.py
@@ -0,0 +1,53 @@
+"""
+An example shows how to generate chat completions from reasoning models
+like DeepSeekR1.
+
+To run this example, you need to start the vLLM server with the reasoning 
+parser:
+
+```bash
+vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
+     --enable-reasoning --reasoning-parser deepseek_r1
+```
+
+This example demonstrates how to generate chat completions from reasoning models
+using the OpenAI Python client library.
+"""
+
+from openai import OpenAI
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
+client = OpenAI(
+    api_key=openai_api_key,
+    base_url=openai_api_base,
+)
+
+models = client.models.list()
+model = models.data[0].id
+
+# Round 1
+messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
+response = client.chat.completions.create(model=model, messages=messages)
+
+reasoning_content = response.choices[0].message.reasoning_content
+content = response.choices[0].message.content
+
+print("reasoning_content:", reasoning_content)
+print("content:", content)
+
+# Round 2
+messages.append({"role": "assistant", "content": content})
+messages.append({
+    "role": "user",
+    "content": "How many Rs are there in the word 'strawberry'?",
+})
+response = client.chat.completions.create(model=model, messages=messages)
+
+reasoning_content = response.choices[0].message.reasoning_content
+content = response.choices[0].message.content
+
+print("reasoning_content:", reasoning_content)
+print("content:", content)
diff --git a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
new file mode 100644
index 0000000000000..8c14aac6b4ecb
--- /dev/null
+++ b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
@@ -0,0 +1,90 @@
+"""
+An example shows how to generate chat completions from reasoning models
+like DeepSeekR1.
+
+To run this example, you need to start the vLLM server with the reasoning 
+parser:
+
+```bash
+vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
+     --enable-reasoning --reasoning-parser deepseek_r1
+```
+
+Unlike openai_chat_completion_with_reasoning.py, this example demonstrates the
+streaming chat completions feature.
+
+The streaming chat completions feature allows you to receive chat completions
+in real-time as they are generated by the model. This is useful for scenarios
+where you want to display chat completions to the user as they are generated
+by the model.
+
+Here we do not use the OpenAI Python client library, because it does not support
+`reasoning_content` fields in the response.
+"""
+
+import json
+
+import requests
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
+models = requests.get(
+    f"{openai_api_base}/models",
+    headers={
+        "Authorization": f"Bearer {openai_api_key}"
+    },
+).json()
+model = models["data"][0]["id"]
+
+# Streaming chat completions
+messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
+
+response = requests.post(
+    f"{openai_api_base}/chat/completions",
+    headers={"Authorization": f"Bearer {openai_api_key}"},
+    json={
+        "model": model,
+        "messages": messages,
+        "stream": True
+    },
+)
+
+print("client: Start streaming chat completions...")
+printed_reasoning_content = False
+printed_content = False
+# Make the streaming request
+if response.status_code == 200:
+    # Process the streaming response
+    for line in response.iter_lines():
+        if line:  # Filter out keep-alive new lines
+            # Decode the line and parse the JSON
+            decoded_line = line.decode("utf-8")
+            if decoded_line.startswith("data:"):
+                data = decoded_line[5:].strip()  # Remove "data:" prefix
+                if data == "[DONE]":  # End of stream
+                    print("\nclient: Stream completed.")
+                    break
+                try:
+                    # Parse the JSON data
+                    chunk = json.loads(data)
+                    reasoning_content = chunk["choices"][0]["delta"].get(
+                        "reasoning_content", "")
+                    content = chunk["choices"][0]["delta"].get("content", "")
+
+                    if reasoning_content:
+                        if not printed_reasoning_content:
+                            printed_reasoning_content = True
+                            print("reasoning_content:", end="", flush=True)
+                        print(reasoning_content, end="", flush=True)
+                    elif content:
+                        if not printed_content:
+                            printed_content = True
+                            print("\ncontent:", end="", flush=True)
+                        # Extract and print the content
+                        print(content, end="", flush=True)
+                except json.JSONDecodeError:
+                    print("Error decoding JSON:", decoded_line)
+else:
+    print(f"Error: {response.status_code} - {response.text}")
diff --git a/tests/entrypoints/openai/reasoning_parsers/__init__.py b/tests/entrypoints/openai/reasoning_parsers/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py b/tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py
new file mode 100644
index 0000000000000..4607e4dfe4d0b
--- /dev/null
+++ b/tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py
@@ -0,0 +1,120 @@
+from typing import List
+
+import pytest
+from transformers import AutoTokenizer
+
+from tests.entrypoints.openai.reasoning_parsers.utils import (
+    run_reasoning_extraction)
+from vllm.entrypoints.openai.reasoning_parsers import (ReasoningParser,
+                                                       ReasoningParserManager)
+
+parser_name = "deepseek_r1"
+start_token = "<think>"
+end_token = "</think>"
+
+SIMPLE_REASONING = {
+    "output": "<think>This is a reasoning section</think>This is the rest",
+    "reasoning_content": "This is a reasoning section",
+    "content": "This is the rest",
+}
+COMPLETE_REASONING = {
+    "output": "<think>This is a reasoning section</think>",
+    "reasoning_content": "This is a reasoning section",
+    "content": None,
+}
+NO_REASONING = {
+    "output": "This is a reasoning section",
+    "reasoning_content": None,
+    "content": "This is a reasoning section",
+}
+MULTIPLE_LINES = {
+    "output": "<think>This\nThat</think>This is the rest\nThat",
+    "reasoning_content": "This\nThat",
+    "content": "This is the rest\nThat",
+}
+SHORTEST_REASONING_NO_STREAMING = {
+    "output": "<think></think>This is the rest",
+    "reasoning_content": "",
+    "content": "This is the rest",
+}
+SHORTEST_REASONING = {
+    "output": "<think></think>This is the rest",
+    "reasoning_content": None,
+    "content": "This is the rest",
+}
+
+TEST_CASES = [
+    pytest.param(
+        False,
+        SIMPLE_REASONING,
+        id="simple_streaming",
+    ),
+    pytest.param(
+        True,
+        SIMPLE_REASONING,
+        id="simple_streaming",
+    ),
+    pytest.param(
+        False,
+        COMPLETE_REASONING,
+        id="complete_streaming",
+    ),
+    pytest.param(
+        True,
+        COMPLETE_REASONING,
+        id="complete_streaming",
+    ),
+    pytest.param(
+        False,
+        NO_REASONING,
+        id="no_streaming",
+    ),
+    pytest.param(
+        True,
+        NO_REASONING,
+        id="no_streaming",
+    ),
+    pytest.param(
+        False,
+        MULTIPLE_LINES,
+        id="multiple_lines_streaming",
+    ),
+    pytest.param(
+        True,
+        MULTIPLE_LINES,
+        id="multiple_lines_streaming",
+    ),
+    pytest.param(
+        True,
+        SHORTEST_REASONING,
+        id="shortest_streaming",
+    ),
+    pytest.param(
+        False,
+        SHORTEST_REASONING_NO_STREAMING,
+        id="shortest_streaming",
+    ),
+]
+
+
+@pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
+def test_reasoning(
+    streaming: bool,
+    param_dict: dict,
+):
+    tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")
+    tokenizer.add_tokens([start_token, end_token])
+    output = tokenizer.tokenize(param_dict["output"])
+    # decode everything to tokens
+    output_tokens: List[str] = [
+        tokenizer.convert_tokens_to_string([token]) for token in output
+    ]
+    parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(
+        parser_name)(tokenizer)
+
+    reasoning, content = run_reasoning_extraction(parser,
+                                                  output_tokens,
+                                                  streaming=streaming)
+
+    assert reasoning == param_dict["reasoning_content"]
+    assert content == param_dict["content"]
diff --git a/tests/entrypoints/openai/reasoning_parsers/utils.py b/tests/entrypoints/openai/reasoning_parsers/utils.py
new file mode 100644
index 0000000000000..ac73ad50a7395
--- /dev/null
+++ b/tests/entrypoints/openai/reasoning_parsers/utils.py
@@ -0,0 +1,93 @@
+from typing import List, Optional, Tuple, Union
+
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaMessage)
+from vllm.entrypoints.openai.reasoning_parsers import ReasoningParser
+
+
+class StreamingReasoningReconstructor:
+
+    def __init__(self):
+        self.reasoning_content = None
+        self.other_content = None
+
+    def append_delta(self, delta: DeltaMessage):
+        # content and the reasoning content should not be present
+        # at the same time
+        assert delta.content is None or delta.reasoning_content is None, (
+            "Both content and reasoning content are present in the "
+            "delta message")
+        if delta.content is not None:
+            if self.other_content is None:
+                self.other_content = delta.content
+            else:
+                self.other_content += delta.content
+        else:
+            if self.reasoning_content is None:
+                self.reasoning_content = delta.reasoning_content
+            else:
+                self.reasoning_content += delta.reasoning_content
+
+
+def run_reasoning_extraction(
+    reasoning_parser: ReasoningParser,
+    model_output: List[str],
+    request: Union[ChatCompletionRequest, None] = None,
+    streaming: bool = False,
+) -> Tuple[Optional[str], Optional[str]]:
+    if streaming:
+        reconstructor = run_reasoning_extraction_streaming(
+            reasoning_parser,
+            model_output,
+            request,
+        )
+        return (
+            reconstructor.reasoning_content,
+            reconstructor.other_content or None,
+        )
+    else:
+        reasoning, content = run_reasoning_extraction_nonstreaming(
+            reasoning_parser, model_output, request)
+        return reasoning, content
+
+
+def run_reasoning_extraction_nonstreaming(
+    reasoning_parser: ReasoningParser,
+    model_output: List[str],
+    request: Union[ChatCompletionRequest, None] = None,
+) -> Tuple[Optional[str], Optional[str]]:
+    request = request or ChatCompletionRequest(messages=[], model="test-model")
+    return reasoning_parser.extract_reasoning_content(
+        model_output=''.join(model_output), request=request)
+
+
+def run_reasoning_extraction_streaming(
+    reasoning_parser: ReasoningParser,
+    model_deltas: List[str],
+    request: Union[ChatCompletionRequest, None] = None,
+) -> StreamingReasoningReconstructor:
+    request = request or ChatCompletionRequest(messages=[], model="test-model")
+    reconstructor = StreamingReasoningReconstructor()
+    previous_text = ""
+    previous_tokens: List[int] = []
+    for delta in model_deltas:
+        token_delta = [
+            reasoning_parser.vocab.get(token)
+            for token in reasoning_parser.model_tokenizer.tokenize(delta)
+            if token in reasoning_parser.vocab
+        ]
+        current_text = previous_text + delta
+        current_tokens = previous_tokens + token_delta
+        delta_message = reasoning_parser.extract_reasoning_content_streaming(
+            previous_text,
+            current_text,
+            delta,
+            previous_tokens,
+            current_tokens,
+            token_delta,
+        )
+        if delta_message is not None:
+            reconstructor.append_delta(delta_message)
+        previous_text = current_text
+        previous_tokens = current_tokens
+    return reconstructor
diff --git a/tests/entrypoints/openai/test_cli_args.py b/tests/entrypoints/openai/test_cli_args.py
index e49562ad6a21f..01bcd78aa91a8 100644
--- a/tests/entrypoints/openai/test_cli_args.py
+++ b/tests/entrypoints/openai/test_cli_args.py
@@ -116,6 +116,35 @@ def test_enable_auto_choice_passes_with_tool_call_parser(serve_parser):
     validate_parsed_serve_args(args)
 
 
+def test_enable_auto_choice_fails_with_enable_reasoning(serve_parser):
+    """Ensure validation fails if reasoning is enabled with auto tool choice"""
+    args = serve_parser.parse_args(args=[
+        "--enable-auto-tool-choice",
+        "--enable-reasoning",
+    ])
+    with pytest.raises(TypeError):
+        validate_parsed_serve_args(args)
+
+
+def test_enable_reasoning_passes_with_reasoning_parser(serve_parser):
+    """Ensure validation passes if reasoning is enabled 
+    with a reasoning parser"""
+    args = serve_parser.parse_args(args=[
+        "--enable-reasoning",
+        "--reasoning-parser",
+        "deepseek_r1",
+    ])
+    validate_parsed_serve_args(args)
+
+
+def test_enable_reasoning_fails_without_reasoning_parser(serve_parser):
+    """Ensure validation fails if reasoning is enabled 
+    without a reasoning parser"""
+    args = serve_parser.parse_args(args=["--enable-reasoning"])
+    with pytest.raises(TypeError):
+        validate_parsed_serve_args(args)
+
+
 def test_chat_template_validation_for_happy_paths(serve_parser):
     """Ensure validation passes if the chat template exists"""
     args = serve_parser.parse_args(
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 077bc993726ae..9e5cf4ba2e490 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -61,6 +61,7 @@
                                               TokenizeRequest,
                                               TokenizeResponse,
                                               UnloadLoraAdapterRequest)
+from vllm.entrypoints.openai.reasoning_parsers import ReasoningParserManager
 # yapf: enable
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
@@ -771,6 +772,8 @@ async def init_app_state(
         return_tokens_as_token_ids=args.return_tokens_as_token_ids,
         enable_auto_tools=args.enable_auto_tool_choice,
         tool_parser=args.tool_call_parser,
+        enable_reasoning=args.enable_reasoning,
+        reasoning_parser=args.reasoning_parser,
         enable_prompt_tokens_details=args.enable_prompt_tokens_details,
     ) if model_config.runner_type == "generate" else None
     state.openai_serving_completion = OpenAIServingCompletion(
@@ -844,6 +847,13 @@ async def run_server(args, **uvicorn_kwargs) -> None:
         raise KeyError(f"invalid tool call parser: {args.tool_call_parser} "
                        f"(chose from {{ {','.join(valid_tool_parses)} }})")
 
+    valid_reasoning_parses = ReasoningParserManager.reasoning_parsers.keys()
+    if args.enable_reasoning \
+        and args.reasoning_parser not in valid_reasoning_parses:
+        raise KeyError(
+            f"invalid reasoning parser: {args.reasoning_parser} "
+            f"(chose from {{ {','.join(valid_reasoning_parses)} }})")
+
     # workaround to make sure that we bind the port before the engine is set up.
     # This avoids race conditions with ray.
     # see https://github.com/vllm-project/vllm/issues/8204
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index 4df75a665bab9..9cfe07c65d55e 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -12,6 +12,7 @@
 from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str
 from vllm.entrypoints.chat_utils import (ChatTemplateContentFormatOption,
                                          validate_chat_template)
+from vllm.entrypoints.openai.reasoning_parsers import ReasoningParserManager
 from vllm.entrypoints.openai.serving_models import (LoRAModulePath,
                                                     PromptAdapterPath)
 from vllm.entrypoints.openai.tool_parsers import ToolParserManager
@@ -208,6 +209,23 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         default=False,
         help="Enable auto tool choice for supported models. Use "
         "``--tool-call-parser`` to specify which parser to use.")
+    parser.add_argument(
+        "--enable-reasoning",
+        action="store_true",
+        default=False,
+        help="Whether to enable reasoning_content for the model. "
+        "If enabled, the model will be able to generate reasoning content.")
+
+    valid_reasoning_parsers = ReasoningParserManager.reasoning_parsers.keys()
+    parser.add_argument(
+        "--reasoning-parser",
+        type=str,
+        metavar="{" + ",".join(valid_reasoning_parsers) + "}",
+        default=None,
+        help=
+        "Select the reasoning parser depending on the model that you're using."
+        " This is used to parse the reasoning content into OpenAI API "
+        "format. Required for ``--enable-reasoning``.")
 
     valid_tool_parsers = ToolParserManager.tool_parsers.keys()
     parser.add_argument(
@@ -267,6 +285,18 @@ def validate_parsed_serve_args(args: argparse.Namespace):
         raise TypeError("Error: --enable-auto-tool-choice requires "
                         "--tool-call-parser")
 
+    # Enable reasoning needs a reasoning parser to be valid
+    if args.enable_reasoning and not args.reasoning_parser:
+        raise TypeError("Error: --enable-reasoning requires "
+                        "--reasoning-parser")
+
+    # Ref https://api-docs.deepseek.com/guides/reasoning_model
+    # tool call and reasoning cannot be enabled at the same time.
+    if args.enable_auto_tool_choice and args.enable_reasoning:
+        raise TypeError(
+            "Error: --enable-auto-tool-choice and "
+            "--enable-reasoning cannot be enabled at the same time")
+
 
 def create_parser_for_docs() -> FlexibleArgumentParser:
     parser_for_docs = FlexibleArgumentParser(
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index f89c3f42aab17..2bc136cc48038 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -1202,6 +1202,7 @@ class ExtractedToolCallInformation(BaseModel):
 
 class ChatMessage(OpenAIBaseModel):
     role: str
+    reasoning_content: Optional[str] = None
     content: Optional[str] = None
     tool_calls: List[ToolCall] = Field(default_factory=list)
 
@@ -1243,6 +1244,7 @@ class ChatCompletionResponse(OpenAIBaseModel):
 class DeltaMessage(OpenAIBaseModel):
     role: Optional[str] = None
     content: Optional[str] = None
+    reasoning_content: Optional[str] = None
     tool_calls: List[DeltaToolCall] = Field(default_factory=list)
 
 
diff --git a/vllm/entrypoints/openai/reasoning_parsers/__init__.py b/vllm/entrypoints/openai/reasoning_parsers/__init__.py
new file mode 100644
index 0000000000000..a21bff52f61fa
--- /dev/null
+++ b/vllm/entrypoints/openai/reasoning_parsers/__init__.py
@@ -0,0 +1,6 @@
+from .abs_reasoning_parsers import ReasoningParser, ReasoningParserManager
+from .deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser
+
+__all__ = [
+    "ReasoningParser", "ReasoningParserManager", "DeepSeekR1ReasoningParser"
+]
diff --git a/vllm/entrypoints/openai/reasoning_parsers/abs_reasoning_parsers.py b/vllm/entrypoints/openai/reasoning_parsers/abs_reasoning_parsers.py
new file mode 100644
index 0000000000000..e5d10ee0bc3a8
--- /dev/null
+++ b/vllm/entrypoints/openai/reasoning_parsers/abs_reasoning_parsers.py
@@ -0,0 +1,158 @@
+import os
+from functools import cached_property
+from typing import Callable, Dict, List, Optional, Sequence, Tuple, Type, Union
+
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaMessage)
+from vllm.logger import init_logger
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.utils import import_from_path, is_list_of
+
+logger = init_logger(__name__)
+
+
+class ReasoningParser:
+    """
+    Abstract reasoning parser class that should not be used directly. 
+    Provided and methods should be used in derived classes.
+
+    It is used to extract reasoning content from the model output.
+    """
+
+    def __init__(self, tokenizer: AnyTokenizer):
+        self.model_tokenizer = tokenizer
+
+    @cached_property
+    def vocab(self) -> Dict[str, int]:
+        # NOTE: Only PreTrainedTokenizerFast is guaranteed to have .vocab
+        # whereas all tokenizers have .get_vocab()
+        return self.model_tokenizer.get_vocab()
+
+    def extract_reasoning_content(
+            self, model_output: str, request: ChatCompletionRequest
+    ) -> Tuple[Optional[str], Optional[str]]:
+        """
+        Extract reasoning content from a complete model-generated string.
+
+        Used for non-streaming responses where we have the entire model response
+        available before sending to the client.
+
+        Parameters:
+        model_output: str
+            The model-generated string to extract reasoning content from.
+
+        request: ChatCompletionRequest
+            The request object that was used to generate the model_output.
+
+        Returns:
+        Tuple[Optional[str], Optional[str]]
+            A tuple containing the reasoning content and the content.
+        """
+
+        raise NotImplementedError(
+            "AbstractReasoningParser.extract_reasoning_calls "
+            "has not been implemented!")
+
+    def extract_reasoning_content_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+    ) -> Union[DeltaMessage, None]:
+        """
+        Instance method that should be implemented for extracting reasoning
+        from an incomplete response; for use when handling reasoning calls and
+        streaming. Has to be an instance method because  it requires state -
+        the current tokens/diffs, but also the information about what has
+        previously been parsed and extracted (see constructor)
+        """
+        raise NotImplementedError(
+            "AbstractReasoningParser.extract_reasoning_content_streaming "
+            "has not been implemented!")
+
+
+class ReasoningParserManager:
+    reasoning_parsers: Dict[str, Type] = {}
+
+    @classmethod
+    def get_reasoning_parser(cls, name) -> Type:
+        """
+        Get reasoning parser by name which is registered by `register_module`.
+
+        Raise a KeyError exception if the name is not registered.
+        """
+        if name in cls.reasoning_parsers:
+            return cls.reasoning_parsers[name]
+
+        raise KeyError(f"reasoning helper: '{name}' not found in "
+                       "reasoning_parsers")
+
+    @classmethod
+    def _register_module(cls,
+                         module: Type,
+                         module_name: Optional[Union[str, List[str]]] = None,
+                         force: bool = True) -> None:
+        if not issubclass(module, ReasoningParser):
+            raise TypeError("module must be subclass of ReasoningParser, "
+                            f"but got {type(module)}")
+        if module_name is None:
+            module_name = module.__name__
+        if isinstance(module_name, str):
+            module_name = [module_name]
+        for name in module_name:
+            if not force and name in cls.reasoning_parsers:
+                existed_module = cls.reasoning_parsers[name]
+                raise KeyError(f"{name} is already registered "
+                               f"at {existed_module.__module__}")
+            cls.reasoning_parsers[name] = module
+
+    @classmethod
+    def register_module(
+            cls,
+            name: Optional[Union[str, List[str]]] = None,
+            force: bool = True,
+            module: Union[Type, None] = None) -> Union[type, Callable]:
+        """
+        Register module with the given name or name list. it can be used as a
+        decoder(with module as None) or normal function(with module as not 
+        None).
+        """
+        if not isinstance(force, bool):
+            raise TypeError(f"force must be a boolean, but got {type(force)}")
+
+        # raise the error ahead of time
+        if not (name is None or isinstance(name, str)
+                or is_list_of(name, str)):
+            raise TypeError(
+                "name must be None, an instance of str, or a sequence of str, "
+                f"but got {type(name)}")
+
+        # use it as a normal method: x.register_module(module=SomeClass)
+        if module is not None:
+            cls._register_module(module=module, module_name=name, force=force)
+            return module
+
+        # use it as a decorator: @x.register_module()
+        def _register(module):
+            cls._register_module(module=module, module_name=name, force=force)
+            return module
+
+        return _register
+
+    @classmethod
+    def import_reasoning_parser(cls, plugin_path: str) -> None:
+        """
+        Import a user-defined reasoning parser by the path 
+        of the reasoning parser define file.
+        """
+        module_name = os.path.splitext(os.path.basename(plugin_path))[0]
+
+        try:
+            import_from_path(module_name, plugin_path)
+        except Exception:
+            logger.exception("Failed to load module '%s' from %s.",
+                             module_name, plugin_path)
+            return
diff --git a/vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py b/vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py
new file mode 100644
index 0000000000000..a440ddc8d3b5d
--- /dev/null
+++ b/vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py
@@ -0,0 +1,133 @@
+import re
+from typing import Optional, Sequence, Tuple, Union
+
+from transformers import PreTrainedTokenizerBase
+
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaMessage)
+from vllm.entrypoints.openai.reasoning_parsers.abs_reasoning_parsers import (
+    ReasoningParser, ReasoningParserManager)
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+@ReasoningParserManager.register_module("deepseek_r1")
+class DeepSeekR1ReasoningParser(ReasoningParser):
+    """
+    Reasoning parser for DeepSeek R1 model.
+
+    The DeepSeek R1 model uses <think>...</think> tokens to denote reasoning 
+    text. This parser extracts the reasoning content from the model output.
+    """
+
+    def __init__(self, tokenizer: PreTrainedTokenizerBase):
+        super().__init__(tokenizer)
+        self.think_start_token = "<think>"
+        self.think_end_token = "</think>"
+
+        self.reasoning_regex = re.compile(
+            rf"{self.think_start_token}(.*?){self.think_end_token}", re.DOTALL)
+
+        if not self.model_tokenizer:
+            raise ValueError(
+                "The model tokenizer must be passed to the ReasoningParser "
+                "constructor during construction.")
+
+        self.think_start_token_id = self.vocab.get(self.think_start_token)
+        self.think_end_token_id = self.vocab.get(self.think_end_token)
+        if (self.think_start_token_id is None
+                or self.think_end_token_id is None):
+            raise RuntimeError(
+                "DeepSeek R1 reasoning parser could not locate think start/end "
+                "tokens in the tokenizer!")
+
+    def extract_reasoning_content_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+    ) -> Union[DeltaMessage, None]:
+        """
+        Extract reasoning content from a delta message.
+        Handles streaming output where previous + delta = current.
+        Uses token IDs for faster processing.
+        For text <think>abc</think>xyz:
+        - 'abc' goes to reasoning_content
+        - 'xyz' goes to content
+        """
+        # Skip single special tokens
+        if len(delta_token_ids) == 1 and (delta_token_ids[0] in [
+                self.think_start_token_id, self.think_end_token_id
+        ]):
+            return None
+
+        if self.think_start_token_id in previous_token_ids:
+            if self.think_end_token_id in delta_token_ids:
+                # <think> in previous, </think> in delta,
+                # extract reasoning content
+                end_index = delta_text.find(self.think_end_token)
+                reasoning_content = delta_text[:end_index]
+                content = delta_text[end_index + len(self.think_end_token):]
+                return DeltaMessage(reasoning_content=reasoning_content,
+                                    content=content if content else None)
+            elif self.think_end_token_id in previous_token_ids:
+                # <think> in previous, </think> in previous,
+                # reasoning content continues
+                return DeltaMessage(content=delta_text)
+            else:
+                # <think> in previous, no </think> in previous or delta,
+                # reasoning content continues
+                return DeltaMessage(reasoning_content=delta_text)
+        elif self.think_start_token_id in delta_token_ids:
+            logger.info(delta_text)
+            if self.think_end_token_id in delta_token_ids:
+                # <think> in delta, </think> in delta, extract reasoning content
+                start_index = delta_text.find(self.think_start_token)
+                end_index = delta_text.find(self.think_end_token)
+                reasoning_content = delta_text[start_index +
+                                               len(self.think_start_token
+                                                   ):end_index]
+                content = delta_text[end_index + len(self.think_end_token):]
+                return DeltaMessage(reasoning_content=reasoning_content,
+                                    content=content if content else None)
+            else:
+                # <think> in delta, no </think> in delta,
+                # reasoning content continues
+                return DeltaMessage(reasoning_content=delta_text)
+        else:
+            # No <think> in previous or delta, reasoning content continues.
+            return DeltaMessage(content=delta_text)
+
+    def extract_reasoning_content(
+            self, model_output: str, request: ChatCompletionRequest
+    ) -> Tuple[Optional[str], Optional[str]]:
+
+        # Check if the model output contains the <think> tokens.
+        if (self.think_start_token not in model_output
+                or self.think_end_token not in model_output):
+            return None, model_output
+        else:
+            # Use a regex to find the reasoning content
+            reasoning_content = self.reasoning_regex.findall(model_output)[0]
+
+            # Remove the reasoning content from the model output
+            # Although deepseek's <think> token is always at the
+            # beginning of the line, we cannot guarantee that the
+            # other models will follow this convention.
+            # Therefore, we need to add :start_index.
+            start_index = model_output.find(self.think_start_token)
+            if start_index != -1:
+                end_index = start_index + len(
+                    f"{self.think_start_token}{reasoning_content}{self.think_end_token}"
+                )
+                model_output = model_output[:start_index] + \
+                                model_output[end_index:]
+
+                if len(model_output) == 0:
+                    return reasoning_content, None
+
+            return reasoning_content, model_output
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 89a119ac65695..dc97f0eb059d7 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -21,6 +21,8 @@
     ChatCompletionStreamResponse, ChatMessage, DeltaFunctionCall, DeltaMessage,
     DeltaToolCall, ErrorResponse, FunctionCall, PromptTokenUsageInfo,
     RequestResponseMetadata, ToolCall, UsageInfo)
+from vllm.entrypoints.openai.reasoning_parsers import (ReasoningParser,
+                                                       ReasoningParserManager)
 from vllm.entrypoints.openai.serving_engine import OpenAIServing
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
@@ -47,6 +49,8 @@ def __init__(
         chat_template: Optional[str],
         chat_template_content_format: ChatTemplateContentFormatOption,
         return_tokens_as_token_ids: bool = False,
+        enable_reasoning: bool = False,
+        reasoning_parser: Optional[str] = None,
         enable_auto_tools: bool = False,
         tool_parser: Optional[str] = None,
         enable_prompt_tokens_details: bool = False,
@@ -69,6 +73,18 @@ def __init__(
                 " the parallel_tool_calls client option is preset for "
                 "compatibility reasons, it will be ignored.")
 
+        self.enable_reasoning: bool = enable_reasoning
+        self.reasoning_parser: Optional[Callable[[AnyTokenizer],
+                                                 ReasoningParser]] = None
+        if self.enable_reasoning:
+            try:
+                self.reasoning_parser = (
+                    ReasoningParserManager.get_reasoning_parser(
+                        reasoning_parser))
+            except Exception as e:
+                raise TypeError("Error: --enable-reasoning requires "
+                                f"reasoning_parser:'{reasoning_parser}' "
+                                "which has not been registered") from e
         self.tool_parser: Optional[Callable[[AnyTokenizer], ToolParser]] = None
         if self.enable_auto_tools:
             try:
@@ -285,14 +301,35 @@ async def chat_completion_stream_generator(
             not tool_choice_function_name
             and self._should_stream_with_auto_tool_parsing(request))
 
+        should_stream_with_reasoning_parsing = (
+            self._should_stream_with_reasoning_parsing(request))
+
         all_previous_token_ids: Optional[List[List[int]]]
-        if tool_choice_auto:
+
+        # Only one of these will be used, thus previous_texts and
+        # all_previous_token_ids will not be used twice in the same iteration.
+        if tool_choice_auto or should_stream_with_reasoning_parsing:
             # These are only required in "auto" tool choice case
             previous_texts = [""] * num_choices
             all_previous_token_ids = [[]] * num_choices
         else:
             previous_texts, all_previous_token_ids = None, None
 
+        try:
+            # There is no need to check if the reasoning_parser is None
+            # because the should_stream_with_reasoning_parsing check
+            # already ensures that the reasoning_parser is not None.
+            # but the pre-commit hook requires it.
+            if should_stream_with_reasoning_parsing and \
+                self.reasoning_parser is not None:
+                reasoning_parser = self.reasoning_parser(tokenizer)
+        except RuntimeError as e:
+            logger.exception("Error in reasoning parser creation.")
+            data = self.create_streaming_error_response(str(e))
+            yield f"data: {data}\n\n"
+            yield "data: [DONE]\n\n"
+            return
+
         # Prepare the tool parser if it's needed
         try:
             if tool_choice_auto and self.tool_parser:
@@ -456,6 +493,32 @@ async def chat_completion_stream_generator(
                         # update the previous values for the next iteration
                         previous_texts[i] = current_text
                         all_previous_token_ids[i] = current_token_ids
+                    # reasoning_content cannot be enabled with tool_choice.
+                    # If it is, the tool_choice will be used instead.
+                    elif self.enable_reasoning:
+                        # handle reasoning_content delta
+                        assert reasoning_parser is not None
+                        assert previous_texts is not None
+                        assert all_previous_token_ids is not None
+                        previous_text = previous_texts[i]
+                        previous_token_ids = all_previous_token_ids[i]
+                        current_text = previous_text + delta_text
+                        current_token_ids = previous_token_ids + list(
+                            output.token_ids)
+
+                        delta_message = (reasoning_parser.
+                                         extract_reasoning_content_streaming(
+                                             previous_text,
+                                             current_text,
+                                             delta_text,
+                                             previous_token_ids,
+                                             current_token_ids,
+                                             output.token_ids,
+                                         ))
+
+                        # update the previous values for the next iteration
+                        previous_texts[i] = current_text
+                        all_previous_token_ids[i] = current_token_ids
 
                     # handle streaming just a content delta
                     else:
@@ -642,17 +705,38 @@ async def chat_completion_full_generator(
             else:
                 logprobs = None
 
+            should_stream_with_reasoning_parsing = (
+                self._should_stream_with_reasoning_parsing(request))
+
             # In the OpenAI API the finish_reason is "tools_called"
             # if the tool choice is auto and the model produced a tool
             # call. The same is not true for named function calls
             auto_tools_called = False
 
+            if should_stream_with_reasoning_parsing and \
+                self.reasoning_parser is not None:
+                try:
+                    reasoning_parser = self.reasoning_parser(tokenizer)
+                except RuntimeError as e:
+                    logger.exception("Error in reasoning parser creation.")
+                    return self.create_error_response(str(e))
+
+                reasoning_content, content = (
+                    reasoning_parser.extract_reasoning_content(
+                        output.text, request=request))
+
+                if reasoning_content:
+                    message = ChatMessage(role=role,
+                                          content=content,
+                                          reasoning_content=reasoning_content)
+                else:
+                    message = ChatMessage(role=role, content=output.text)
+
             # if auto tools are not enabled, and a named tool choice using
             #   outlines is not being used
-            if (not self.enable_auto_tools
-                    or not self.tool_parser) and not isinstance(
-                        request.tool_choice,
-                        ChatCompletionNamedToolChoiceParam):
+            elif (not self.enable_auto_tools
+                  or not self.tool_parser) and not isinstance(
+                      request.tool_choice, ChatCompletionNamedToolChoiceParam):
                 message = ChatMessage(role=role, content=output.text)
 
             # if the request uses tools and specified a tool choice
@@ -835,6 +919,17 @@ def _should_stream_with_auto_tool_parsing(self,
         return (request.tools and self.tool_parser and self.enable_auto_tools
                 and request.tool_choice in ['auto', None])
 
+    def _should_stream_with_reasoning_parsing(self,
+                                              request: ChatCompletionRequest):
+        """
+            Utility function to check if streamed tokens should go through the
+            reasoning parser that was configured.
+    
+            We only want to do this IF reasoning is enabled and a reasoning 
+            parser is configured.
+            """
+        return self.enable_reasoning and self.reasoning_parser is not None
+
     def _should_check_for_unstreamed_tool_arg_tokens(
         self,
         delta_message: Optional[DeltaMessage],
diff --git a/vllm/scripts.py b/vllm/scripts.py
index 42e1c639eda10..8101e6b3af7ee 100644
--- a/vllm/scripts.py
+++ b/vllm/scripts.py
@@ -167,6 +167,7 @@ def main():
         "Must be a YAML with the following options:"
         "https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#cli-reference"
     )
+
     serve_parser = make_arg_parser(serve_parser)
     serve_parser.set_defaults(dispatch_function=serve)
 

From dd6a3a02cb3bf2a7bc6cb84c85dcd57c6eaf2bf9 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 29 Jan 2025 03:38:29 +0000
Subject: [PATCH 420/462] [Doc] Convert docs to use colon fences (#12471)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/requirements-docs.txt                    |    4 +-
 docs/source/api/engine/index.md               |    4 +-
 docs/source/api/model/index.md                |    4 +-
 docs/source/api/multimodal/index.md           |    4 +-
 docs/source/api/offline_inference/index.md    |    4 +-
 .../contributing/dockerfile/dockerfile.md     |    4 +-
 docs/source/contributing/model/basic.md       |    8 +-
 docs/source/contributing/model/index.md       |   12 +-
 docs/source/contributing/model/multimodal.md  |   32 +-
 .../source/contributing/model/registration.md |   16 +-
 docs/source/contributing/model/tests.md       |    8 +-
 docs/source/contributing/overview.md          |   12 +-
 .../contributing/profiling/profiling_index.md |   12 +-
 docs/source/deployment/docker.md              |   16 +-
 .../source/deployment/frameworks/cerebrium.md |    4 +-
 docs/source/deployment/frameworks/dstack.md   |    8 +-
 docs/source/deployment/frameworks/helm.md     |  408 +++---
 docs/source/deployment/frameworks/index.md    |    4 +-
 docs/source/deployment/frameworks/skypilot.md |   36 +-
 docs/source/deployment/integrations/index.md  |    4 +-
 docs/source/deployment/nginx.md               |    4 +-
 docs/source/design/arch_overview.md           |   20 +-
 docs/source/design/kernel/paged_attention.md  |   32 +-
 docs/source/design/multiprocessing.md         |    4 +-
 .../features/automatic_prefix_caching.md      |    4 +-
 docs/source/features/compatibility_matrix.md  |  879 ++++++------
 docs/source/features/disagg_prefill.md        |   20 +-
 docs/source/features/lora.md                  |    4 +-
 docs/source/features/quantization/auto_awq.md |    4 +-
 docs/source/features/quantization/fp8.md      |   16 +-
 docs/source/features/quantization/gguf.md     |   12 +-
 docs/source/features/quantization/index.md    |    4 +-
 docs/source/features/quantization/int8.md     |    8 +-
 .../quantization/supported_hardware.md        |  229 +--
 docs/source/features/spec_decode.md           |    8 +-
 docs/source/features/structured_outputs.md    |    4 +-
 docs/source/generate_examples.py              |    4 +-
 .../ai_accelerator/hpu-gaudi.inc.md           |   76 +-
 .../installation/ai_accelerator/index.md      |  272 ++--
 .../installation/ai_accelerator/neuron.inc.md |    4 +-
 .../installation/ai_accelerator/tpu.inc.md    |   51 +-
 .../installation/cpu/apple.inc.md             |    4 +-
 .../getting_started/installation/cpu/index.md |   86 +-
 .../installation/cpu/x86.inc.md               |    4 +-
 .../installation/gpu/cuda.inc.md              |   12 +-
 .../getting_started/installation/gpu/index.md |  206 +--
 .../installation/gpu/rocm.inc.md              |   25 +-
 .../installation/gpu/xpu.inc.md               |    4 +-
 .../getting_started/installation/index.md     |    4 +-
 .../installation/python_env_setup.inc.md      |    4 +-
 docs/source/getting_started/quickstart.md     |   12 +-
 .../source/getting_started/troubleshooting.md |   12 +-
 docs/source/index.md                          |   48 +-
 docs/source/models/extensions/index.md        |    4 +-
 .../models/extensions/runai_model_streamer.md |    4 +-
 docs/source/models/extensions/tensorizer.md   |    4 +-
 docs/source/models/generative_models.md       |    4 +-
 docs/source/models/pooling_models.md          |   60 +-
 docs/source/models/supported_models.md        | 1263 +++++++++--------
 docs/source/serving/distributed_serving.md    |   12 +-
 docs/source/serving/engine_args.md            |    2 +
 docs/source/serving/env_vars.md               |    8 +-
 docs/source/serving/integrations/index.md     |    4 +-
 docs/source/serving/metrics.md                |    4 +-
 docs/source/serving/multimodal_inputs.md      |   53 +-
 docs/source/serving/offline_inference.md      |    8 +-
 .../serving/openai_compatible_server.md       |   56 +-
 pyproject.toml                                |    1 +
 68 files changed, 2091 insertions(+), 2080 deletions(-)

diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt
index 8217bc3ba3ded..1d669699f4b2a 100644
--- a/docs/requirements-docs.txt
+++ b/docs/requirements-docs.txt
@@ -1,10 +1,10 @@
 sphinx==6.2.1
+sphinx-argparse==0.4.0
 sphinx-book-theme==1.0.1
 sphinx-copybutton==0.5.2
-myst-parser==3.0.1
-sphinx-argparse==0.4.0
 sphinx-design==0.6.1
 sphinx-togglebutton==0.3.2
+myst-parser==3.0.1
 msgspec
 cloudpickle
 
diff --git a/docs/source/api/engine/index.md b/docs/source/api/engine/index.md
index 701cb95d3be33..b6544d94afdf8 100644
--- a/docs/source/api/engine/index.md
+++ b/docs/source/api/engine/index.md
@@ -8,10 +8,10 @@
 .. currentmodule:: vllm.engine
 ```
 
-```{toctree}
+:::{toctree}
 :caption: Engines
 :maxdepth: 2
 
 llm_engine
 async_llm_engine
-```
+:::
diff --git a/docs/source/api/model/index.md b/docs/source/api/model/index.md
index 113792147be7c..8fee3a55c93de 100644
--- a/docs/source/api/model/index.md
+++ b/docs/source/api/model/index.md
@@ -2,10 +2,10 @@
 
 ## Submodules
 
-```{toctree}
+:::{toctree}
 :maxdepth: 1
 
 interfaces_base
 interfaces
 adapters
-```
+:::
diff --git a/docs/source/api/multimodal/index.md b/docs/source/api/multimodal/index.md
index 14efdb506d76f..069ed53e545c5 100644
--- a/docs/source/api/multimodal/index.md
+++ b/docs/source/api/multimodal/index.md
@@ -17,7 +17,7 @@ Looking to add your own multi-modal model? Please follow the instructions listed
 
 ## Submodules
 
-```{toctree}
+:::{toctree}
 :maxdepth: 1
 
 inputs
@@ -25,4 +25,4 @@ parse
 processing
 profiling
 registry
-```
+:::
diff --git a/docs/source/api/offline_inference/index.md b/docs/source/api/offline_inference/index.md
index c32f99d59e3db..ec2cc599d923c 100644
--- a/docs/source/api/offline_inference/index.md
+++ b/docs/source/api/offline_inference/index.md
@@ -1,9 +1,9 @@
 # Offline Inference
 
-```{toctree}
+:::{toctree}
 :caption: Contents
 :maxdepth: 1
 
 llm
 llm_inputs
-```
+:::
diff --git a/docs/source/contributing/dockerfile/dockerfile.md b/docs/source/contributing/dockerfile/dockerfile.md
index cb142318b8724..96674805df534 100644
--- a/docs/source/contributing/dockerfile/dockerfile.md
+++ b/docs/source/contributing/dockerfile/dockerfile.md
@@ -17,11 +17,11 @@ The edges of the build graph represent:
 
 - `RUN --mount=(.\*)from=...` dependencies (with a dotted line and an empty diamond arrow head)
 
-  > ```{figure} /assets/contributing/dockerfile-stages-dependency.png
+  > :::{figure} /assets/contributing/dockerfile-stages-dependency.png
   > :align: center
   > :alt: query
   > :width: 100%
-  > ```
+  > :::
   >
   > Made using: <https://github.com/patrickhoefler/dockerfilegraph>
   >
diff --git a/docs/source/contributing/model/basic.md b/docs/source/contributing/model/basic.md
index b9b92fd027f6e..180fdd59e9a64 100644
--- a/docs/source/contributing/model/basic.md
+++ b/docs/source/contributing/model/basic.md
@@ -10,9 +10,9 @@ First, clone the PyTorch model code from the source repository.
 For instance, vLLM's [OPT model](gh-file:vllm/model_executor/models/opt.py) was adapted from
 HuggingFace's [modeling_opt.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py) file.
 
-```{warning}
+:::{warning}
 Make sure to review and adhere to the original code's copyright and licensing terms!
-```
+:::
 
 ## 2. Make your code compatible with vLLM
 
@@ -80,10 +80,10 @@ def forward(
     ...
 ```
 
-```{note}
+:::{note}
 Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings.
 If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM.
-```
+:::
 
 For reference, check out our [Llama implementation](gh-file:vllm/model_executor/models/llama.py). vLLM already supports a large number of models. It is recommended to find a model similar to yours and adapt it to your model's architecture. Check out <gh-dir:vllm/model_executor/models> for more examples.
 
diff --git a/docs/source/contributing/model/index.md b/docs/source/contributing/model/index.md
index fe018b61b08cf..721ee3cd2047c 100644
--- a/docs/source/contributing/model/index.md
+++ b/docs/source/contributing/model/index.md
@@ -4,7 +4,7 @@
 
 This section provides more information on how to integrate a [PyTorch](https://pytorch.org/) model into vLLM.
 
-```{toctree}
+:::{toctree}
 :caption: Contents
 :maxdepth: 1
 
@@ -12,16 +12,16 @@ basic
 registration
 tests
 multimodal
-```
+:::
 
-```{note}
+:::{note}
 The complexity of adding a new model depends heavily on the model's architecture.
 The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM.
 However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex.
-```
+:::
 
-```{tip}
+:::{tip}
 If you are encountering issues while integrating your model into vLLM, feel free to open a [GitHub issue](https://github.com/vllm-project/vllm/issues)
 or ask on our [developer slack](https://slack.vllm.ai).
 We will be happy to help you out!
-```
+:::
diff --git a/docs/source/contributing/model/multimodal.md b/docs/source/contributing/model/multimodal.md
index e5fd9a2877ceb..6c6f3b701cd28 100644
--- a/docs/source/contributing/model/multimodal.md
+++ b/docs/source/contributing/model/multimodal.md
@@ -48,9 +48,9 @@ Further update the model as follows:
             return vision_embeddings
     ```
 
-    ```{important}
+    :::{important}
     The returned `multimodal_embeddings` must be either a **3D {class}`torch.Tensor`** of shape `(num_items, feature_size, hidden_size)`, or a **list / tuple of 2D {class}`torch.Tensor`'s** of shape `(feature_size, hidden_size)`, so that `multimodal_embeddings[i]` retrieves the embeddings generated from the `i`-th multimodal data item (e.g, image) of the request.
-    ```
+    :::
 
 - Implement {meth}`~vllm.model_executor.models.interfaces.SupportsMultiModal.get_input_embeddings` to merge `multimodal_embeddings` with text embeddings from the `input_ids`. If input processing for the model is implemented correctly (see sections below), then you can leverage the utility function we provide to easily merge the embeddings.
 
@@ -89,10 +89,10 @@ Further update the model as follows:
   + class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
   ```
 
-  ```{note}
+  :::{note}
   The model class does not have to be named {code}`*ForCausalLM`.
   Check out [the HuggingFace Transformers documentation](https://huggingface.co/docs/transformers/model_doc/auto#multimodal) for some examples.
-  ```
+  :::
 
 ## 2. Specify processing information
 
@@ -120,8 +120,8 @@ When calling the model, the output embeddings from the visual encoder are assign
 containing placeholder feature tokens. Therefore, the number of placeholder feature tokens should be equal
 to the size of the output embeddings.
 
-::::{tab-set}
-:::{tab-item} Basic example: LLaVA
+:::::{tab-set}
+::::{tab-item} Basic example: LLaVA
 :sync: llava
 
 Looking at the code of HF's `LlavaForConditionalGeneration`:
@@ -254,12 +254,12 @@ def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
     return {"image": self.get_max_image_tokens()}
 ```
 
-```{note}
+:::{note}
 Our [actual code](gh-file:vllm/model_executor/models/llava.py) is more abstracted to support vision encoders other than CLIP.
-```
-
 :::
+
 ::::
+:::::
 
 ## 3. Specify dummy inputs
 
@@ -315,17 +315,17 @@ def get_dummy_processor_inputs(
 Afterwards, create a subclass of {class}`~vllm.multimodal.processing.BaseMultiModalProcessor`
 to fill in the missing details about HF processing.
 
-```{seealso}
+:::{seealso}
 [Multi-Modal Data Processing](#mm-processing)
-```
+:::
 
 ### Multi-modal fields
 
 Override {class}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_mm_fields_config` to
 return a schema of the tensors outputted by the HF processor that are related to the input multi-modal items.
 
-::::{tab-set}
-:::{tab-item} Basic example: LLaVA
+:::::{tab-set}
+::::{tab-item} Basic example: LLaVA
 :sync: llava
 
 Looking at the model's `forward` method:
@@ -367,13 +367,13 @@ def _get_mm_fields_config(
     )
 ```
 
-```{note}
+:::{note}
 Our [actual code](gh-file:vllm/model_executor/models/llava.py) additionally supports
 pre-computed image embeddings, which can be passed to be model via the `image_embeds` argument.
-```
-
 :::
+
 ::::
+:::::
 
 ### Prompt replacements
 
diff --git a/docs/source/contributing/model/registration.md b/docs/source/contributing/model/registration.md
index d6c9e4181dfee..64cd25b53807e 100644
--- a/docs/source/contributing/model/registration.md
+++ b/docs/source/contributing/model/registration.md
@@ -17,17 +17,17 @@ After you have implemented your model (see [tutorial](#new-model-basic)), put it
 Then, add your model class to `_VLLM_MODELS` in <gh-file:vllm/model_executor/models/registry.py> so that it is automatically registered upon importing vLLM.
 Finally, update our [list of supported models](#supported-models) to promote your model!
 
-```{important}
+:::{important}
 The list of models in each section should be maintained in alphabetical order.
-```
+:::
 
 ## Out-of-tree models
 
 You can load an external model using a plugin without modifying the vLLM codebase.
 
-```{seealso}
+:::{seealso}
 [vLLM's Plugin System](#plugin-system)
-```
+:::
 
 To register the model, use the following code:
 
@@ -45,11 +45,11 @@ from vllm import ModelRegistry
 ModelRegistry.register_model("YourModelForCausalLM", "your_code:YourModelForCausalLM")
 ```
 
-```{important}
+:::{important}
 If your model is a multimodal model, ensure the model class implements the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface.
 Read more about that [here](#supports-multimodal).
-```
+:::
 
-```{note}
+:::{note}
 Although you can directly put these code snippets in your script using `vllm.LLM`, the recommended way is to place these snippets in a vLLM plugin. This ensures compatibility with various vLLM features like distributed inference and the API server.
-```
+:::
diff --git a/docs/source/contributing/model/tests.md b/docs/source/contributing/model/tests.md
index 74c933b2f45da..68d51d89f7cff 100644
--- a/docs/source/contributing/model/tests.md
+++ b/docs/source/contributing/model/tests.md
@@ -14,14 +14,14 @@ Without them, the CI for your PR will fail.
 Include an example HuggingFace repository for your model in <gh-file:tests/models/registry.py>.
 This enables a unit test that loads dummy weights to ensure that the model can be initialized in vLLM.
 
-```{important}
+:::{important}
 The list of models in each section should be maintained in alphabetical order.
-```
+:::
 
-```{tip}
+:::{tip}
 If your model requires a development version of HF Transformers, you can set
 `min_transformers_version` to skip the test in CI until the model is released.
-```
+:::
 
 ## Optional Tests
 
diff --git a/docs/source/contributing/overview.md b/docs/source/contributing/overview.md
index 36cf8e7440eca..908c7cb4d38ee 100644
--- a/docs/source/contributing/overview.md
+++ b/docs/source/contributing/overview.md
@@ -35,17 +35,17 @@ pre-commit run --all-files
 pytest tests/
 ```
 
-```{note}
+:::{note}
 Currently, the repository is not fully checked by `mypy`.
-```
+:::
 
 ## Issues
 
 If you encounter a bug or have a feature request, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible.
 
-```{important}
+:::{important}
 If you discover a security vulnerability, please follow the instructions [here](gh-file:SECURITY.md#reporting-a-vulnerability).
-```
+:::
 
 ## Pull Requests & Code Reviews
 
@@ -81,9 +81,9 @@ appropriately to indicate the type of change. Please use one of the following:
 - `[Misc]` for PRs that do not fit the above categories. Please use this
   sparingly.
 
-```{note}
+:::{note}
 If the PR spans more than one category, please include all relevant prefixes.
-```
+:::
 
 ### Code Quality
 
diff --git a/docs/source/contributing/profiling/profiling_index.md b/docs/source/contributing/profiling/profiling_index.md
index 001db86bdf555..79aeb292a9b73 100644
--- a/docs/source/contributing/profiling/profiling_index.md
+++ b/docs/source/contributing/profiling/profiling_index.md
@@ -6,21 +6,21 @@ The OpenAI server also needs to be started with the `VLLM_TORCH_PROFILER_DIR` en
 
 When using `benchmarks/benchmark_serving.py`, you can enable profiling by passing the `--profile` flag.
 
-```{warning}
+:::{warning}
 Only enable profiling in a development environment.
-```
+:::
 
 Traces can be visualized using <https://ui.perfetto.dev/>.
 
-```{tip}
+:::{tip}
 Only send a few requests through vLLM when profiling, as the traces can get quite large. Also, no need to untar the traces, they can be viewed directly.
-```
+:::
 
-```{tip}
+:::{tip}
 To stop the profiler - it flushes out all the profile trace files to the directory. This takes time, for example for about 100 requests worth of data for a llama 70b, it takes about 10 minutes to flush out on a H100.
 Set the env variable VLLM_RPC_TIMEOUT to a big number before you start the server. Say something like 30 minutes.
 `export VLLM_RPC_TIMEOUT=1800000`
-```
+:::
 
 ## Example commands and usage
 
diff --git a/docs/source/deployment/docker.md b/docs/source/deployment/docker.md
index 438be47316f3b..334c02225bd6b 100644
--- a/docs/source/deployment/docker.md
+++ b/docs/source/deployment/docker.md
@@ -21,11 +21,11 @@ $ docker run --runtime nvidia --gpus all \
 
 You can add any other <project:#engine-args> you need after the image tag (`vllm/vllm-openai:latest`).
 
-```{note}
+:::{note}
 You can either use the `ipc=host` flag or `--shm-size` flag to allow the
 container to access the host's shared memory. vLLM uses PyTorch, which uses shared
 memory to share data between processes under the hood, particularly for tensor parallel inference.
-```
+:::
 
 (deployment-docker-build-image-from-source)=
 
@@ -38,25 +38,25 @@ You can build and run vLLM from source via the provided <gh-file:Dockerfile>. To
 DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai
 ```
 
-```{note}
+:::{note}
 By default vLLM will build for all GPU types for widest distribution. If you are just building for the
 current GPU type the machine is running on, you can add the argument `--build-arg torch_cuda_arch_list=""`
 for vLLM to find the current GPU type and build for that.
 
 If you are using Podman instead of Docker, you might need to disable SELinux labeling by
 adding `--security-opt label=disable` when running `podman build` command to avoid certain [existing issues](https://github.com/containers/buildah/discussions/4184).
-```
+:::
 
 ## Building for Arm64/aarch64
 
 A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper. At time of this writing, this requires the use
 of PyTorch Nightly and should be considered **experimental**. Using the flag `--platform "linux/arm64"` will attempt to build for arm64.
 
-```{note}
+:::{note}
 Multiple modules must be compiled, so this process can take a while. Recommend using `--build-arg max_jobs=` & `--build-arg nvcc_threads=`
 flags to speed up build process. However, ensure your `max_jobs` is substantially larger than `nvcc_threads` to get the most benefits.
 Keep an eye on memory usage with parallel jobs as it can be substantial (see example below).
-```
+:::
 
 ```console
 # Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB)
@@ -85,6 +85,6 @@ $ docker run --runtime nvidia --gpus all \
 
 The argument `vllm/vllm-openai` specifies the image to run, and should be replaced with the name of the custom-built image (the `-t` tag from the build command).
 
-```{note}
+:::{note}
 **For version 0.4.1 and 0.4.2 only** - the vLLM docker images under these versions are supposed to be run under the root user since a library under the root user's home directory, i.e. `/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1` is required to be loaded during runtime. If you are running the container under a different user, you may need to first change the permissions of the library (and all the parent directories) to allow the user to access it, then run vLLM with environment variable `VLLM_NCCL_SO_PATH=/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1` .
-```
+:::
diff --git a/docs/source/deployment/frameworks/cerebrium.md b/docs/source/deployment/frameworks/cerebrium.md
index 5787c4a407bfb..b20c95137b6e7 100644
--- a/docs/source/deployment/frameworks/cerebrium.md
+++ b/docs/source/deployment/frameworks/cerebrium.md
@@ -2,11 +2,11 @@
 
 # Cerebrium
 
-```{raw} html
+:::{raw} html
 <p align="center">
     <img src="https://i.ibb.co/hHcScTT/Screenshot-2024-06-13-at-10-14-54.png" alt="vLLM_plus_cerebrium"/>
 </p>
-```
+:::
 
 vLLM can be run on a cloud based GPU machine with [Cerebrium](https://www.cerebrium.ai/), a serverless AI infrastructure platform that makes it easier for companies to build and deploy AI based applications.
 
diff --git a/docs/source/deployment/frameworks/dstack.md b/docs/source/deployment/frameworks/dstack.md
index b42a34125c6d7..a16e28f2d8983 100644
--- a/docs/source/deployment/frameworks/dstack.md
+++ b/docs/source/deployment/frameworks/dstack.md
@@ -2,11 +2,11 @@
 
 # dstack
 
-```{raw} html
+:::{raw} html
 <p align="center">
     <img src="https://i.ibb.co/71kx6hW/vllm-dstack.png" alt="vLLM_plus_dstack"/>
 </p>
-```
+:::
 
 vLLM can be run on a cloud based GPU machine with [dstack](https://dstack.ai/), an open-source framework for running LLMs on any cloud. This tutorial assumes that you have already configured credentials, gateway, and GPU quotas on your cloud environment.
 
@@ -97,6 +97,6 @@ completion = client.chat.completions.create(
 print(completion.choices[0].message.content)
 ```
 
-```{note}
+:::{note}
 dstack automatically handles authentication on the gateway using dstack's tokens. Meanwhile, if you don't want to configure a gateway, you can provision dstack `Task` instead of `Service`. The `Task` is for development purpose only. If you want to know more about hands-on materials how to serve vLLM using dstack, check out [this repository](https://github.com/dstackai/dstack-examples/tree/main/deployment/vllm)
-```
+:::
diff --git a/docs/source/deployment/frameworks/helm.md b/docs/source/deployment/frameworks/helm.md
index 18ed293191468..e4fc5e1313079 100644
--- a/docs/source/deployment/frameworks/helm.md
+++ b/docs/source/deployment/frameworks/helm.md
@@ -38,213 +38,213 @@ chart **including persistent volumes** and deletes the release.
 
 ## Architecture
 
-```{image} /assets/deployment/architecture_helm_deployment.png
-```
+:::{image} /assets/deployment/architecture_helm_deployment.png
+:::
 
 ## Values
 
-```{list-table}
+:::{list-table}
 :widths: 25 25 25 25
 :header-rows: 1
 
-* - Key
-  - Type
-  - Default
-  - Description
-* - autoscaling
-  - object
-  - {"enabled":false,"maxReplicas":100,"minReplicas":1,"targetCPUUtilizationPercentage":80}
-  - Autoscaling configuration
-* - autoscaling.enabled
-  - bool
-  - false
-  - Enable autoscaling
-* - autoscaling.maxReplicas
-  - int
-  - 100
-  - Maximum replicas
-* - autoscaling.minReplicas
-  - int
-  - 1
-  - Minimum replicas
-* - autoscaling.targetCPUUtilizationPercentage
-  - int
-  - 80
-  - Target CPU utilization for autoscaling
-* - configs
-  - object
-  - {}
-  - Configmap
-* - containerPort
-  - int
-  - 8000
-  - Container port
-* - customObjects
-  - list
-  - []
-  - Custom Objects configuration
-* - deploymentStrategy
-  - object
-  - {}
-  - Deployment strategy configuration
-* - externalConfigs
-  - list
-  - []
-  - External configuration
-* - extraContainers
-  - list
-  - []
-  - Additional containers configuration
-* - extraInit
-  - object
-  - {"pvcStorage":"1Gi","s3modelpath":"relative_s3_model_path/opt-125m", "awsEc2MetadataDisabled": true}
-  - Additional configuration for the init container
-* - extraInit.pvcStorage
-  - string
-  - "50Gi"
-  - Storage size of the s3
-* - extraInit.s3modelpath
-  - string
-  - "relative_s3_model_path/opt-125m"
-  - Path of the model on the s3 which hosts model weights and config files
-* - extraInit.awsEc2MetadataDisabled
-  - boolean
-  - true
-  - Disables the use of the Amazon EC2 instance metadata service
-* - extraPorts
-  - list
-  - []
-  - Additional ports configuration
-* - gpuModels
-  - list
-  - ["TYPE_GPU_USED"]
-  - Type of gpu used
-* - image
-  - object
-  - {"command":["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"],"repository":"vllm/vllm-openai","tag":"latest"}
-  - Image configuration
-* - image.command
-  - list
-  - ["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"]
-  - Container launch command
-* - image.repository
-  - string
-  - "vllm/vllm-openai"
-  - Image repository
-* - image.tag
-  - string
-  - "latest"
-  - Image tag
-* - livenessProbe
-  - object
-  - {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":15,"periodSeconds":10}
-  - Liveness probe configuration
-* - livenessProbe.failureThreshold
-  - int
-  - 3
-  - Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not alive
-* - livenessProbe.httpGet
-  - object
-  - {"path":"/health","port":8000}
-  - Configuration of the Kubelet http request on the server
-* - livenessProbe.httpGet.path
-  - string
-  - "/health"
-  - Path to access on the HTTP server
-* - livenessProbe.httpGet.port
-  - int
-  - 8000
-  - Name or number of the port to access on the container, on which the server is listening
-* - livenessProbe.initialDelaySeconds
-  - int
-  - 15
-  - Number of seconds after the container has started before liveness probe is initiated
-* - livenessProbe.periodSeconds
-  - int
-  - 10
-  - How often (in seconds) to perform the liveness probe
-* - maxUnavailablePodDisruptionBudget
-  - string
-  - ""
-  - Disruption Budget Configuration
-* - readinessProbe
-  - object
-  - {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":5,"periodSeconds":5}
-  - Readiness probe configuration
-* - readinessProbe.failureThreshold
-  - int
-  - 3
-  - Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not ready
-* - readinessProbe.httpGet
-  - object
-  - {"path":"/health","port":8000}
-  - Configuration of the Kubelet http request on the server
-* - readinessProbe.httpGet.path
-  - string
-  - "/health"
-  - Path to access on the HTTP server
-* - readinessProbe.httpGet.port
-  - int
-  - 8000
-  - Name or number of the port to access on the container, on which the server is listening
-* - readinessProbe.initialDelaySeconds
-  - int
-  - 5
-  - Number of seconds after the container has started before readiness probe is initiated
-* - readinessProbe.periodSeconds
-  - int
-  - 5
-  - How often (in seconds) to perform the readiness probe
-* - replicaCount
-  - int
-  - 1
-  - Number of replicas
-* - resources
-  - object
-  - {"limits":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1},"requests":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1}}
-  - Resource configuration
-* - resources.limits."nvidia.com/gpu"
-  - int
-  - 1
-  - Number of gpus used
-* - resources.limits.cpu
-  - int
-  - 4
-  - Number of CPUs
-* - resources.limits.memory
-  - string
-  - "16Gi"
-  - CPU memory configuration
-* - resources.requests."nvidia.com/gpu"
-  - int
-  - 1
-  - Number of gpus used
-* - resources.requests.cpu
-  - int
-  - 4
-  - Number of CPUs
-* - resources.requests.memory
-  - string
-  - "16Gi"
-  - CPU memory configuration
-* - secrets
-  - object
-  - {}
-  - Secrets configuration
-* - serviceName
-  - string
-  -
-  - Service name
-* - servicePort
-  - int
-  - 80
-  - Service port
-* - labels.environment
-  - string
-  - test
-  - Environment name
-* - labels.release
-  - string
-  - test
-  - Release name
-```
+- * Key
+  * Type
+  * Default
+  * Description
+- * autoscaling
+  * object
+  * {"enabled":false,"maxReplicas":100,"minReplicas":1,"targetCPUUtilizationPercentage":80}
+  * Autoscaling configuration
+- * autoscaling.enabled
+  * bool
+  * false
+  * Enable autoscaling
+- * autoscaling.maxReplicas
+  * int
+  * 100
+  * Maximum replicas
+- * autoscaling.minReplicas
+  * int
+  * 1
+  * Minimum replicas
+- * autoscaling.targetCPUUtilizationPercentage
+  * int
+  * 80
+  * Target CPU utilization for autoscaling
+- * configs
+  * object
+  * {}
+  * Configmap
+- * containerPort
+  * int
+  * 8000
+  * Container port
+- * customObjects
+  * list
+  * []
+  * Custom Objects configuration
+- * deploymentStrategy
+  * object
+  * {}
+  * Deployment strategy configuration
+- * externalConfigs
+  * list
+  * []
+  * External configuration
+- * extraContainers
+  * list
+  * []
+  * Additional containers configuration
+- * extraInit
+  * object
+  * {"pvcStorage":"1Gi","s3modelpath":"relative_s3_model_path/opt-125m", "awsEc2MetadataDisabled": true}
+  * Additional configuration for the init container
+- * extraInit.pvcStorage
+  * string
+  * "50Gi"
+  * Storage size of the s3
+- * extraInit.s3modelpath
+  * string
+  * "relative_s3_model_path/opt-125m"
+  * Path of the model on the s3 which hosts model weights and config files
+- * extraInit.awsEc2MetadataDisabled
+  * boolean
+  * true
+  * Disables the use of the Amazon EC2 instance metadata service
+- * extraPorts
+  * list
+  * []
+  * Additional ports configuration
+- * gpuModels
+  * list
+  * ["TYPE_GPU_USED"]
+  * Type of gpu used
+- * image
+  * object
+  * {"command":["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"],"repository":"vllm/vllm-openai","tag":"latest"}
+  * Image configuration
+- * image.command
+  * list
+  * ["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"]
+  * Container launch command
+- * image.repository
+  * string
+  * "vllm/vllm-openai"
+  * Image repository
+- * image.tag
+  * string
+  * "latest"
+  * Image tag
+- * livenessProbe
+  * object
+  * {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":15,"periodSeconds":10}
+  * Liveness probe configuration
+- * livenessProbe.failureThreshold
+  * int
+  * 3
+  * Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not alive
+- * livenessProbe.httpGet
+  * object
+  * {"path":"/health","port":8000}
+  * Configuration of the Kubelet http request on the server
+- * livenessProbe.httpGet.path
+  * string
+  * "/health"
+  * Path to access on the HTTP server
+- * livenessProbe.httpGet.port
+  * int
+  * 8000
+  * Name or number of the port to access on the container, on which the server is listening
+- * livenessProbe.initialDelaySeconds
+  * int
+  * 15
+  * Number of seconds after the container has started before liveness probe is initiated
+- * livenessProbe.periodSeconds
+  * int
+  * 10
+  * How often (in seconds) to perform the liveness probe
+- * maxUnavailablePodDisruptionBudget
+  * string
+  * ""
+  * Disruption Budget Configuration
+- * readinessProbe
+  * object
+  * {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":5,"periodSeconds":5}
+  * Readiness probe configuration
+- * readinessProbe.failureThreshold
+  * int
+  * 3
+  * Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not ready
+- * readinessProbe.httpGet
+  * object
+  * {"path":"/health","port":8000}
+  * Configuration of the Kubelet http request on the server
+- * readinessProbe.httpGet.path
+  * string
+  * "/health"
+  * Path to access on the HTTP server
+- * readinessProbe.httpGet.port
+  * int
+  * 8000
+  * Name or number of the port to access on the container, on which the server is listening
+- * readinessProbe.initialDelaySeconds
+  * int
+  * 5
+  * Number of seconds after the container has started before readiness probe is initiated
+- * readinessProbe.periodSeconds
+  * int
+  * 5
+  * How often (in seconds) to perform the readiness probe
+- * replicaCount
+  * int
+  * 1
+  * Number of replicas
+- * resources
+  * object
+  * {"limits":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1},"requests":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1}}
+  * Resource configuration
+- * resources.limits."nvidia.com/gpu"
+  * int
+  * 1
+  * Number of gpus used
+- * resources.limits.cpu
+  * int
+  * 4
+  * Number of CPUs
+- * resources.limits.memory
+  * string
+  * "16Gi"
+  * CPU memory configuration
+- * resources.requests."nvidia.com/gpu"
+  * int
+  * 1
+  * Number of gpus used
+- * resources.requests.cpu
+  * int
+  * 4
+  * Number of CPUs
+- * resources.requests.memory
+  * string
+  * "16Gi"
+  * CPU memory configuration
+- * secrets
+  * object
+  * {}
+  * Secrets configuration
+- * serviceName
+  * string
+  *
+  * Service name
+- * servicePort
+  * int
+  * 80
+  * Service port
+- * labels.environment
+  * string
+  * test
+  * Environment name
+- * labels.release
+  * string
+  * test
+  * Release name
+:::
diff --git a/docs/source/deployment/frameworks/index.md b/docs/source/deployment/frameworks/index.md
index 964782763f6b3..cb758d3e6d2e4 100644
--- a/docs/source/deployment/frameworks/index.md
+++ b/docs/source/deployment/frameworks/index.md
@@ -1,6 +1,6 @@
 # Using other frameworks
 
-```{toctree}
+:::{toctree}
 :maxdepth: 1
 
 bentoml
@@ -11,4 +11,4 @@ lws
 modal
 skypilot
 triton
-```
+:::
diff --git a/docs/source/deployment/frameworks/skypilot.md b/docs/source/deployment/frameworks/skypilot.md
index 051fc2f2a8d4e..5e101b9001033 100644
--- a/docs/source/deployment/frameworks/skypilot.md
+++ b/docs/source/deployment/frameworks/skypilot.md
@@ -2,11 +2,11 @@
 
 # SkyPilot
 
-```{raw} html
+:::{raw} html
 <p align="center">
   <img src="https://imgur.com/yxtzPEu.png" alt="vLLM"/>
 </p>
-```
+:::
 
 vLLM can be **run and scaled to multiple service replicas on clouds and Kubernetes** with [SkyPilot](https://github.com/skypilot-org/skypilot), an open-source framework for running LLMs on any cloud. More examples for various open models, such as Llama-3, Mixtral, etc, can be found in [SkyPilot AI gallery](https://skypilot.readthedocs.io/en/latest/gallery/index.html).
 
@@ -104,10 +104,10 @@ service:
   max_completion_tokens: 1
 ```
 
-```{raw} html
+:::{raw} html
 <details>
 <summary>Click to see the full recipe YAML</summary>
-```
+:::
 
 ```yaml
 service:
@@ -153,9 +153,9 @@ run: |
     2>&1 | tee api_server.log
 ```
 
-```{raw} html
+:::{raw} html
 </details>
-```
+:::
 
 Start the serving the Llama-3 8B model on multiple replicas:
 
@@ -169,10 +169,10 @@ Wait until the service is ready:
 watch -n10 sky serve status vllm
 ```
 
-```{raw} html
+:::{raw} html
 <details>
 <summary>Example outputs:</summary>
-```
+:::
 
 ```console
 Services
@@ -185,9 +185,9 @@ vllm          1   1        xx.yy.zz.121  18 mins ago  1x GCP([Spot]{'L4': 1})  R
 vllm          2   1        xx.yy.zz.245  18 mins ago  1x GCP([Spot]{'L4': 1})  READY   us-east4
 ```
 
-```{raw} html
+:::{raw} html
 </details>
-```
+:::
 
 After the service is READY, you can find a single endpoint for the service and access the service with the endpoint:
 
@@ -223,10 +223,10 @@ service:
 
 This will scale the service up to when the QPS exceeds 2 for each replica.
 
-```{raw} html
+:::{raw} html
 <details>
 <summary>Click to see the full recipe YAML</summary>
-```
+:::
 
 ```yaml
 service:
@@ -275,9 +275,9 @@ run: |
     2>&1 | tee api_server.log
 ```
 
-```{raw} html
+:::{raw} html
 </details>
-```
+:::
 
 To update the service with the new config:
 
@@ -295,10 +295,10 @@ sky serve down vllm
 
 It is also possible to access the Llama-3 service with a separate GUI frontend, so the user requests send to the GUI will be load-balanced across replicas.
 
-```{raw} html
+:::{raw} html
 <details>
 <summary>Click to see the full GUI YAML</summary>
-```
+:::
 
 ```yaml
 envs:
@@ -328,9 +328,9 @@ run: |
     --stop-token-ids 128009,128001 | tee ~/gradio.log
 ```
 
-```{raw} html
+:::{raw} html
 </details>
-```
+:::
 
 1. Start the chat web UI:
 
diff --git a/docs/source/deployment/integrations/index.md b/docs/source/deployment/integrations/index.md
index d47ede8967547..c286edb4d7bc1 100644
--- a/docs/source/deployment/integrations/index.md
+++ b/docs/source/deployment/integrations/index.md
@@ -1,9 +1,9 @@
 # External Integrations
 
-```{toctree}
+:::{toctree}
 :maxdepth: 1
 
 kserve
 kubeai
 llamastack
-```
+:::
diff --git a/docs/source/deployment/nginx.md b/docs/source/deployment/nginx.md
index a58f791c2997b..87feb48856853 100644
--- a/docs/source/deployment/nginx.md
+++ b/docs/source/deployment/nginx.md
@@ -105,9 +105,9 @@ docker run -itd --ipc host --privileged --network vllm_nginx --gpus all --shm-si
 docker run -itd --ipc host --privileged --network vllm_nginx --gpus all --shm-size=10.24gb -v $hf_cache_dir:/root/.cache/huggingface/ -p 8082:8000 --name vllm1 vllm --model meta-llama/Llama-2-7b-chat-hf
 ```
 
-```{note}
+:::{note}
 If you are behind proxy, you can pass the proxy settings to the docker run command via `-e http_proxy=$http_proxy -e https_proxy=$https_proxy`.
-```
+:::
 
 (nginxloadbalancer-nginx-launch-nginx)=
 
diff --git a/docs/source/design/arch_overview.md b/docs/source/design/arch_overview.md
index cec503ef2f77d..04886e5981eef 100644
--- a/docs/source/design/arch_overview.md
+++ b/docs/source/design/arch_overview.md
@@ -4,19 +4,19 @@
 
 This document provides an overview of the vLLM architecture.
 
-```{contents} Table of Contents
+:::{contents} Table of Contents
 :depth: 2
 :local: true
-```
+:::
 
 ## Entrypoints
 
 vLLM provides a number of entrypoints for interacting with the system. The
 following diagram shows the relationship between them.
 
-```{image} /assets/design/arch_overview/entrypoints.excalidraw.png
+:::{image} /assets/design/arch_overview/entrypoints.excalidraw.png
 :alt: Entrypoints Diagram
-```
+:::
 
 ### LLM Class
 
@@ -84,9 +84,9 @@ More details on the API server can be found in the [OpenAI-Compatible Server](#o
 The `LLMEngine` and `AsyncLLMEngine` classes are central to the functioning of
 the vLLM system, handling model inference and asynchronous request processing.
 
-```{image} /assets/design/arch_overview/llm_engine.excalidraw.png
+:::{image} /assets/design/arch_overview/llm_engine.excalidraw.png
 :alt: LLMEngine Diagram
-```
+:::
 
 ### LLMEngine
 
@@ -144,11 +144,11 @@ configurations affect the class we ultimately get.
 
 The following figure shows the class hierarchy of vLLM:
 
-> ```{figure} /assets/design/hierarchy.png
+> :::{figure} /assets/design/hierarchy.png
 > :align: center
 > :alt: query
 > :width: 100%
-> ```
+> :::
 
 There are several important design choices behind this class hierarchy:
 
@@ -178,7 +178,7 @@ of a vision model and a language model. By making the constructor uniform, we
 can easily create a vision model and a language model and compose them into a
 vision-language model.
 
-````{note}
+:::{note}
 To support this change, all vLLM models' signatures have been updated to:
 
 ```python
@@ -215,7 +215,7 @@ else:
 ```
 
 This way, the model can work with both old and new versions of vLLM.
-````
+:::
 
 3\. **Sharding and Quantization at Initialization**: Certain features require
 changing the model weights. For example, tensor parallelism needs to shard the
diff --git a/docs/source/design/kernel/paged_attention.md b/docs/source/design/kernel/paged_attention.md
index f896f903c78f5..5f2582877260a 100644
--- a/docs/source/design/kernel/paged_attention.md
+++ b/docs/source/design/kernel/paged_attention.md
@@ -139,26 +139,26 @@
   const scalar_t* q_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE;
   ```
 
-  ```{figure} ../../assets/kernel/query.png
+  :::{figure} ../../assets/kernel/query.png
   :align: center
   :alt: query
   :width: 70%
 
   Query data of one token at one head
-  ```
+  :::
 
 - Each thread defines its own `q_ptr` which points to the assigned
   query token data on global memory. For example, if `VEC_SIZE` is 4
   and `HEAD_SIZE` is 128, the `q_ptr` points to data that contains
   total of 128 elements divided into 128 / 4 = 32 vecs.
 
-  ```{figure} ../../assets/kernel/q_vecs.png
+  :::{figure} ../../assets/kernel/q_vecs.png
   :align: center
   :alt: q_vecs
   :width: 70%
 
   `q_vecs` for one thread group
-  ```
+  :::
 
   ```cpp
   __shared__ Q_vec q_vecs[THREAD_GROUP_SIZE][NUM_VECS_PER_THREAD];
@@ -195,13 +195,13 @@
   points to key token data based on `k_cache` at assigned block,
   assigned head and assigned token.
 
-  ```{figure} ../../assets/kernel/key.png
+  :::{figure} ../../assets/kernel/key.png
   :align: center
   :alt: key
   :width: 70%
 
   Key data of all context tokens at one head
-  ```
+  :::
 
 - The diagram above illustrates the memory layout for key data. It
   assumes that the `BLOCK_SIZE` is 16, `HEAD_SIZE` is 128, `x` is
@@ -214,13 +214,13 @@
   elements for one token) that will be processed by 2 threads (one
   thread group) separately.
 
-  ```{figure} ../../assets/kernel/k_vecs.png
+  :::{figure} ../../assets/kernel/k_vecs.png
   :align: center
   :alt: k_vecs
   :width: 70%
 
   `k_vecs` for one thread
-  ```
+  :::
 
   ```cpp
   K_vec k_vecs[NUM_VECS_PER_THREAD]
@@ -289,14 +289,14 @@
   should be performed across the entire thread block, encompassing
   results between the query token and all context key tokens.
 
-  ```{math}
+  :::{math}
   :nowrap: true
 
   \begin{gather*}
   m(x):=\max _i \quad x_i \\ \quad f(x):=\left[\begin{array}{lll}e^{x_1-m(x)} & \ldots & e^{x_B-m(x)}\end{array}\right]\\ \quad \ell(x):=\sum_i f(x)_i \\
   \quad \operatorname{softmax}(x):=\frac{f(x)}{\ell(x)}
   \end{gather*}
-  ```
+  :::
 
 ### `qk_max` and `logits`
 
@@ -379,29 +379,29 @@
 
 ## Value
 
-```{figure} ../../assets/kernel/value.png
+:::{figure} ../../assets/kernel/value.png
 :align: center
 :alt: value
 :width: 70%
 
 Value data of all context tokens at one head
-```
+:::
 
-```{figure} ../../assets/kernel/logits_vec.png
+:::{figure} ../../assets/kernel/logits_vec.png
 :align: center
 :alt: logits_vec
 :width: 50%
 
 `logits_vec` for one thread
-```
+:::
 
-```{figure} ../../assets/kernel/v_vec.png
+:::{figure} ../../assets/kernel/v_vec.png
 :align: center
 :alt: v_vec
 :width: 70%
 
 List of `v_vec` for one thread
-```
+:::
 
 - Now we need to retrieve the value data and perform dot multiplication
   with `logits`. Unlike query and key, there is no thread group
diff --git a/docs/source/design/multiprocessing.md b/docs/source/design/multiprocessing.md
index c2cdb75ea08a7..55dae0bb92d4e 100644
--- a/docs/source/design/multiprocessing.md
+++ b/docs/source/design/multiprocessing.md
@@ -7,9 +7,9 @@ page for information on known issues and how to solve them.
 
 ## Introduction
 
-```{important}
+:::{important}
 The source code references are to the state of the code at the time of writing in December, 2024.
-```
+:::
 
 The use of Python multiprocessing in vLLM is complicated by:
 
diff --git a/docs/source/features/automatic_prefix_caching.md b/docs/source/features/automatic_prefix_caching.md
index 3d70cbb29c385..59016d7fcf6b3 100644
--- a/docs/source/features/automatic_prefix_caching.md
+++ b/docs/source/features/automatic_prefix_caching.md
@@ -6,9 +6,9 @@
 
 Automatic Prefix Caching (APC in short) caches the KV cache of existing queries, so that a new query can directly reuse the KV cache if it shares the same prefix with one of the existing queries, allowing the new query to skip the computation of the shared part.
 
-```{note}
+:::{note}
 Technical details on how vLLM implements APC can be found [here](#design-automatic-prefix-caching).
-```
+:::
 
 ## Enabling APC in vLLM
 
diff --git a/docs/source/features/compatibility_matrix.md b/docs/source/features/compatibility_matrix.md
index 47ab616b30686..b0018ebccf5ba 100644
--- a/docs/source/features/compatibility_matrix.md
+++ b/docs/source/features/compatibility_matrix.md
@@ -4,13 +4,13 @@
 
 The tables below show mutually exclusive features and the support on some hardware.
 
-```{note}
+:::{note}
 Check the '✗' with links to see tracking issue for unsupported feature/hardware combination.
-```
+:::
 
 ## Feature x Feature
 
-```{raw} html
+:::{raw} html
 <style>
   /* Make smaller to try to improve readability  */
   td {
@@ -23,448 +23,447 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar
     font-size: 0.8rem;
   }
 </style>
-```
+:::
 
-```{list-table}
-   :header-rows: 1
-   :stub-columns: 1
-   :widths: auto
+:::{list-table}
+:header-rows: 1
+:stub-columns: 1
+:widths: auto
 
-   * - Feature
-     - [CP](#chunked-prefill)
-     - [APC](#automatic-prefix-caching)
-     - [LoRA](#lora-adapter)
-     - <abbr title="Prompt Adapter">prmpt adptr</abbr>
-     - [SD](#spec_decode)
-     - CUDA graph
-     - <abbr title="Pooling Models">pooling</abbr>
-     - <abbr title="Encoder-Decoder Models">enc-dec</abbr>
-     - <abbr title="Logprobs">logP</abbr>
-     - <abbr title="Prompt Logprobs">prmpt logP</abbr>
-     - <abbr title="Async Output Processing">async output</abbr>
-     - multi-step
-     - <abbr title="Multimodal Inputs">mm</abbr>
-     - best-of
-     - beam-search
-     - <abbr title="Guided Decoding">guided dec</abbr>
-   * - [CP](#chunked-prefill)
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-   * - [APC](#automatic-prefix-caching)
-     - ✅
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-   * - [LoRA](#lora-adapter)
-     - [✗](gh-pr:9057)
-     - ✅
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-   * - <abbr title="Prompt Adapter">prmpt adptr</abbr>
-     - ✅
-     - ✅
-     - ✅
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-   * - [SD](#spec_decode)
-     - ✅
-     - ✅
-     - ✗
-     - ✅
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-   * - CUDA graph
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-   * - <abbr title="Pooling Models">pooling</abbr>
-     - ✗
-     - ✗
-     - ✗
-     - ✗
-     - ✗
-     - ✗
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-   * - <abbr title="Encoder-Decoder Models">enc-dec</abbr>
-     - ✗
-     - [✗](gh-issue:7366)
-     - ✗
-     - ✗
-     - [✗](gh-issue:7366)
-     - ✅
-     - ✅
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-   * - <abbr title="Logprobs">logP</abbr>
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✗
-     - ✅
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-   * - <abbr title="Prompt Logprobs">prmpt logP</abbr>
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - [✗](gh-pr:8199)
-     - ✅
-     - ✗
-     - ✅
-     - ✅
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-   * - <abbr title="Async Output Processing">async output</abbr>
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✗
-     - ✅
-     - ✗
-     - ✗
-     - ✅
-     - ✅
-     -
-     -
-     -
-     -
-     -
-     -
-   * - multi-step
-     - ✗
-     - ✅
-     - ✗
-     - ✅
-     - ✗
-     - ✅
-     - ✗
-     - ✗
-     - ✅
-     - [✗](gh-issue:8198)
-     - ✅
-     -
-     -
-     -
-     -
-     -
-   * - <abbr title="Multimodal Inputs">mm</abbr>
-     - ✅
-     -  [✗](gh-pr:8348)
-     -  [✗](gh-pr:7199)
-     - ?
-     - ?
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ?
-     -
-     -
-     -
-     -
-   * - best-of
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - [✗](gh-issue:6137)
-     - ✅
-     - ✗
-     - ✅
-     - ✅
-     - ✅
-     - ?
-     - [✗](gh-issue:7968)
-     - ✅
-     -
-     -
-     -
-   * - beam-search
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - [✗](gh-issue:6137)
-     - ✅
-     - ✗
-     - ✅
-     - ✅
-     - ✅
-     - ?
-     - [✗](gh-issue:7968>)
-     - ?
-     - ✅
-     -
-     -
-   * - <abbr title="Guided Decoding">guided dec</abbr>
-     - ✅
-     - ✅
-     - ?
-     - ?
-     - [✗](gh-issue:11484)
-     - ✅
-     - ✗
-     - ?
-     - ✅
-     - ✅
-     - ✅
-     - [✗](gh-issue:9893)
-     - ?
-     - ✅
-     - ✅
-     -
-
-```
+- * Feature
+  * [CP](#chunked-prefill)
+  * [APC](#automatic-prefix-caching)
+  * [LoRA](#lora-adapter)
+  * <abbr title="Prompt Adapter">prmpt adptr</abbr>
+  * [SD](#spec_decode)
+  * CUDA graph
+  * <abbr title="Pooling Models">pooling</abbr>
+  * <abbr title="Encoder-Decoder Models">enc-dec</abbr>
+  * <abbr title="Logprobs">logP</abbr>
+  * <abbr title="Prompt Logprobs">prmpt logP</abbr>
+  * <abbr title="Async Output Processing">async output</abbr>
+  * multi-step
+  * <abbr title="Multimodal Inputs">mm</abbr>
+  * best-of
+  * beam-search
+  * <abbr title="Guided Decoding">guided dec</abbr>
+- * [CP](#chunked-prefill)
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+- * [APC](#automatic-prefix-caching)
+  * ✅
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+- * [LoRA](#lora-adapter)
+  * [✗](gh-pr:9057)
+  * ✅
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+- * <abbr title="Prompt Adapter">prmpt adptr</abbr>
+  * ✅
+  * ✅
+  * ✅
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+- * [SD](#spec_decode)
+  * ✅
+  * ✅
+  * ✗
+  * ✅
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+- * CUDA graph
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+- * <abbr title="Pooling Models">pooling</abbr>
+  * ✗
+  * ✗
+  * ✗
+  * ✗
+  * ✗
+  * ✗
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+- * <abbr title="Encoder-Decoder Models">enc-dec</abbr>
+  * ✗
+  * [✗](gh-issue:7366)
+  * ✗
+  * ✗
+  * [✗](gh-issue:7366)
+  * ✅
+  * ✅
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+- * <abbr title="Logprobs">logP</abbr>
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✗
+  * ✅
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+- * <abbr title="Prompt Logprobs">prmpt logP</abbr>
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * [✗](gh-pr:8199)
+  * ✅
+  * ✗
+  * ✅
+  * ✅
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+- * <abbr title="Async Output Processing">async output</abbr>
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✗
+  * ✅
+  * ✗
+  * ✗
+  * ✅
+  * ✅
+  *
+  *
+  *
+  *
+  *
+  *
+- * multi-step
+  * ✗
+  * ✅
+  * ✗
+  * ✅
+  * ✗
+  * ✅
+  * ✗
+  * ✗
+  * ✅
+  * [✗](gh-issue:8198)
+  * ✅
+  *
+  *
+  *
+  *
+  *
+- * <abbr title="Multimodal Inputs">mm</abbr>
+  * ✅
+  * [✗](gh-pr:8348)
+  * [✗](gh-pr:7199)
+  * ?
+  * ?
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ?
+  *
+  *
+  *
+  *
+- * best-of
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * [✗](gh-issue:6137)
+  * ✅
+  * ✗
+  * ✅
+  * ✅
+  * ✅
+  * ?
+  * [✗](gh-issue:7968)
+  * ✅
+  *
+  *
+  *
+- * beam-search
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * [✗](gh-issue:6137)
+  * ✅
+  * ✗
+  * ✅
+  * ✅
+  * ✅
+  * ?
+  * [✗](gh-issue:7968>)
+  * ?
+  * ✅
+  *
+  *
+- * <abbr title="Guided Decoding">guided dec</abbr>
+  * ✅
+  * ✅
+  * ?
+  * ?
+  * [✗](gh-issue:11484)
+  * ✅
+  * ✗
+  * ?
+  * ✅
+  * ✅
+  * ✅
+  * [✗](gh-issue:9893)
+  * ?
+  * ✅
+  * ✅
+  *
+:::
 
 (feature-x-hardware)=
 
 ## Feature x Hardware
 
-```{list-table}
-   :header-rows: 1
-   :stub-columns: 1
-   :widths: auto
+:::{list-table}
+:header-rows: 1
+:stub-columns: 1
+:widths: auto
 
-   * - Feature
-     - Volta
-     - Turing
-     - Ampere
-     - Ada
-     - Hopper
-     - CPU
-     - AMD
-   * - [CP](#chunked-prefill)
-     - [✗](gh-issue:2729)
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-   * - [APC](#automatic-prefix-caching)
-     - [✗](gh-issue:3687)
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-   * - [LoRA](#lora-adapter)
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-   * - <abbr title="Prompt Adapter">prmpt adptr</abbr>
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - [✗](gh-issue:8475)
-     - ✅
-   * - [SD](#spec_decode)
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-   * - CUDA graph
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✗
-     - ✅
-   * - <abbr title="Pooling Models">pooling</abbr>
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ?
-   * - <abbr title="Encoder-Decoder Models">enc-dec</abbr>
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✗
-   * - <abbr title="Multimodal Inputs">mm</abbr>
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-   * - <abbr title="Logprobs">logP</abbr>
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-   * - <abbr title="Prompt Logprobs">prmpt logP</abbr>
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-   * - <abbr title="Async Output Processing">async output</abbr>
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✗
-     - ✗
-   * - multi-step
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - [✗](gh-issue:8477)
-     - ✅
-   * - best-of
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-   * - beam-search
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-   * - <abbr title="Guided Decoding">guided dec</abbr>
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-```
+- * Feature
+  * Volta
+  * Turing
+  * Ampere
+  * Ada
+  * Hopper
+  * CPU
+  * AMD
+- * [CP](#chunked-prefill)
+  * [✗](gh-issue:2729)
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+- * [APC](#automatic-prefix-caching)
+  * [✗](gh-issue:3687)
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+- * [LoRA](#lora-adapter)
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+- * <abbr title="Prompt Adapter">prmpt adptr</abbr>
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * [✗](gh-issue:8475)
+  * ✅
+- * [SD](#spec_decode)
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+- * CUDA graph
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✗
+  * ✅
+- * <abbr title="Pooling Models">pooling</abbr>
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ?
+- * <abbr title="Encoder-Decoder Models">enc-dec</abbr>
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✗
+- * <abbr title="Multimodal Inputs">mm</abbr>
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+- * <abbr title="Logprobs">logP</abbr>
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+- * <abbr title="Prompt Logprobs">prmpt logP</abbr>
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+- * <abbr title="Async Output Processing">async output</abbr>
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✗
+  * ✗
+- * multi-step
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * [✗](gh-issue:8477)
+  * ✅
+- * best-of
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+- * beam-search
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+- * <abbr title="Guided Decoding">guided dec</abbr>
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+:::
diff --git a/docs/source/features/disagg_prefill.md b/docs/source/features/disagg_prefill.md
index efa2efc66192e..52d253b9c2b18 100644
--- a/docs/source/features/disagg_prefill.md
+++ b/docs/source/features/disagg_prefill.md
@@ -4,9 +4,9 @@
 
 This page introduces you the disaggregated prefilling feature in vLLM.
 
-```{note}
+:::{note}
 This feature is experimental and subject to change.
-```
+:::
 
 ## Why disaggregated prefilling?
 
@@ -15,9 +15,9 @@ Two main reasons:
 - **Tuning time-to-first-token (TTFT) and inter-token-latency (ITL) separately**. Disaggregated prefilling put prefill and decode phase of LLM inference inside different vLLM instances. This gives you the flexibility to assign different parallel strategies (e.g. `tp` and `pp`) to tune TTFT without affecting ITL, or to tune ITL without affecting TTFT.
 - **Controlling tail ITL**. Without disaggregated prefilling, vLLM may insert some prefill jobs during the decoding of one request. This results in higher tail latency. Disaggregated prefilling helps you solve this issue and control tail ITL. Chunked prefill with a proper chunk size also can achieve the same goal, but in practice it's hard to figure out the correct chunk size value. So disaggregated prefilling is a much more reliable way to control tail ITL.
 
-```{note}
+:::{note}
 Disaggregated prefill DOES NOT improve throughput.
-```
+:::
 
 ## Usage example
 
@@ -39,21 +39,21 @@ Key abstractions for disaggregated prefilling:
 - **LookupBuffer**: LookupBuffer provides two API: `insert` KV cache and `drop_select` KV cache. The semantics of `insert` and `drop_select` are similar to SQL, where `insert` inserts a KV cache into the buffer, and `drop_select` returns the KV cache that matches the given condition and drop it from the buffer.
 - **Pipe**: A single-direction FIFO pipe for tensor transmission. It supports `send_tensor` and `recv_tensor`.
 
-```{note}
+:::{note}
 `insert` is non-blocking operation but `drop_select` is blocking operation.
-```
+:::
 
 Here is a figure illustrating how the above 3 abstractions are organized:
 
-```{image} /assets/features/disagg_prefill/abstraction.jpg
+:::{image} /assets/features/disagg_prefill/abstraction.jpg
 :alt: Disaggregated prefilling abstractions
-```
+:::
 
 The workflow of disaggregated prefilling is as follows:
 
-```{image} /assets/features/disagg_prefill/overview.jpg
+:::{image} /assets/features/disagg_prefill/overview.jpg
 :alt: Disaggregated prefilling workflow
-```
+:::
 
 The `buffer` corresponds to `insert` API in LookupBuffer, and the `drop_select` corresponds to `drop_select` API in LookupBuffer.
 
diff --git a/docs/source/features/lora.md b/docs/source/features/lora.md
index b00d05147bb32..fb5a7a0d519cb 100644
--- a/docs/source/features/lora.md
+++ b/docs/source/features/lora.md
@@ -60,9 +60,9 @@ vllm serve meta-llama/Llama-2-7b-hf \
     --lora-modules sql-lora=$HOME/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/
 ```
 
-```{note}
+:::{note}
 The commit ID `0dfa347e8877a4d4ed19ee56c140fa518470028c` may change over time. Please check the latest commit ID in your environment to ensure you are using the correct one.
-```
+:::
 
 The server entrypoint accepts all other LoRA configuration parameters (`max_loras`, `max_lora_rank`, `max_cpu_loras`,
 etc.), which will apply to all forthcoming requests. Upon querying the `/models` endpoint, we should see our LoRA along
diff --git a/docs/source/features/quantization/auto_awq.md b/docs/source/features/quantization/auto_awq.md
index 404505eb3890e..30735b1161ff3 100644
--- a/docs/source/features/quantization/auto_awq.md
+++ b/docs/source/features/quantization/auto_awq.md
@@ -2,11 +2,11 @@
 
 # AutoAWQ
 
-```{warning}
+:::{warning}
 Please note that AWQ support in vLLM is under-optimized at the moment. We would recommend using the unquantized version of the model for better
 accuracy and higher throughput. Currently, you can use AWQ as a way to reduce memory footprint. As of now, it is more suitable for low latency
 inference with small number of concurrent requests. vLLM's AWQ implementation have lower throughput than unquantized version.
-```
+:::
 
 To create a new 4-bit quantized model, you can leverage [AutoAWQ](https://github.com/casper-hansen/AutoAWQ).
 Quantizing reduces the model's precision from FP16 to INT4 which effectively reduces the file size by ~70%.
diff --git a/docs/source/features/quantization/fp8.md b/docs/source/features/quantization/fp8.md
index 1398e8a324201..a62e0124b7706 100644
--- a/docs/source/features/quantization/fp8.md
+++ b/docs/source/features/quantization/fp8.md
@@ -14,10 +14,10 @@ The FP8 types typically supported in hardware have two distinct representations,
 - **E4M3**: Consists of 1 sign bit, 4 exponent bits, and 3 bits of mantissa. It can store values up to +/-448 and `nan`.
 - **E5M2**: Consists of 1 sign bit, 5 exponent bits, and 2 bits of mantissa. It can store values up to +/-57344, +/- `inf`, and `nan`. The tradeoff for the increased dynamic range is lower precision of the stored values.
 
-```{note}
+:::{note}
 FP8 computation is supported on NVIDIA GPUs with compute capability > 8.9 (Ada Lovelace, Hopper).
 FP8 models will run on compute capability > 8.0 (Ampere) as weight-only W8A16, utilizing FP8 Marlin.
-```
+:::
 
 ## Quick Start with Online Dynamic Quantization
 
@@ -32,9 +32,9 @@ model = LLM("facebook/opt-125m", quantization="fp8")
 result = model.generate("Hello, my name is")
 ```
 
-```{warning}
+:::{warning}
 Currently, we load the model at original precision before quantizing down to 8-bits, so you need enough memory to load the whole model.
-```
+:::
 
 ## Installation
 
@@ -110,9 +110,9 @@ model.generate("Hello my name is")
 
 Evaluate accuracy with `lm_eval` (for example on 250 samples of `gsm8k`):
 
-```{note}
+:::{note}
 Quantized models can be sensitive to the presence of the `bos` token. `lm_eval` does not add a `bos` token by default, so make sure to include the `add_bos_token=True` argument when running your evaluations.
-```
+:::
 
 ```console
 $ MODEL=$PWD/Meta-Llama-3-8B-Instruct-FP8-Dynamic
@@ -137,10 +137,10 @@ If you encounter any issues or have feature requests, please open an issue on th
 
 ## Deprecated Flow
 
-```{note}
+:::{note}
 The following information is preserved for reference and search purposes.
 The quantization method described below is deprecated in favor of the `llmcompressor` method described above.
-```
+:::
 
 For static per-tensor offline quantization to FP8, please install the [AutoFP8 library](https://github.com/neuralmagic/autofp8).
 
diff --git a/docs/source/features/quantization/gguf.md b/docs/source/features/quantization/gguf.md
index 640997cf4bc39..65c181900f9be 100644
--- a/docs/source/features/quantization/gguf.md
+++ b/docs/source/features/quantization/gguf.md
@@ -2,13 +2,13 @@
 
 # GGUF
 
-```{warning}
+:::{warning}
 Please note that GGUF support in vLLM is highly experimental and under-optimized at the moment, it might be incompatible with other features. Currently, you can use GGUF as a way to reduce memory footprint. If you encounter any issues, please report them to the vLLM team.
-```
+:::
 
-```{warning}
+:::{warning}
 Currently, vllm only supports loading single-file GGUF models. If you have a multi-files GGUF model, you can use [gguf-split](https://github.com/ggerganov/llama.cpp/pull/6135) tool to merge them to a single-file model.
-```
+:::
 
 To run a GGUF model with vLLM, you can download and use the local GGUF model from [TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF](https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF) with the following command:
 
@@ -25,9 +25,9 @@ You can also add `--tensor-parallel-size 2` to enable tensor parallelism inferen
 vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tensor-parallel-size 2
 ```
 
-```{warning}
+:::{warning}
 We recommend using the tokenizer from base model instead of GGUF model. Because the tokenizer conversion from GGUF is time-consuming and unstable, especially for some models with large vocab size.
-```
+:::
 
 You can also use the GGUF model directly through the LLM entrypoint:
 
diff --git a/docs/source/features/quantization/index.md b/docs/source/features/quantization/index.md
index 56ccdb5f00c34..d972dc85fc23c 100644
--- a/docs/source/features/quantization/index.md
+++ b/docs/source/features/quantization/index.md
@@ -4,7 +4,7 @@
 
 Quantization trades off model precision for smaller memory footprint, allowing large models to be run on a wider range of devices.
 
-```{toctree}
+:::{toctree}
 :caption: Contents
 :maxdepth: 1
 
@@ -15,4 +15,4 @@ gguf
 int8
 fp8
 quantized_kvcache
-```
+:::
diff --git a/docs/source/features/quantization/int8.md b/docs/source/features/quantization/int8.md
index 592a60d3988b2..fedb16f4350e5 100644
--- a/docs/source/features/quantization/int8.md
+++ b/docs/source/features/quantization/int8.md
@@ -7,9 +7,9 @@ This quantization method is particularly useful for reducing model size while ma
 
 Please visit the HF collection of [quantized INT8 checkpoints of popular LLMs ready to use with vLLM](https://huggingface.co/collections/neuralmagic/int8-llms-for-vllm-668ec32c049dca0369816415).
 
-```{note}
+:::{note}
 INT8 computation is supported on NVIDIA GPUs with compute capability > 7.5 (Turing, Ampere, Ada Lovelace, Hopper).
-```
+:::
 
 ## Prerequisites
 
@@ -119,9 +119,9 @@ $ lm_eval --model vllm \
   --batch_size 'auto'
 ```
 
-```{note}
+:::{note}
 Quantized models can be sensitive to the presence of the `bos` token. Make sure to include the `add_bos_token=True` argument when running evaluations.
-```
+:::
 
 ## Best Practices
 
diff --git a/docs/source/features/quantization/supported_hardware.md b/docs/source/features/quantization/supported_hardware.md
index f5c0a95ea426e..555ed4ce4c8db 100644
--- a/docs/source/features/quantization/supported_hardware.md
+++ b/docs/source/features/quantization/supported_hardware.md
@@ -4,128 +4,129 @@
 
 The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM:
 
-```{list-table}
+:::{list-table}
 :header-rows: 1
 :widths: 20 8 8 8 8 8 8 8 8 8 8
 
-* - Implementation
-  - Volta
-  - Turing
-  - Ampere
-  - Ada
-  - Hopper
-  - AMD GPU
-  - Intel GPU
-  - x86 CPU
-  - AWS Inferentia
-  - Google TPU
-* - AWQ
-  - ✗
-  - ✅︎
-  - ✅︎
-  - ✅︎
-  - ✅︎
-  - ✗
-  - ✅︎
-  - ✅︎
-  - ✗
-  - ✗
-* - GPTQ
-  - ✅︎
-  - ✅︎
-  - ✅︎
-  - ✅︎
-  - ✅︎
-  - ✗
-  - ✅︎
-  - ✅︎
-  - ✗
-  - ✗
-* - Marlin (GPTQ/AWQ/FP8)
-  - ✗
-  - ✗
-  - ✅︎
-  - ✅︎
-  - ✅︎
-  - ✗
-  - ✗
-  - ✗
-  - ✗
-  - ✗
-* - INT8 (W8A8)
-  - ✗
-  - ✅︎
-  - ✅︎
-  - ✅︎
-  - ✅︎
-  - ✗
-  - ✗
-  - ✅︎
-  - ✗
-  - ✗
-* - FP8 (W8A8)
-  - ✗
-  - ✗
-  - ✗
-  - ✅︎
-  - ✅︎
-  - ✅︎
-  - ✗
-  - ✗
-  - ✗
-  - ✗
-* - AQLM
-  - ✅︎
-  - ✅︎
-  - ✅︎
-  - ✅︎
-  - ✅︎
-  - ✗
-  - ✗
-  - ✗
-  - ✗
-  - ✗
-* - bitsandbytes
-  - ✅︎
-  - ✅︎
-  - ✅︎
-  - ✅︎
-  - ✅︎
-  - ✗
-  - ✗
-  - ✗
-  - ✗
-  - ✗
-* - DeepSpeedFP
-  - ✅︎
-  - ✅︎
-  - ✅︎
-  - ✅︎
-  - ✅︎
-  - ✗
-  - ✗
-  - ✗
-  - ✗
-  - ✗
-* - GGUF
-  - ✅︎
-  - ✅︎
-  - ✅︎
-  - ✅︎
-  - ✅︎
-  - ✅︎
-  - ✗
-  - ✗
-  - ✗
-  - ✗
-```
+- * Implementation
+  * Volta
+  * Turing
+  * Ampere
+  * Ada
+  * Hopper
+  * AMD GPU
+  * Intel GPU
+  * x86 CPU
+  * AWS Inferentia
+  * Google TPU
+- * AWQ
+  * ✗
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✗
+  * ✅︎
+  * ✅︎
+  * ✗
+  * ✗
+- * GPTQ
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✗
+  * ✅︎
+  * ✅︎
+  * ✗
+  * ✗
+- * Marlin (GPTQ/AWQ/FP8)
+  * ✗
+  * ✗
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✗
+  * ✗
+  * ✗
+  * ✗
+  * ✗
+- * INT8 (W8A8)
+  * ✗
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✗
+  * ✗
+  * ✅︎
+  * ✗
+  * ✗
+- * FP8 (W8A8)
+  * ✗
+  * ✗
+  * ✗
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✗
+  * ✗
+  * ✗
+  * ✗
+- * AQLM
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✗
+  * ✗
+  * ✗
+  * ✗
+  * ✗
+- * bitsandbytes
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✗
+  * ✗
+  * ✗
+  * ✗
+  * ✗
+- * DeepSpeedFP
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✗
+  * ✗
+  * ✗
+  * ✗
+  * ✗
+- * GGUF
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✗
+  * ✗
+  * ✗
+  * ✗
+
+:::
 
 - Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0.
 - "✅︎" indicates that the quantization method is supported on the specified hardware.
 - "✗" indicates that the quantization method is not supported on the specified hardware.
 
-```{note}
+:::{note}
 This compatibility chart is subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods.
 
 For the most up-to-date information on hardware support and quantization methods, please refer to <gh-dir:vllm/model_executor/layers/quantization> or consult with the vLLM development team.
-```
+:::
diff --git a/docs/source/features/spec_decode.md b/docs/source/features/spec_decode.md
index ab7b2f302bd13..da87127057dc5 100644
--- a/docs/source/features/spec_decode.md
+++ b/docs/source/features/spec_decode.md
@@ -2,15 +2,15 @@
 
 # Speculative Decoding
 
-```{warning}
+:::{warning}
 Please note that speculative decoding in vLLM is not yet optimized and does
 not usually yield inter-token latency reductions for all prompt datasets or sampling parameters.
 The work to optimize it is ongoing and can be followed here: <gh-issue:4630>
-```
+:::
 
-```{warning}
+:::{warning}
 Currently, speculative decoding in vLLM is not compatible with pipeline parallelism.
-```
+:::
 
 This document shows how to use [Speculative Decoding](https://x.com/karpathy/status/1697318534555336961) with vLLM.
 Speculative decoding is a technique which improves inter-token latency in memory-bound LLM inference.
diff --git a/docs/source/features/structured_outputs.md b/docs/source/features/structured_outputs.md
index 1d77c7339a33f..90c880e8cfa46 100644
--- a/docs/source/features/structured_outputs.md
+++ b/docs/source/features/structured_outputs.md
@@ -95,10 +95,10 @@ completion = client.chat.completions.create(
 print(completion.choices[0].message.content)
 ```
 
-```{tip}
+:::{tip}
 While not strictly necessary, normally it´s better to indicate in the prompt that a JSON needs to be generated and which fields and how should the LLM fill them.
 This can improve the results notably in most cases.
-```
+:::
 
 Finally we have the `guided_grammar`, which probably is the most difficult one to use but it´s really powerful, as it allows us to define complete languages like SQL queries.
 It works by using a context free EBNF grammar, which for example we can use to define a specific format of simplified SQL queries, like in the example below:
diff --git a/docs/source/generate_examples.py b/docs/source/generate_examples.py
index aaa13d0fb6d3f..ac592e22328da 100644
--- a/docs/source/generate_examples.py
+++ b/docs/source/generate_examples.py
@@ -57,9 +57,9 @@ class Index:
 
     def generate(self) -> str:
         content = f"# {self.title}\n\n{self.description}\n\n"
-        content += "```{toctree}\n"
+        content += ":::{toctree}\n"
         content += f":caption: {self.caption}\n:maxdepth: {self.maxdepth}\n"
-        content += "\n".join(self.documents) + "\n```\n"
+        content += "\n".join(self.documents) + "\n:::\n"
         return content
 
 
diff --git a/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md b/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
index ae42dd0c0d08f..704a16233981f 100644
--- a/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
+++ b/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
@@ -86,9 +86,9 @@ docker build -f Dockerfile.hpu -t vllm-hpu-env  .
 docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --rm vllm-hpu-env
 ```
 
-```{tip}
+:::{tip}
 If you're observing the following error: `docker: Error response from daemon: Unknown runtime specified habana.`, please refer to "Install Using Containers" section of [Intel Gaudi Software Stack and Driver Installation](https://docs.habana.ai/en/v1.18.0/Installation_Guide/Bare_Metal_Fresh_OS.html). Make sure you have `habana-container-runtime` package installed and that `habana` container runtime is registered.
-```
+:::
 
 ## Extra information
 
@@ -155,30 +155,30 @@ Gaudi2 devices. Configurations that are not listed may or may not work.
 
 Currently in vLLM for HPU we support four execution modes, depending on selected HPU PyTorch Bridge backend (via `PT_HPU_LAZY_MODE` environment variable), and `--enforce-eager` flag.
 
-```{list-table} vLLM execution modes
+:::{list-table} vLLM execution modes
 :widths: 25 25 50
 :header-rows: 1
 
-* - `PT_HPU_LAZY_MODE`
-  - `enforce_eager`
-  - execution mode
-* - 0
-  - 0
-  - torch.compile
-* - 0
-  - 1
-  - PyTorch eager mode
-* - 1
-  - 0
-  - HPU Graphs
-* - 1
-  - 1
-  - PyTorch lazy mode
-```
-
-```{warning}
+- * `PT_HPU_LAZY_MODE`
+  * `enforce_eager`
+  * execution mode
+- * 0
+  * 0
+  * torch.compile
+- * 0
+  * 1
+  * PyTorch eager mode
+- * 1
+  * 0
+  * HPU Graphs
+- * 1
+  * 1
+  * PyTorch lazy mode
+:::
+
+:::{warning}
 In 1.18.0, all modes utilizing `PT_HPU_LAZY_MODE=0` are highly experimental and should be only used for validating functional correctness. Their performance will be improved in the next releases. For obtaining the best performance in 1.18.0, please use HPU Graphs, or PyTorch lazy mode.
-```
+:::
 
 (gaudi-bucketing-mechanism)=
 
@@ -187,9 +187,9 @@ In 1.18.0, all modes utilizing `PT_HPU_LAZY_MODE=0` are highly experimental and
 Intel Gaudi accelerators work best when operating on models with fixed tensor shapes. [Intel Gaudi Graph Compiler](https://docs.habana.ai/en/latest/Gaudi_Overview/Intel_Gaudi_Software_Suite.html#graph-compiler-and-runtime) is responsible for generating optimized binary code that implements the given model topology on Gaudi. In its default configuration, the produced binary code may be heavily dependent on input and output tensor shapes, and can require graph recompilation when encountering differently shaped tensors within the same topology. While the resulting binaries utilize Gaudi efficiently, the compilation itself may introduce a noticeable overhead in end-to-end execution.
 In a dynamic inference serving scenario, there is a need to minimize the number of graph compilations and reduce the risk of graph compilation occurring during server runtime. Currently it is achieved by "bucketing" model's forward pass across two dimensions - `batch_size` and `sequence_length`.
 
-```{note}
+:::{note}
 Bucketing allows us to reduce the number of required graphs significantly, but it does not handle any graph compilation and device code generation - this is done in warmup and HPUGraph capture phase.
-```
+:::
 
 Bucketing ranges are determined with 3 parameters - `min`, `step` and `max`. They can be set separately for prompt and decode phase, and for batch size and sequence length dimension. These parameters can be observed in logs during vLLM startup:
 
@@ -222,15 +222,15 @@ min = 128, step = 128, max = 512
 
 In the logged scenario, 24 buckets were generated for prompt (prefill) runs, and 48 buckets for decode runs. Each bucket corresponds to a separate optimized device binary for a given model with specified tensor shapes. Whenever a batch of requests is processed, it is padded across batch and sequence length dimension to the smallest possible bucket.
 
-```{warning}
+:::{warning}
 If a request exceeds maximum bucket size in any dimension, it will be processed without padding, and its processing may require a graph compilation, potentially significantly increasing end-to-end latency. The boundaries of the buckets are user-configurable via environment variables, and upper bucket boundaries can be increased to avoid such scenario.
-```
+:::
 
 As an example, if a request of 3 sequences, with max sequence length of 412 comes in to an idle vLLM server, it will be padded executed as `(4, 512)` prefill bucket, as `batch_size` (number of sequences) will be padded to 4 (closest batch_size dimension higher than 3), and max sequence length will be padded to 512 (closest sequence length dimension higher than 412). After prefill stage, it will be executed as `(4, 512)` decode bucket and will continue as that bucket until either batch dimension changes (due to request being finished) - in which case it will become a `(2, 512)` bucket, or context length increases above 512 tokens, in which case it will become `(4, 640)` bucket.
 
-```{note}
+:::{note}
 Bucketing is transparent to a client -- padding in sequence length dimension is never returned to the client, and padding in batch dimension does not create new requests.
-```
+:::
 
 ### Warmup
 
@@ -252,9 +252,9 @@ INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size
 
 This example uses the same buckets as in the [Bucketing Mechanism](#gaudi-bucketing-mechanism) section. Each output line corresponds to execution of a single bucket. When bucket is executed for the first time, its graph is compiled and can be reused later on, skipping further graph compilations.
 
-```{tip}
+:::{tip}
 Compiling all the buckets might take some time and can be turned off with `VLLM_SKIP_WARMUP=true` environment variable. Keep in mind that if you do that, you may face graph compilations once executing a given bucket for the first time. It is fine to disable warmup for development, but it's highly recommended to enable it in deployment.
-```
+:::
 
 ### HPU Graph capture
 
@@ -269,9 +269,9 @@ With its default value (`VLLM_GRAPH_RESERVED_MEM=0.1`), 10% of usable memory wil
 Environment variable `VLLM_GRAPH_PROMPT_RATIO` determines the ratio of usable graph memory reserved for prefill and decode graphs. By default (`VLLM_GRAPH_PROMPT_RATIO=0.3`), both stages have equal memory constraints.
 Lower value corresponds to less usable graph memory reserved for prefill stage, e.g. `VLLM_GRAPH_PROMPT_RATIO=0.2` will reserve 20% of usable graph memory for prefill graphs, and 80% of usable graph memory for decode graphs.
 
-```{note}
+:::{note}
 `gpu_memory_utilization` does not correspond to the absolute memory usage across HPU. It specifies the memory margin after loading the model and performing a profile run. If device has 100 GiB of total memory, and 50 GiB of free memory after loading model weights and executing profiling run, `gpu_memory_utilization` at its default value will mark 90% of 50 GiB as usable, leaving 5 GiB of margin, regardless of total device memory.
-```
+:::
 
 User can also configure the strategy for capturing HPU Graphs for prompt and decode stages separately. Strategy affects the order of capturing graphs. There are two strategies implemented:
 \- `max_bs` - graph capture queue will sorted in descending order by their batch sizes. Buckets with equal batch sizes are sorted by sequence length in ascending order (e.g. `(64, 128)`, `(64, 256)`, `(32, 128)`, `(32, 256)`, `(1, 128)`, `(1,256)`), default strategy for decode
@@ -279,9 +279,9 @@ User can also configure the strategy for capturing HPU Graphs for prompt and dec
 
 When there's large amount of requests pending, vLLM scheduler will attempt to fill the maximum batch size for decode as soon as possible. When a request is finished, decode batch size decreases. When that happens, vLLM will attempt to schedule a prefill iteration for requests in the waiting queue, to fill the decode batch size to its previous state. This means that in a full load scenario, decode batch size is often at its maximum, which makes large batch size HPU Graphs crucial to capture, as reflected by `max_bs` strategy. On the other hand, prefills will be executed most frequently with very low batch sizes (1-4), which is reflected in `min_tokens` strategy.
 
-```{note}
+:::{note}
 `VLLM_GRAPH_PROMPT_RATIO` does not set a hard limit on memory taken by graphs for each stage (prefill and decode). vLLM will first attempt to use up entirety of usable prefill graph memory (usable graph memory * `VLLM_GRAPH_PROMPT_RATIO`) for capturing prefill HPU Graphs, next it will attempt do the same for decode graphs and usable decode graph memory pool. If one stage is fully captured, and there is unused memory left within usable graph memory pool, vLLM will attempt further graph capture for the other stage, until no more HPU Graphs can be captured without exceeding reserved memory pool. The behavior on that mechanism can be observed in the example below.
-```
+:::
 
 Each described step is logged by vLLM server, as follows (negative values correspond to memory being released):
 
@@ -352,13 +352,13 @@ INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of devi
 
 - `VLLM_{phase}_{dim}_BUCKET_{param}` - collection of 12 environment variables configuring ranges of bucketing mechanism
 
-  - `{phase}` is either `PROMPT` or `DECODE`
+  * `{phase}` is either `PROMPT` or `DECODE`
 
-  - `{dim}` is either `BS`, `SEQ` or `BLOCK`
+  * `{dim}` is either `BS`, `SEQ` or `BLOCK`
 
-  - `{param}` is either `MIN`, `STEP` or `MAX`
+  * `{param}` is either `MIN`, `STEP` or `MAX`
 
-  - Default values:
+  * Default values:
 
     - Prompt:
       - batch size min (`VLLM_PROMPT_BS_BUCKET_MIN`): `1`
diff --git a/docs/source/getting_started/installation/ai_accelerator/index.md b/docs/source/getting_started/installation/ai_accelerator/index.md
index a6c4c44305a4c..88352f639567b 100644
--- a/docs/source/getting_started/installation/ai_accelerator/index.md
+++ b/docs/source/getting_started/installation/ai_accelerator/index.md
@@ -2,374 +2,374 @@
 
 vLLM is a Python library that supports the following AI accelerators. Select your AI accelerator type to see vendor specific instructions:
 
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
 
-:::{tab-item} TPU
+::::{tab-item} TPU
 :sync: tpu
 
-```{include} tpu.inc.md
+:::{include} tpu.inc.md
 :start-after: "# Installation"
 :end-before: "## Requirements"
-```
-
 :::
 
-:::{tab-item} Intel Gaudi
+::::
+
+::::{tab-item} Intel Gaudi
 :sync: hpu-gaudi
 
-```{include} hpu-gaudi.inc.md
+:::{include} hpu-gaudi.inc.md
 :start-after: "# Installation"
 :end-before: "## Requirements"
-```
-
 :::
 
-:::{tab-item} Neuron
+::::
+
+::::{tab-item} Neuron
 :sync: neuron
 
-```{include} neuron.inc.md
+:::{include} neuron.inc.md
 :start-after: "# Installation"
 :end-before: "## Requirements"
-```
-
 :::
 
-:::{tab-item} OpenVINO
+::::
+
+::::{tab-item} OpenVINO
 :sync: openvino
 
-```{include} openvino.inc.md
+:::{include} openvino.inc.md
 :start-after: "# Installation"
 :end-before: "## Requirements"
-```
-
 :::
 
 ::::
 
+:::::
+
 ## Requirements
 
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
 
-:::{tab-item} TPU
+::::{tab-item} TPU
 :sync: tpu
 
-```{include} tpu.inc.md
+:::{include} tpu.inc.md
 :start-after: "## Requirements"
 :end-before: "## Configure a new environment"
-```
-
 :::
 
-:::{tab-item} Intel Gaudi
+::::
+
+::::{tab-item} Intel Gaudi
 :sync: hpu-gaudi
 
-```{include} hpu-gaudi.inc.md
+:::{include} hpu-gaudi.inc.md
 :start-after: "## Requirements"
 :end-before: "## Configure a new environment"
-```
-
 :::
 
-:::{tab-item} Neuron
+::::
+
+::::{tab-item} Neuron
 :sync: neuron
 
-```{include} neuron.inc.md
+:::{include} neuron.inc.md
 :start-after: "## Requirements"
 :end-before: "## Configure a new environment"
-```
-
 :::
 
-:::{tab-item} OpenVINO
+::::
+
+::::{tab-item} OpenVINO
 :sync: openvino
 
-```{include} openvino.inc.md
+:::{include} openvino.inc.md
 :start-after: "## Requirements"
 :end-before: "## Set up using Python"
-```
-
 :::
 
 ::::
 
+:::::
+
 ## Configure a new environment
 
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
 
-:::{tab-item} TPU
+::::{tab-item} TPU
 :sync: tpu
 
-```{include} tpu.inc.md
+:::{include} tpu.inc.md
 :start-after: "## Configure a new environment"
 :end-before: "## Set up using Python"
-```
-
 :::
 
-:::{tab-item} Intel Gaudi
+::::
+
+::::{tab-item} Intel Gaudi
 :sync: hpu-gaudi
 
-```{include} hpu-gaudi.inc.md
+:::{include} hpu-gaudi.inc.md
 :start-after: "## Configure a new environment"
 :end-before: "## Set up using Python"
-```
-
 :::
 
-:::{tab-item} Neuron
+::::
+
+::::{tab-item} Neuron
 :sync: neuron
 
-```{include} neuron.inc.md
+:::{include} neuron.inc.md
 :start-after: "## Configure a new environment"
 :end-before: "## Set up using Python"
-```
-
 :::
 
-:::{tab-item} OpenVINO
-:sync: openvino
+::::
 
-```{include} ../python_env_setup.inc.md
-```
+::::{tab-item} OpenVINO
+:sync: openvino
 
+:::{include} ../python_env_setup.inc.md
 :::
 
 ::::
 
+:::::
+
 ## Set up using Python
 
 ### Pre-built wheels
 
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
 
-:::{tab-item} TPU
+::::{tab-item} TPU
 :sync: tpu
 
-```{include} tpu.inc.md
+:::{include} tpu.inc.md
 :start-after: "### Pre-built wheels"
 :end-before: "### Build wheel from source"
-```
-
 :::
 
-:::{tab-item} Intel Gaudi
+::::
+
+::::{tab-item} Intel Gaudi
 :sync: hpu-gaudi
 
-```{include} hpu-gaudi.inc.md
+:::{include} hpu-gaudi.inc.md
 :start-after: "### Pre-built wheels"
 :end-before: "### Build wheel from source"
-```
-
 :::
 
-:::{tab-item} Neuron
+::::
+
+::::{tab-item} Neuron
 :sync: neuron
 
-```{include} neuron.inc.md
+:::{include} neuron.inc.md
 :start-after: "### Pre-built wheels"
 :end-before: "### Build wheel from source"
-```
-
 :::
 
-:::{tab-item} OpenVINO
+::::
+
+::::{tab-item} OpenVINO
 :sync: openvino
 
-```{include} openvino.inc.md
+:::{include} openvino.inc.md
 :start-after: "### Pre-built wheels"
 :end-before: "### Build wheel from source"
-```
-
 :::
 
 ::::
 
+:::::
+
 ### Build wheel from source
 
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
 
-:::{tab-item} TPU
+::::{tab-item} TPU
 :sync: tpu
 
-```{include} tpu.inc.md
+:::{include} tpu.inc.md
 :start-after: "### Build wheel from source"
 :end-before: "## Set up using Docker"
-```
-
 :::
 
-:::{tab-item} Intel Gaudi
+::::
+
+::::{tab-item} Intel Gaudi
 :sync: hpu-gaudi
 
-```{include} hpu-gaudi.inc.md
+:::{include} hpu-gaudi.inc.md
 :start-after: "### Build wheel from source"
 :end-before: "## Set up using Docker"
-```
-
 :::
 
-:::{tab-item} Neuron
+::::
+
+::::{tab-item} Neuron
 :sync: neuron
 
-```{include} neuron.inc.md
+:::{include} neuron.inc.md
 :start-after: "### Build wheel from source"
 :end-before: "## Set up using Docker"
-```
-
 :::
 
-:::{tab-item} OpenVINO
+::::
+
+::::{tab-item} OpenVINO
 :sync: openvino
 
-```{include} openvino.inc.md
+:::{include} openvino.inc.md
 :start-after: "### Build wheel from source"
 :end-before: "## Set up using Docker"
-```
-
 :::
 
 ::::
 
+:::::
+
 ## Set up using Docker
 
 ### Pre-built images
 
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
 
-:::{tab-item} TPU
+::::{tab-item} TPU
 :sync: tpu
 
-```{include} tpu.inc.md
+:::{include} tpu.inc.md
 :start-after: "### Pre-built images"
 :end-before: "### Build image from source"
-```
-
 :::
 
-:::{tab-item} Intel Gaudi
+::::
+
+::::{tab-item} Intel Gaudi
 :sync: hpu-gaudi
 
-```{include} hpu-gaudi.inc.md
+:::{include} hpu-gaudi.inc.md
 :start-after: "### Pre-built images"
 :end-before: "### Build image from source"
-```
-
 :::
 
-:::{tab-item} Neuron
+::::
+
+::::{tab-item} Neuron
 :sync: neuron
 
-```{include} neuron.inc.md
+:::{include} neuron.inc.md
 :start-after: "### Pre-built images"
 :end-before: "### Build image from source"
-```
-
 :::
 
-:::{tab-item} OpenVINO
+::::
+
+::::{tab-item} OpenVINO
 :sync: openvino
 
-```{include} openvino.inc.md
+:::{include} openvino.inc.md
 :start-after: "### Pre-built images"
 :end-before: "### Build image from source"
-```
-
 :::
 
 ::::
 
+:::::
+
 ### Build image from source
 
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
 
-:::{tab-item} TPU
+::::{tab-item} TPU
 :sync: tpu
 
-```{include} tpu.inc.md
+:::{include} tpu.inc.md
 :start-after: "### Build image from source"
 :end-before: "## Extra information"
-```
-
 :::
 
-:::{tab-item} Intel Gaudi
+::::
+
+::::{tab-item} Intel Gaudi
 :sync: hpu-gaudi
 
-```{include} hpu-gaudi.inc.md
+:::{include} hpu-gaudi.inc.md
 :start-after: "### Build image from source"
 :end-before: "## Extra information"
-```
-
 :::
 
-:::{tab-item} Neuron
+::::
+
+::::{tab-item} Neuron
 :sync: neuron
 
-```{include} neuron.inc.md
+:::{include} neuron.inc.md
 :start-after: "### Build image from source"
 :end-before: "## Extra information"
-```
-
 :::
 
-:::{tab-item} OpenVINO
+::::
+
+::::{tab-item} OpenVINO
 :sync: openvino
 
-```{include} openvino.inc.md
+:::{include} openvino.inc.md
 :start-after: "### Build image from source"
 :end-before: "## Extra information"
-```
-
 :::
 
 ::::
 
+:::::
+
 ## Extra information
 
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
 
-:::{tab-item} TPU
+::::{tab-item} TPU
 :sync: tpu
 
-```{include} tpu.inc.md
+:::{include} tpu.inc.md
 :start-after: "## Extra information"
-```
-
 :::
 
-:::{tab-item} Intel Gaudi
+::::
+
+::::{tab-item} Intel Gaudi
 :sync: hpu-gaudi
 
-```{include} hpu-gaudi.inc.md
+:::{include} hpu-gaudi.inc.md
 :start-after: "## Extra information"
-```
-
 :::
 
-:::{tab-item} Neuron
+::::
+
+::::{tab-item} Neuron
 :sync: neuron
 
-```{include} neuron.inc.md
+:::{include} neuron.inc.md
 :start-after: "## Extra information"
-```
-
 :::
 
-:::{tab-item} OpenVINO
+::::
+
+::::{tab-item} OpenVINO
 :sync: openvino
 
-```{include} openvino.inc.md
+:::{include} openvino.inc.md
 :start-after: "## Extra information"
-```
-
 :::
 
 ::::
+
+:::::
diff --git a/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md b/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md
index 575a9f9c2e2f0..145cc9d668efd 100644
--- a/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md
+++ b/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md
@@ -67,9 +67,9 @@ Currently, there are no pre-built Neuron wheels.
 
 ### Build wheel from source
 
-```{note}
+:::{note}
 The currently supported version of Pytorch for Neuron installs `triton` version `2.1.0`. This is incompatible with `vllm >= 0.5.3`. You may see an error `cannot import name 'default_dump_dir...`. To work around this, run a `pip install --upgrade triton==3.0.0` after installing the vLLM wheel.
-```
+:::
 
 Following instructions are applicable to Neuron SDK 2.16 and beyond.
 
diff --git a/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md b/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md
index 6a911cc6b9eba..6827afc805fd8 100644
--- a/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md
+++ b/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md
@@ -47,10 +47,10 @@ When you request queued resources, the request is added to a queue maintained by
 the Cloud TPU service. When the requested resource becomes available, it's
 assigned to your Google Cloud project for your immediate exclusive use.
 
-```{note}
+:::{note}
 In all of the following commands, replace the ALL CAPS parameter names with
 appropriate values. See the parameter descriptions table for more information.
-```
+:::
 
 ### Provision Cloud TPUs with GKE
 
@@ -75,33 +75,33 @@ gcloud alpha compute tpus queued-resources create QUEUED_RESOURCE_ID \
 --service-account SERVICE_ACCOUNT
 ```
 
-```{list-table} Parameter descriptions
+:::{list-table} Parameter descriptions
 :header-rows: 1
 
-* - Parameter name
-  - Description
-* - QUEUED_RESOURCE_ID
-  - The user-assigned ID of the queued resource request.
-* - TPU_NAME
-  - The user-assigned name of the TPU which is created when the queued
+- * Parameter name
+  * Description
+- * QUEUED_RESOURCE_ID
+  * The user-assigned ID of the queued resource request.
+- * TPU_NAME
+  * The user-assigned name of the TPU which is created when the queued
     resource request is allocated.
-* - PROJECT_ID
-  - Your Google Cloud project
-* - ZONE
-  - The GCP zone where you want to create your Cloud TPU. The value you use
+- * PROJECT_ID
+  * Your Google Cloud project
+- * ZONE
+  * The GCP zone where you want to create your Cloud TPU. The value you use
     depends on the version of TPUs you are using. For more information, see
     `TPU regions and zones <https://cloud.google.com/tpu/docs/regions-zones>`_
-* - ACCELERATOR_TYPE
-  - The TPU version you want to use. Specify the TPU version, for example
+- * ACCELERATOR_TYPE
+  * The TPU version you want to use. Specify the TPU version, for example
     `v5litepod-4` specifies a v5e TPU with 4 cores. For more information,
     see `TPU versions <https://cloud.devsite.corp.google.com/tpu/docs/system-architecture-tpu-vm#versions>`_.
-* - RUNTIME_VERSION
-  - The TPU VM runtime version to use. For more information see `TPU VM images <https://cloud.google.com/tpu/docs/runtimes>`_.
-* - SERVICE_ACCOUNT
-  - The email address for your service account. You can find it in the IAM
+- * RUNTIME_VERSION
+  * The TPU VM runtime version to use. For more information see `TPU VM images <https://cloud.google.com/tpu/docs/runtimes>`_.
+- * SERVICE_ACCOUNT
+  * The email address for your service account. You can find it in the IAM
     Cloud Console under *Service Accounts*. For example:
     `tpu-service-account@<your_project_ID>.iam.gserviceaccount.com`
-```
+:::
 
 Connect to your TPU using SSH:
 
@@ -178,15 +178,15 @@ Run the Docker image with the following command:
 docker run --privileged --net host --shm-size=16G -it vllm-tpu
 ```
 
-```{note}
+:::{note}
 Since TPU relies on XLA which requires static shapes, vLLM bucketizes the
 possible input shapes and compiles an XLA graph for each shape. The
 compilation time may take 20~30 minutes in the first run. However, the
 compilation time reduces to ~5 minutes afterwards because the XLA graphs are
 cached in the disk (in {code}`VLLM_XLA_CACHE_PATH` or {code}`~/.cache/vllm/xla_cache` by default).
-```
+:::
 
-````{tip}
+:::{tip}
 If you encounter the following error:
 
 ```console
@@ -198,9 +198,10 @@ file or directory
 Install OpenBLAS with the following command:
 
 ```console
-$ sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev
+sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev
 ```
-````
+
+:::
 
 ## Extra information
 
diff --git a/docs/source/getting_started/installation/cpu/apple.inc.md b/docs/source/getting_started/installation/cpu/apple.inc.md
index 56545253b1ef7..0808b869fdb7b 100644
--- a/docs/source/getting_started/installation/cpu/apple.inc.md
+++ b/docs/source/getting_started/installation/cpu/apple.inc.md
@@ -25,9 +25,9 @@ pip install -r requirements-cpu.txt
 pip install -e . 
 ```
 
-```{note}
+:::{note}
 On macOS the `VLLM_TARGET_DEVICE` is automatically set to `cpu`, which currently is the only supported device.
-```
+:::
 
 #### Troubleshooting
 
diff --git a/docs/source/getting_started/installation/cpu/index.md b/docs/source/getting_started/installation/cpu/index.md
index 4ec907c0e9fda..2f549ede0cf48 100644
--- a/docs/source/getting_started/installation/cpu/index.md
+++ b/docs/source/getting_started/installation/cpu/index.md
@@ -2,86 +2,86 @@
 
 vLLM is a Python library that supports the following CPU variants. Select your CPU type to see vendor specific instructions:
 
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
 
-:::{tab-item} x86
+::::{tab-item} x86
 :sync: x86
 
-```{include} x86.inc.md
+:::{include} x86.inc.md
 :start-after: "# Installation"
 :end-before: "## Requirements"
-```
-
 :::
 
-:::{tab-item} ARM
+::::
+
+::::{tab-item} ARM
 :sync: arm
 
-```{include} arm.inc.md
+:::{include} arm.inc.md
 :start-after: "# Installation"
 :end-before: "## Requirements"
-```
-
 :::
 
-:::{tab-item} Apple silicon
+::::
+
+::::{tab-item} Apple silicon
 :sync: apple
 
-```{include} apple.inc.md
+:::{include} apple.inc.md
 :start-after: "# Installation"
 :end-before: "## Requirements"
-```
-
 :::
 
 ::::
 
+:::::
+
 ## Requirements
 
 - Python: 3.9 -- 3.12
 
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
 
-:::{tab-item} x86
+::::{tab-item} x86
 :sync: x86
 
-```{include} x86.inc.md
+:::{include} x86.inc.md
 :start-after: "## Requirements"
 :end-before: "## Set up using Python"
-```
-
 :::
 
-:::{tab-item} ARM
+::::
+
+::::{tab-item} ARM
 :sync: arm
 
-```{include} arm.inc.md
+:::{include} arm.inc.md
 :start-after: "## Requirements"
 :end-before: "## Set up using Python"
-```
-
 :::
 
-:::{tab-item} Apple silicon
+::::
+
+::::{tab-item} Apple silicon
 :sync: apple
 
-```{include} apple.inc.md
+:::{include} apple.inc.md
 :start-after: "## Requirements"
 :end-before: "## Set up using Python"
-```
-
 :::
 
 ::::
 
+:::::
+
 ## Set up using Python
 
 ### Create a new Python environment
 
-```{include} ../python_env_setup.inc.md
-```
+:::{include} ../python_env_setup.inc.md
+:::
 
 ### Pre-built wheels
 
@@ -89,41 +89,41 @@ Currently, there are no pre-built CPU wheels.
 
 ### Build wheel from source
 
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
 
-:::{tab-item} x86
+::::{tab-item} x86
 :sync: x86
 
-```{include} x86.inc.md
+:::{include} x86.inc.md
 :start-after: "### Build wheel from source"
 :end-before: "## Set up using Docker"
-```
-
 :::
 
-:::{tab-item} ARM
+::::
+
+::::{tab-item} ARM
 :sync: arm
 
-```{include} arm.inc.md
+:::{include} arm.inc.md
 :start-after: "### Build wheel from source"
 :end-before: "## Set up using Docker"
-```
-
 :::
 
-:::{tab-item} Apple silicon
+::::
+
+::::{tab-item} Apple silicon
 :sync: apple
 
-```{include} apple.inc.md
+:::{include} apple.inc.md
 :start-after: "### Build wheel from source"
 :end-before: "## Set up using Docker"
-```
-
 :::
 
 ::::
 
+:::::
+
 ## Set up using Docker
 
 ### Pre-built images
@@ -142,9 +142,9 @@ $ docker run -it \
              vllm-cpu-env
 ```
 
-:::{tip}
+::::{tip}
 For ARM or Apple silicon, use `Dockerfile.arm`
-:::
+::::
 
 ## Supported features
 
diff --git a/docs/source/getting_started/installation/cpu/x86.inc.md b/docs/source/getting_started/installation/cpu/x86.inc.md
index e0eaac5099305..f146ae0918b44 100644
--- a/docs/source/getting_started/installation/cpu/x86.inc.md
+++ b/docs/source/getting_started/installation/cpu/x86.inc.md
@@ -17,10 +17,10 @@ vLLM initially supports basic model inferencing and serving on x86 CPU platform,
 :::{include} build.inc.md
 :::
 
-```{note}
+:::{note}
 - AVX512_BF16 is an extension ISA provides native BF16 data type conversion and vector product instructions, which brings some performance improvement compared with pure AVX512. The CPU backend build script will check the host CPU flags to determine whether to enable AVX512_BF16.
 - If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable `VLLM_CPU_AVX512BF16=1` before the building.
-```
+:::
 
 ## Set up using Docker
 
diff --git a/docs/source/getting_started/installation/gpu/cuda.inc.md b/docs/source/getting_started/installation/gpu/cuda.inc.md
index 4cce65278c069..5c2ea30dbfde1 100644
--- a/docs/source/getting_started/installation/gpu/cuda.inc.md
+++ b/docs/source/getting_started/installation/gpu/cuda.inc.md
@@ -10,9 +10,9 @@ vLLM contains pre-compiled C++ and CUDA (12.1) binaries.
 
 ### Create a new Python environment
 
-```{note}
+:::{note}
 PyTorch installed via `conda` will statically link `NCCL` library, which can cause issues when vLLM tries to use `NCCL`. See <gh-issue:8420> for more details.
-```
+:::
 
 In order to be performant, vLLM has to compile many cuda kernels. The compilation unfortunately introduces binary incompatibility with other CUDA versions and PyTorch versions, even for the same PyTorch version with different building configurations.
 
@@ -100,10 +100,10 @@ pip install --editable .
 
 You can find more information about vLLM's wheels in <project:#install-the-latest-code>.
 
-```{note}
+:::{note}
 There is a possibility that your source code may have a different commit ID compared to the latest vLLM wheel, which could potentially lead to unknown errors.
 It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to <project:#install-the-latest-code> for instructions on how to install a specified wheel.
-```
+:::
 
 #### Full build (with compilation)
 
@@ -115,7 +115,7 @@ cd vllm
 pip install -e .
 ```
 
-```{tip}
+:::{tip}
 Building from source requires a lot of compilation. If you are building from source repeatedly, it's more efficient to cache the compilation results.
 
 For example, you can install [ccache](https://github.com/ccache/ccache) using `conda install ccache` or `apt install ccache` .
@@ -123,7 +123,7 @@ As long as `which ccache` command can find the `ccache` binary, it will be used
 
 [sccache](https://github.com/mozilla/sccache) works similarly to `ccache`, but has the capability to utilize caching in remote storage environments.
 The following environment variables can be set to configure the vLLM `sccache` remote: `SCCACHE_BUCKET=vllm-build-sccache SCCACHE_REGION=us-west-2 SCCACHE_S3_NO_CREDENTIALS=1`. We also recommend setting `SCCACHE_IDLE_TIMEOUT=0`.
-```
+:::
 
 ##### Use an existing PyTorch installation
 
diff --git a/docs/source/getting_started/installation/gpu/index.md b/docs/source/getting_started/installation/gpu/index.md
index 6c007382b2c3d..0a61f889753a3 100644
--- a/docs/source/getting_started/installation/gpu/index.md
+++ b/docs/source/getting_started/installation/gpu/index.md
@@ -2,299 +2,299 @@
 
 vLLM is a Python library that supports the following GPU variants. Select your GPU type to see vendor specific instructions:
 
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
 
-:::{tab-item} CUDA
+::::{tab-item} CUDA
 :sync: cuda
 
-```{include} cuda.inc.md
+:::{include} cuda.inc.md
 :start-after: "# Installation"
 :end-before: "## Requirements"
-```
-
 :::
 
-:::{tab-item} ROCm
+::::
+
+::::{tab-item} ROCm
 :sync: rocm
 
-```{include} rocm.inc.md
+:::{include} rocm.inc.md
 :start-after: "# Installation"
 :end-before: "## Requirements"
-```
-
 :::
 
-:::{tab-item} XPU
+::::
+
+::::{tab-item} XPU
 :sync: xpu
 
-```{include} xpu.inc.md
+:::{include} xpu.inc.md
 :start-after: "# Installation"
 :end-before: "## Requirements"
-```
-
 :::
 
 ::::
 
+:::::
+
 ## Requirements
 
 - OS: Linux
 - Python: 3.9 -- 3.12
 
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
 
-:::{tab-item} CUDA
+::::{tab-item} CUDA
 :sync: cuda
 
-```{include} cuda.inc.md
+:::{include} cuda.inc.md
 :start-after: "## Requirements"
 :end-before: "## Set up using Python"
-```
-
 :::
 
-:::{tab-item} ROCm
+::::
+
+::::{tab-item} ROCm
 :sync: rocm
 
-```{include} rocm.inc.md
+:::{include} rocm.inc.md
 :start-after: "## Requirements"
 :end-before: "## Set up using Python"
-```
-
 :::
 
-:::{tab-item} XPU
+::::
+
+::::{tab-item} XPU
 :sync: xpu
 
-```{include} xpu.inc.md
+:::{include} xpu.inc.md
 :start-after: "## Requirements"
 :end-before: "## Set up using Python"
-```
-
 :::
 
 ::::
 
+:::::
+
 ## Set up using Python
 
 ### Create a new Python environment
 
-```{include} ../python_env_setup.inc.md
-```
+:::{include} ../python_env_setup.inc.md
+:::
 
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
 
-:::{tab-item} CUDA
+::::{tab-item} CUDA
 :sync: cuda
 
-```{include} cuda.inc.md
+:::{include} cuda.inc.md
 :start-after: "## Create a new Python environment"
 :end-before: "### Pre-built wheels"
-```
-
 :::
 
-:::{tab-item} ROCm
+::::
+
+::::{tab-item} ROCm
 :sync: rocm
 
 There is no extra information on creating a new Python environment for this device.
 
-:::
+::::
 
-:::{tab-item} XPU
+::::{tab-item} XPU
 :sync: xpu
 
 There is no extra information on creating a new Python environment for this device.
 
-:::
-
 ::::
 
+:::::
+
 ### Pre-built wheels
 
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
 
-:::{tab-item} CUDA
+::::{tab-item} CUDA
 :sync: cuda
 
-```{include} cuda.inc.md
+:::{include} cuda.inc.md
 :start-after: "### Pre-built wheels"
 :end-before: "### Build wheel from source"
-```
-
 :::
 
-:::{tab-item} ROCm
+::::
+
+::::{tab-item} ROCm
 :sync: rocm
 
-```{include} rocm.inc.md
+:::{include} rocm.inc.md
 :start-after: "### Pre-built wheels"
 :end-before: "### Build wheel from source"
-```
-
 :::
 
-:::{tab-item} XPU
+::::
+
+::::{tab-item} XPU
 :sync: xpu
 
-```{include} xpu.inc.md
+:::{include} xpu.inc.md
 :start-after: "### Pre-built wheels"
 :end-before: "### Build wheel from source"
-```
-
 :::
 
 ::::
 
+:::::
+
 (build-from-source)=
 
 ### Build wheel from source
 
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
 
-:::{tab-item} CUDA
+::::{tab-item} CUDA
 :sync: cuda
 
-```{include} cuda.inc.md
+:::{include} cuda.inc.md
 :start-after: "### Build wheel from source"
 :end-before: "## Set up using Docker"
-```
-
 :::
 
-:::{tab-item} ROCm
+::::
+
+::::{tab-item} ROCm
 :sync: rocm
 
-```{include} rocm.inc.md
+:::{include} rocm.inc.md
 :start-after: "### Build wheel from source"
 :end-before: "## Set up using Docker"
-```
-
 :::
 
-:::{tab-item} XPU
+::::
+
+::::{tab-item} XPU
 :sync: xpu
 
-```{include} xpu.inc.md
+:::{include} xpu.inc.md
 :start-after: "### Build wheel from source"
 :end-before: "## Set up using Docker"
-```
-
 :::
 
 ::::
 
+:::::
+
 ## Set up using Docker
 
 ### Pre-built images
 
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
 
-:::{tab-item} CUDA
+::::{tab-item} CUDA
 :sync: cuda
 
-```{include} cuda.inc.md
+:::{include} cuda.inc.md
 :start-after: "### Pre-built images"
 :end-before: "### Build image from source"
-```
-
 :::
 
-:::{tab-item} ROCm
+::::
+
+::::{tab-item} ROCm
 :sync: rocm
 
-```{include} rocm.inc.md
+:::{include} rocm.inc.md
 :start-after: "### Pre-built images"
 :end-before: "### Build image from source"
-```
-
 :::
 
-:::{tab-item} XPU
+::::
+
+::::{tab-item} XPU
 :sync: xpu
 
-```{include} xpu.inc.md
+:::{include} xpu.inc.md
 :start-after: "### Pre-built images"
 :end-before: "### Build image from source"
-```
-
 :::
 
 ::::
 
+:::::
+
 ### Build image from source
 
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
 
-:::{tab-item} CUDA
+::::{tab-item} CUDA
 :sync: cuda
 
-```{include} cuda.inc.md
+:::{include} cuda.inc.md
 :start-after: "### Build image from source"
 :end-before: "## Supported features"
-```
-
 :::
 
-:::{tab-item} ROCm
+::::
+
+::::{tab-item} ROCm
 :sync: rocm
 
-```{include} rocm.inc.md
+:::{include} rocm.inc.md
 :start-after: "### Build image from source"
 :end-before: "## Supported features"
-```
-
 :::
 
-:::{tab-item} XPU
+::::
+
+::::{tab-item} XPU
 :sync: xpu
 
-```{include} xpu.inc.md
+:::{include} xpu.inc.md
 :start-after: "### Build image from source"
 :end-before: "## Supported features"
-```
-
 :::
 
 ::::
 
+:::::
+
 ## Supported features
 
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
 
-:::{tab-item} CUDA
+::::{tab-item} CUDA
 :sync: cuda
 
-```{include} cuda.inc.md
+:::{include} cuda.inc.md
 :start-after: "## Supported features"
-```
-
 :::
 
-:::{tab-item} ROCm
+::::
+
+::::{tab-item} ROCm
 :sync: rocm
 
-```{include} rocm.inc.md
+:::{include} rocm.inc.md
 :start-after: "## Supported features"
-```
-
 :::
 
-:::{tab-item} XPU
+::::
+
+::::{tab-item} XPU
 :sync: xpu
 
-```{include} xpu.inc.md
+:::{include} xpu.inc.md
 :start-after: "## Supported features"
-```
-
 :::
 
 ::::
+
+:::::
diff --git a/docs/source/getting_started/installation/gpu/rocm.inc.md b/docs/source/getting_started/installation/gpu/rocm.inc.md
index 69238f6e36fb2..131ad1704ea11 100644
--- a/docs/source/getting_started/installation/gpu/rocm.inc.md
+++ b/docs/source/getting_started/installation/gpu/rocm.inc.md
@@ -16,10 +16,10 @@ Currently, there are no pre-built ROCm wheels.
 However, the [AMD Infinity hub for vLLM](https://hub.docker.com/r/rocm/vllm/tags) offers a prebuilt, optimized
 docker image designed for validating inference performance on the AMD Instinct™ MI300X accelerator.
 
-```{tip}
+:::{tip}
 Please check [LLM inference performance validation on AMD Instinct MI300X](https://rocm.docs.amd.com/en/latest/how-to/performance-validation/mi300x/vllm-benchmark.html)
 for instructions on how to use this prebuilt docker image.
-```
+:::
 
 ### Build wheel from source
 
@@ -47,9 +47,9 @@ for instructions on how to use this prebuilt docker image.
     cd ../..
     ```
 
-    ```{note}
-    - If you see HTTP issue related to downloading packages during building triton, please try again as the HTTP error is intermittent.
-    ```
+    :::{note}
+    If you see HTTP issue related to downloading packages during building triton, please try again as the HTTP error is intermittent.
+    :::
 
 2. Optionally, if you choose to use CK flash attention, you can install [flash attention for ROCm](https://github.com/ROCm/flash-attention/tree/ck_tile)
 
@@ -67,9 +67,9 @@ for instructions on how to use this prebuilt docker image.
     cd ..
     ```
 
-    ```{note}
-    - You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`)
-    ```
+    :::{note}
+    You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`)
+    :::
 
 3. Build vLLM. For example, vLLM on ROCM 6.2 can be built with the following steps:
 
@@ -95,17 +95,18 @@ for instructions on how to use this prebuilt docker image.
 
     This may take 5-10 minutes. Currently, `pip install .` does not work for ROCm installation.
 
-    ```{tip}
+<!--- pyml disable-num-lines 5 ul-indent-->
+    :::{tip}
     - Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers.
     - Triton flash attention does not currently support sliding window attention. If using half precision, please use CK flash-attention for sliding window support.
     - To use CK flash-attention or PyTorch naive attention, please use this flag `export VLLM_USE_TRITON_FLASH_ATTN=0` to turn off triton flash attention.
     - The ROCm version of PyTorch, ideally, should match the ROCm driver version.
-    ```
+    :::
 
-```{tip}
+:::{tip}
 - For MI300x (gfx942) users, to achieve optimal performance, please refer to [MI300x tuning guide](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/index.html) for performance optimization and tuning tips on system and workflow level.
   For vLLM, please refer to [vLLM performance optimization](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/workload.html#vllm-performance-optimization).
-```
+:::
 
 ## Set up using Docker
 
diff --git a/docs/source/getting_started/installation/gpu/xpu.inc.md b/docs/source/getting_started/installation/gpu/xpu.inc.md
index 577986eba74fd..bc01c6000bc07 100644
--- a/docs/source/getting_started/installation/gpu/xpu.inc.md
+++ b/docs/source/getting_started/installation/gpu/xpu.inc.md
@@ -30,10 +30,10 @@ pip install -v -r requirements-xpu.txt
 VLLM_TARGET_DEVICE=xpu python setup.py install
 ```
 
-```{note}
+:::{note}
 - FP16 is the default data type in the current XPU backend. The BF16 data
   type will be supported in the future.
-```
+:::
 
 ## Set up using Docker
 
diff --git a/docs/source/getting_started/installation/index.md b/docs/source/getting_started/installation/index.md
index bc1d268bf0c7e..0f5e013ce071a 100644
--- a/docs/source/getting_started/installation/index.md
+++ b/docs/source/getting_started/installation/index.md
@@ -4,10 +4,10 @@
 
 vLLM supports the following hardware platforms:
 
-```{toctree}
+:::{toctree}
 :maxdepth: 1
 
 gpu/index
 cpu/index
 ai_accelerator/index
-```
+:::
diff --git a/docs/source/getting_started/installation/python_env_setup.inc.md b/docs/source/getting_started/installation/python_env_setup.inc.md
index 25cfac5f58aa7..cb73914c9c75e 100644
--- a/docs/source/getting_started/installation/python_env_setup.inc.md
+++ b/docs/source/getting_started/installation/python_env_setup.inc.md
@@ -6,9 +6,9 @@ conda create -n myenv python=3.12 -y
 conda activate myenv
 ```
 
-```{note}
+:::{note}
 [PyTorch has deprecated the conda release channel](https://github.com/pytorch/pytorch/issues/138506). If you use `conda`, please only use it to create Python environment rather than installing packages.
-```
+:::
 
 Or you can create a new Python environment using [uv](https://docs.astral.sh/uv/), a very fast Python environment manager. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment using the following command:
 
diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md
index 8ac80e5e5c553..f4682ee45a48e 100644
--- a/docs/source/getting_started/quickstart.md
+++ b/docs/source/getting_started/quickstart.md
@@ -32,9 +32,9 @@ conda activate myenv
 pip install vllm
 ```
 
-```{note}
+:::{note}
 For non-CUDA platforms, please refer [here](#installation-index) for specific instructions on how to install vLLM.
-```
+:::
 
 (quickstart-offline)=
 
@@ -69,9 +69,9 @@ The {class}`~vllm.LLM` class initializes vLLM's engine and the [OPT-125M model](
 llm = LLM(model="facebook/opt-125m")
 ```
 
-```{note}
+:::{note}
 By default, vLLM downloads models from [HuggingFace](https://huggingface.co/). If you would like to use models from [ModelScope](https://www.modelscope.cn), set the environment variable `VLLM_USE_MODELSCOPE` before initializing the engine.
-```
+:::
 
 Now, the fun part! The outputs are generated using `llm.generate`. It adds the input prompts to the vLLM engine's waiting queue and executes the vLLM engine to generate the outputs with high throughput. The outputs are returned as a list of `RequestOutput` objects, which include all of the output tokens.
 
@@ -97,10 +97,10 @@ Run the following command to start the vLLM server with the [Qwen2.5-1.5B-Instru
 vllm serve Qwen/Qwen2.5-1.5B-Instruct
 ```
 
-```{note}
+:::{note}
 By default, the server uses a predefined chat template stored in the tokenizer.
 You can learn about overriding it [here](#chat-template).
-```
+:::
 
 This server can be queried in the same format as OpenAI API. For example, to list the models:
 
diff --git a/docs/source/getting_started/troubleshooting.md b/docs/source/getting_started/troubleshooting.md
index 7bfe9b4036adf..2f41fa3b6b19e 100644
--- a/docs/source/getting_started/troubleshooting.md
+++ b/docs/source/getting_started/troubleshooting.md
@@ -4,9 +4,9 @@
 
 This document outlines some troubleshooting strategies you can consider. If you think you've discovered a bug, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible.
 
-```{note}
+:::{note}
 Once you've debugged a problem, remember to turn off any debugging environment variables defined, or simply start a new shell to avoid being affected by lingering debugging settings. Otherwise, the system might be slow with debugging functionalities left activated.
-```
+:::
 
 ## Hangs downloading a model
 
@@ -18,9 +18,9 @@ It's recommended to download the model first using the [huggingface-cli](https:/
 If the model is large, it can take a long time to load it from disk. Pay attention to where you store the model. Some clusters have shared filesystems across nodes, e.g. a distributed filesystem or a network filesystem, which can be slow.
 It'd be better to store the model in a local disk. Additionally, have a look at the CPU memory usage, when the model is too large it might take a lot of CPU memory, slowing down the operating system because it needs to frequently swap between disk and memory.
 
-```{note}
+:::{note}
 To isolate the model downloading and loading issue, you can use the `--load-format dummy` argument to skip loading the model weights. This way, you can check if the model downloading and loading is the bottleneck.
-```
+:::
 
 ## Out of memory
 
@@ -132,14 +132,14 @@ If the script runs successfully, you should see the message `sanity check is suc
 
 If the test script hangs or crashes, usually it means the hardware/drivers are broken in some sense. You should try to contact your system administrator or hardware vendor for further assistance. As a common workaround, you can try to tune some NCCL environment variables, such as `export NCCL_P2P_DISABLE=1` to see if it helps. Please check [their documentation](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html) for more information. Please only use these environment variables as a temporary workaround, as they might affect the performance of the system. The best solution is still to fix the hardware/drivers so that the test script can run successfully.
 
-```{note}
+:::{note}
 A multi-node environment is more complicated than a single-node one. If you see errors such as `torch.distributed.DistNetworkError`, it is likely that the network/DNS setup is incorrect. In that case, you can manually assign node rank and specify the IP via command line arguments:
 
 - In the first node, run `NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 0 --master_addr $MASTER_ADDR test.py`.
 - In the second node, run `NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 1 --master_addr $MASTER_ADDR test.py`.
 
 Adjust `--nproc-per-node`, `--nnodes`, and `--node-rank` according to your setup, being sure to execute different commands (with different `--node-rank`) on different nodes.
-```
+:::
 
 (troubleshooting-python-multiprocessing)=
 
diff --git a/docs/source/index.md b/docs/source/index.md
index 6957d5dd0f2e7..e90e81c72860a 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -1,13 +1,13 @@
 # Welcome to vLLM
 
-```{figure} ./assets/logos/vllm-logo-text-light.png
+:::{figure} ./assets/logos/vllm-logo-text-light.png
 :align: center
 :alt: vLLM
 :class: no-scaled-link
 :width: 60%
-```
+:::
 
-```{raw} html
+:::{raw} html
 <p style="text-align:center">
 <strong>Easy, fast, and cheap LLM serving for everyone
 </strong>
@@ -19,7 +19,7 @@
 <a class="github-button" href="https://github.com/vllm-project/vllm/subscription" data-icon="octicon-eye" data-size="large" aria-label="Watch">Watch</a>
 <a class="github-button" href="https://github.com/vllm-project/vllm/fork" data-icon="octicon-repo-forked" data-size="large" aria-label="Fork">Fork</a>
 </p>
-```
+:::
 
 vLLM is a fast and easy-to-use library for LLM inference and serving.
 
@@ -58,7 +58,7 @@ For more information, check out the following:
 
 % How to start using vLLM?
 
-```{toctree}
+:::{toctree}
 :caption: Getting Started
 :maxdepth: 1
 
@@ -67,11 +67,11 @@ getting_started/quickstart
 getting_started/examples/examples_index
 getting_started/troubleshooting
 getting_started/faq
-```
+:::
 
 % What does vLLM support?
 
-```{toctree}
+:::{toctree}
 :caption: Models
 :maxdepth: 1
 
@@ -79,11 +79,11 @@ models/generative_models
 models/pooling_models
 models/supported_models
 models/extensions/index
-```
+:::
 
 % Additional capabilities
 
-```{toctree}
+:::{toctree}
 :caption: Features
 :maxdepth: 1
 
@@ -96,11 +96,11 @@ features/automatic_prefix_caching
 features/disagg_prefill
 features/spec_decode
 features/compatibility_matrix
-```
+:::
 
 % Details about running vLLM
 
-```{toctree}
+:::{toctree}
 :caption: Inference and Serving
 :maxdepth: 1
 
@@ -113,11 +113,11 @@ serving/engine_args
 serving/env_vars
 serving/usage_stats
 serving/integrations/index
-```
+:::
 
 % Scaling up vLLM for production
 
-```{toctree}
+:::{toctree}
 :caption: Deployment
 :maxdepth: 1
 
@@ -126,21 +126,21 @@ deployment/k8s
 deployment/nginx
 deployment/frameworks/index
 deployment/integrations/index
-```
+:::
 
 % Making the most out of vLLM
 
-```{toctree}
+:::{toctree}
 :caption: Performance
 :maxdepth: 1
 
 performance/optimization
 performance/benchmarks
-```
+:::
 
 % Explanation of vLLM internals
 
-```{toctree}
+:::{toctree}
 :caption: Design Documents
 :maxdepth: 2
 
@@ -151,11 +151,11 @@ design/kernel/paged_attention
 design/mm_processing
 design/automatic_prefix_caching
 design/multiprocessing
-```
+:::
 
 % How to contribute to the vLLM project
 
-```{toctree}
+:::{toctree}
 :caption: Developer Guide
 :maxdepth: 2
 
@@ -164,11 +164,11 @@ contributing/profiling/profiling_index
 contributing/dockerfile/dockerfile
 contributing/model/index
 contributing/vulnerability_management
-```
+:::
 
 % Technical API specifications
 
-```{toctree}
+:::{toctree}
 :caption: API Reference
 :maxdepth: 2
 
@@ -177,18 +177,18 @@ api/engine/index
 api/inference_params
 api/multimodal/index
 api/model/index
-```
+:::
 
 % Latest news and acknowledgements
 
-```{toctree}
+:::{toctree}
 :caption: Community
 :maxdepth: 1
 
 community/blog
 community/meetups
 community/sponsors
-```
+:::
 
 ## Indices and tables
 
diff --git a/docs/source/models/extensions/index.md b/docs/source/models/extensions/index.md
index cff09d12eba47..69faf472e5300 100644
--- a/docs/source/models/extensions/index.md
+++ b/docs/source/models/extensions/index.md
@@ -1,8 +1,8 @@
 # Built-in Extensions
 
-```{toctree}
+:::{toctree}
 :maxdepth: 1
 
 runai_model_streamer
 tensorizer
-```
+:::
diff --git a/docs/source/models/extensions/runai_model_streamer.md b/docs/source/models/extensions/runai_model_streamer.md
index 75f7a9fcad416..99c37876a01b3 100644
--- a/docs/source/models/extensions/runai_model_streamer.md
+++ b/docs/source/models/extensions/runai_model_streamer.md
@@ -48,6 +48,6 @@ You can read further about CPU buffer memory limiting [here](https://github.com/
 vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"memory_limit":5368709120}'
 ```
 
-```{note}
+:::{note}
 For further instructions about tunable parameters and additional parameters configurable through environment variables, read the [Environment Variables Documentation](https://github.com/run-ai/runai-model-streamer/blob/master/docs/src/env-vars.md).
-```
+:::
diff --git a/docs/source/models/extensions/tensorizer.md b/docs/source/models/extensions/tensorizer.md
index ae17e3437bca6..830c579d91bae 100644
--- a/docs/source/models/extensions/tensorizer.md
+++ b/docs/source/models/extensions/tensorizer.md
@@ -11,6 +11,6 @@ For more information on CoreWeave's Tensorizer, please refer to
 [CoreWeave's Tensorizer documentation](https://github.com/coreweave/tensorizer). For more information on serializing a vLLM model, as well a general usage guide to using Tensorizer with vLLM, see
 the [vLLM example script](https://docs.vllm.ai/en/stable/getting_started/examples/offline_inference/tensorize_vllm_model.html).
 
-```{note}
+:::{note}
 Note that to use this feature you will need to install `tensorizer` by running `pip install vllm[tensorizer]`.
-```
+:::
diff --git a/docs/source/models/generative_models.md b/docs/source/models/generative_models.md
index e4b4cd03a90d2..4abe6b776eea3 100644
--- a/docs/source/models/generative_models.md
+++ b/docs/source/models/generative_models.md
@@ -70,10 +70,10 @@ The {class}`~vllm.LLM.chat` method implements chat functionality on top of {clas
 In particular, it accepts input similar to [OpenAI Chat Completions API](https://platform.openai.com/docs/api-reference/chat)
 and automatically applies the model's [chat template](https://huggingface.co/docs/transformers/en/chat_templating) to format the prompt.
 
-```{important}
+:::{important}
 In general, only instruction-tuned models have a chat template.
 Base models may perform poorly as they are not trained to respond to the chat conversation.
-```
+:::
 
 ```python
 llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct")
diff --git a/docs/source/models/pooling_models.md b/docs/source/models/pooling_models.md
index 91db694be29a4..9704ccee745c4 100644
--- a/docs/source/models/pooling_models.md
+++ b/docs/source/models/pooling_models.md
@@ -8,54 +8,54 @@ In vLLM, pooling models implement the {class}`~vllm.model_executor.models.VllmMo
 These models use a {class}`~vllm.model_executor.layers.Pooler` to extract the final hidden states of the input
 before returning them.
 
-```{note}
+:::{note}
 We currently support pooling models primarily as a matter of convenience.
 As shown in the [Compatibility Matrix](#compatibility-matrix), most vLLM features are not applicable to
 pooling models as they only work on the generation or decode stage, so performance may not improve as much.
-```
+:::
 
 For pooling models, we support the following `--task` options.
 The selected option sets the default pooler used to extract the final hidden states:
 
-```{list-table}
+:::{list-table}
 :widths: 50 25 25 25
 :header-rows: 1
 
-* - Task
-  - Pooling Type
-  - Normalization
-  - Softmax
-* - Embedding (`embed`)
-  - `LAST`
-  - ✅︎
-  - ✗
-* - Classification (`classify`)
-  - `LAST`
-  - ✗
-  - ✅︎
-* - Sentence Pair Scoring (`score`)
-  - \*
-  - \*
-  - \*
-* - Reward Modeling (`reward`)
-  - `ALL`
-  - ✗
-  - ✗
-```
+- * Task
+  * Pooling Type
+  * Normalization
+  * Softmax
+- * Embedding (`embed`)
+  * `LAST`
+  * ✅︎
+  * ✗
+- * Classification (`classify`)
+  * `LAST`
+  * ✗
+  * ✅︎
+- * Sentence Pair Scoring (`score`)
+  * \*
+  * \*
+  * \*
+- * Reward Modeling (`reward`)
+  * `ALL`
+  * ✗
+  * ✗
+:::
 
 \*The default pooler is always defined by the model.
 
-```{note}
+:::{note}
 If the model's implementation in vLLM defines its own pooler, the default pooler is set to that instead of the one specified in this table.
-```
+:::
 
 When loading [Sentence Transformers](https://huggingface.co/sentence-transformers) models,
 we attempt to override the default pooler based on its Sentence Transformers configuration file (`modules.json`).
 
-```{tip}
+:::{tip}
 You can customize the model's pooling method via the `--override-pooler-config` option,
 which takes priority over both the model's and Sentence Transformers's defaults.
-```
+:::
 
 ## Offline Inference
 
@@ -111,10 +111,10 @@ The {class}`~vllm.LLM.score` method outputs similarity scores between sentence p
 It is primarily designed for [cross-encoder models](https://www.sbert.net/examples/applications/cross-encoder/README.html).
 These types of models serve as rerankers between candidate query-document pairs in RAG systems.
 
-```{note}
+:::{note}
 vLLM can only perform the model inference component (e.g. embedding, reranking) of RAG.
 To handle RAG at a higher level, you should use integration frameworks such as [LangChain](https://github.com/langchain-ai/langchain).
-```
+:::
 
 ```python
 llm = LLM(model="BAAI/bge-reranker-v2-m3", task="score")
diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index e59150cdd3b83..94f4bd6cadabd 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -17,7 +17,7 @@ By default, vLLM loads models from [HuggingFace (HF) Hub](https://huggingface.co
 To determine whether a given model is supported, you can check the `config.json` file inside the HF repository.
 If the `"architectures"` field contains a model architecture listed below, then it should be supported in theory.
 
-````{tip}
+:::{tip}
 The easiest way to check if your model is really supported at runtime is to run the program below:
 
 ```python
@@ -35,7 +35,7 @@ print(output)
 ```
 
 If vLLM successfully returns text (for generative models) or hidden states (for pooling models), it indicates that your model is supported.
-````
+:::
 
 Otherwise, please refer to [Adding a New Model](#new-model) for instructions on how to implement your model in vLLM.
 Alternatively, you can [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) to request vLLM support.
@@ -72,364 +72,364 @@ See [this page](#generative-models) for more information on how to use generativ
 
 #### Text Generation (`--task generate`)
 
-```{list-table}
+:::{list-table}
 :widths: 25 25 50 5 5
 :header-rows: 1
 
-* - Architecture
-  - Models
-  - Example HF Models
-  - [LoRA](#lora-adapter)
-  - [PP](#distributed-serving)
-* - `AquilaForCausalLM`
-  - Aquila, Aquila2
-  - `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc.
-  - ✅︎
-  - ✅︎
-* - `ArcticForCausalLM`
-  - Arctic
-  - `Snowflake/snowflake-arctic-base`, `Snowflake/snowflake-arctic-instruct`, etc.
-  -
-  - ✅︎
-* - `BaiChuanForCausalLM`
-  - Baichuan2, Baichuan
-  - `baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc.
-  - ✅︎
-  - ✅︎
-* - `BloomForCausalLM`
-  - BLOOM, BLOOMZ, BLOOMChat
-  - `bigscience/bloom`, `bigscience/bloomz`, etc.
-  -
-  - ✅︎
-* - `BartForConditionalGeneration`
-  - BART
-  - `facebook/bart-base`, `facebook/bart-large-cnn`, etc.
-  -
-  -
-* - `ChatGLMModel`
-  - ChatGLM
-  - `THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, etc.
-  - ✅︎
-  - ✅︎
-* - `CohereForCausalLM`, `Cohere2ForCausalLM`
-  - Command-R
-  - `CohereForAI/c4ai-command-r-v01`, `CohereForAI/c4ai-command-r7b-12-2024`, etc.
-  - ✅︎
-  - ✅︎
-* - `DbrxForCausalLM`
-  - DBRX
-  - `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc.
-  -
-  - ✅︎
-* - `DeciLMForCausalLM`
-  - DeciLM
-  - `Deci/DeciLM-7B`, `Deci/DeciLM-7B-instruct`, etc.
-  -
-  - ✅︎
-* - `DeepseekForCausalLM`
-  - DeepSeek
-  - `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat` etc.
-  -
-  - ✅︎
-* - `DeepseekV2ForCausalLM`
-  - DeepSeek-V2
-  - `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat` etc.
-  -
-  - ✅︎
-* - `DeepseekV3ForCausalLM`
-  - DeepSeek-V3
-  - `deepseek-ai/DeepSeek-V3-Base`, `deepseek-ai/DeepSeek-V3` etc.
-  -
-  - ✅︎
-* - `ExaoneForCausalLM`
-  - EXAONE-3
-  - `LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc.
-  - ✅︎
-  - ✅︎
-* - `FalconForCausalLM`
-  - Falcon
-  - `tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc.
-  -
-  - ✅︎
-* - `FalconMambaForCausalLM`
-  - FalconMamba
-  - `tiiuae/falcon-mamba-7b`, `tiiuae/falcon-mamba-7b-instruct`, etc.
-  - ✅︎
-  - ✅︎
-* - `GemmaForCausalLM`
-  - Gemma
-  - `google/gemma-2b`, `google/gemma-7b`, etc.
-  - ✅︎
-  - ✅︎
-* - `Gemma2ForCausalLM`
-  - Gemma2
-  - `google/gemma-2-9b`, `google/gemma-2-27b`, etc.
-  - ✅︎
-  - ✅︎
-* - `GlmForCausalLM`
-  - GLM-4
-  - `THUDM/glm-4-9b-chat-hf`, etc.
-  - ✅︎
-  - ✅︎
-* - `GPT2LMHeadModel`
-  - GPT-2
-  - `gpt2`, `gpt2-xl`, etc.
-  -
-  - ✅︎
-* - `GPTBigCodeForCausalLM`
-  - StarCoder, SantaCoder, WizardCoder
-  - `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc.
-  - ✅︎
-  - ✅︎
-* - `GPTJForCausalLM`
-  - GPT-J
-  - `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc.
-  -
-  - ✅︎
-* - `GPTNeoXForCausalLM`
-  - GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM
-  - `EleutherAI/gpt-neox-20b`, `EleutherAI/pythia-12b`, `OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc.
-  -
-  - ✅︎
-* - `GraniteForCausalLM`
-  - Granite 3.0, Granite 3.1, PowerLM
-  - `ibm-granite/granite-3.0-2b-base`, `ibm-granite/granite-3.1-8b-instruct`, `ibm/PowerLM-3b`, etc.
-  - ✅︎
-  - ✅︎
-* - `GraniteMoeForCausalLM`
-  - Granite 3.0 MoE, PowerMoE
-  - `ibm-granite/granite-3.0-1b-a400m-base`, `ibm-granite/granite-3.0-3b-a800m-instruct`, `ibm/PowerMoE-3b`, etc.
-  - ✅︎
-  - ✅︎
-* - `GritLM`
-  - GritLM
-  - `parasail-ai/GritLM-7B-vllm`.
-  - ✅︎
-  - ✅︎
-* - `InternLMForCausalLM`
-  - InternLM
-  - `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc.
-  - ✅︎
-  - ✅︎
-* - `InternLM2ForCausalLM`
-  - InternLM2
-  - `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc.
-  - ✅︎
-  - ✅︎
-* - `InternLM3ForCausalLM`
-  - InternLM3
-  - `internlm/internlm3-8b-instruct`, etc.
-  - ✅︎
-  - ✅︎
-* - `JAISLMHeadModel`
-  - Jais
-  - `inceptionai/jais-13b`, `inceptionai/jais-13b-chat`, `inceptionai/jais-30b-v3`, `inceptionai/jais-30b-chat-v3`, etc.
-  -
-  - ✅︎
-* - `JambaForCausalLM`
-  - Jamba
-  - `ai21labs/AI21-Jamba-1.5-Large`, `ai21labs/AI21-Jamba-1.5-Mini`, `ai21labs/Jamba-v0.1`, etc.
-  - ✅︎
-  - ✅︎
-* - `LlamaForCausalLM`
-  - Llama 3.1, Llama 3, Llama 2, LLaMA, Yi
-  - `meta-llama/Meta-Llama-3.1-405B-Instruct`, `meta-llama/Meta-Llama-3.1-70B`, `meta-llama/Meta-Llama-3-70B-Instruct`, `meta-llama/Llama-2-70b-hf`, `01-ai/Yi-34B`, etc.
-  - ✅︎
-  - ✅︎
-* - `MambaForCausalLM`
-  - Mamba
-  - `state-spaces/mamba-130m-hf`, `state-spaces/mamba-790m-hf`, `state-spaces/mamba-2.8b-hf`, etc.
-  -
-  - ✅︎
-* - `MiniCPMForCausalLM`
-  - MiniCPM
-  - `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc.
-  - ✅︎
-  - ✅︎
-* - `MiniCPM3ForCausalLM`
-  - MiniCPM3
-  - `openbmb/MiniCPM3-4B`, etc.
-  - ✅︎
-  - ✅︎
-* - `MistralForCausalLM`
-  - Mistral, Mistral-Instruct
-  - `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc.
-  - ✅︎
-  - ✅︎
-* - `MixtralForCausalLM`
-  - Mixtral-8x7B, Mixtral-8x7B-Instruct
-  - `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc.
-  - ✅︎
-  - ✅︎
-* - `MPTForCausalLM`
-  - MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter
-  - `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc.
-  -
-  - ✅︎
-* - `NemotronForCausalLM`
-  - Nemotron-3, Nemotron-4, Minitron
-  - `nvidia/Minitron-8B-Base`, `mgoin/Nemotron-4-340B-Base-hf-FP8`, etc.
-  - ✅︎
-  - ✅︎
-* - `OLMoForCausalLM`
-  - OLMo
-  - `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc.
-  -
-  - ✅︎
-* - `OLMo2ForCausalLM`
-  - OLMo2
-  - `allenai/OLMo2-7B-1124`, etc.
-  -
-  - ✅︎
-* - `OLMoEForCausalLM`
-  - OLMoE
-  - `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc.
-  - ✅︎
-  - ✅︎
-* - `OPTForCausalLM`
-  - OPT, OPT-IML
-  - `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc.
-  -
-  - ✅︎
-* - `OrionForCausalLM`
-  - Orion
-  - `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc.
-  -
-  - ✅︎
-* - `PhiForCausalLM`
-  - Phi
-  - `microsoft/phi-1_5`, `microsoft/phi-2`, etc.
-  - ✅︎
-  - ✅︎
-* - `Phi3ForCausalLM`
-  - Phi-4, Phi-3
-  - `microsoft/Phi-4`, `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc.
-  - ✅︎
-  - ✅︎
-* - `Phi3SmallForCausalLM`
-  - Phi-3-Small
-  - `microsoft/Phi-3-small-8k-instruct`, `microsoft/Phi-3-small-128k-instruct`, etc.
-  -
-  - ✅︎
-* - `PhiMoEForCausalLM`
-  - Phi-3.5-MoE
-  - `microsoft/Phi-3.5-MoE-instruct`, etc.
-  - ✅︎
-  - ✅︎
-* - `PersimmonForCausalLM`
-  - Persimmon
-  - `adept/persimmon-8b-base`, `adept/persimmon-8b-chat`, etc.
-  -
-  - ✅︎
-* - `QWenLMHeadModel`
-  - Qwen
-  - `Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.
-  - ✅︎
-  - ✅︎
-* - `Qwen2ForCausalLM`
-  - QwQ, Qwen2
-  - `Qwen/QwQ-32B-Preview`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-7B`, etc.
-  - ✅︎
-  - ✅︎
-* - `Qwen2MoeForCausalLM`
-  - Qwen2MoE
-  - `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc.
-  -
-  - ✅︎
-* - `StableLmForCausalLM`
-  - StableLM
-  - `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc.
-  -
-  - ✅︎
-* - `Starcoder2ForCausalLM`
-  - Starcoder2
-  - `bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc.
-  -
-  - ✅︎
-* - `SolarForCausalLM`
-  - Solar Pro
-  - `upstage/solar-pro-preview-instruct`, etc.
-  - ✅︎
-  - ✅︎
-* - `TeleChat2ForCausalLM`
-  - TeleChat2
-  - `TeleAI/TeleChat2-3B`, `TeleAI/TeleChat2-7B`, `TeleAI/TeleChat2-35B`, etc.
-  - ✅︎
-  - ✅︎
-* - `XverseForCausalLM`
-  - XVERSE
-  - `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc.
-  - ✅︎
-  - ✅︎
-```
-
-```{note}
+- * Architecture
+  * Models
+  * Example HF Models
+  * [LoRA](#lora-adapter)
+  * [PP](#distributed-serving)
+- * `AquilaForCausalLM`
+  * Aquila, Aquila2
+  * `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc.
+  * ✅︎
+  * ✅︎
+- * `ArcticForCausalLM`
+  * Arctic
+  * `Snowflake/snowflake-arctic-base`, `Snowflake/snowflake-arctic-instruct`, etc.
+  *
+  * ✅︎
+- * `BaiChuanForCausalLM`
+  * Baichuan2, Baichuan
+  * `baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc.
+  * ✅︎
+  * ✅︎
+- * `BloomForCausalLM`
+  * BLOOM, BLOOMZ, BLOOMChat
+  * `bigscience/bloom`, `bigscience/bloomz`, etc.
+  *
+  * ✅︎
+- * `BartForConditionalGeneration`
+  * BART
+  * `facebook/bart-base`, `facebook/bart-large-cnn`, etc.
+  *
+  *
+- * `ChatGLMModel`
+  * ChatGLM
+  * `THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, etc.
+  * ✅︎
+  * ✅︎
+- * `CohereForCausalLM`, `Cohere2ForCausalLM`
+  * Command-R
+  * `CohereForAI/c4ai-command-r-v01`, `CohereForAI/c4ai-command-r7b-12-2024`, etc.
+  * ✅︎
+  * ✅︎
+- * `DbrxForCausalLM`
+  * DBRX
+  * `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc.
+  *
+  * ✅︎
+- * `DeciLMForCausalLM`
+  * DeciLM
+  * `Deci/DeciLM-7B`, `Deci/DeciLM-7B-instruct`, etc.
+  *
+  * ✅︎
+- * `DeepseekForCausalLM`
+  * DeepSeek
+  * `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat` etc.
+  *
+  * ✅︎
+- * `DeepseekV2ForCausalLM`
+  * DeepSeek-V2
+  * `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat` etc.
+  *
+  * ✅︎
+- * `DeepseekV3ForCausalLM`
+  * DeepSeek-V3
+  * `deepseek-ai/DeepSeek-V3-Base`, `deepseek-ai/DeepSeek-V3` etc.
+  *
+  * ✅︎
+- * `ExaoneForCausalLM`
+  * EXAONE-3
+  * `LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc.
+  * ✅︎
+  * ✅︎
+- * `FalconForCausalLM`
+  * Falcon
+  * `tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc.
+  *
+  * ✅︎
+- * `FalconMambaForCausalLM`
+  * FalconMamba
+  * `tiiuae/falcon-mamba-7b`, `tiiuae/falcon-mamba-7b-instruct`, etc.
+  * ✅︎
+  * ✅︎
+- * `GemmaForCausalLM`
+  * Gemma
+  * `google/gemma-2b`, `google/gemma-7b`, etc.
+  * ✅︎
+  * ✅︎
+- * `Gemma2ForCausalLM`
+  * Gemma2
+  * `google/gemma-2-9b`, `google/gemma-2-27b`, etc.
+  * ✅︎
+  * ✅︎
+- * `GlmForCausalLM`
+  * GLM-4
+  * `THUDM/glm-4-9b-chat-hf`, etc.
+  * ✅︎
+  * ✅︎
+- * `GPT2LMHeadModel`
+  * GPT-2
+  * `gpt2`, `gpt2-xl`, etc.
+  *
+  * ✅︎
+- * `GPTBigCodeForCausalLM`
+  * StarCoder, SantaCoder, WizardCoder
+  * `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc.
+  * ✅︎
+  * ✅︎
+- * `GPTJForCausalLM`
+  * GPT-J
+  * `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc.
+  *
+  * ✅︎
+- * `GPTNeoXForCausalLM`
+  * GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM
+  * `EleutherAI/gpt-neox-20b`, `EleutherAI/pythia-12b`, `OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc.
+  *
+  * ✅︎
+- * `GraniteForCausalLM`
+  * Granite 3.0, Granite 3.1, PowerLM
+  * `ibm-granite/granite-3.0-2b-base`, `ibm-granite/granite-3.1-8b-instruct`, `ibm/PowerLM-3b`, etc.
+  * ✅︎
+  * ✅︎
+- * `GraniteMoeForCausalLM`
+  * Granite 3.0 MoE, PowerMoE
+  * `ibm-granite/granite-3.0-1b-a400m-base`, `ibm-granite/granite-3.0-3b-a800m-instruct`, `ibm/PowerMoE-3b`, etc.
+  * ✅︎
+  * ✅︎
+- * `GritLM`
+  * GritLM
+  * `parasail-ai/GritLM-7B-vllm`.
+  * ✅︎
+  * ✅︎
+- * `InternLMForCausalLM`
+  * InternLM
+  * `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc.
+  * ✅︎
+  * ✅︎
+- * `InternLM2ForCausalLM`
+  * InternLM2
+  * `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc.
+  * ✅︎
+  * ✅︎
+- * `InternLM3ForCausalLM`
+  * InternLM3
+  * `internlm/internlm3-8b-instruct`, etc.
+  * ✅︎
+  * ✅︎
+- * `JAISLMHeadModel`
+  * Jais
+  * `inceptionai/jais-13b`, `inceptionai/jais-13b-chat`, `inceptionai/jais-30b-v3`, `inceptionai/jais-30b-chat-v3`, etc.
+  *
+  * ✅︎
+- * `JambaForCausalLM`
+  * Jamba
+  * `ai21labs/AI21-Jamba-1.5-Large`, `ai21labs/AI21-Jamba-1.5-Mini`, `ai21labs/Jamba-v0.1`, etc.
+  * ✅︎
+  * ✅︎
+- * `LlamaForCausalLM`
+  * Llama 3.1, Llama 3, Llama 2, LLaMA, Yi
+  * `meta-llama/Meta-Llama-3.1-405B-Instruct`, `meta-llama/Meta-Llama-3.1-70B`, `meta-llama/Meta-Llama-3-70B-Instruct`, `meta-llama/Llama-2-70b-hf`, `01-ai/Yi-34B`, etc.
+  * ✅︎
+  * ✅︎
+- * `MambaForCausalLM`
+  * Mamba
+  * `state-spaces/mamba-130m-hf`, `state-spaces/mamba-790m-hf`, `state-spaces/mamba-2.8b-hf`, etc.
+  *
+  * ✅︎
+- * `MiniCPMForCausalLM`
+  * MiniCPM
+  * `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc.
+  * ✅︎
+  * ✅︎
+- * `MiniCPM3ForCausalLM`
+  * MiniCPM3
+  * `openbmb/MiniCPM3-4B`, etc.
+  * ✅︎
+  * ✅︎
+- * `MistralForCausalLM`
+  * Mistral, Mistral-Instruct
+  * `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc.
+  * ✅︎
+  * ✅︎
+- * `MixtralForCausalLM`
+  * Mixtral-8x7B, Mixtral-8x7B-Instruct
+  * `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc.
+  * ✅︎
+  * ✅︎
+- * `MPTForCausalLM`
+  * MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter
+  * `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc.
+  *
+  * ✅︎
+- * `NemotronForCausalLM`
+  * Nemotron-3, Nemotron-4, Minitron
+  * `nvidia/Minitron-8B-Base`, `mgoin/Nemotron-4-340B-Base-hf-FP8`, etc.
+  * ✅︎
+  * ✅︎
+- * `OLMoForCausalLM`
+  * OLMo
+  * `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc.
+  *
+  * ✅︎
+- * `OLMo2ForCausalLM`
+  * OLMo2
+  * `allenai/OLMo2-7B-1124`, etc.
+  *
+  * ✅︎
+- * `OLMoEForCausalLM`
+  * OLMoE
+  * `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc.
+  * ✅︎
+  * ✅︎
+- * `OPTForCausalLM`
+  * OPT, OPT-IML
+  * `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc.
+  *
+  * ✅︎
+- * `OrionForCausalLM`
+  * Orion
+  * `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc.
+  *
+  * ✅︎
+- * `PhiForCausalLM`
+  * Phi
+  * `microsoft/phi-1_5`, `microsoft/phi-2`, etc.
+  * ✅︎
+  * ✅︎
+- * `Phi3ForCausalLM`
+  * Phi-4, Phi-3
+  * `microsoft/Phi-4`, `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc.
+  * ✅︎
+  * ✅︎
+- * `Phi3SmallForCausalLM`
+  * Phi-3-Small
+  * `microsoft/Phi-3-small-8k-instruct`, `microsoft/Phi-3-small-128k-instruct`, etc.
+  *
+  * ✅︎
+- * `PhiMoEForCausalLM`
+  * Phi-3.5-MoE
+  * `microsoft/Phi-3.5-MoE-instruct`, etc.
+  * ✅︎
+  * ✅︎
+- * `PersimmonForCausalLM`
+  * Persimmon
+  * `adept/persimmon-8b-base`, `adept/persimmon-8b-chat`, etc.
+  *
+  * ✅︎
+- * `QWenLMHeadModel`
+  * Qwen
+  * `Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.
+  * ✅︎
+  * ✅︎
+- * `Qwen2ForCausalLM`
+  * QwQ, Qwen2
+  * `Qwen/QwQ-32B-Preview`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-7B`, etc.
+  * ✅︎
+  * ✅︎
+- * `Qwen2MoeForCausalLM`
+  * Qwen2MoE
+  * `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc.
+  *
+  * ✅︎
+- * `StableLmForCausalLM`
+  * StableLM
+  * `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc.
+  *
+  * ✅︎
+- * `Starcoder2ForCausalLM`
+  * Starcoder2
+  * `bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc.
+  *
+  * ✅︎
+- * `SolarForCausalLM`
+  * Solar Pro
+  * `upstage/solar-pro-preview-instruct`, etc.
+  * ✅︎
+  * ✅︎
+- * `TeleChat2ForCausalLM`
+  * TeleChat2
+  * `TeleAI/TeleChat2-3B`, `TeleAI/TeleChat2-7B`, `TeleAI/TeleChat2-35B`, etc.
+  * ✅︎
+  * ✅︎
+- * `XverseForCausalLM`
+  * XVERSE
+  * `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc.
+  * ✅︎
+  * ✅︎
+:::
+
+:::{note}
 Currently, the ROCm version of vLLM supports Mistral and Mixtral only for context lengths up to 4096.
-```
+:::
 
 ### Pooling Models
 
 See [this page](pooling-models) for more information on how to use pooling models.
 
-```{important}
+:::{important}
 Since some model architectures support both generative and pooling tasks,
 you should explicitly specify the task type to ensure that the model is used in pooling mode instead of generative mode.
-```
+:::
 
 #### Text Embedding (`--task embed`)
 
-```{list-table}
+:::{list-table}
 :widths: 25 25 50 5 5
 :header-rows: 1
 
-* - Architecture
-  - Models
-  - Example HF Models
-  - [LoRA](#lora-adapter)
-  - [PP](#distributed-serving)
-* - `BertModel`
-  - BERT-based
-  - `BAAI/bge-base-en-v1.5`, etc.
-  -
-  -
-* - `Gemma2Model`
-  - Gemma2-based
-  - `BAAI/bge-multilingual-gemma2`, etc.
-  -
-  - ✅︎
-* - `GritLM`
-  - GritLM
-  - `parasail-ai/GritLM-7B-vllm`.
-  - ✅︎
-  - ✅︎
-* - `LlamaModel`, `LlamaForCausalLM`, `MistralModel`, etc.
-  - Llama-based
-  - `intfloat/e5-mistral-7b-instruct`, etc.
-  - ✅︎
-  - ✅︎
-* - `Qwen2Model`, `Qwen2ForCausalLM`
-  - Qwen2-based
-  - `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc.
-  - ✅︎
-  - ✅︎
-* - `RobertaModel`, `RobertaForMaskedLM`
-  - RoBERTa-based
-  - `sentence-transformers/all-roberta-large-v1`, `sentence-transformers/all-roberta-large-v1`, etc.
-  -
-  -
-* - `XLMRobertaModel`
-  - XLM-RoBERTa-based
-  - `intfloat/multilingual-e5-large`, etc.
-  -
-  -
-```
-
-```{note}
+- * Architecture
+  * Models
+  * Example HF Models
+  * [LoRA](#lora-adapter)
+  * [PP](#distributed-serving)
+- * `BertModel`
+  * BERT-based
+  * `BAAI/bge-base-en-v1.5`, etc.
+  *
+  *
+- * `Gemma2Model`
+  * Gemma2-based
+  * `BAAI/bge-multilingual-gemma2`, etc.
+  *
+  * ✅︎
+- * `GritLM`
+  * GritLM
+  * `parasail-ai/GritLM-7B-vllm`.
+  * ✅︎
+  * ✅︎
+- * `LlamaModel`, `LlamaForCausalLM`, `MistralModel`, etc.
+  * Llama-based
+  * `intfloat/e5-mistral-7b-instruct`, etc.
+  * ✅︎
+  * ✅︎
+- * `Qwen2Model`, `Qwen2ForCausalLM`
+  * Qwen2-based
+  * `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc.
+  * ✅︎
+  * ✅︎
+- * `RobertaModel`, `RobertaForMaskedLM`
+  * RoBERTa-based
+  * `sentence-transformers/all-roberta-large-v1`, `sentence-transformers/all-roberta-large-v1`, etc.
+  *
+  *
+- * `XLMRobertaModel`
+  * XLM-RoBERTa-based
+  * `intfloat/multilingual-e5-large`, etc.
+  *
+  *
+:::
+
+:::{note}
 `ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config.
 You should manually set mean pooling by passing `--override-pooler-config '{"pooling_type": "MEAN"}'`.
-```
+:::
 
-```{note}
+:::{note}
 Unlike base Qwen2, `Alibaba-NLP/gte-Qwen2-7B-instruct` uses bi-directional attention.
 You can set `--hf-overrides '{"is_causal": false}'` to change the attention mask accordingly.
 
@@ -438,7 +438,7 @@ despite being described otherwise on its model card.
 
 Regardless of the variant, you need to enable `--trust-remote-code` for the correct tokenizer to be
 loaded. See [relevant issue on HF Transformers](https://github.com/huggingface/transformers/issues/34882).
-```
+:::
 
 If your model is not in the above list, we will try to automatically convert the model using
 {func}`~vllm.model_executor.models.adapters.as_embedding_model`. By default, the embeddings
@@ -446,98 +446,98 @@ of the whole prompt are extracted from the normalized hidden state corresponding
 
 #### Reward Modeling (`--task reward`)
 
-```{list-table}
+:::{list-table}
 :widths: 25 25 50 5 5
 :header-rows: 1
 
-* - Architecture
-  - Models
-  - Example HF Models
-  - [LoRA](#lora-adapter)
-  - [PP](#distributed-serving)
-* - `InternLM2ForRewardModel`
-  - InternLM2-based
-  - `internlm/internlm2-1_8b-reward`, `internlm/internlm2-7b-reward`, etc.
-  - ✅︎
-  - ✅︎
-* - `LlamaForCausalLM`
-  - Llama-based
-  - `peiyi9979/math-shepherd-mistral-7b-prm`, etc.
-  - ✅︎
-  - ✅︎
-* - `Qwen2ForRewardModel`
-  - Qwen2-based
-  - `Qwen/Qwen2.5-Math-RM-72B`, etc.
-  - ✅︎
-  - ✅︎
-* - `Qwen2ForProcessRewardModel`
-  - Qwen2-based
-  - `Qwen/Qwen2.5-Math-PRM-7B`, `Qwen/Qwen2.5-Math-PRM-72B`, etc.
-  - ✅︎
-  - ✅︎
-```
+- * Architecture
+  * Models
+  * Example HF Models
+  * [LoRA](#lora-adapter)
+  * [PP](#distributed-serving)
+- * `InternLM2ForRewardModel`
+  * InternLM2-based
+  * `internlm/internlm2-1_8b-reward`, `internlm/internlm2-7b-reward`, etc.
+  * ✅︎
+  * ✅︎
+- * `LlamaForCausalLM`
+  * Llama-based
+  * `peiyi9979/math-shepherd-mistral-7b-prm`, etc.
+  * ✅︎
+  * ✅︎
+- * `Qwen2ForRewardModel`
+  * Qwen2-based
+  * `Qwen/Qwen2.5-Math-RM-72B`, etc.
+  * ✅︎
+  * ✅︎
+- * `Qwen2ForProcessRewardModel`
+  * Qwen2-based
+  * `Qwen/Qwen2.5-Math-PRM-7B`, `Qwen/Qwen2.5-Math-PRM-72B`, etc.
+  * ✅︎
+  * ✅︎
+:::
 
 If your model is not in the above list, we will try to automatically convert the model using
 {func}`~vllm.model_executor.models.adapters.as_reward_model`. By default, we return the hidden states of each token directly.
 
-```{important}
+:::{important}
 For process-supervised reward models such as `peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly,
 e.g.: `--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`.
-```
+:::
 
 #### Classification (`--task classify`)
 
-```{list-table}
+:::{list-table}
 :widths: 25 25 50 5 5
 :header-rows: 1
 
-* - Architecture
-  - Models
-  - Example HF Models
-  - [LoRA](#lora-adapter)
-  - [PP](#distributed-serving)
-* - `JambaForSequenceClassification`
-  - Jamba
-  - `ai21labs/Jamba-tiny-reward-dev`, etc.
-  - ✅︎
-  - ✅︎
-* - `Qwen2ForSequenceClassification`
-  - Qwen2-based
-  - `jason9693/Qwen2.5-1.5B-apeach`, etc.
-  - ✅︎
-  - ✅︎
-```
+- * Architecture
+  * Models
+  * Example HF Models
+  * [LoRA](#lora-adapter)
+  * [PP](#distributed-serving)
+- * `JambaForSequenceClassification`
+  * Jamba
+  * `ai21labs/Jamba-tiny-reward-dev`, etc.
+  * ✅︎
+  * ✅︎
+- * `Qwen2ForSequenceClassification`
+  * Qwen2-based
+  * `jason9693/Qwen2.5-1.5B-apeach`, etc.
+  * ✅︎
+  * ✅︎
+:::
 
 If your model is not in the above list, we will try to automatically convert the model using
 {func}`~vllm.model_executor.models.adapters.as_classification_model`. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token.
 
 #### Sentence Pair Scoring (`--task score`)
 
-```{list-table}
+:::{list-table}
 :widths: 25 25 50 5 5
 :header-rows: 1
 
-* - Architecture
-  - Models
-  - Example HF Models
-  - [LoRA](#lora-adapter)
-  - [PP](#distributed-serving)
-* - `BertForSequenceClassification`
-  - BERT-based
-  - `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc.
-  -
-  -
-* - `RobertaForSequenceClassification`
-  - RoBERTa-based
-  - `cross-encoder/quora-roberta-base`, etc.
-  -
-  -
-* - `XLMRobertaForSequenceClassification`
-  - XLM-RoBERTa-based
-  - `BAAI/bge-reranker-v2-m3`, etc.
-  -
-  -
-```
+- * Architecture
+  * Models
+  * Example HF Models
+  * [LoRA](#lora-adapter)
+  * [PP](#distributed-serving)
+- * `BertForSequenceClassification`
+  * BERT-based
+  * `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc.
+  *
+  *
+- * `RobertaForSequenceClassification`
+  * RoBERTa-based
+  * `cross-encoder/quora-roberta-base`, etc.
+  *
+  *
+- * `XLMRobertaForSequenceClassification`
+  * XLM-RoBERTa-based
+  * `BAAI/bge-reranker-v2-m3`, etc.
+  *
+  *
+:::
 
 (supported-mm-models)=
 
@@ -560,11 +560,12 @@ On the other hand, modalities separated by `/` are mutually exclusive.
 
 See [this page](#multimodal-inputs) on how to pass multi-modal inputs to the model.
 
-````{important}
+:::{important}
 To enable multiple multi-modal items per text prompt, you have to set `limit_mm_per_prompt` (offline inference)
 or `--limit-mm-per-prompt` (online serving). For example, to enable passing up to 4 images per text prompt:
 
 Offline inference:
+
 ```python
 llm = LLM(
     model="Qwen/Qwen2-VL-7B-Instruct",
@@ -573,14 +574,16 @@ llm = LLM(
 ```
 
 Online serving:
+
 ```bash
 vllm serve Qwen/Qwen2-VL-7B-Instruct --limit-mm-per-prompt image=4
 ```
-````
 
-```{note}
+:::
+
+:::{note}
 vLLM currently only supports adding LoRA to the language backbone of multimodal models.
-```
+:::
 
 ### Generative Models
 
@@ -588,256 +591,256 @@ See [this page](#generative-models) for more information on how to use generativ
 
 #### Text Generation (`--task generate`)
 
-```{list-table}
+:::{list-table}
 :widths: 25 25 15 20 5 5 5
 :header-rows: 1
 
-* - Architecture
-  - Models
-  - Inputs
-  - Example HF Models
-  - [LoRA](#lora-adapter)
-  - [PP](#distributed-serving)
-  - [V1](gh-issue:8779)
-* - `AriaForConditionalGeneration`
-  - Aria
-  - T + I<sup>+</sup>
-  - `rhymes-ai/Aria`
-  -
-  - ✅︎
-  - ✅︎
-* - `Blip2ForConditionalGeneration`
-  - BLIP-2
-  - T + I<sup>E</sup>
-  - `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc.
-  -
-  - ✅︎
-  - ✅︎
-* - `ChameleonForConditionalGeneration`
-  - Chameleon
-  - T + I
-  - `facebook/chameleon-7b` etc.
-  -
-  - ✅︎
-  - ✅︎
-* - `DeepseekVLV2ForCausalLM`
-  - DeepSeek-VL2
-  - T + I<sup>+</sup>
-  - `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2` etc. (see note)
-  -
-  - ✅︎
-  - ✅︎
-* - `FuyuForCausalLM`
-  - Fuyu
-  - T + I
-  - `adept/fuyu-8b` etc.
-  -
-  - ✅︎
-  - ✅︎
-* - `ChatGLMModel`
-  - GLM-4V
-  - T + I
-  - `THUDM/glm-4v-9b` etc.
-  - ✅︎
-  - ✅︎
-  -
-* - `H2OVLChatModel`
-  - H2OVL
-  - T + I<sup>E+</sup>
-  - `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc.
-  -
-  - ✅︎
-  -
-* - `Idefics3ForConditionalGeneration`
-  - Idefics3
-  - T + I
-  - `HuggingFaceM4/Idefics3-8B-Llama3` etc.
-  - ✅︎
-  -
-  -
-* - `InternVLChatModel`
-  - InternVL 2.5, Mono-InternVL, InternVL 2.0
-  - T + I<sup>E+</sup>
-  - `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc.
-  -
-  - ✅︎
-  - ✅︎
-* - `LlavaForConditionalGeneration`
-  - LLaVA-1.5
-  - T + I<sup>E+</sup>
-  - `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), etc.
-  -
-  - ✅︎
-  - ✅︎
-* - `LlavaNextForConditionalGeneration`
-  - LLaVA-NeXT
-  - T + I<sup>E+</sup>
-  - `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc.
-  -
-  - ✅︎
-  - ✅︎
-* - `LlavaNextVideoForConditionalGeneration`
-  - LLaVA-NeXT-Video
-  - T + V
-  - `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc.
-  -
-  - ✅︎
-  - ✅︎
-* - `LlavaOnevisionForConditionalGeneration`
-  - LLaVA-Onevision
-  - T + I<sup>+</sup> + V<sup>+</sup>
-  - `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc.
-  -
-  - ✅︎
-  - ✅︎
-* - `MiniCPMV`
-  - MiniCPM-V
-  - T + I<sup>E+</sup>
-  - `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, etc.
-  - ✅︎
-  - ✅︎
-  -
-* - `MllamaForConditionalGeneration`
-  - Llama 3.2
-  - T + I<sup>+</sup>
-  - `meta-llama/Llama-3.2-90B-Vision-Instruct`, `meta-llama/Llama-3.2-11B-Vision`, etc.
-  -
-  -
-  -
-* - `MolmoForCausalLM`
-  - Molmo
-  - T + I
-  - `allenai/Molmo-7B-D-0924`, `allenai/Molmo-72B-0924`, etc.
-  - ✅︎
-  - ✅︎
-  - ✅︎
-* - `NVLM_D_Model`
-  - NVLM-D 1.0
-  - T + I<sup>E+</sup>
-  - `nvidia/NVLM-D-72B`, etc.
-  -
-  - ✅︎
-  - ✅︎
-* - `PaliGemmaForConditionalGeneration`
-  - PaliGemma, PaliGemma 2
-  - T + I<sup>E</sup>
-  - `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc.
-  -
-  - ✅︎
-  -
-* - `Phi3VForCausalLM`
-  - Phi-3-Vision, Phi-3.5-Vision
-  - T + I<sup>E+</sup>
-  - `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc.
-  -
-  - ✅︎
-  - ✅︎
-* - `PixtralForConditionalGeneration`
-  - Pixtral
-  - T + I<sup>+</sup>
-  - `mistralai/Pixtral-12B-2409`, `mistral-community/pixtral-12b` (see note), etc.
-  -
-  - ✅︎
-  - ✅︎
-* - `QWenLMHeadModel`
-  - Qwen-VL
-  - T + I<sup>E+</sup>
-  - `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc.
-  - ✅︎
-  - ✅︎
-  - ✅︎
-* - `Qwen2AudioForConditionalGeneration`
-  - Qwen2-Audio
-  - T + A<sup>+</sup>
-  - `Qwen/Qwen2-Audio-7B-Instruct`
-  -
-  - ✅︎
-  - ✅︎
-* - `Qwen2VLForConditionalGeneration`
-  - QVQ, Qwen2-VL
-  - T + I<sup>E+</sup> + V<sup>E+</sup>
-  - `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc.
-  - ✅︎
-  - ✅︎
-  - ✅︎
-* - `UltravoxModel`
-  - Ultravox
-  - T + A<sup>E+</sup>
-  - `fixie-ai/ultravox-v0_3`
-  -
-  - ✅︎
-  - ✅︎
-```
+- * Architecture
+  * Models
+  * Inputs
+  * Example HF Models
+  * [LoRA](#lora-adapter)
+  * [PP](#distributed-serving)
+  * [V1](gh-issue:8779)
+- * `AriaForConditionalGeneration`
+  * Aria
+  * T + I<sup>+</sup>
+  * `rhymes-ai/Aria`
+  *
+  * ✅︎
+  * ✅︎
+- * `Blip2ForConditionalGeneration`
+  * BLIP-2
+  * T + I<sup>E</sup>
+  * `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc.
+  *
+  * ✅︎
+  * ✅︎
+- * `ChameleonForConditionalGeneration`
+  * Chameleon
+  * T + I
+  * `facebook/chameleon-7b` etc.
+  *
+  * ✅︎
+  * ✅︎
+- * `DeepseekVLV2ForCausalLM`
+  * DeepSeek-VL2
+  * T + I<sup>+</sup>
+  * `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2` etc. (see note)
+  *
+  * ✅︎
+  * ✅︎
+- * `FuyuForCausalLM`
+  * Fuyu
+  * T + I
+  * `adept/fuyu-8b` etc.
+  *
+  * ✅︎
+  * ✅︎
+- * `ChatGLMModel`
+  * GLM-4V
+  * T + I
+  * `THUDM/glm-4v-9b` etc.
+  * ✅︎
+  * ✅︎
+  *
+- * `H2OVLChatModel`
+  * H2OVL
+  * T + I<sup>E+</sup>
+  * `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc.
+  *
+  * ✅︎
+  *
+- * `Idefics3ForConditionalGeneration`
+  * Idefics3
+  * T + I
+  * `HuggingFaceM4/Idefics3-8B-Llama3` etc.
+  * ✅︎
+  *
+  *
+- * `InternVLChatModel`
+  * InternVL 2.5, Mono-InternVL, InternVL 2.0
+  * T + I<sup>E+</sup>
+  * `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc.
+  *
+  * ✅︎
+  * ✅︎
+- * `LlavaForConditionalGeneration`
+  * LLaVA-1.5
+  * T + I<sup>E+</sup>
+  * `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), etc.
+  *
+  * ✅︎
+  * ✅︎
+- * `LlavaNextForConditionalGeneration`
+  * LLaVA-NeXT
+  * T + I<sup>E+</sup>
+  * `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc.
+  *
+  * ✅︎
+  * ✅︎
+- * `LlavaNextVideoForConditionalGeneration`
+  * LLaVA-NeXT-Video
+  * T + V
+  * `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc.
+  *
+  * ✅︎
+  * ✅︎
+- * `LlavaOnevisionForConditionalGeneration`
+  * LLaVA-Onevision
+  * T + I<sup>+</sup> + V<sup>+</sup>
+  * `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc.
+  *
+  * ✅︎
+  * ✅︎
+- * `MiniCPMV`
+  * MiniCPM-V
+  * T + I<sup>E+</sup>
+  * `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, etc.
+  * ✅︎
+  * ✅︎
+  *
+- * `MllamaForConditionalGeneration`
+  * Llama 3.2
+  * T + I<sup>+</sup>
+  * `meta-llama/Llama-3.2-90B-Vision-Instruct`, `meta-llama/Llama-3.2-11B-Vision`, etc.
+  *
+  *
+  *
+- * `MolmoForCausalLM`
+  * Molmo
+  * T + I
+  * `allenai/Molmo-7B-D-0924`, `allenai/Molmo-72B-0924`, etc.
+  * ✅︎
+  * ✅︎
+  * ✅︎
+- * `NVLM_D_Model`
+  * NVLM-D 1.0
+  * T + I<sup>E+</sup>
+  * `nvidia/NVLM-D-72B`, etc.
+  *
+  * ✅︎
+  * ✅︎
+- * `PaliGemmaForConditionalGeneration`
+  * PaliGemma, PaliGemma 2
+  * T + I<sup>E</sup>
+  * `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc.
+  *
+  * ✅︎
+  *
+- * `Phi3VForCausalLM`
+  * Phi-3-Vision, Phi-3.5-Vision
+  * T + I<sup>E+</sup>
+  * `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc.
+  *
+  * ✅︎
+  * ✅︎
+- * `PixtralForConditionalGeneration`
+  * Pixtral
+  * T + I<sup>+</sup>
+  * `mistralai/Pixtral-12B-2409`, `mistral-community/pixtral-12b` (see note), etc.
+  *
+  * ✅︎
+  * ✅︎
+- * `QWenLMHeadModel`
+  * Qwen-VL
+  * T + I<sup>E+</sup>
+  * `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc.
+  * ✅︎
+  * ✅︎
+  * ✅︎
+- * `Qwen2AudioForConditionalGeneration`
+  * Qwen2-Audio
+  * T + A<sup>+</sup>
+  * `Qwen/Qwen2-Audio-7B-Instruct`
+  *
+  * ✅︎
+  * ✅︎
+- * `Qwen2VLForConditionalGeneration`
+  * QVQ, Qwen2-VL
+  * T + I<sup>E+</sup> + V<sup>E+</sup>
+  * `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc.
+  * ✅︎
+  * ✅︎
+  * ✅︎
+- * `UltravoxModel`
+  * Ultravox
+  * T + A<sup>E+</sup>
+  * `fixie-ai/ultravox-v0_3`
+  *
+  * ✅︎
+  * ✅︎
+:::
 
 <sup>E</sup> Pre-computed embeddings can be inputted for this modality.  
 <sup>+</sup> Multiple items can be inputted per text prompt for this modality.
 
-```{note}
+:::{note}
 To use `DeepSeek-VL2` series models, you have to pass `--hf_overrides '{"architectures": ["DeepseekVLV2ForCausalLM"]}'` when running vLLM.
-```
+:::
 
-```{note}
+:::{note}
 To use `TIGER-Lab/Mantis-8B-siglip-llama3`, you have to pass `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM.
-```
+:::
 
-```{note}
+:::{note}
 The official `openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (`HwwwH/MiniCPM-V-2`) for now.
 For more details, please see: <gh-pr:4087#issuecomment-2250397630>
-```
+:::
 
-```{note}
+:::{note}
 The chat template for Pixtral-HF is incorrect (see [discussion](https://huggingface.co/mistral-community/pixtral-12b/discussions/22)).
 A corrected version is available at <gh-file:examples/template_pixtral_hf.jinja>.
-```
+:::
 
 ### Pooling Models
 
 See [this page](pooling-models) for more information on how to use pooling models.
 
-```{important}
+:::{important}
 Since some model architectures support both generative and pooling tasks,
 you should explicitly specify the task type to ensure that the model is used in pooling mode instead of generative mode.
-```
+:::
 
 #### Text Embedding (`--task embed`)
 
 Any text generation model can be converted into an embedding model by passing `--task embed`.
 
-```{note}
+:::{note}
 To get the best results, you should use pooling models that are specifically trained as such.
-```
+:::
 
 The following table lists those that are tested in vLLM.
 
-```{list-table}
+:::{list-table}
 :widths: 25 25 15 25 5 5
 :header-rows: 1
 
-* - Architecture
-  - Models
-  - Inputs
-  - Example HF Models
-  - [LoRA](#lora-adapter)
-  - [PP](#distributed-serving)
-* - `LlavaNextForConditionalGeneration`
-  - LLaVA-NeXT-based
-  - T / I
-  - `royokong/e5-v`
-  -
-  - ✅︎
-* - `Phi3VForCausalLM`
-  - Phi-3-Vision-based
-  - T + I
-  - `TIGER-Lab/VLM2Vec-Full`
-  - 🚧
-  - ✅︎
-* - `Qwen2VLForConditionalGeneration`
-  - Qwen2-VL-based
-  - T + I
-  - `MrLight/dse-qwen2-2b-mrl-v1`
-  -
-  - ✅︎
-```
+- * Architecture
+  * Models
+  * Inputs
+  * Example HF Models
+  * [LoRA](#lora-adapter)
+  * [PP](#distributed-serving)
+- * `LlavaNextForConditionalGeneration`
+  * LLaVA-NeXT-based
+  * T / I
+  * `royokong/e5-v`
+  *
+  * ✅︎
+- * `Phi3VForCausalLM`
+  * Phi-3-Vision-based
+  * T + I
+  * `TIGER-Lab/VLM2Vec-Full`
+  * 🚧
+  * ✅︎
+- * `Qwen2VLForConditionalGeneration`
+  * Qwen2-VL-based
+  * T + I
+  * `MrLight/dse-qwen2-2b-mrl-v1`
+  *
+  * ✅︎
+:::
 
 _________________
 
@@ -849,9 +852,9 @@ At vLLM, we are committed to facilitating the integration and support of third-p
 
 2. **Best-Effort Consistency**: While we aim to maintain a level of consistency between the models implemented in vLLM and other frameworks like transformers, complete alignment is not always feasible. Factors like acceleration techniques and the use of low-precision computations can introduce discrepancies. Our commitment is to ensure that the implemented models are functional and produce sensible results.
 
-    ```{tip}
+    :::{tip}
     When comparing the output of `model.generate` from HuggingFace Transformers with the output of `llm.generate` from vLLM, note that the former reads the model's generation config file (i.e., [generation_config.json](https://github.com/huggingface/transformers/blob/19dabe96362803fb0a9ae7073d03533966598b17/src/transformers/generation/utils.py#L1945)) and applies the default parameters for generation, while the latter only uses the parameters passed to the function. Ensure all sampling parameters are identical when comparing outputs.
-    ```
+    :::
 
 3. **Issue Resolution and Model Updates**: Users are encouraged to report any bugs or issues they encounter with third-party models. Proposed fixes should be submitted via PRs, with a clear explanation of the problem and the rationale behind the proposed solution. If a fix for one model impacts another, we rely on the community to highlight and address these cross-model dependencies. Note: for bugfix PRs, it is good etiquette to inform the original author to seek their feedback.
 
diff --git a/docs/source/serving/distributed_serving.md b/docs/source/serving/distributed_serving.md
index daf6e2f250416..3f9ca27eb438e 100644
--- a/docs/source/serving/distributed_serving.md
+++ b/docs/source/serving/distributed_serving.md
@@ -14,9 +14,9 @@ In short, you should increase the number of GPUs and the number of nodes until y
 
 After adding enough GPUs and nodes to hold the model, you can run vLLM first, which will print some logs like `# GPU blocks: 790`. Multiply the number by `16` (the block size), and you can get roughly the maximum number of tokens that can be served on the current configuration. If this number is not satisfying, e.g. you want higher throughput, you can further increase the number of GPUs or nodes, until the number of blocks is enough.
 
-```{note}
+:::{note}
 There is one edge case: if the model fits in a single node with multiple GPUs, but the number of GPUs cannot divide the model size evenly, you can use pipeline parallelism, which splits the model along layers and supports uneven splits. In this case, the tensor parallel size should be 1 and the pipeline parallel size should be the number of GPUs.
-```
+:::
 
 ## Running vLLM on a single node
 
@@ -94,12 +94,12 @@ vllm serve /path/to/the/model/in/the/container \
 
 To make tensor parallel performant, you should make sure the communication between nodes is efficient, e.g. using high-speed network cards like Infiniband. To correctly set up the cluster to use Infiniband, append additional arguments like `--privileged -e NCCL_IB_HCA=mlx5` to the `run_cluster.sh` script. Please contact your system administrator for more information on how to set up the flags. One way to confirm if the Infiniband is working is to run vLLM with `NCCL_DEBUG=TRACE` environment variable set, e.g. `NCCL_DEBUG=TRACE vllm serve ...` and check the logs for the NCCL version and the network used. If you find `[send] via NET/Socket` in the logs, it means NCCL uses raw TCP Socket, which is not efficient for cross-node tensor parallel. If you find `[send] via NET/IB/GDRDMA` in the logs, it means NCCL uses Infiniband with GPU-Direct RDMA, which is efficient.
 
-```{warning}
+:::{warning}
 After you start the Ray cluster, you'd better also check the GPU-GPU communication between nodes. It can be non-trivial to set up. Please refer to the [sanity check script](#troubleshooting-incorrect-hardware-driver) for more information. If you need to set some environment variables for the communication configuration, you can append them to the `run_cluster.sh` script, e.g. `-e NCCL_SOCKET_IFNAME=eth0`. Note that setting environment variables in the shell (e.g. `NCCL_SOCKET_IFNAME=eth0 vllm serve ...`) only works for the processes in the same node, not for the processes in the other nodes. Setting environment variables when you create the cluster is the recommended way. See <gh-issue:6803> for more information.
-```
+:::
 
-```{warning}
+:::{warning}
 Please make sure you downloaded the model to all the nodes (with the same path), or the model is downloaded to some distributed file system that is accessible by all nodes.
 
 When you use huggingface repo id to refer to the model, you should append your huggingface token to the `run_cluster.sh` script, e.g. `-e HF_TOKEN=`. The recommended way is to download the model first, and then use the path to refer to the model.
-```
+:::
diff --git a/docs/source/serving/engine_args.md b/docs/source/serving/engine_args.md
index cd3c6a430b7fa..827c25b50522f 100644
--- a/docs/source/serving/engine_args.md
+++ b/docs/source/serving/engine_args.md
@@ -4,6 +4,7 @@
 
 Below, you can find an explanation of every engine argument for vLLM:
 
+<!--- pyml disable-num-lines 7 no-space-in-emphasis-->
 ```{eval-rst}
 .. argparse::
     :module: vllm.engine.arg_utils
@@ -16,6 +17,7 @@ Below, you can find an explanation of every engine argument for vLLM:
 
 Below are the additional arguments related to the asynchronous engine:
 
+<!--- pyml disable-num-lines 7 no-space-in-emphasis-->
 ```{eval-rst}
 .. argparse::
     :module: vllm.engine.arg_utils
diff --git a/docs/source/serving/env_vars.md b/docs/source/serving/env_vars.md
index f9b08077a03b4..9845241930a40 100644
--- a/docs/source/serving/env_vars.md
+++ b/docs/source/serving/env_vars.md
@@ -2,14 +2,14 @@
 
 vLLM uses the following environment variables to configure the system:
 
-```{warning}
+:::{warning}
 Please note that `VLLM_PORT` and `VLLM_HOST_IP` set the port and ip for vLLM's **internal usage**. It is not the port and ip for the API server. If you use `--host $VLLM_HOST_IP` and `--port $VLLM_PORT` to start the API server, it will not work.
 
 All environment variables used by vLLM are prefixed with `VLLM_`. **Special care should be taken for Kubernetes users**: please do not name the service as `vllm`, otherwise environment variables set by Kubernetes might conflict with vLLM's environment variables, because [Kubernetes sets environment variables for each service with the capitalized service name as the prefix](https://kubernetes.io/docs/concepts/services-networking/service/#environment-variables).
-```
+:::
 
-```{literalinclude} ../../../vllm/envs.py
+:::{literalinclude} ../../../vllm/envs.py
 :end-before: end-env-vars-definition
 :language: python
 :start-after: begin-env-vars-definition
-```
+:::
diff --git a/docs/source/serving/integrations/index.md b/docs/source/serving/integrations/index.md
index 371c284981ce9..e2b4c0814605b 100644
--- a/docs/source/serving/integrations/index.md
+++ b/docs/source/serving/integrations/index.md
@@ -1,8 +1,8 @@
 # External Integrations
 
-```{toctree}
+:::{toctree}
 :maxdepth: 1
 
 langchain
 llamaindex
-```
+:::
diff --git a/docs/source/serving/metrics.md b/docs/source/serving/metrics.md
index 6c84f6d1350a6..6c0dc8880a90d 100644
--- a/docs/source/serving/metrics.md
+++ b/docs/source/serving/metrics.md
@@ -31,8 +31,8 @@ vllm:iteration_tokens_total_bucket{le="512.0",model_name="unsloth/Llama-3.2-1B-I
 
 The following metrics are exposed:
 
-```{literalinclude} ../../../vllm/engine/metrics.py
+:::{literalinclude} ../../../vllm/engine/metrics.py
 :end-before: end-metrics-definitions
 :language: python
 :start-after: begin-metrics-definitions
-```
+:::
diff --git a/docs/source/serving/multimodal_inputs.md b/docs/source/serving/multimodal_inputs.md
index 0213b0a3388ea..217b531e83788 100644
--- a/docs/source/serving/multimodal_inputs.md
+++ b/docs/source/serving/multimodal_inputs.md
@@ -4,10 +4,10 @@
 
 This page teaches you how to pass multi-modal inputs to [multi-modal models](#supported-mm-models) in vLLM.
 
-```{note}
+:::{note}
 We are actively iterating on multi-modal support. See [this RFC](gh-issue:4194) for upcoming changes,
 and [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) if you have any feedback or feature requests.
-```
+:::
 
 ## Offline Inference
 
@@ -203,13 +203,13 @@ for o in outputs:
 
 Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions API](https://platform.openai.com/docs/api-reference/chat).
 
-```{important}
+:::{important}
 A chat template is **required** to use Chat Completions API.
 
 Although most models come with a chat template, for others you have to define one yourself.
 The chat template can be inferred based on the documentation on the model's HuggingFace repo.
 For example, LLaVA-1.5 (`llava-hf/llava-1.5-7b-hf`) requires a chat template that can be found here: <gh-file:examples/template_llava.jinja>
-```
+:::
 
 ### Image
 
@@ -273,24 +273,25 @@ print("Chat completion output:", chat_response.choices[0].message.content)
 
 Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
 
-```{tip}
+:::{tip}
 Loading from local file paths is also supported on vLLM: You can specify the allowed local media path via `--allowed-local-media-path` when launching the API server/engine,
 and pass the file path as `url` in the API request.
-```
+:::
 
-```{tip}
+:::{tip}
 There is no need to place image placeholders in the text content of the API request - they are already represented by the image content.
 In fact, you can place image placeholders in the middle of the text by interleaving text and image content.
-```
+:::
 
-````{note}
+:::{note}
 By default, the timeout for fetching images through HTTP URL is `5` seconds.
 You can override this by setting the environment variable:
 
 ```console
-$ export VLLM_IMAGE_FETCH_TIMEOUT=<timeout>
+export VLLM_IMAGE_FETCH_TIMEOUT=<timeout>
 ```
-````
+
+:::
 
 ### Video
 
@@ -345,14 +346,15 @@ print("Chat completion output from image url:", result)
 
 Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
 
-````{note}
+:::{note}
 By default, the timeout for fetching videos through HTTP URL is `30` seconds.
 You can override this by setting the environment variable:
 
 ```console
-$ export VLLM_VIDEO_FETCH_TIMEOUT=<timeout>
+export VLLM_VIDEO_FETCH_TIMEOUT=<timeout>
 ```
-````
+
+:::
 
 ### Audio
 
@@ -448,24 +450,25 @@ print("Chat completion output from audio url:", result)
 
 Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
 
-````{note}
+:::{note}
 By default, the timeout for fetching audios through HTTP URL is `10` seconds.
 You can override this by setting the environment variable:
 
 ```console
-$ export VLLM_AUDIO_FETCH_TIMEOUT=<timeout>
+export VLLM_AUDIO_FETCH_TIMEOUT=<timeout>
 ```
-````
+
+:::
 
 ### Embedding
 
 vLLM's Embeddings API is a superset of OpenAI's [Embeddings API](https://platform.openai.com/docs/api-reference/embeddings),
 where a list of chat `messages` can be passed instead of batched `inputs`. This enables multi-modal inputs to be passed to embedding models.
 
-```{tip}
+:::{tip}
 The schema of `messages` is exactly the same as in Chat Completions API.
 You can refer to the above tutorials for more details on how to pass each type of multi-modal data.
-```
+:::
 
 Usually, embedding models do not expect chat-based input, so we need to use a custom chat template to format the text and images.
 Refer to the examples below for illustration.
@@ -477,13 +480,13 @@ vllm serve TIGER-Lab/VLM2Vec-Full --task embed \
   --trust-remote-code --max-model-len 4096 --chat-template examples/template_vlm2vec.jinja
 ```
 
-```{important}
+:::{important}
 Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass `--task embed`
 to run this model in embedding mode instead of text generation mode.
 
 The custom chat template is completely different from the original one for this model,
 and can be found here: <gh-file:examples/template_vlm2vec.jinja>
-```
+:::
 
 Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level `requests` library:
 
@@ -518,16 +521,16 @@ vllm serve MrLight/dse-qwen2-2b-mrl-v1 --task embed \
   --trust-remote-code --max-model-len 8192 --chat-template examples/template_dse_qwen2_vl.jinja
 ```
 
-```{important}
+:::{important}
 Like with VLM2Vec, we have to explicitly pass `--task embed`.
 
 Additionally, `MrLight/dse-qwen2-2b-mrl-v1` requires an EOS token for embeddings, which is handled
 by a custom chat template: <gh-file:examples/template_dse_qwen2_vl.jinja>
-```
+:::
 
-```{important}
+:::{important}
 Also important, `MrLight/dse-qwen2-2b-mrl-v1` requires a placeholder image of the minimum image size for text query embeddings. See the full code
 example below for details.
-```
+:::
 
 Full example: <gh-file:examples/online_serving/openai_chat_embedding_client_for_multimodal.py>
diff --git a/docs/source/serving/offline_inference.md b/docs/source/serving/offline_inference.md
index 8a18598665a70..ded57500c5d0d 100644
--- a/docs/source/serving/offline_inference.md
+++ b/docs/source/serving/offline_inference.md
@@ -22,9 +22,9 @@ The available APIs depend on the type of model that is being run:
 
 Please refer to the above pages for more details about each API.
 
-```{seealso}
+:::{seealso}
 [API Reference](/api/offline_inference/index)
-```
+:::
 
 ## Configuration Options
 
@@ -70,12 +70,12 @@ llm = LLM(model="ibm-granite/granite-3.1-8b-instruct",
           tensor_parallel_size=2)
 ```
 
-```{important}
+:::{important}
 To ensure that vLLM initializes CUDA correctly, you should avoid calling related functions (e.g. {func}`torch.cuda.set_device`)
 before initializing vLLM. Otherwise, you may run into an error like `RuntimeError: Cannot re-initialize CUDA in forked subprocess`.
 
 To control which devices are used, please instead set the `CUDA_VISIBLE_DEVICES` environment variable.
-```
+:::
 
 #### Quantization
 
diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index 8bc234545befd..82ef54c16dafb 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -161,11 +161,11 @@ print(completion._request_id)
 
 The `vllm serve` command is used to launch the OpenAI-compatible server.
 
-```{argparse}
+:::{argparse}
 :module: vllm.entrypoints.openai.cli_args
 :func: create_parser_for_docs
 :prog: vllm serve
-```
+:::
 
 #### Configuration file
 
@@ -188,10 +188,10 @@ To use the above config file:
 vllm serve SOME_MODEL --config config.yaml
 ```
 
-```{note}
+:::{note}
 In case an argument is supplied simultaneously using command line and the config file, the value from the command line will take precedence.
 The order of priorities is `command line > config file values > defaults`.
-```
+:::
 
 ## API Reference
 
@@ -208,19 +208,19 @@ Code example: <gh-file:examples/online_serving/openai_completion_client.py>
 
 The following [sampling parameters](#sampling-params) are supported.
 
-```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
 :start-after: begin-completion-sampling-params
 :end-before: end-completion-sampling-params
-```
+:::
 
 The following extra parameters are supported:
 
-```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
 :start-after: begin-completion-extra-params
 :end-before: end-completion-extra-params
-```
+:::
 
 (chat-api)=
 
@@ -240,19 +240,19 @@ Code example: <gh-file:examples/online_serving/openai_chat_completion_client.py>
 
 The following [sampling parameters](#sampling-params) are supported.
 
-```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
 :start-after: begin-chat-completion-sampling-params
 :end-before: end-chat-completion-sampling-params
-```
+:::
 
 The following extra parameters are supported:
 
-```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
 :start-after: begin-chat-completion-extra-params
 :end-before: end-chat-completion-extra-params
-```
+:::
 
 (embeddings-api)=
 
@@ -264,9 +264,9 @@ you can use the [official OpenAI Python client](https://github.com/openai/openai
 If the model has a [chat template](#chat-template), you can replace `inputs` with a list of `messages` (same schema as [Chat API](#chat-api))
 which will be treated as a single prompt to the model.
 
-```{tip}
+:::{tip}
 This enables multi-modal inputs to be passed to embedding models, see [this page](#multimodal-inputs) for details.
-```
+:::
 
 Code example: <gh-file:examples/online_serving/openai_embedding_client.py>
 
@@ -274,27 +274,27 @@ Code example: <gh-file:examples/online_serving/openai_embedding_client.py>
 
 The following [pooling parameters](#pooling-params) are supported.
 
-```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
 :start-after: begin-embedding-pooling-params
 :end-before: end-embedding-pooling-params
-```
+:::
 
 The following extra parameters are supported by default:
 
-```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
 :start-after: begin-embedding-extra-params
 :end-before: end-embedding-extra-params
-```
+:::
 
 For chat-like input (i.e. if `messages` is passed), these extra parameters are supported instead:
 
-```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
 :start-after: begin-chat-embedding-extra-params
 :end-before: end-chat-embedding-extra-params
-```
+:::
 
 (tokenizer-api)=
 
@@ -465,19 +465,19 @@ Response:
 
 The following [pooling parameters](#pooling-params) are supported.
 
-```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
 :start-after: begin-score-pooling-params
 :end-before: end-score-pooling-params
-```
+:::
 
 The following extra parameters are supported:
 
-```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
 :start-after: begin-score-extra-params
 :end-before: end-score-extra-params
-```
+:::
 
 (rerank-api)=
 
@@ -552,16 +552,16 @@ Response:
 
 The following [pooling parameters](#pooling-params) are supported.
 
-```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
 :start-after: begin-rerank-pooling-params
 :end-before: end-rerank-pooling-params
-```
+:::
 
 The following extra parameters are supported:
 
-```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
 :start-after: begin-rerank-extra-params
 :end-before: end-rerank-extra-params
-```
+:::
diff --git a/pyproject.toml b/pyproject.toml
index 8f2e20d0f5800..9892967b82d79 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -111,6 +111,7 @@ markers = [
 ]
 
 [tool.pymarkdown]
+plugins.md004.style = "sublist" # ul-style
 plugins.md013.enabled = false # line-length
 plugins.md041.enabled = false # first-line-h1
 plugins.md033.enabled = false # inline-html

From 46fb056749b7d9f5e4ea7a060207ed2eb3ad75e0 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Wed, 29 Jan 2025 04:11:16 +0000
Subject: [PATCH 421/462] [V1][Metrics] Add TTFT and TPOT histograms (#12530)

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
---
 tests/entrypoints/openai/test_metrics.py |  6 ++++++
 vllm/v1/engine/output_processor.py       |  4 +++-
 vllm/v1/metrics/loggers.py               | 25 ++++++++++++++++++++++++
 vllm/v1/metrics/stats.py                 | 11 +++++++++++
 4 files changed, 45 insertions(+), 1 deletion(-)

diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py
index 9a84c82b62fdf..901ba8e8e5ef3 100644
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@@ -208,6 +208,12 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
     "vllm:request_generation_tokens_sum",
     "vllm:request_generation_tokens_bucket",
     "vllm:request_generation_tokens_count",
+    "vllm:time_to_first_token_seconds_sum",
+    "vllm:time_to_first_token_seconds_bucket",
+    "vllm:time_to_first_token_seconds_count",
+    "vllm:time_per_output_token_seconds_sum",
+    "vllm:time_per_output_token_seconds_bucket",
+    "vllm:time_per_output_token_seconds_count",
 ]
 
 
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index 39217b8090140..234ef8194ca93 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -27,6 +27,7 @@ def __init__(
         prompt: Optional[str],
         prompt_token_ids: List[int],
         detokenizer: IncrementalDetokenizer,
+        arrival_time: float,
         queue: Optional[asyncio.Queue[RequestOutput]],
     ):
         self.request_id = request_id
@@ -37,7 +38,7 @@ def __init__(
         self.is_prefilling = True
         self.queue = queue
 
-        self.stats = RequestStateStats()
+        self.stats = RequestStateStats(last_token_time=arrival_time)
 
     @classmethod
     def from_new_request(
@@ -54,6 +55,7 @@ def from_new_request(
                 tokenizer=tokenizer,
                 request=request,
             ),
+            arrival_time=request.arrival_time,
             queue=queue,
         )
 
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index 87d9d63652c05..9bb24d1948651 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -121,6 +121,26 @@ def __init__(self, model_config: ModelConfig):
                 buckets=build_1_2_5_buckets(max_model_len),
                 labelnames=labelnames).labels(*labelvalues)
 
+        self.histogram_time_to_first_token = \
+            prometheus_client.Histogram(
+                name="vllm:time_to_first_token_seconds",
+                documentation="Histogram of time to first token in seconds.",
+                buckets=[
+                    0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5,
+                    0.75, 1.0, 2.5, 5.0, 7.5, 10.0
+                ],
+                labelnames=labelnames).labels(*labelvalues)
+
+        self.histogram_time_per_output_token = \
+            prometheus_client.Histogram(
+                name="vllm:time_per_output_token_seconds",
+                documentation="Histogram of time per output token in seconds.",
+                buckets=[
+                    0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5,
+                    0.75, 1.0, 2.5
+                ],
+                labelnames=labelnames).labels(*labelvalues)
+
     def log(self, scheduler_stats: SchedulerStats,
             iteration_stats: IterationStats):
         """Log to prometheus."""
@@ -137,6 +157,11 @@ def log(self, scheduler_stats: SchedulerStats,
             self.histogram_num_generation_tokens_request.observe(
                 finished_request.num_generation_tokens)
 
+        for ttft in iteration_stats.time_to_first_tokens_iter:
+            self.histogram_time_to_first_token.observe(ttft)
+        for tpot in iteration_stats.time_per_output_tokens_iter:
+            self.histogram_time_per_output_token.observe(tpot)
+
     @staticmethod
     def _unregister_vllm_metrics():
         # Unregister any existing vLLM collectors (for CI/CD
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
index 55d85a7992cc5..f4c276f0b6902 100644
--- a/vllm/v1/metrics/stats.py
+++ b/vllm/v1/metrics/stats.py
@@ -1,3 +1,4 @@
+import time
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, List
 
@@ -22,6 +23,7 @@ class RequestStateStats:
     """Stats that need to be tracked across delta updates."""
 
     num_generation_tokens: int = 0
+    last_token_time: float = 0.0
 
 
 @dataclass
@@ -40,6 +42,8 @@ def __init__(self, log_stats: bool):
         self.num_generation_tokens = 0
         self.num_prompt_tokens = 0
         self.finished_requests: List[FinishedRequestStats] = []
+        self.time_to_first_tokens_iter: List[float] = []
+        self.time_per_output_tokens_iter: List[float] = []
 
     def update_from_output(self, output: "EngineCoreOutput",
                            is_prefilling: bool, prompt_len: int,
@@ -48,6 +52,8 @@ def update_from_output(self, output: "EngineCoreOutput",
             return
 
         num_new_generation_tokens = len(output.new_token_ids)
+        now = time.time()
+        last_token_latency = now - request_state_stats.last_token_time
 
         self.num_generation_tokens += num_new_generation_tokens
         if is_prefilling:
@@ -58,7 +64,12 @@ def update_from_output(self, output: "EngineCoreOutput",
             assert (num_new_generation_tokens > 0)
             self.num_prompt_tokens += prompt_len
 
+            self.time_to_first_tokens_iter.append(last_token_latency)
+        else:
+            self.time_per_output_tokens_iter.append(last_token_latency)
+
         request_state_stats.num_generation_tokens += num_new_generation_tokens
+        request_state_stats.last_token_time = now
 
     def update_from_finished_request(self, request_output: "RequestOutput",
                                      request_state_stats: RequestStateStats):

From bd02164cf9eeed8436b26d62c37c1d792e97f9e8 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Tue, 28 Jan 2025 23:49:03 -0500
Subject: [PATCH 422/462] Bugfix for whisper quantization due to fake k_proj
 bias (#12524)

Signed-off-by: mgoin <michael@neuralmagic.com>
---
 vllm/model_executor/models/whisper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
index b8512b735da94..15e35fa9cd2c9 100644
--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@@ -743,7 +743,7 @@ def _create_fake_bias_for_k_proj(
     So that the bias for k_proj in qkv_proj can be initialized with zeros.
     """
     for name, weight in weights:
-        if ".self_attn.k_proj.weight" in name:
+        if name.endswith(".self_attn.k_proj.weight"):
             bias = torch.zeros(weight.size(0))
             bias_name = name.replace("weight", "bias")
             yield from [(name, weight), (bias_name, bias)]

From 5f671cb4c3145194e94ffb393ee459432f7fa2b8 Mon Sep 17 00:00:00 2001
From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Date: Tue, 28 Jan 2025 23:56:56 -0500
Subject: [PATCH 423/462] [V1] Improve Error Message for Unsupported Config
 (#12535)

Co-authored-by: Michael Goin <michael@neuralmagic.com>
---
 vllm/platforms/cuda.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 2587e3a11dde3..e4b436edf7588 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -120,13 +120,18 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         if parallel_config.worker_cls == "auto":
             if scheduler_config.is_multi_step:
                 if envs.VLLM_USE_V1:
-                    raise NotImplementedError
+                    raise NotImplementedError(
+                        "Multi-step scheduling is not supported (and not "
+                        "needed) on VLLM V1. Please launch without "
+                        "--num-scheduler-steps.")
                 else:
                     parallel_config.worker_cls = \
                         "vllm.worker.multi_step_worker.MultiStepWorker"
             elif vllm_config.speculative_config:
                 if envs.VLLM_USE_V1:
-                    raise NotImplementedError
+                    raise NotImplementedError(
+                        "Speculative decoding is not yet supported on VLLM V1."
+                    )
                 else:
                     parallel_config.worker_cls = \
                         "vllm.spec_decode.spec_decode_worker.create_spec_worker"

From ef001d98ef36166ebacb48eab2e32eb738407b53 Mon Sep 17 00:00:00 2001
From: Maximilien de Bayser <mbayser@br.ibm.com>
Date: Wed, 29 Jan 2025 04:53:13 -0300
Subject: [PATCH 424/462] Fix the pydantic logging validator (#12420)

Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
---
 vllm/entrypoints/openai/protocol.py | 29 ++++++++++++++++++++++-------
 1 file changed, 22 insertions(+), 7 deletions(-)

diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 2bc136cc48038..29d071ce50c8e 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -6,7 +6,8 @@
 from typing import Any, ClassVar, Dict, List, Literal, Optional, Set, Union
 
 import torch
-from pydantic import BaseModel, ConfigDict, Field, model_validator
+from pydantic import (BaseModel, ConfigDict, Field, TypeAdapter,
+                      ValidationInfo, field_validator, model_validator)
 from typing_extensions import Annotated
 
 from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
@@ -45,14 +46,14 @@ class OpenAIBaseModel(BaseModel):
     # Cache class field names
     field_names: ClassVar[Optional[Set[str]]] = None
 
-    @model_validator(mode="before")
+    @model_validator(mode="wrap")
     @classmethod
-    def __log_extra_fields__(cls, data):
-
+    def __log_extra_fields__(cls, data, handler):
+        result = handler(data)
+        if not isinstance(data, dict):
+            return result
         field_names = cls.field_names
         if field_names is None:
-            if not isinstance(data, dict):
-                return data
             # Get all class field names and their potential aliases
             field_names = set()
             for field_name, field in cls.model_fields.items():
@@ -67,7 +68,7 @@ def __log_extra_fields__(cls, data):
                 "The following fields were present in the request "
                 "but ignored: %s",
                 data.keys() - field_names)
-        return data
+        return result
 
 
 class ErrorResponse(OpenAIBaseModel):
@@ -1287,6 +1288,20 @@ class BatchRequestInput(OpenAIBaseModel):
     # The parameters of the request.
     body: Union[ChatCompletionRequest, EmbeddingRequest, ScoreRequest]
 
+    @field_validator('body', mode='plain')
+    @classmethod
+    def check_type_for_url(cls, value: Any, info: ValidationInfo):
+        # Use url to disambiguate models
+        url = info.data['url']
+        if url == "/v1/chat/completions":
+            return ChatCompletionRequest.model_validate(value)
+        if url == "/v1/embeddings":
+            return TypeAdapter(EmbeddingRequest).validate_python(value)
+        if url == "/v1/score":
+            return ScoreRequest.model_validate(value)
+        return TypeAdapter(Union[ChatCompletionRequest, EmbeddingRequest,
+                                 ScoreRequest]).validate_python(value)
+
 
 class BatchResponseData(OpenAIBaseModel):
     # HTTP status code of the response.

From 036ca94c25fa07391016aa1b4f93a8ac5d74f296 Mon Sep 17 00:00:00 2001
From: Travis Johnson <tsjohnso@us.ibm.com>
Date: Wed, 29 Jan 2025 01:54:35 -0700
Subject: [PATCH 425/462] [Bugfix] handle alignment of arguments in
 convert_sparse_cross_attention_mask_to_dense (#12347)

Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
Signed-off-by: Wallas Santos <wallashss@ibm.com>
Co-authored-by: Wallas Santos <wallashss@ibm.com>
---
 .../vision_language/test_mllama.py            | 208 ++++++++++++++++++
 vllm/model_executor/models/mllama.py          |  18 +-
 2 files changed, 222 insertions(+), 4 deletions(-)

diff --git a/tests/models/encoder_decoder/vision_language/test_mllama.py b/tests/models/encoder_decoder/vision_language/test_mllama.py
index 636a3eedff31b..16c71228ede7a 100644
--- a/tests/models/encoder_decoder/vision_language/test_mllama.py
+++ b/tests/models/encoder_decoder/vision_language/test_mllama.py
@@ -1,11 +1,15 @@
 from typing import List, Optional, Tuple, Type, overload
 
 import pytest
+import torch
 from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer,
                           BatchEncoding)
 
+from vllm.attention.backends.flash_attn import FlashAttentionMetadata
 from vllm.attention.selector import (_Backend, _cached_get_attn_backend,
                                      global_force_attn_backend_context_manager)
+from vllm.model_executor.models.mllama import (MLLAMA_IMAGE_TOKEN_ID,
+                                               MllamaForConditionalGeneration)
 from vllm.multimodal.image import rescale_image_size
 from vllm.sequence import SampleLogprobs
 
@@ -33,6 +37,29 @@
     "meta-llama/Llama-3.2-11B-Vision-Instruct",
 ]
 
+# Indices for inputs
+TEXT_ONLY = '0'
+IMAGE_AT_BEG = '1'
+IMAGE_AT_MIDDLE = '2'
+TWO_IMAGES = '3'
+
+# Input tokenized
+prompt_data = {
+    # Tell me a story
+    TEXT_ONLY: [41551, 757, 264, 3446],
+    # <|image|> What's the content of this image
+    IMAGE_AT_BEG:
+    [MLLAMA_IMAGE_TOKEN_ID, 3639, 596, 279, 2262, 315, 420, 2217, 220],
+    # Hello <|image|>What' the content of this image
+    IMAGE_AT_MIDDLE:
+    [9906, 220, MLLAMA_IMAGE_TOKEN_ID, 3923, 6, 279, 2262, 315, 420, 2217],
+    #<|image|>Is there a duck in this image?<|image|>What's the animal in this image? # noqa: E501
+    TWO_IMAGES: [
+        MLLAMA_IMAGE_TOKEN_ID, 3957, 1070, 264, 37085, 304, 420, 2217, 30,
+        MLLAMA_IMAGE_TOKEN_ID, 3923, 596, 279, 10065, 304, 420, 2217, 30
+    ]
+}
+
 
 def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
                                          Optional[SampleLogprobs]],
@@ -365,3 +392,184 @@ def test_models_interleaved_images(hf_runner, vllm_runner, image_assets, model,
             num_logprobs=num_logprobs,
             tensor_parallel_size=1,
         )
+
+
+@large_gpu_test(min_gb=48)
+@pytest.mark.core_model
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS)
+def test_regression(vllm_runner, image_assets, model, dtype, max_tokens,
+                    num_logprobs, attn_backend: _Backend) -> None:
+
+    stop_sign = image_assets[0].pil_image
+
+    with global_force_attn_backend_context_manager(attn_backend), vllm_runner(
+            model,
+            dtype=dtype,
+            max_model_len=4096,
+            max_num_seqs=2,
+            tensor_parallel_size=1,
+            enforce_eager=True,
+            limit_mm_per_prompt={"image":
+                                 _LIMIT_IMAGE_PER_PROMPT}) as vllm_model:
+
+        # Regression tests for https://github.com/vllm-project/vllm/issues/10648
+
+        # Number of image tags is greater than the number of images provided
+        prompt = "<|begin_of_text|><|image|><|image|> Compare the two images"  # noqa: E501
+        image = stop_sign
+        with pytest.raises(ValueError):
+            vllm_model.generate_greedy_logprobs([prompt],
+                                                max_tokens,
+                                                num_logprobs,
+                                                images=[image])
+
+        # Batch of a text-only and image request that requires cross-attention
+        prompts = [
+            "What is the capital of spain?",
+            "Text before the image...<|image|>What is in the image?",  # noqa: E501
+        ]
+        images = [
+            None,
+            [stop_sign],
+        ]
+        vllm_model.generate_greedy_logprobs(prompts,
+                                            max_tokens,
+                                            num_logprobs,
+                                            images=images)
+
+        # Test the reverse order too for good measure
+        prompts = [
+            "<|begin_of_text|>Text before the image...<|image|>What is in the image?",  # noqa: E501
+            "<|begin_of_text|>Hello!",
+        ]
+        images = [
+            [stop_sign],
+            None,
+        ]
+        vllm_model.generate_greedy_logprobs(prompts,
+                                            max_tokens,
+                                            num_logprobs,
+                                            images=images)
+
+
+@pytest.mark.core_model
+@pytest.mark.parametrize(
+    "input_indices_and_output",
+    # inputs, (cross_attention_mask, kv_range_for_decode)
+    [([TEXT_ONLY], (None, None)), ([IMAGE_AT_BEG], (None, None)),
+     ([TEXT_ONLY, IMAGE_AT_BEG], (None, None)),
+     ([IMAGE_AT_MIDDLE], ((10, 12), [[0, 6]])),
+     ([TEXT_ONLY, IMAGE_AT_MIDDLE], ((14, 12), [[0, 6]])),
+     ([TEXT_ONLY, IMAGE_AT_BEG, IMAGE_AT_MIDDLE],
+      ((23, 24), [[0, 6], [6, 12]])),
+     ([IMAGE_AT_MIDDLE, TEXT_ONLY], ((14, 12), [[0, 6]])),
+     ([TWO_IMAGES], ((18, 12), [[6, 12]])),
+     ([TEXT_ONLY, TWO_IMAGES], ((22, 12), [[6, 12]]))])
+def test_get_cross_attention_mask(input_indices_and_output) -> None:
+
+    input_indices, expected_output = input_indices_and_output
+
+    sequences = [torch.tensor(prompt_data[i]) for i in input_indices]
+    num_tiles = [[2, 2] if i != TEXT_ONLY else [] for i in input_indices
+                 if i != TEXT_ONLY]
+    input = torch.cat(sequences)
+
+    seq_lens = [len(s) for s in sequences]
+
+    attn_data = FlashAttentionMetadata(
+        seq_lens=seq_lens,
+        # Dummy values
+        enable_kv_scales_calculation=False,
+        num_prefills=0,
+        num_prefill_tokens=0,
+        num_decode_tokens=0,
+        slot_mapping=0,
+        multi_modal_placeholder_index_maps=None,
+        seq_lens_tensor=0,
+        max_prefill_seq_len=0,
+        max_decode_seq_len=0,
+        context_lens_tensor=None,
+        block_tables=None,
+        use_cuda_graph=False,
+    )
+
+    dummy: dict[str, str] = {}
+
+    cross_attention_mask, kv_range_for_decode = MllamaForConditionalGeneration\
+        .get_cross_attention_mask(dummy,
+                                  input,
+                                  attn_data,
+                                  num_tiles=num_tiles,
+                                  num_tokens_per_tile=3,
+                                  dtype=torch.bfloat16)
+
+    expected_cross_attention_mask, expected_kv_range_for_decode = \
+        expected_output
+
+    assert kv_range_for_decode == expected_kv_range_for_decode
+    if expected_cross_attention_mask is not None:
+        assert cross_attention_mask is not None
+        assert cross_attention_mask.shape == expected_cross_attention_mask
+    else:
+        assert cross_attention_mask is None
+
+
+@pytest.mark.core_model
+@pytest.mark.parametrize(
+    "input_indices",
+    [[TEXT_ONLY], [IMAGE_AT_BEG], [TEXT_ONLY, IMAGE_AT_BEG], [IMAGE_AT_MIDDLE],
+     [TEXT_ONLY, IMAGE_AT_MIDDLE], [TEXT_ONLY, IMAGE_AT_BEG, IMAGE_AT_MIDDLE],
+     [IMAGE_AT_MIDDLE, TEXT_ONLY], [TWO_IMAGES], [TEXT_ONLY, TWO_IMAGES]])
+def test_get_full_text_row_masked_out_mask(input_indices) -> None:
+
+    sequences = [torch.tensor(prompt_data[i]) for i in input_indices]
+
+    seq_lens = [len(s) for s in sequences]
+
+    num_prefill_tokens = sum(seq_lens)
+
+    # TEXT_ONLY is zero, so it will be masked out,
+    # other instances should not be.
+    encoder_seq_lens = [int(i) for i in input_indices]
+
+    attn_data = FlashAttentionMetadata(
+        seq_lens=seq_lens,
+        encoder_seq_lens=encoder_seq_lens,
+        num_prefill_tokens=num_prefill_tokens,
+        # Dummy values
+        enable_kv_scales_calculation=False,
+        num_prefills=0,
+        num_decode_tokens=0,
+        slot_mapping=0,
+        multi_modal_placeholder_index_maps=None,
+        seq_lens_tensor=0,
+        max_prefill_seq_len=0,
+        max_decode_seq_len=0,
+        context_lens_tensor=None,
+        block_tables=None,
+        use_cuda_graph=False,
+    )
+
+    dummy: dict[str, str] = {}
+
+    full_text_row_masked_out_mask = MllamaForConditionalGeneration\
+        .get_full_text_row_masked_out_mask(dummy,
+                                  attn_data,
+                                  torch.get_default_device())
+
+    full_text_row_masked_out_mask = full_text_row_masked_out_mask.squeeze()
+    full_text_row_masked_out_mask = full_text_row_masked_out_mask.tolist()
+
+    idx = 0
+    assert len(full_text_row_masked_out_mask) == num_prefill_tokens
+    for i, seq_len in enumerate(seq_lens):
+        must_be_masked = input_indices[i] != TEXT_ONLY
+        for _ in range(seq_len):
+            assert full_text_row_masked_out_mask[idx] == must_be_masked, \
+                f"full_text_row_masked_out_mask[{idx}] must be " \
+                f"'{must_be_masked}' "
+            idx += 1
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index e15ac84a6049b..34b8624647ce6 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -1485,14 +1485,23 @@ def convert_sparse_cross_attention_mask_to_dense(
     total_length = sum(lengths)
     total_tiles = sum([sum(tiles) for tiles in num_tiles])
     dense_mask = np.zeros(shape=(total_length, total_tiles), dtype=np.int64)
-    # A list of ranges, range[i] = [start, end] means
-    # if the i-th sample has N tiles in total, the tiles[start, end]
-    # will be used for cross-attention decoding.
+    # A list of ranges, range[i] = [start, end] means that the i-th image will
+    # use tiles[start, end] for cross-attention decoding.
     tile_range_for_decode = []
 
     seq_start = 0
     tile_start = 0
-    for masks, tiles, length in zip(sparse_mask, num_tiles, lengths):
+
+    # sparse_mask has an [] entry for each sequence that does not have images,
+    # but num_tiles does not have these entries...
+    num_tiles_idx = 0
+    for masks, length in zip(sparse_mask, lengths):
+        if len(masks) == 0:
+            # Text only
+            continue
+
+        tiles = num_tiles[num_tiles_idx]
+        num_tiles_idx += 1
         ts, td = -1, 0
         for mask, tile in zip(masks, tiles):
             if len(mask) != 2:
@@ -1512,6 +1521,7 @@ def convert_sparse_cross_attention_mask_to_dense(
         assert td != 0
         tile_range_for_decode.append((ts, ts + td))
         seq_start += length
+    assert num_tiles_idx == len(num_tiles)
 
     return dense_mask, tile_range_for_decode
 

From d93bf4da855a0c5e8d3c875def6b37c5e9d77763 Mon Sep 17 00:00:00 2001
From: Alphi <52458637+HwwwwwwwH@users.noreply.github.com>
Date: Wed, 29 Jan 2025 17:24:59 +0800
Subject: [PATCH 426/462] [Model] Refactoring of MiniCPM-V and add
 MiniCPM-o-2.6 support for vLLM (#12069)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: hzh <hezhihui_thu@163.com>
Signed-off-by: Sungjae Lee <33976427+llsj14@users.noreply.github.com>
Signed-off-by: shaochangxu.scx <shaochangxu.scx@antgroup.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: NickLucche <nlucches@redhat.com>
Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Rafael Vasquez <rafvasq21@gmail.com>
Signed-off-by: Akshat Tripathi <akshat@krai.ai>
Signed-off-by: Oleg Mosalov <oleg@krai.ai>
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Signed-off-by: Yida Wu <yidawu@alumni.cmu.edu>
Signed-off-by: Chenguang Li <757486878@qq.com>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Alex-Brooks <Alex.brooks@ibm.com>
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Shanshan Shen <467638484@qq.com>
Signed-off-by: elijah <f1renze.142857@gmail.com>
Signed-off-by: Yikun <yikunkero@gmail.com>
Signed-off-by: mgoin <michael@neuralmagic.com>
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Konrad Zawora <kzawora@habana.ai>
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
Co-authored-by: Sungjae Lee <33976427+llsj14@users.noreply.github.com>
Co-authored-by: shaochangxu <85155497+shaochangxu@users.noreply.github.com>
Co-authored-by: shaochangxu.scx <shaochangxu.scx@antgroup.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com>
Co-authored-by: sixgod <evethwillbeok@outlook.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
Co-authored-by: Rafael Vasquez <rafvasq21@gmail.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Akshat Tripathi <Akshat.tripathi6568@gmail.com>
Co-authored-by: Oleg Mosalov <oleg@krai.ai>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Avshalom Manevich <12231371+avshalomman@users.noreply.github.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Co-authored-by: Yangcheng Li <liyangcheng.lyc@alibaba-inc.com>
Co-authored-by: Siyuan Li <94890248+liaoyanqing666@users.noreply.github.com>
Co-authored-by: Concurrensee <yida.wu@amd.com>
Co-authored-by: Chenguang Li <757486878@qq.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: Alex Brooks <alex.brooks@ibm.com>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Shanshan Shen <467638484@qq.com>
Co-authored-by: elijah <30852919+e1ijah1@users.noreply.github.com>
Co-authored-by: Yikun Jiang <yikunkero@gmail.com>
Co-authored-by: Steve Luo <36296769+SunflowerAries@users.noreply.github.com>
Co-authored-by: mgoin <michael@neuralmagic.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: Konrad Zawora <kzawora@habana.ai>
Co-authored-by: TJian <tunjian1996@gmail.com>
Co-authored-by: tjtanaa <tunjian.tan@embeddedllm.com>
Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com>
Co-authored-by: maang-h <55082429+maang-h@users.noreply.github.com>
Co-authored-by: Elfie Guo <164945471+elfiegg@users.noreply.github.com>
Co-authored-by: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
---
 docs/source/models/supported_models.md        |   9 +-
 examples/offline_inference/audio_language.py  |  32 +-
 examples/offline_inference/vision_language.py |  33 +-
 requirements-cpu.txt                          |   1 +
 requirements-cuda.txt                         |   1 +
 requirements-test.in                          |   3 +
 requirements-test.txt                         |  37 +-
 .../vision_language/test_models.py            |  14 +
 .../vision_language/vlm_utils/model_utils.py  |  11 +
 .../multimodal/processing/test_common.py      |   2 +
 tests/models/registry.py                      |   4 +-
 vllm/entrypoints/chat_utils.py                |   6 +-
 vllm/model_executor/models/minicpmo.py        | 811 +++++++++++++++++
 vllm/model_executor/models/minicpmv.py        | 843 ++++++++++++++----
 vllm/model_executor/models/registry.py        |   1 +
 15 files changed, 1622 insertions(+), 186 deletions(-)
 create mode 100644 vllm/model_executor/models/minicpmo.py

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 94f4bd6cadabd..afaad8818bdcb 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -693,9 +693,16 @@ See [this page](#generative-models) for more information on how to use generativ
   *
   * ✅︎
   * ✅︎
+- * `MiniCPMO`
+  * MiniCPM-O
+  * T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>E+</sup>
+  * `openbmb/MiniCPM-o-2_6`, etc.
+  * ✅︎
+  * ✅︎
+  *
 - * `MiniCPMV`
   * MiniCPM-V
-  * T + I<sup>E+</sup>
+  * T + I<sup>E+</sup> + V<sup>E+</sup>
   * `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, etc.
   * ✅︎
   * ✅︎
diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py
index 6fd74782a9aae..5952ec13ec3cb 100644
--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@@ -67,7 +67,37 @@ def run_qwen2_audio(question: str, audio_count: int):
     return llm, prompt, stop_token_ids
 
 
-model_example_map = {"ultravox": run_ultravox, "qwen2_audio": run_qwen2_audio}
+def run_minicpmo(question: str, audio_count: int):
+    model_name = "openbmb/MiniCPM-o-2_6"
+    tokenizer = AutoTokenizer.from_pretrained(model_name,
+                                              trust_remote_code=True)
+    llm = LLM(model=model_name,
+              trust_remote_code=True,
+              max_model_len=4096,
+              max_num_seqs=5,
+              limit_mm_per_prompt={"audio": audio_count})
+
+    stop_tokens = ['<|im_end|>', '<|endoftext|>']
+    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
+
+    audio_placeholder = "(<audio>./</audio>)" * audio_count
+    audio_chat_template = "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n<|spk_bos|><|spk|><|spk_eos|><|tts_bos|>' }}{% endif %}"  # noqa: E501
+    messages = [{
+        'role': 'user',
+        'content': f'{audio_placeholder}\n{question}'
+    }]
+    prompt = tokenizer.apply_chat_template(messages,
+                                           tokenize=False,
+                                           add_generation_prompt=True,
+                                           chat_template=audio_chat_template)
+    return llm, prompt, stop_token_ids
+
+
+model_example_map = {
+    "ultravox": run_ultravox,
+    "qwen2_audio": run_qwen2_audio,
+    "minicpmo": run_minicpmo
+}
 
 
 def main(args):
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 415439e88ed59..38c2b13d3f2c7 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -265,8 +265,9 @@ def run_mantis(question: str, modality: str):
 
 
 # MiniCPM-V
-def run_minicpmv(question: str, modality: str):
-    assert modality == "image"
+def run_minicpmv_base(question: str, modality: str, model_name):
+    assert modality in ["image", "video"]
+    # If you want to use `MiniCPM-o-2_6` with audio inputs, check `audio_language.py` # noqa
 
     # 2.0
     # The official repo doesn't work yet, so we need to use a fork for now
@@ -277,7 +278,15 @@ def run_minicpmv(question: str, modality: str):
     # model_name = "openbmb/MiniCPM-Llama3-V-2_5"
 
     # 2.6
-    model_name = "openbmb/MiniCPM-V-2_6"
+    # model_name = "openbmb/MiniCPM-V-2_6"
+    # o2.6
+
+    # modality supports
+    # 2.0: image
+    # 2.5: image
+    # 2.6: image, video
+    # o2.6: image, video, audio
+    # model_name = "openbmb/MiniCPM-o-2_6"
     tokenizer = AutoTokenizer.from_pretrained(model_name,
                                               trust_remote_code=True)
     llm = LLM(
@@ -294,13 +303,18 @@ def run_minicpmv(question: str, modality: str):
     # 2.5
     # stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id]
 
-    # 2.6
+    # 2.6 / o2.6
     stop_tokens = ['<|im_end|>', '<|endoftext|>']
     stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
 
+    modality_placeholder = {
+        "image": "(<image>./</image>)",
+        "video": "(<video>./</video>)",
+    }
+
     messages = [{
         'role': 'user',
-        'content': f'(<image>./</image>)\n{question}'
+        'content': f'{modality_placeholder[modality]}\n{question}'
     }]
     prompt = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
@@ -308,6 +322,14 @@ def run_minicpmv(question: str, modality: str):
     return llm, prompt, stop_token_ids
 
 
+def run_minicpmo(question: str, modality: str):
+    return run_minicpmv_base(question, modality, "openbmb/MiniCPM-o-2_6")
+
+
+def run_minicpmv(question: str, modality: str):
+    return run_minicpmv_base(question, modality, "openbmb/MiniCPM-V-2_6")
+
+
 # LLama 3.2
 def run_mllama(question: str, modality: str):
     assert modality == "image"
@@ -523,6 +545,7 @@ def run_qwen2_vl(question: str, modality: str):
     "llava-next-video": run_llava_next_video,
     "llava-onevision": run_llava_onevision,
     "mantis": run_mantis,
+    "minicpmo": run_minicpmo,
     "minicpmv": run_minicpmv,
     "mllama": run_mllama,
     "molmo": run_molmo,
diff --git a/requirements-cpu.txt b/requirements-cpu.txt
index 056fbf5a7adec..ed0d2c9fae0b6 100644
--- a/requirements-cpu.txt
+++ b/requirements-cpu.txt
@@ -4,5 +4,6 @@
 # Dependencies for CPUs
 torch==2.5.1+cpu; platform_machine != "ppc64le" and platform_machine != "aarch64" and platform_system != "Darwin"
 torch==2.5.1; platform_machine == "aarch64" or platform_system == "Darwin" 
+torchaudio; platform_machine != "ppc64le"  # required for the image processor of minicpm-o-2_6, this must be updated alongside torch
 torchvision; platform_machine != "ppc64le"  # required for the image processor of phi3v, this must be updated alongside torch
 datasets # for benchmark scripts
diff --git a/requirements-cuda.txt b/requirements-cuda.txt
index 8002fbd8ee5b9..78fa360f2dc96 100644
--- a/requirements-cuda.txt
+++ b/requirements-cuda.txt
@@ -5,6 +5,7 @@
 ray[default] >= 2.9
 nvidia-ml-py >= 12.560.30 # for pynvml package
 torch == 2.5.1
+torchaudio==2.5.1
 # These must be updated alongside torch
 torchvision == 0.20.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
 xformers == 0.0.28.post3; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch 2.5.1
diff --git a/requirements-test.in b/requirements-test.in
index bc76a91ad5356..13ad17b256734 100644
--- a/requirements-test.in
+++ b/requirements-test.in
@@ -12,6 +12,8 @@ decord # required for video tests
 einops # required for MPT, qwen-vl and Mamba
 httpx
 librosa # required for audio tests
+vector_quantize_pytorch # required for minicpmo_26 test
+vocos # required for minicpmo_26 test
 peft
 pqdm
 ray[adag]==2.40.0
@@ -19,6 +21,7 @@ sentence-transformers # required for embedding tests
 soundfile # required for audio tests
 timm # required for internvl test
 torch==2.5.1
+torchaudio==2.5.1
 transformers_stream_generator # required for qwen-vl test
 matplotlib # required for qwen-vl test
 mistral_common[opencv] >= 1.5.0 # required for pixtral test
diff --git a/requirements-test.txt b/requirements-test.txt
index 09e009c2e21f4..df7e904bb0d34 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -106,9 +106,17 @@ dnspython==2.7.0
 docutils==0.16
     # via awscli
 einops==0.8.0
-    # via -r requirements-test.in
+    # via
+    #   -r requirements-test.in
+    #   encodec
+    #   vector-quantize-pytorch
+    #   vocos
+einx==0.3.0
+    # via vector-quantize-pytorch
 email-validator==2.2.0
     # via pydantic
+encodec==0.1.1
+    # via vocos
 evaluate==0.4.3
     # via lm-eval
 fastparquet==2024.11.0
@@ -125,6 +133,8 @@ filelock==3.16.1
     #   triton
 fonttools==4.54.1
     # via matplotlib
+frozendict==2.4.6
+    # via einx
 frozenlist==1.5.0
     # via
     #   aiohttp
@@ -159,6 +169,7 @@ huggingface-hub==0.26.2
     #   timm
     #   tokenizers
     #   transformers
+    #   vocos
 idna==3.10
     # via
     #   anyio
@@ -261,6 +272,8 @@ numpy==1.26.4
     #   cupy-cuda12x
     #   datasets
     #   decord
+    #   einx
+    #   encodec
     #   evaluate
     #   fastparquet
     #   genai-perf
@@ -283,6 +296,7 @@ numpy==1.26.4
     #   torchvision
     #   transformers
     #   tritonclient
+    #   vocos
 nvidia-cublas-cu12==12.4.5.8
     # via
     #   nvidia-cudnn-cu12
@@ -455,6 +469,7 @@ pyyaml==6.0.2
     #   responses
     #   timm
     #   transformers
+    #   vocos
 ray[adag]==2.40.0
     # via -r requirements-test.in
 redis==5.2.0
@@ -517,6 +532,7 @@ scipy==1.13.1
     #   scikit-learn
     #   sentence-transformers
     #   statsmodels
+    #   vocos
 sentence-transformers==3.2.1
     # via -r requirements-test.in
 sentencepiece==0.2.0
@@ -540,7 +556,9 @@ sqlitedict==2.1.0
 statsmodels==0.14.4
     # via genai-perf
 sympy==1.13.1
-    # via torch
+    # via
+    #   einx
+    #   torch
 tabledata==1.3.3
     # via pytablewriter
 tabulate==0.9.0
@@ -568,12 +586,21 @@ torch==2.5.1
     #   -r requirements-test.in
     #   accelerate
     #   bitsandbytes
+    #   encodec
     #   lm-eval
     #   peft
     #   sentence-transformers
     #   tensorizer
     #   timm
+    #   torchaudio
     #   torchvision
+    #   vector-quantize-pytorch
+    #   vocos
+torchaudio==2.5.1
+    # via
+    #   -r requirements-test.in
+    #   encodec
+    #   vocos
 torchvision==0.20.1
     # via timm
 tqdm==4.66.6
@@ -584,6 +611,7 @@ tqdm==4.66.6
     #   lm-eval
     #   nltk
     #   peft
+    #   pqdm
     #   sentence-transformers
     #   tqdm-multiprocess
     #   transformers
@@ -615,6 +643,7 @@ typing-extensions==4.12.2
     #   huggingface-hub
     #   librosa
     #   mistral-common
+    #   pqdm
     #   pydantic
     #   pydantic-core
     #   torch
@@ -626,6 +655,10 @@ urllib3==2.2.3
     #   requests
     #   responses
     #   tritonclient
+vector-quantize-pytorch==1.21.2
+    # via -r requirements-test.in
+vocos==0.1.0
+    # via -r requirements-test.in
 word2number==1.1
     # via lm-eval
 xxhash==3.5.0
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index d5f0d63288cc1..62c644f73d62d 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -350,6 +350,20 @@
         postprocess_inputs=model_utils.wrap_inputs_post_processor,
         hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
     ),
+    "minicpmo_26": VLMTestInfo(
+        models=["openbmb/MiniCPM-o-2_6"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
+        img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
+        max_model_len=4096,
+        max_num_seqs=2,
+        get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']),  # noqa: E501
+        postprocess_inputs=model_utils.ignore_inputs_post_processor(
+            "image_sizes"
+        ),
+        hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
+        patch_hf_runner=model_utils.minicpmo_patch_hf_runner
+    ),
     "minicpmv_26": VLMTestInfo(
         models=["openbmb/MiniCPM-V-2_6"],
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
index 1ca85c7bb2056..07bdb2cee44d2 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
@@ -497,6 +497,17 @@ def _generate(self, *args, **kwargs):
     return hf_model
 
 
+def minicpmo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+    orig_generate = hf_model.model.generate
+
+    def _generate(self, *args, **kwargs):
+        return orig_generate(*args, decode_text=False, **kwargs)
+
+    hf_model.model.generate = types.MethodType(_generate, hf_model.model)
+
+    return hf_model
+
+
 def _generate_greedy_logprobs_limit(
     self,
     prompts: List[str],
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index b575ec6acbef3..ca28da268fa05 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -152,6 +152,8 @@ def _test_processing_correctness(
     "llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
     "TIGER-Lab/Mantis-8B-siglip-llama3",
     "mistral-community/pixtral-12b",
+    "openbmb/MiniCPM-o-2_6",
+    "openbmb/MiniCPM-V-2_6",
     "Qwen/Qwen-VL-Chat",
     "Qwen/Qwen2-VL-2B-Instruct",
     "Qwen/Qwen2-Audio-7B-Instruct",
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 0bd06dea0ec7f..7952e65aa76a5 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -245,7 +245,9 @@ def check_available_online(
     "LlavaOnevisionForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"),  # noqa: E501
     "MantisForConditionalGeneration": _HfExamplesInfo("TIGER-Lab/Mantis-8B-siglip-llama3",  # noqa: E501
                                                       hf_overrides={"architectures": ["MantisForConditionalGeneration"]}),  # noqa: E501
-    "MiniCPMV": _HfExamplesInfo("openbmb/MiniCPM-Llama3-V-2_5",
+    "MiniCPMO": _HfExamplesInfo("openbmb/MiniCPM-o-2_6",
+                                trust_remote_code=True),
+    "MiniCPMV": _HfExamplesInfo("openbmb/MiniCPM-V-2_6",
                                 trust_remote_code=True),
     "MolmoForCausalLM": _HfExamplesInfo("allenai/Molmo-7B-D-0924",
                                         trust_remote_code=True),
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 723d6e9085806..97d2561df602a 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -392,7 +392,7 @@ def _placeholder_str(self, modality: ModalityStr,
             if model_type == "phi3_v":
                 # Workaround since this token is not defined in the tokenizer
                 return f"<|image_{current_count}|>"
-            if model_type == "minicpmv":
+            if model_type in ("minicpmo", "minicpmv"):
                 return "(<image>./</image>)"
             if model_type in ("blip-2", "chatglm", "fuyu", "paligemma",
                               "pixtral"):
@@ -424,10 +424,14 @@ def _placeholder_str(self, modality: ModalityStr,
             if model_type == "qwen2_audio":
                 return (f"Audio {current_count}: "
                         f"<|audio_bos|><|AUDIO|><|audio_eos|>")
+            if model_type == "minicpmo":
+                return "(<audio>./</audio>)"
             raise TypeError(f"Unknown model type: {model_type}")
         elif modality == "video":
             if model_type == "qwen2_vl":
                 return "<|vision_start|><|video_pad|><|vision_end|>"
+            if model_type in ("minicpmo", "minicpmv"):
+                return "(<video>./</video>)"
             if model_type.startswith("llava"):
                 return self._cached_token_str(self._tokenizer,
                                               hf_config.video_token_index)
diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py
new file mode 100644
index 0000000000000..eb4282d62005a
--- /dev/null
+++ b/vllm/model_executor/models/minicpmo.py
@@ -0,0 +1,811 @@
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only MiniCPM-O model compatible with HuggingFace weights."""
+from functools import partial
+from itertools import accumulate
+from typing import (Any, Dict, Iterable, List, Literal, Mapping, Optional, Set,
+                    Tuple, TypedDict, Union)
+
+import torch
+import torch.types
+from torch import nn
+from transformers.modeling_outputs import BaseModelOutputWithPast
+from transformers.models.whisper.modeling_whisper import (
+    ACT2FN, WHISPER_ATTENTION_CLASSES, WhisperConfig, WhisperEncoder)
+
+from vllm.attention import AttentionMetadata
+from vllm.config import VllmConfig
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
+from vllm.multimodal.inputs import MultiModalFieldConfig
+from vllm.multimodal.parse import (ModalityData, ModalityDataItems,
+                                   MultiModalDataItems, MultiModalDataParser,
+                                   VideoItem)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        PromptReplacement)
+from vllm.multimodal.profiling import ProcessorInputs
+from vllm.sequence import IntermediateTensors
+
+from .minicpmv import (MiniCPMV2_6, MiniCPMVDummyInputsBuilder,
+                       MiniCPMVEmbeddingItems, MiniCPMVMultiModalDataParser,
+                       MiniCPMVMultiModalProcessor, MiniCPMVProcessingInfo)
+from .utils import AutoWeightsLoader, maybe_prefix
+
+CPU_DEVICE = torch.device("cpu")
+
+MiniCPMOEmbeddingItems = MiniCPMVEmbeddingItems
+
+
+class MiniCPMOAudioFeatureInputs(TypedDict):
+    type: Literal["audio_features"]
+    data: torch.Tensor
+    """
+    Shape: `(batch_size * num_audios * num_slices, num_channels, length)`
+    Slice here means chunk. Audio that is too long will be split into slices,
+    which is the same as image.
+    Padding is used therefore `data` is `torch.Tensor`.
+    """
+
+    audio_feature_lens: torch.Tensor
+    """
+    Shape: `(batch_size * num_audios * num_slices)`
+
+    This should be feature length of each audio slice, 
+    which equals to `data.shape[-1]`
+    """
+
+    audio_bounds: torch.Tensor
+    """
+    Shape: `(batch_size * num_audios * num_slices, 2)`
+
+    This should be in `(start, stop)` format.
+    """
+
+
+class MiniCPMOAudioEmbeddingInputs(TypedDict):
+    type: Literal["audio_embeds"]
+    data: List[torch.Tensor]
+    """
+    Shape: `(batch_size * num_images * num_slices, hidden_size)`
+
+    `hidden_size` must match the hidden size of language model backbone.
+    instead of a batched tensor.
+    Length of each slice may vary, so pass it as a list.
+    """
+    audio_bounds: torch.Tensor
+    """
+    Shape: `(batch_size * num_audios * num_slices, 2)`
+
+    This should be in `(start, stop)` format.
+    """
+
+
+MiniCPMOAudioInputs = Union[MiniCPMOAudioFeatureInputs,
+                            MiniCPMOAudioEmbeddingInputs]
+
+
+class MiniCPMOAudioEmbeddingItems(MiniCPMOEmbeddingItems):
+
+    def __init__(self, data: Dict) -> None:
+        super().__init__(data, "audio")
+        audio_embeds = self.data.get("audio_embeds", None)
+        if audio_embeds is None:
+            raise ValueError("Incorrect type of video_embeds",
+                             "Got type: None")
+        self.data["audio_embeds"] = audio_embeds
+
+    def get(self, index: int) -> object:
+        return self.data["audio_embeds"][index]
+
+
+class MiniCPMOMultiModalDataParser(MiniCPMVMultiModalDataParser):
+
+    def _parse_audio_data(
+        self,
+        data: Union[dict[str, torch.Tensor], ModalityData[VideoItem]],
+    ) -> ModalityDataItems[Any, Any]:
+        if isinstance(data, dict):
+            return MiniCPMOAudioEmbeddingItems(data)
+        return super()._parse_audio_data(data)
+
+
+class MiniCPMOProcessingInfo(MiniCPMVProcessingInfo):
+    audio_pattern = "(<audio>./</audio>)"
+
+    def get_supported_mm_modalities(self) -> List[str]:
+        return ["image", "video", "audio"]
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None, "video": None, "audio": None}
+
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+        return {
+            "image": self.get_max_image_tokens(),
+            "audio": self.get_max_audio_tokens(),
+            "video": self.get_max_video_tokens(seq_len)
+        }
+
+    def get_default_audio_pool_step(self) -> int:
+        return 2
+
+    def get_default_audio_sampling_rate(self) -> int:
+        return 16000
+
+    def get_chunk_length(self) -> int:
+        return self.get_hf_config().audio_chunk_length
+
+    def get_max_audio_tokens_per_chunk(self) -> int:
+        pool_step = self.get_default_audio_pool_step()
+        fbank_feat_in_chunk = 100
+        cnn_feat_in_chunk = (fbank_feat_in_chunk - 1) // 2 + 1
+        num_audio_tokens = (cnn_feat_in_chunk - pool_step) // pool_step + 1
+        return num_audio_tokens + 2  # <audio>(<unk>*N)</audio>
+
+    def get_max_audio_chunks_with_most_features(self) -> int:
+        return 30
+
+    def get_audio_len_by_num_chunks(self, num_chunks: int) -> int:
+        sampling_rate = self.get_default_audio_sampling_rate()
+        # exclude <audio> </audio>
+        num_tokens_per_chunk = self.get_max_audio_tokens_per_chunk() - 2
+        return int(num_chunks * sampling_rate / num_tokens_per_chunk) + 1
+
+    def get_num_frames_with_most_features(self, seq_len: int) -> int:
+        mm_config = self.ctx.get_mm_config()
+        max_images = mm_config.limit_per_prompt.get("image", 1)
+        max_videos = mm_config.limit_per_prompt.get("video", 1)
+        max_audios = mm_config.limit_per_prompt.get("audio", 1)
+
+        # count <image_idx></image_idx> tokens
+        # which are not in get_max_image_tokens
+        max_image_tokens = self.get_max_image_tokens(
+        ) * max_images + 4 * max_images
+        max_audio_tokens = self.get_max_audio_tokens(
+        ) * max_audios + 2 * max_audios
+        max_total_frames = self.get_max_video_frames(seq_len -
+                                                     max_image_tokens -
+                                                     max_audio_tokens)
+
+        num_frames = max(max_total_frames // max(max_videos, 1), 1)
+
+        return num_frames
+
+
+class MiniCPMODummyInputsBuilder(MiniCPMVDummyInputsBuilder):
+
+    def get_dummy_processor_inputs(
+            self, seq_len: int, mm_counts: Mapping[str,
+                                                   int]) -> ProcessorInputs:
+        num_audios = mm_counts.get("audio", 0)
+        audio_len = self.info.get_max_audio_chunks_with_most_features() * \
+            self.info.get_default_audio_sampling_rate()
+
+        processor_inputs = super().get_dummy_processor_inputs(
+            seq_len, mm_counts)
+        mm_data = {
+            "image":
+            processor_inputs.mm_data["image"],
+            "video":
+            processor_inputs.mm_data["video"],
+            "audio":
+            self._get_dummy_audios(length=audio_len, num_audios=num_audios)
+        }
+
+        audio_prompt_texts = self.info.audio_pattern * num_audios
+
+        return ProcessorInputs(prompt_text=processor_inputs.prompt_text + \
+                               audio_prompt_texts,
+                               mm_data=mm_data)
+
+
+class MiniCPMOMultiModalProcessor(
+        MiniCPMVMultiModalProcessor,
+        BaseMultiModalProcessor[MiniCPMOProcessingInfo]):
+
+    def _get_data_parser(self) -> MultiModalDataParser:
+        return MiniCPMOMultiModalDataParser(
+            target_sr=self.info.get_default_audio_sampling_rate())
+
+    def get_audio_prompt_texts(self,
+                               audio_lens: int,
+                               chunk_input: bool = True,
+                               chunk_length: int = 1) -> str:
+        return self.info.get_hf_processor().get_audio_placeholder(
+            audio_lens, chunk_input, chunk_length)
+
+    def get_special_tokens(self) -> Dict[str, torch.Tensor]:
+        tokenizer = self.info.get_tokenizer()
+        special_tokens = super().get_special_tokens()
+        if hasattr(tokenizer, "audio_start_id"):
+            special_tokens["audio_start_id"] = torch.tensor(
+                tokenizer.audio_start_id)
+            special_tokens["audio_end_id"] = torch.tensor(
+                tokenizer.audio_end_id)
+        return special_tokens
+
+    def process_audios(self, mm_data: Mapping[str, object],
+                       mm_kwargs: Mapping[str, object]) -> Dict[str, object]:
+        audios = mm_data.pop("audios", [])
+        audio_embeds = mm_data.pop("audio_embeds", [])
+        if isinstance(audios, (list, torch.Tensor)) and len(audios) > 0:
+            audio_outputs = {
+                "audio_lens": [],
+                "audio_features": [],
+                "audio_feature_lens": [],
+                "audio_num_segments": []
+            }
+            for audio in audios:
+                single_audio_outputs = super().call_base_hf_processor(
+                    prompt=self.info.audio_pattern,
+                    mm_data={
+                        "audios": audio,
+                        "chunk_input": True
+                    },
+                    mm_kwargs=mm_kwargs)
+                audio_outputs["audio_lens"].append(len(audio))
+                audio_outputs["audio_features"].append(
+                    single_audio_outputs["audio_features"])
+                audio_outputs["audio_num_segments"].append(
+                    len(single_audio_outputs["audio_feature_lens"][0]))
+                audio_outputs["audio_feature_lens"] += \
+                    single_audio_outputs["audio_feature_lens"]
+            audio_outputs["audio_features"] = [
+                audio_feature for single_audio_features in \
+                    audio_outputs["audio_features"]
+                for audio_feature in single_audio_features
+            ]
+            audio_outputs["audio_feature_lens"] = torch.cat(
+                audio_outputs["audio_feature_lens"])
+        elif len(audio_embeds):
+            audio_outputs = {
+                "audio_lens": [
+                    self.info.get_audio_len_by_num_chunks(
+                        sum(chunk_embeds.shape[0]
+                            for chunk_embeds in single_audio_embeds))
+                    for single_audio_embeds in audio_embeds
+                ],
+                "audio_embeds": [
+                    chunk_embeds for single_audio_embeds in audio_embeds
+                    for chunk_embeds in single_audio_embeds
+                ],
+                "audio_num_segments": [
+                    len(single_audio_embeds)
+                    for single_audio_embeds in audio_embeds
+                ]
+            }
+        else:
+            audio_outputs = {}
+        return audio_outputs
+
+    def get_placeholder_match_pattern(self) -> str:
+        return r"\(<(image|video|audio)>./</\1>\)"
+
+    def get_placeholder_split_pattern(self) -> str:
+        return r"\(<(?:image|video|audio)>./</(?:image|video|audio)>\)"
+
+    def process_mm_inputs(self, mm_data, mm_kwargs) -> object:
+        return {
+            "image": self.process_images(mm_data, mm_kwargs),
+            "video": self.process_videos(mm_data, mm_kwargs),
+            "audio": self.process_audios(mm_data, mm_kwargs)
+        }
+
+    def get_modality_num_counter(self, modality: str) -> str:
+        if modality == "audio":
+            return "audio_lens"
+        return super().get_modality_num_counter(modality)
+
+    def get_num_slices_by_modality(self, inputs: Dict[str, object],
+                                   modality: str, index: int) -> int:
+        if modality == "audio":
+            return inputs["audio"]["audio_num_segments"][index]
+        return super().get_num_slices_by_modality(inputs, modality, index)
+
+    def get_prompt_texts_by_modality(self, inputs: Dict[str, object],
+                                     modality: str, index: int) -> str:
+        if modality == "audio":
+            return self.get_audio_prompt_texts(
+                inputs["audio"]["audio_lens"][index])
+        return super().get_prompt_texts_by_modality(inputs, modality, index)
+
+    def _get_prompt_replacements(
+            self, mm_items: MultiModalDataItems,
+            hf_processor_mm_kwargs: Mapping[str, Any],
+            out_mm_kwargs: MultiModalKwargs) -> List[PromptReplacement]:
+        placeholder = {
+            "image": self.info.image_pattern,
+            "video": self.info.video_pattern,
+            "audio": self.info.audio_pattern
+        }
+
+        def get_replacement_minicpmv(item_idx: int, modality: str):
+            if modality == "image":
+                return self.get_image_prompt_texts(
+                    mm_items["image"].get_image_size(item_idx), item_idx)
+            elif modality == "video":
+                return self.get_video_prompt_texts(
+                    mm_items["video"].get_frame_size(item_idx),
+                    mm_items["video"].get_num_frames(item_idx))
+            else:  # audio
+                if isinstance(mm_items["audio"], MiniCPMOAudioEmbeddingItems):
+                    single_audio_embeds = mm_items["audio"].get(item_idx)
+                    audio_len = self.info.get_audio_len_by_num_chunks(
+                        sum(chunk_embeds.shape[0]
+                            for chunk_embeds in single_audio_embeds))
+                    return self.get_audio_prompt_texts(audio_len)
+                return self.get_audio_prompt_texts(
+                    len(mm_items["audio"].get(item_idx)))
+
+        return [
+            PromptReplacement(modality=modality,
+                              target=placeholder[modality],
+                              replacement=partial(get_replacement_minicpmv,
+                                                  modality=modality))
+            for modality in ("image", "video", "audio")
+        ]
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+
+        def get_slices(num_slices: List[int]) -> List[int]:
+            slice_indices = [0] + list(accumulate(num_slices))
+            slices = [(slice_indices[i], slice_indices[i + 1])
+                      for i in range(len(num_slices))]
+            return [slice(*slice_item) for slice_item in slices]
+
+        audio_slices = get_slices(
+            hf_inputs.get("audio_num_slices", torch.empty(0)))
+        return dict(
+            **super()._get_mm_fields_config(hf_inputs, hf_processor_mm_kwargs),
+            audio_features=MultiModalFieldConfig.flat("audio", audio_slices),
+            audio_feature_lens=MultiModalFieldConfig.flat(
+                "audio", audio_slices),
+            audio_num_slices=MultiModalFieldConfig.batched("audio"),
+            audio_orders_in_mm_data=MultiModalFieldConfig.batched("audio"),
+            audio_embeds=MultiModalFieldConfig.flat("audio", audio_slices))
+
+
+class MultiModalProjector(nn.Module):
+
+    def __init__(self, in_dim: int, out_dim: int):
+        super().__init__()
+        self.linear1 = nn.Linear(in_features=in_dim,
+                                 out_features=out_dim,
+                                 bias=True)
+        self.relu = nn.ReLU()
+        self.linear2 = nn.Linear(in_features=out_dim,
+                                 out_features=out_dim,
+                                 bias=True)
+
+    def forward(self, audio_features: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.relu(self.linear1(audio_features))
+        hidden_states = self.linear2(hidden_states)
+        return hidden_states
+
+
+class MiniCPMWhisperEncoderLayer(nn.Module):
+
+    def __init__(self, config: WhisperConfig, layer_idx: int = None):
+        super().__init__()
+        self.embed_dim = config.d_model
+        self.self_attn = WHISPER_ATTENTION_CLASSES[
+            config._attn_implementation](
+                embed_dim=self.embed_dim,
+                num_heads=config.encoder_attention_heads,
+                dropout=config.attention_dropout,
+                config=config,
+                layer_idx=layer_idx,
+            )
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+    ) -> torch.Tensor:
+        residual = hidden_states
+        past_key_values = None
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states, attn_weights, past_key_values = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            past_key_value=past_key_values,
+        )
+        hidden_states = nn.functional.dropout(hidden_states,
+                                              p=self.dropout,
+                                              training=self.training)
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states,
+                                              p=self.activation_dropout,
+                                              training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states,
+                                              p=self.dropout,
+                                              training=self.training)
+        hidden_states = residual + hidden_states
+
+        if hidden_states.dtype == torch.float16 and (
+                torch.isinf(hidden_states).any()
+                or torch.isnan(hidden_states).any()):
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(hidden_states,
+                                        min=-clamp_value,
+                                        max=clamp_value)
+
+        outputs = (hidden_states, )
+
+        return outputs
+
+
+class MiniCPMWhisperEncoder(WhisperEncoder):
+
+    def __init__(self, config: WhisperConfig):
+        super().__init__(config)
+        self.layers = nn.ModuleList([
+            MiniCPMWhisperEncoderLayer(config, layer_idx=i)
+            for i in range(config.encoder_layers)
+        ])
+
+    def forward(
+        self,
+        input_features: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> BaseModelOutputWithPast:
+        # Ignore copy
+        input_features = input_features.to(dtype=self.conv1.weight.dtype,
+                                           device=self.conv1.weight.device)
+
+        inputs_embeds = nn.functional.gelu(self.conv1(input_features))
+        inputs_embeds = nn.functional.gelu(self.conv2(inputs_embeds))
+
+        inputs_embeds = inputs_embeds.permute(0, 2, 1)
+
+        embed_pos = self.embed_positions.weight
+
+        embed_pos = embed_pos[:inputs_embeds.shape[1], :]
+
+        hidden_states = inputs_embeds + embed_pos
+        hidden_states = nn.functional.dropout(hidden_states,
+                                              p=self.dropout,
+                                              training=self.training)
+
+        encoder_states = ()
+
+        for idx, encoder_layer in enumerate(self.layers):
+            encoder_states = encoder_states + (hidden_states, )
+            to_drop = False
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:  # skip the layer
+                    to_drop = True
+
+            # Ignore copy
+            if to_drop:
+                layer_outputs = (None, None)
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                )
+
+                hidden_states = layer_outputs[0]
+
+        hidden_states = self.layer_norm(hidden_states)
+        encoder_states = encoder_states + (hidden_states, )
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            hidden_states=encoder_states,
+        )
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    MiniCPMOMultiModalProcessor,
+    info=MiniCPMOProcessingInfo,
+    dummy_inputs=MiniCPMODummyInputsBuilder)
+class MiniCPMO(MiniCPMV2_6):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+        self.apm = self.init_audio_module(vllm_config=vllm_config,
+                                          prefix=maybe_prefix(prefix, "apm"))
+
+    def init_audio_module(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        # Do not use parameters temporarily
+        audio_config = self.config.audio_config
+        model = MiniCPMWhisperEncoder(audio_config)
+        audio_output_dim = int(audio_config.encoder_ffn_dim // 4)
+        self.audio_avg_pooler = \
+            nn.AvgPool1d(self.config.audio_pool_step,
+                         stride=self.config.audio_pool_step)
+        self.audio_projection_layer = \
+            MultiModalProjector(in_dim=audio_output_dim,out_dim=self.embed_dim)
+        self.audio_encoder_layer = -1
+        return model
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        loader = AutoWeightsLoader(self, skip_prefixes=["tts"])
+        return loader.load_weights(weights)
+
+    def subsequent_chunk_mask(
+        self,
+        size: int,
+        chunk_size: int,
+        num_left_chunks: int = -1,
+        device: torch.device = CPU_DEVICE,
+        num_lookhead: int = 0,
+    ) -> torch.Tensor:
+        ret = torch.zeros(size, size, device=device, dtype=torch.bool)
+        for i in range(size):
+            if num_left_chunks < 0:
+                start = 0
+            else:
+                start = max((i // chunk_size - num_left_chunks) * chunk_size,
+                            0)
+            ending = min((i // chunk_size + 1) * chunk_size + num_lookhead,
+                         size)
+            ret[i, start:ending] = True
+        return ret
+
+    def _get_feat_extract_output_lengths(self,
+                                         input_lengths: torch.LongTensor):
+        input_lengths_after_cnn = (input_lengths - 1) // 2 + 1
+        input_lengths_after_pooling = (
+            input_lengths_after_cnn -
+            self.config.audio_pool_step) // self.config.audio_pool_step + 1
+        input_lengths_after_pooling = input_lengths_after_pooling.to(
+            dtype=torch.int32)
+
+        return input_lengths_after_cnn, input_lengths_after_pooling
+
+    # Copied from HF repo of MiniCPM-o-2_6,
+    # designed for batched inputs and outputs
+    def get_audio_hidden_states(self, data: MiniCPMOAudioInputs,
+                                chunk_length: int) -> torch.Tensor:
+        wavforms = data.get(
+            "data",
+            [])  # (bs, 80, frames) or [], multi audios need filled in advance
+        audio_feature_lens_raw = [data.get("audio_feature_lens",
+                                           [])]  # list, [[x1, x2], [y1], [z1]]
+
+        # exist audio
+        if len(wavforms) > 0:
+            audio_feature_lens = torch.hstack(audio_feature_lens_raw)
+            batch_size, _, max_mel_seq_len = wavforms.shape
+            max_seq_len = (max_mel_seq_len - 1) // 2 + 1
+
+            # Create a sequence tensor of shape (batch_size, max_seq_len)
+            seq_range = (torch.arange(
+                0,
+                max_seq_len,
+                dtype=audio_feature_lens.dtype,
+                device=audio_feature_lens.device).unsqueeze(0).expand(
+                    batch_size, max_seq_len))
+            lengths_expand = audio_feature_lens.unsqueeze(1).expand(
+                batch_size, max_seq_len)
+            # Create mask
+            padding_mask = seq_range >= lengths_expand  # 1 for padded values
+
+            audio_attention_mask_ = padding_mask.view(
+                batch_size, 1, 1, max_seq_len).expand(batch_size, 1,
+                                                      max_seq_len, max_seq_len)
+            audio_attention_mask = audio_attention_mask_.to(
+                dtype=self.apm.conv1.weight.dtype,
+                device=self.apm.conv1.weight.device)
+
+            if chunk_length > 0:
+                chunk_num_frame = int(chunk_length * 50)
+                chunk_mask = self.subsequent_chunk_mask(
+                    size=max_seq_len,
+                    chunk_size=chunk_num_frame,
+                    num_left_chunks=-1,
+                    device=audio_attention_mask_.device,
+                )
+                audio_attention_mask_ = torch.logical_or(
+                    audio_attention_mask_, torch.logical_not(chunk_mask))
+
+            audio_attention_mask[audio_attention_mask_] = float("-inf")
+            audio_states = self.apm(
+                wavforms, attention_mask=audio_attention_mask).hidden_states[
+                    self.audio_encoder_layer]
+            audio_embeds = self.audio_projection_layer(audio_states)
+
+            audio_embeds = audio_embeds.transpose(1, 2)
+            audio_embeds = self.audio_avg_pooler(audio_embeds)
+            audio_embeds = audio_embeds.transpose(1, 2)
+
+            _, feature_lens_after_pooling = \
+                self._get_feat_extract_output_lengths(audio_feature_lens)
+
+            num_audio_tokens = feature_lens_after_pooling
+
+            final_audio_embeds = []
+            idx = 0
+            for i in range(len(audio_feature_lens_raw)):
+                target_audio_embeds = []
+                for _ in range(len(audio_feature_lens_raw[i])):
+                    target_audio_embeds.append(
+                        audio_embeds[idx, :num_audio_tokens[idx], :])
+                    idx += 1
+                final_audio_embeds.append(target_audio_embeds)
+            return final_audio_embeds
+        else:
+            return []
+
+    def get_embedding_with_audios(self, vlm_embedding: torch.Tensor,
+                                  audio_inputs: Optional[MiniCPMOAudioInputs],
+                                  chunk_length: int) -> torch.Tensor:
+        device, dtype = vlm_embedding.device, vlm_embedding.dtype
+        if audio_inputs["type"] == "audio_embeds":
+            audio_embeddings = audio_inputs["data"]
+            audio_embeddings = [
+                audio_embeddings[i].to(device=device, dtype=dtype)
+                for i in range(len(audio_embeddings))
+            ]
+        else:
+            audio_embeddings = self.get_audio_hidden_states(
+                audio_inputs, chunk_length)[0]
+        if audio_embeddings is None or len(audio_embeddings) == 0:
+            return vlm_embedding
+        audio_bounds = audio_inputs["audio_bounds"]
+        if self.config.chunk_input:
+            audio_embs = torch.cat(audio_embeddings, dim=0).to(device=device,
+                                                               dtype=dtype)
+            audio_start_pos = 0
+            for bound in audio_bounds:
+                audio_len = bound[1] - bound[0]
+                vlm_embedding[bound[0]:bound[1]] = audio_embs[
+                    audio_start_pos:audio_start_pos + audio_len, :]
+                audio_start_pos += audio_len
+        else:
+            for embs, bound in zip(audio_embeddings, audio_bounds):
+                audio_indices = torch.arange(bound[0],
+                                             bound[1],
+                                             dtype=torch.long).to(device)
+
+                if embs.shape[0] != len(audio_indices):
+                    raise ValueError(
+                        "Shape mismatch: Trying to assign embeddings "
+                        f"of shape {embs.shape} "
+                        f"to input indices of length {len(audio_indices)}")
+                vlm_embedding[audio_indices] = embs.to(dtype)
+        return vlm_embedding
+
+    def _get_audio_bounds(self, input_ids: torch.Tensor,
+                          audio_start_id: torch.Tensor,
+                          audio_end_id: torch.Tensor) -> torch.Tensor:
+        audio_start_tokens, = torch.where(input_ids == audio_start_id[0])
+        audio_start_tokens += 1
+        audio_end_tokens, = torch.where(input_ids == audio_end_id[0])
+        valid_audio_nums = max(len(audio_start_tokens), len(audio_end_tokens))
+        return torch.hstack([
+            audio_start_tokens[:valid_audio_nums].unsqueeze(-1),
+            audio_end_tokens[:valid_audio_nums].unsqueeze(-1)
+        ])
+
+    def _parse_and_validate_audio_inputs(
+            self, input_ids: torch.Tensor,
+            **kwargs: object) -> Tuple[MiniCPMOAudioInputs]:
+        audio_features = kwargs.pop("audio_features", [])
+        audio_feature_lens = kwargs.pop("audio_feature_lens", [])
+        audio_embeds = kwargs.pop("audio_embeds", None)
+        audio_start_id = kwargs.pop("audio_start_id", None)
+        audio_end_id = kwargs.pop("audio_end_id", None)
+        if audio_embeds is not None:
+            audio_embeds = [
+                audio_embeds[i][j] for i in range(len(audio_embeds))
+                for j in range(len(audio_embeds[i]))
+            ]
+            return MiniCPMOAudioEmbeddingInputs(
+                audio_bounds=self._get_audio_bounds(input_ids, audio_start_id,
+                                                    audio_end_id),
+                data=audio_embeds,
+                type="audio_embeds")
+        if len(audio_features) > 0:
+            audio_features_all = [
+                i.permute(1, 0) for audio_feature in audio_features
+                for i in audio_feature
+            ]
+            audio_features = torch.nn.utils.rnn.pad_sequence(
+                audio_features_all, batch_first=True,
+                padding_value=0.0).permute(0, 2, 1)
+            audio_feature_lens = torch.cat(
+                [item for item in audio_feature_lens])
+
+            return MiniCPMOAudioFeatureInputs(
+                audio_bounds=self._get_audio_bounds(input_ids, audio_start_id,
+                                                    audio_end_id),
+                data=audio_features,
+                audio_feature_lens=audio_feature_lens,
+                type="audio_features")
+        return None
+
+    def _parse_and_validate_inputs(self, input_ids: torch.Tensor,
+                                   **kwargs: object):
+        image_inputs = self._parse_and_validate_image_inputs(
+            input_ids, **kwargs)
+        if not any("audio" in key for key in kwargs):
+            return image_inputs, None
+        audio_inputs = self._parse_and_validate_audio_inputs(
+            input_ids, **kwargs)
+        return image_inputs, audio_inputs
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        **kwargs: Any,
+    ) -> torch.Tensor:
+        if intermediate_tensors is not None:
+            vlm_embeddings = None
+        else:
+            image_inputs, audio_inputs = \
+                self._parse_and_validate_inputs(input_ids, **kwargs)
+            vlm_embeddings, _ = self.get_embedding_with_vision(
+                input_ids, image_inputs)
+
+            if audio_inputs is not None:
+                vlm_embeddings = self.get_embedding_with_audios(
+                    vlm_embeddings, audio_inputs,
+                    self.config.audio_chunk_length)
+
+        # always pass the input via `inputs_embeds`
+        # to make sure the computation graph is consistent
+        # for `torch.compile` integration
+        input_ids = None
+
+        output = self.llm.model(
+            input_ids=input_ids,
+            positions=positions,
+            kv_caches=kv_caches,
+            attn_metadata=attn_metadata,
+            intermediate_tensors=intermediate_tensors,
+            inputs_embeds=vlm_embeddings,
+        )
+        return output
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 1aa529056893b..bf967d33a3176 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -22,21 +22,21 @@
 """Inference-only MiniCPM-V model compatible with HuggingFace weights."""
 import math
 import re
+from collections import Counter
 from functools import cached_property, partial
-from typing import (Any, Callable, Iterable, List, Literal, Mapping, Optional,
-                    Set, Tuple, TypedDict, Union)
+from itertools import accumulate
+from typing import (Any, Callable, Dict, Iterable, List, Literal, Mapping,
+                    Optional, Set, Tuple, TypedDict, Union)
 
+import numpy as np
 import torch
 import torch.types
 from PIL import Image
 from torch import nn
-from transformers import PretrainedConfig
-from typing_extensions import NotRequired
+from transformers import BatchFeature, PretrainedConfig
 
 from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
-                         InputContext, token_inputs)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.resampler import (BaseResampler, Resampler2,
                                                   get_2d_sincos_pos_embed)
@@ -48,33 +48,30 @@
 from vllm.model_executor.models.qwen2 import Qwen2ForCausalLM
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
-from vllm.multimodal.image import cached_get_image_processor
-from vllm.multimodal.utils import cached_get_tokenizer
-from vllm.sequence import IntermediateTensors, SequenceData
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalInputs, PlaceholderRange)
+from vllm.multimodal.parse import (ImageItem, ImageSize, ModalityData,
+                                   ModalityDataItems, MultiModalDataItems,
+                                   MultiModalDataParser, VideoItem)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, PromptReplacement)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.sequence import IntermediateTensors
 
 from .idefics2_vision_model import Idefics2VisionTransformer
 from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP
 from .utils import AutoWeightsLoader, maybe_prefix
 
-RawImageType = Union[Image.Image, torch.Tensor]
-
-
-class MiniCPMVRawImageInput(TypedDict):
-    """Input mapper input with auxiliary data for computing image bounds."""
-    image: RawImageType
+CPU_DEVICE = torch.device("cpu")
 
-    # Image bounds token ids in 0-dim scaler tensor.
-    im_start_id: torch.Tensor
-    im_end_id: torch.Tensor
-    slice_start_id: NotRequired[torch.Tensor]
-    slice_end_id: NotRequired[torch.Tensor]
+RawImageType = Union[Image.Image, torch.Tensor]
 
 
 class MiniCPMVImagePixelInputs(TypedDict):
     type: Literal["pixel_values"]
     data: List[torch.Tensor]
     """
-    Shape: `(batch_size * num_images, num_channels, height, width)`
+    Shape: `(batch_size * num_images * num_slices, num_channels, height, width)`
 
     Note that the image size may vary, so we pass it as a list
     instead of a batched tensor.
@@ -82,14 +79,14 @@ class MiniCPMVImagePixelInputs(TypedDict):
 
     image_bounds: torch.Tensor
     """
-    Shape: `(batch_size * num_images, 2)`
+    Shape: `(batch_size * num_images * num_slices, 2)`
 
     This should be in `(start, stop)` format.
     """
 
     tgt_sizes: torch.Tensor
     """
-    Shape: `(batch_size * num_images, 2)`
+    Shape: `(batch_size * num_images * num_slices, 2)`
 
     This should be in `(height, width)` format.
     """
@@ -99,7 +96,8 @@ class MiniCPMVImageEmbeddingInputs(TypedDict):
     type: Literal["image_embeds"]
     data: torch.Tensor
     """
-    Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
+    Shape: `(batch_size * num_images * num_slices, 
+             image_feature_size, hidden_size)`
 
     `hidden_size` must match the hidden size of language model backbone.
     instead of a batched tensor.
@@ -107,7 +105,7 @@ class MiniCPMVImageEmbeddingInputs(TypedDict):
 
     image_bounds: torch.Tensor
     """
-    Shape: `(batch_size * num_images, 2)`
+    Shape: `(batch_size * num_images * num_slices, 2)`
 
     This should be in `(start, stop)` format.
     """
@@ -116,6 +114,93 @@ class MiniCPMVImageEmbeddingInputs(TypedDict):
 MiniCPMVImageInputs = Union[MiniCPMVImagePixelInputs,
                             MiniCPMVImageEmbeddingInputs]
 
+
+class MiniCPMVEmbeddingItems(ModalityDataItems[dict[str, torch.Tensor],
+                                               dict[str, torch.Tensor]]):
+
+    def __init__(self, data: Dict, modality: str) -> None:
+        super().__init__(data, modality)
+
+    def get_processor_data(self) -> Mapping[str, object]:
+        return self.data
+
+    def get_passthrough_data(self) -> Mapping[str, object]:
+        return {}
+
+    def get_count(self) -> int:
+        return len(self.data[f"{self.modality}_embeds"])
+
+    def get(self, index: int) -> Dict[str, torch.Tensor]:
+        out = {}
+        for k, v in self.data.items():
+            out[k] = v[index]
+        return out
+
+
+class MiniCPMVImageEmbeddingItems(MiniCPMVEmbeddingItems):
+
+    def __init__(self, data: Dict) -> None:
+        super().__init__(data, "image")
+        image_embeds = self.data.get("image_embeds", None)
+        image_sizes = self.data.get("image_sizes", None)
+        if image_embeds is None:
+            raise ValueError("In correct type of image_embeds",
+                             "Got type: None")
+        if not isinstance(image_embeds[0], torch.Tensor):
+            raise ValueError("In correct type of image_embeds",
+                             f"Got type: {type(image_embeds[0])}")
+        if image_sizes is None:
+            raise ValueError(
+                "In correct type of image_sizes", "Got type: None."
+                "If you're using `image_size_list`, "
+                "please rename it to `image_sizes`")
+        if len(image_embeds[0].shape) == 2:
+            image_embeds = [image_embeds]
+            image_sizes = [image_sizes]
+        self.data["image_embeds"] = image_embeds
+        self.data["image_sizes"] = image_sizes
+
+    def get_image_size(self, index: int) -> ImageSize:
+        image_size = self.data["image_sizes"][index]
+        return ImageSize(width=image_size[0], height=image_size[1])
+
+
+class MiniCPMVVideoEmbeddingItems(MiniCPMVEmbeddingItems):
+
+    def __init__(self, data: Dict) -> None:
+        super().__init__(data, "video")
+        video_embeds = self.data.get("video_embeds", None)
+        image_sizes = self.data.get("image_sizes", None)
+        num_frames = self.data.get("num_frames", None)
+        if video_embeds is None:
+            raise ValueError("In correct type of video_embeds",
+                             "Got type: None")
+        if not isinstance(video_embeds[0], torch.Tensor):
+            raise ValueError("In correct type of video_embeds",
+                             f"Got type: {type(video_embeds[0])}")
+        if image_sizes is None:
+            raise ValueError(
+                "In correct type of image_sizes", "Got type: None."
+                "If you're using `image_size_list`, "
+                "please rename it to `image_sizes`")
+        if num_frames is None:
+            raise ValueError("In correct type of numframes", "Got type: None")
+        if len(video_embeds[0].shape) == 2:
+            video_embeds = [video_embeds]
+            image_sizes = [image_sizes]
+            num_frames = [num_frames]
+        self.data["video_embeds"] = video_embeds
+        self.data["image_sizes"] = image_sizes
+        self.data["num_frames"] = num_frames
+
+    def get_frame_size(self, index: int) -> ImageSize:
+        frame_size = self.data["image_sizes"][index]
+        return ImageSize(width=frame_size[0], height=frame_size[1])
+
+    def get_num_frames(self, index: int) -> int:
+        return self.data["num_frames"][index]
+
+
 DEFAULT_LN = partial(nn.LayerNorm, eps=1e-6)
 
 
@@ -212,25 +297,6 @@ def forward(self, x: torch.Tensor,
         return x
 
 
-def _build_image_input(ctx: InputContext,
-                       image: RawImageType) -> MiniCPMVRawImageInput:
-    tokenizer = cached_get_tokenizer(
-        ctx.model_config.tokenizer,
-        trust_remote_code=ctx.model_config.trust_remote_code)
-    if hasattr(tokenizer, "slice_start_id"):
-        return MiniCPMVRawImageInput(
-            image=image,
-            im_start_id=torch.tensor(tokenizer.im_start_id),
-            im_end_id=torch.tensor(tokenizer.im_end_id),
-            slice_start_id=torch.tensor(tokenizer.slice_start_id),
-            slice_end_id=torch.tensor(tokenizer.slice_end_id))
-    else:
-        return MiniCPMVRawImageInput(
-            image=image,
-            im_start_id=torch.tensor(tokenizer.im_start_id),
-            im_end_id=torch.tensor(tokenizer.im_end_id))
-
-
 def get_version_by_config(config: PretrainedConfig) -> Tuple[int, ...]:
     version_float = getattr(config, "version", None)
 
@@ -240,129 +306,512 @@ def get_version_by_config(config: PretrainedConfig) -> Tuple[int, ...]:
         if config.hidden_size == 2304 and config.query_num == 64:
             return (2, 0)
         return (2, 5)
-
     version_str = str(version_float)
     return tuple(int(x) for x in version_str.split("."))
 
 
-def get_max_minicpmv_image_tokens(ctx: InputContext):
-    hf_config = ctx.get_hf_config()
-    return getattr(hf_config, "query_num", 64)
+class MiniCPMVMultiModalDataParser(MultiModalDataParser):
+
+    def _parse_image_data(
+        self,
+        data: Union[dict[str, torch.Tensor], ModalityData[ImageItem]],
+    ) -> ModalityDataItems[Any, Any]:
+        if isinstance(data, dict):
+            return MiniCPMVImageEmbeddingItems(data)
+        return super()._parse_image_data(data)
+
+    def _parse_video_data(
+        self,
+        data: Union[dict[str, torch.Tensor], ModalityData[VideoItem]],
+    ) -> ModalityDataItems[Any, Any]:
+        if isinstance(data, dict):
+            return MiniCPMVVideoEmbeddingItems(data)
+        return super()._parse_video_data(data)
+
+
+class MiniCPMVProcessingInfo(BaseProcessingInfo):
+    image_pattern = "(<image>./</image>)"
+    video_pattern = "(<video>./</video>)"
+
+    def get_hf_config(self):
+        return self.ctx.get_hf_config()
+
+    def get_hf_processor(
+        self,
+        **kwargs: object,
+    ):
+        hf_processor = self.ctx.get_hf_processor()
+        return hf_processor
+
+    def get_image_processor(self):
+        hf_processor = self.get_hf_processor()
+        image_processor = hf_processor.image_processor  # type: ignore
+        return image_processor
+
+    def get_model_version(self):
+        return get_version_by_config(self.get_hf_config())
+
+    def get_supported_mm_modalities(self) -> List[str]:
+        if self.get_model_version() == (2, 6):
+            return ["image", "video"]
+        else:
+            return ["image"]
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        if self.get_model_version() == (2, 6):
+            return {"image": None, "video": None}
+        else:
+            return {"image": None}
+
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+        mm_max_tokens = {"image": self.get_max_image_tokens()}
+        if self.get_model_version() == (2, 6):
+            mm_max_tokens["video"] = self.get_max_video_tokens(seq_len)
+        return mm_max_tokens
+
+    def get_max_video_frame_tokens(self) -> int:
+        frame_size = self.get_video_frame_size_with_most_features()
+        return self.get_num_image_tokens(frame_size,
+                                         self.get_video_max_slice_num())
+
+    def get_max_video_tokens(self, seq_len: int) -> int:
+        return self.get_max_video_frame_tokens(
+        ) * self.get_num_frames_with_most_features(seq_len)
+
+    def get_max_audio_tokens(self) -> int:
+        return self.get_max_audio_tokens_per_chunk(
+        ) * self.get_max_audio_chunks_with_most_features()
+
+    def get_slice_query_num(self) -> int:
+        hf_config = self.get_hf_config()
+        query_num = getattr(hf_config, "query_num", 64)
+        return query_num
+
+    def get_max_slice_num(self) -> int:
+        hf_config = self.get_hf_config()
+        max_slice_num = getattr(hf_config, "max_slice_num", 9)
+        return max_slice_num
+
+    def get_sliced_grid(self, image_size: ImageSize,
+                        max_slice_num: int) -> Tuple[int, int]:
+        if self.get_model_version() == (2, 6):
+            slice_grid = self.get_image_processor().get_sliced_grid(
+                image_size, max_slice_num)
+        else:
+            slice_grid = self.get_image_processor().get_sliced_grid(image_size)
+        return slice_grid
+
+    def get_num_image_tokens(self, image_size: ImageSize,
+                             max_slice_num: int) -> int:
+        slice_grid = self.get_sliced_grid(image_size, max_slice_num)
+        num_tokens = self.get_slice_query_num(
+        ) + 2  # <image>(<unk> * query_num)</image>
+        if slice_grid is not None:
+            if self.get_model_version() == (2, 6):
+                num_additional_tokens = 0
+            else:
+                # <slice><image>(<unk> * query_num)</image></slice>
+                num_additional_tokens = 2
+            num_tokens += ((self.get_slice_query_num() + 2) \
+                            * slice_grid[0] * slice_grid[1]) \
+                            + slice_grid[1] - 1 + num_additional_tokens
+        return num_tokens
 
+    def get_image_slice_nums(self, image_size: torch.Tensor,
+                             max_slice_nums: int) -> int:
+        grid = self.get_sliced_grid(image_size, max_slice_nums)
+        return 1 if grid is None else grid[0] * grid[1] + 1
 
-def dummy_seq_data_for_minicpmv(seq_len: int, num_images: int):
-    return SequenceData.from_prompt_token_counts((0, seq_len))
+    def get_max_image_tokens(self) -> int:
+        image_size = self.get_image_size_with_most_features()
+        return self.get_num_image_tokens(image_size, self.get_max_slice_num())
 
+    def get_image_size_with_most_features(self) -> ImageSize:
+        # Result in the max possible feature size (h:w = 9:1)
+        return self.get_default_image_sizes(self.get_max_slice_num())
 
-def dummy_image_for_minicpmv(ctx: InputContext, hf_config: PretrainedConfig,
-                             num_images: int):
-    width = height = hf_config.image_size
-    image = _build_image_input(ctx,
-                               image=Image.new("RGB", (width, height),
-                                               color=0))
-    return {"image": [image] if num_images == 1 else [image] * num_images}
+    def get_video_max_slice_num(self) -> int:
+        return 1
 
+    def get_video_frame_size_with_most_features(self) -> ImageSize:
+        return self.get_default_image_sizes(self.get_video_max_slice_num())
 
-def dummy_data_for_minicpmv(ctx: InputContext, seq_len: int,
-                            mm_counts: Mapping[str, int]):
-    hf_config = ctx.get_hf_config()
-    num_images = mm_counts["image"]
+    def get_max_video_frames(self, max_tokens: int) -> int:
+        num_frame_tokens = self.get_max_video_frame_tokens()
+        num_frames = max_tokens // num_frame_tokens
+        return num_frames
 
-    seq_data = dummy_seq_data_for_minicpmv(seq_len, num_images)
-    mm_data = dummy_image_for_minicpmv(ctx, hf_config, num_images)
+    def get_num_frames_with_most_features(self, seq_len: int) -> int:
+        mm_config = self.ctx.get_mm_config()
+        max_images = mm_config.limit_per_prompt.get("image", 1)
+        max_videos = mm_config.limit_per_prompt.get("video", 1)
 
-    return DummyData(seq_data, mm_data)
+        # count <image_idx></image_idx> tokens
+        # which are not in get_max_image_tokens
+        max_image_tokens = self.get_max_image_tokens(
+        ) * max_images + 4 * max_images
+        max_total_frames = self.get_max_video_frames(seq_len -
+                                                     max_image_tokens)
 
+        num_frames = max(max_total_frames // max(max_videos, 1), 1)
 
-def input_processor_for_minicpmv(ctx: InputContext, inputs: DecoderOnlyInputs):
-    multi_modal_data = inputs.get("multi_modal_data")
-    if multi_modal_data is None or "image" not in multi_modal_data:
-        return inputs
-    model_config = ctx.model_config
-    version = get_version_by_config(model_config.hf_config)
-    tokenizer = cached_get_tokenizer(
-        model_config.tokenizer,
-        trust_remote_code=model_config.trust_remote_code)
-    image_processor = cached_get_image_processor(model_config.tokenizer)
+        return num_frames
 
-    def get_placeholder(image_size: Tuple[int, int], num_image: int):
+    def get_default_image_sizes(self, num_slices: int) -> ImageSize:
+        image_size = getattr(self.get_hf_config(), "image_size", 448)
+        return ImageSize(width=image_size, height=image_size * num_slices)
+
+
+class MiniCPMVDummyInputsBuilder(BaseDummyInputsBuilder[MiniCPMVProcessingInfo]
+                                 ):
+
+    def get_dummy_processor_inputs(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        num_images = mm_counts.get("image", 0)
+        num_videos = mm_counts.get("video", 0)
+
+        image_width, image_height = \
+            self.info.get_image_size_with_most_features()
+        video_width, video_height = \
+            self.info.get_video_frame_size_with_most_features()
+        num_video_frames = \
+            self.info.get_num_frames_with_most_features(seq_len)
+
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=image_width,
+                                   height=image_height,
+                                   num_images=num_images),
+            "video": [
+                self._get_dummy_images(width=video_width,
+                                       height=video_height,
+                                       num_images=num_video_frames)
+            ] * num_videos,
+        }
+
+        image_prompt_texts = self.info.image_pattern * num_images
+        video_prompt_texts = self.info.video_pattern * num_videos
+
+        return ProcessorInputs(prompt_text=image_prompt_texts +
+                               video_prompt_texts,
+                               mm_data=mm_data)
+
+
+class MiniCPMVMultiModalProcessor(
+        BaseMultiModalProcessor[MiniCPMVProcessingInfo]):
+
+    def _get_data_parser(self) -> MultiModalDataParser:
+        return MiniCPMVMultiModalDataParser()
+
+    def get_slice_image_placeholder(self, image_size: ImageSize,
+                                    **kwargs) -> str:
+        image_processor = self.info.get_image_processor()
+        version = self.info.get_model_version()
         if version == (2, 0) or version == (2, 5):
             return image_processor.get_slice_image_placeholder(image_size)
         return image_processor.get_slice_image_placeholder(
-            image_size, num_image)
-
-    prompt = inputs.get("prompt")
-    token_ids = inputs.get("prompt_token_ids")
-    if prompt is None:
-        prompt = tokenizer.decode(token_ids)
-
-    pattern = "(<image>./</image>)"
-    images = multi_modal_data["image"]
-    image_tags = re.findall(pattern, prompt)
-    if len(image_tags) == 0:
-        new_token_ids = token_ids
-        new_prompt = prompt
-    else:
-        if isinstance(images, dict):
-            image_size_list = images.get("image_size_list")
-            images = [images.get("image_embeds")]
+            image_size, **kwargs)
+
+    def get_image_prompt_texts(self,
+                               image_size: ImageSize,
+                               image_idx: int = 0) -> str:
+        prompt_texts = self.get_slice_image_placeholder(image_size,
+                                                        image_idx=image_idx)
+        return prompt_texts
+
+    def get_video_prompt_texts(self, image_size: ImageSize,
+                               num_frames: int) -> str:
+        prompt_texts = "".join(
+            self.get_slice_image_placeholder(
+                image_size=image_size,
+                image_idx=0,
+                max_slice_nums=self.info.get_video_max_slice_num(),
+                use_image_id=False) for image_idx in range(num_frames))
+        return prompt_texts
+
+    def get_special_tokens(self) -> Dict[str, torch.Tensor]:
+        tokenizer = self.info.get_tokenizer()
+        special_tokens = {
+            "im_start_id": torch.tensor(tokenizer.im_start_id),
+            "im_end_id": torch.tensor(tokenizer.im_end_id)
+        }
+        if hasattr(tokenizer, "slice_start_id"):
+            special_tokens["slice_start_id"] = torch.tensor(
+                tokenizer.slice_start_id)
+            special_tokens["slice_end_id"] = torch.tensor(
+                tokenizer.slice_end_id)
+        return special_tokens
+
+    @staticmethod
+    def repack_processor_outputs(outputs: Any) -> BatchFeature:
+        valid_keys = ["pixel_values", "image_sizes", "tgt_sizes"]
+        outputs = {key: outputs[key][0] for key in valid_keys}
+        return outputs
+
+    def process_images(self, mm_data: Mapping[str, object],
+                       mm_kwargs: Mapping[str, object]) -> Dict[str, object]:
+        images = mm_data.pop("images", [])
+        image_embeds = mm_data.pop("image_embeds", [])
+        if isinstance(images, Image.Image):
+            images = [images]
+        if isinstance(images, (list, torch.Tensor)) and len(images) > 0:
+            image_outputs = super()._call_hf_processor(
+                prompt=self.info.image_pattern * len(images),
+                mm_data={"images": images},
+                mm_kwargs=mm_kwargs)
+            image_outputs = MiniCPMVMultiModalProcessor.\
+                repack_processor_outputs(image_outputs)
+        elif len(image_embeds) > 0:
+            image_sizes = mm_data.pop("image_sizes", None)
+            image_outputs = {
+                "image_embeds": torch.cat(image_embeds),
+                "image_sizes": image_sizes
+            }
         else:
-            if isinstance(images, Image.Image):
-                images = [images]
-            image_size_list = [image.size for image in images]
-
-        text_chunks = prompt.split(pattern)
-        new_prompt_chunks: List[str] = []
-        for i in range(len(image_size_list)):
-            new_prompt_chunks += [
-                text_chunks[i],
-                get_placeholder(image_size_list[i], i)
-            ]
-        new_prompt_chunks.append(text_chunks[-1])
-        new_prompt = "".join(new_prompt_chunks)
-        new_token_ids = tokenizer.encode(new_prompt)
-
-    multi_modal_data["image"] = [
-        _build_image_input(ctx, image) for image in images
-    ]
+            image_outputs = {}
+        return image_outputs
+
+    def process_videos(self, mm_data: Mapping[str, object],
+                       mm_kwargs: Mapping[str, object]) -> Dict[str, object]:
+        videos = mm_data.pop("videos", [])
+        video_embeds = mm_data.pop("video_embeds", [])
+        if len(videos) > 0 and isinstance(videos[0], Image.Image):
+            videos = [videos]
+        if isinstance(videos, list) and len(videos) > 0:
+            video_outputs = {
+                "video_pixel_values": [],
+                "video_image_sizes": [],
+                "video_tgt_sizes": [],
+                "num_frames": []
+            }
+            for video in videos:
+                parsed_video = []
+                for frame in video:
+                    if isinstance(frame, np.ndarray):
+                        parsed_video.append(Image.fromarray(frame))
+                    else:
+                        parsed_video.append(frame)
+                video = parsed_video
+                single_video_outputs = super()._call_hf_processor(
+                    prompt=self.info.image_pattern * len(video),
+                    mm_data={"images": video},
+                    mm_kwargs={
+                        **mm_kwargs, "max_slice_nums":
+                        self.info.get_video_max_slice_num()
+                    })
+                video_outputs["num_frames"].append(len(video))
+                for key in single_video_outputs:
+                    if "video_" + key in video_outputs:
+                        if key == "image_sizes":
+                            video_outputs["video_" + key].append(
+                                single_video_outputs[key][0][0])
+                        else:
+                            video_outputs["video_" +
+                                          key] += single_video_outputs[key][0]
+        elif len(video_embeds):
+            image_sizes = mm_data.pop("image_sizes", None)
+            num_frames = mm_data.pop("num_frames", None)
+            video_outputs = {
+                "video_embeds": torch.cat(video_embeds),
+                "video_image_sizes": image_sizes,
+                "num_frames": num_frames
+            }
+        else:
+            video_outputs = {}
+        return video_outputs
 
-    return token_inputs(
-        prompt_token_ids=new_token_ids,
-        prompt=new_prompt,
-        multi_modal_data=multi_modal_data,
-    )
+    def get_placeholder_match_pattern(self) -> str:
+        return r"\(<(image|video)>./</\1>\)"
 
+    def get_placeholder_split_pattern(self) -> str:
+        return r"\(<(?:image|video)>./</(?:image|video)>\)"
 
-def input_mapper_for_minicpmv(ctx: InputContext, data: object):
-    model_config = ctx.model_config
+    def process_mm_inputs(self, mm_data, mm_kwargs) -> object:
+        return {
+            "image": self.process_images(mm_data, mm_kwargs),
+            "video": self.process_videos(mm_data, mm_kwargs)
+        }
 
-    image_processor = cached_get_image_processor(
-        model_config.model, trust_remote_code=model_config.trust_remote_code)
-    if image_processor is None:
-        raise RuntimeError("No HuggingFace processor is available "
-                           "to process the image object")
+    def get_input_modalities(self, mm_data) -> List[str]:
+        supported_mm_modalities = self.info.get_supported_mm_modalities()
+        input_modalities = []
+        for modality in supported_mm_modalities:
+            if modality in mm_data and mm_data[modality] != {}:
+                input_modalities.append(modality)
+        return input_modalities
+
+    def get_modality_num_counter(self, modality: str) -> str:
+        if modality == "image":
+            return "image_sizes"
+        elif modality == "video":
+            return "video_image_sizes"
+
+    def get_num_slices_by_modality(self, inputs: Dict[str, object],
+                                   modality: str, index: int) -> int:
+        if modality == "image":
+            return self.info.get_image_slice_nums(
+                inputs[modality]["image_sizes"][index],
+                self.info.get_max_slice_num())
+        elif modality == "video":
+            return self.info.get_image_slice_nums(
+                inputs[modality]["video_image_sizes"][index],
+                self.info.get_video_max_slice_num()
+            ) * inputs[modality]["num_frames"][index]
+        else:
+            raise ValueError(f"UnExpected modality: {modality}")
+
+    def check_mm_inputs(self, inputs: Dict[str, object],
+                        matches: List[str]) -> None:
+        counts = Counter(matches)
+        for modality, count in counts.items():
+            if modality not in inputs or not inputs[modality]:
+                raise ValueError(f"None input data of {modality}."
+                                 "But prompt requires.")
+            counter_key = self.get_modality_num_counter(modality)
+            if len(inputs[modality][counter_key]) != count:
+                raise ValueError(f"The prompt requires {count} "
+                                 f"{modality} inputs while you pass "
+                                 f"{len(inputs[modality][counter_key])}")
+
+    def get_prompt_texts_by_modality(self, inputs: Dict[str, object],
+                                     modality: str, index: int) -> str:
+        if modality == "image":
+            return self.get_image_prompt_texts(
+                inputs["image"]["image_sizes"][index], index)
+        elif modality == "video":
+            return self.get_video_prompt_texts(
+                inputs["video"]["video_image_sizes"][index],
+                inputs["video"]["num_frames"][index])
+        else:
+            raise ValueError(f"UnExpected modality: {modality}")
 
-    if not isinstance(data, list):
-        raise ValueError(
-            "Image input must be list of MiniCPMVImageInput, got (%s)", data)
+    def call_base_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        return super()._call_hf_processor(prompt=prompt,
+                                          mm_data=mm_data,
+                                          mm_kwargs=mm_kwargs)
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        # Do not support combination inputs of images and videos for now
+        # Try to handle interleaved multimodal data
+        tokenizer = self.info.get_tokenizer()
+        inputs = self.process_mm_inputs(mm_data, mm_kwargs)
+        mm_input_modalities = self.get_input_modalities(inputs)
+        num_mm_slices = {modality: [] for modality in mm_input_modalities}
+        for modality in mm_input_modalities:
+            num_counter_key = self.get_modality_num_counter(modality)
+            for index in range(len(inputs[modality][num_counter_key])):
+                num_mm_slices[modality].append(
+                    self.get_num_slices_by_modality(inputs, modality, index))
+        return {
+            "input_ids": np.array([tokenizer.encode(prompt)]),
+            **{
+                key: value
+                for modality in inputs
+                for key, value in inputs[modality].items()
+            },
+            **{
+                f"{modality}_num_slices": num_mm_slices[modality]
+                for modality in mm_input_modalities
+            }
+        }
 
-    if len(data) > 0 and isinstance(data[0]['image'], torch.Tensor):
-        batch_data = {
-            "image_embeds": data[0]['image'],
+    def _get_prompt_replacements(
+            self, mm_items: MultiModalDataItems,
+            hf_processor_mm_kwargs: Mapping[str, Any],
+            out_mm_kwargs: MultiModalKwargs) -> List[PromptReplacement]:
+        placeholder = {
+            "image": self.info.image_pattern,
+            "video": self.info.video_pattern,
         }
-    else:
-        batch_data = image_processor \
-            .preprocess([img["image"] for img in data], return_tensors="pt") \
-            .data
 
-    if len(data) > 0:
-        batch_data["im_start_id"] = data[0]["im_start_id"]
-        batch_data["im_end_id"] = data[0]["im_end_id"]
-        if "slice_start_id" in data[0]:
-            batch_data["slice_start_id"] = data[0]["slice_start_id"]
-            batch_data["slice_end_id"] = data[0]["slice_end_id"]
+        def get_replacement_minicpmv(item_idx: int, modality: str):
+            if modality == "image":
+                return self.get_image_prompt_texts(
+                    mm_items["image"].get_image_size(item_idx), item_idx)
+            else:  # video
+                return self.get_video_prompt_texts(
+                    mm_items["video"].get_frame_size(item_idx),
+                    mm_items["video"].get_num_frames(item_idx))
+
+        return [
+            PromptReplacement(modality=modality,
+                              target=placeholder[modality],
+                              replacement=partial(get_replacement_minicpmv,
+                                                  modality=modality))
+            for modality in ("image", "video")
+        ]
 
-    return MultiModalKwargs(batch_data)
+    def _get_mm_fields_config(
+        self,
+        hf_inputs,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+
+        def get_slices(num_slices: List[int]) -> List[int]:
+            slice_indices = [0] + list(accumulate(num_slices))
+            slices = [(slice_indices[i], slice_indices[i + 1])
+                      for i in range(len(num_slices))]
+            return [slice(*slice_item) for slice_item in slices]
+
+        image_slices = get_slices(
+            hf_inputs.get("image_num_slices", torch.empty(0)))
+        video_slices = get_slices(
+            hf_inputs.get("video_num_slices", torch.empty(0)))
+
+        return dict(
+            pixel_values=MultiModalFieldConfig.flat("image", image_slices),
+            image_sizes=MultiModalFieldConfig.batched("image"),
+            tgt_sizes=MultiModalFieldConfig.flat("image", image_slices),
+            image_num_slices=MultiModalFieldConfig.batched("image"),
+            image_embeds=MultiModalFieldConfig.flat("image", image_slices),
+            video_pixel_values=MultiModalFieldConfig.flat(
+                "video", video_slices),
+            video_image_sizes=MultiModalFieldConfig.batched("video"),
+            video_tgt_sizes=MultiModalFieldConfig.flat("video", video_slices),
+            video_embeds=MultiModalFieldConfig.flat("video", video_slices),
+            video_num_slices=MultiModalFieldConfig.batched("video"))
+
+    def apply(
+        self,
+        prompt: Union[str, List[int]],
+        mm_data: MultiModalDataDict,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> MultiModalInputs:
+        supported_mm_modalities = self.info.get_supported_mm_modalities()
+        if isinstance(prompt, list):
+            prompt = self.info.get_tokenizer().decode(prompt)
+        matches = re.findall(self.get_placeholder_match_pattern(), prompt)
+        mm_orders = {
+            f"{modality}_orders":
+            torch.tensor(
+                [index for index, m in enumerate(matches) if m == modality])
+            for modality in supported_mm_modalities
+        }
+        result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
+        # Exclude <image_id>x</image_id> from placeholders
+        if "image" in result["mm_placeholders"] and \
+            self.info.get_model_version() == (2, 6):
+            result["mm_placeholders"]["image"] = [
+                PlaceholderRange(offset=p["offset"] + 3 + idx // 10,
+                                 length=p["length"] - 3 - idx // 10)
+                for idx, p in enumerate(result["mm_placeholders"]["image"])
+            ]
+        result["mm_kwargs"].update(**mm_orders)
+        result["mm_kwargs"].update(**self.get_special_tokens())
+        return result
 
 
 class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
@@ -409,7 +858,7 @@ def sampler(self):
 
         return get_sampler()
 
-    def get_embedding(
+    def get_embedding_with_vision(
         self,
         input_ids: torch.Tensor,
         image_inputs: Optional[MiniCPMVImageInputs],
@@ -471,25 +920,46 @@ def _get_image_bounds(
             image_end_tokens[:valid_image_nums].unsqueeze(-1),
         ])
 
-    def _parse_and_validate_inputs(
+    def _parse_and_validate_image_inputs(
         self,
         input_ids: torch.Tensor,
         **kwargs: object,
     ) -> Optional[MiniCPMVImageInputs]:
-        pixel_values = kwargs.pop("pixel_values", [])
-        tgt_sizes = kwargs.pop("tgt_sizes", [])
+        mm_data = {
+            "image": {
+                key: kwargs.pop(key, [])
+                for key in ["pixel_values", "tgt_sizes", "image_num_slices"]
+            },
+            "video": {
+                "pixel_values": kwargs.pop("video_pixel_values", []),
+                "tgt_sizes": kwargs.pop("video_tgt_sizes", []),
+                "video_num_slices": kwargs.pop("video_num_slices", [])
+            }
+        }
         im_start_id = kwargs.pop("im_start_id", None)
         im_end_id = kwargs.pop("im_end_id", None)
         slice_start_id = kwargs.pop("slice_start_id", None)
         slice_end_id = kwargs.pop("slice_end_id", None)
+        mm_orders = {
+            f"{modality}": kwargs.pop(f"{modality}_orders", None)
+            for modality in ["image", "video", "audio"]
+        }
+        batch_size = max(len(mm_data["image"]["pixel_values"]),
+                         len(mm_data["video"]["pixel_values"]))
         image_embeds = kwargs.pop("image_embeds", None)
-
+        video_embeds = kwargs.pop("video_embeds", None)
+        if image_embeds is not None and video_embeds is not None:
+            raise ValueError(
+                "Incorrect inputs for vision embeddings. "
+                "Image embeds and video embeds can not exist simultaneously.")
+        if video_embeds is not None:
+            image_embeds = video_embeds
         if image_embeds is not None:
             if not isinstance(image_embeds, (torch.Tensor, list)):
                 raise ValueError(f"Incorrect type of image embeds. "
                                  f"Got type: {type(image_embeds)}")
-            if isinstance(image_embeds, list):
-                image_embeds = torch.concat(image_embeds)
+            image_embeds = torch.concat(
+                [image_embeds[i] for i in range(len(image_embeds))])
 
             return MiniCPMVImageEmbeddingInputs(
                 image_bounds=self._get_image_bounds(input_ids, im_start_id,
@@ -498,29 +968,47 @@ def _parse_and_validate_inputs(
                 data=image_embeds,
                 type="image_embeds",
             )
-
-        if not isinstance(pixel_values, (torch.Tensor, list)):
-            raise ValueError("Incorrect type of pixel values. "
-                             f"Got type: {type(pixel_values)}")
-
-        if not isinstance(tgt_sizes, (torch.Tensor, list)):
-            raise ValueError("Incorrect type of target sizes. "
-                             f"Got type: {type(tgt_sizes)}")
-
-        if len(pixel_values) != len(tgt_sizes):
-            raise ValueError("Inconsistent batch lengths, found: "
-                             f"{len(pixel_values)} vs. {len(tgt_sizes)}")
+        for modality, modality_mm_data in mm_data.items():
+            if not isinstance(modality_mm_data["pixel_values"],
+                              (torch.Tensor, list)):
+                raise ValueError(
+                    "Incorrect type of pixel values. "
+                    f"Got type: {type(modality_mm_data['pixel_values'])}")
+
+            if not isinstance(modality_mm_data["tgt_sizes"],
+                              (torch.Tensor, list)):
+                raise ValueError(
+                    "Incorrect type of target sizes. "
+                    f"Got type: {type(modality_mm_data['tgt_sizes'])}")
+
+            if len(modality_mm_data["pixel_values"]) != len(
+                    modality_mm_data["tgt_sizes"]):
+                raise ValueError(
+                    "Inconsistent batch lengths, found: "
+                    f"{len(modality_mm_data['pixel_values'])} vs. "
+                    f"{len(modality_mm_data['tgt_sizes'])}")
 
         pixel_values_flat: List[torch.Tensor] = []
         tgt_sizes_flat: List[torch.Tensor] = []
-        for pixel_b, tgt_b in zip(pixel_values, tgt_sizes):
-            if len(pixel_b) != len(tgt_b):
-                raise ValueError("Inconsistent N lengths, found: "
-                                 f"{len(pixel_b)} vs {len(tgt_b)}")
-
-            for pixel_n, tgt_n in zip(pixel_b, tgt_b):
-                pixel_values_flat += pixel_n
-                tgt_sizes_flat += tgt_n
+        for b in range(batch_size):
+            mm_counts = {"image": 0, "video": 0} if self.version == (2, 6) \
+                        else {"image": 0}
+            mm_slice_counts = {"image": 0, "video": 0} \
+                               if self.version == (2, 6) else {"image": 0}
+            mm_orders_b = [(index, modality) for modality in mm_counts
+                           for index in mm_orders[modality][b]]
+            for _, modality in sorted(mm_orders_b, key=lambda x: x[0]):
+                pos = mm_counts[modality]
+                num_slices = mm_data[modality][f"{modality}_num_slices"][b][
+                    pos]
+                slice_start_idx = mm_slice_counts[modality]
+                slice_end_idx = slice_start_idx + num_slices
+                pixel_values_flat += mm_data[modality]["pixel_values"][b][
+                    slice_start_idx:slice_end_idx]
+                tgt_sizes_flat += mm_data[modality]["tgt_sizes"][b][
+                    slice_start_idx:slice_end_idx]
+                mm_counts[modality] += 1
+                mm_slice_counts[modality] += num_slices
 
         # NOTE: Input IDs does not contain image tokens during memory profiling,
         # so we allow it to be empty
@@ -544,6 +1032,10 @@ def _parse_and_validate_inputs(
             type="pixel_values",
         )
 
+    def _parse_and_validate_inputs(self, input_ids: torch.Tensor,
+                                   **kwargs: object):
+        return self._parse_and_validate_image_inputs(input_ids, **kwargs)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -556,9 +1048,10 @@ def forward(
         if intermediate_tensors is not None:
             vlm_embeddings = None
         else:
-            image_inputs = self._parse_and_validate_inputs(input_ids, **kwargs)
-
-            vlm_embeddings, _ = self.get_embedding(input_ids, image_inputs)
+            image_inputs = \
+                self._parse_and_validate_inputs(input_ids, **kwargs)
+            vlm_embeddings, _ = self.get_embedding_with_vision(
+                input_ids, image_inputs)
 
         # always pass the input via `inputs_embeds`
         # to make sure the computation graph is consistent
@@ -964,15 +1457,15 @@ def get_vision_hidden_states(self,
 _SUPPORT_VERSION = {
     (2, 0): MiniCPMV2_0,
     (2, 5): MiniCPMV2_5,
-    (2, 6): MiniCPMV2_6
+    (2, 6): MiniCPMV2_6,
 }
 
 
-@MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_minicpmv)
-@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_minicpmv_image_tokens)
-@INPUT_REGISTRY.register_dummy_data(dummy_data_for_minicpmv)
-@INPUT_REGISTRY.register_input_processor(input_processor_for_minicpmv)
-class MiniCPMV(MiniCPMVBaseModel, SupportsLoRA):
+@MULTIMODAL_REGISTRY.register_processor(
+    MiniCPMVMultiModalProcessor,
+    info=MiniCPMVProcessingInfo,
+    dummy_inputs=MiniCPMVDummyInputsBuilder)
+class MiniCPMV(MiniCPMVBaseModel, SupportsMultiModal, SupportsLoRA):
     """
     Different versions of MiniCPMV use different visual encoders and LLMs,
     which is not conducive to the current integration logic of LoRA and
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 8d71b19060bf4..de05bf2b772f5 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -162,6 +162,7 @@
     "LlavaNextVideoForConditionalGeneration": ("llava_next_video", "LlavaNextVideoForConditionalGeneration"),  # noqa: E501
     "LlavaOnevisionForConditionalGeneration": ("llava_onevision", "LlavaOnevisionForConditionalGeneration"),  # noqa: E501
     "MantisForConditionalGeneration": ("llava", "MantisForConditionalGeneration"),  # noqa: E501
+    "MiniCPMO": ("minicpmo", "MiniCPMO"),
     "MiniCPMV": ("minicpmv", "MiniCPMV"),
     "MolmoForCausalLM": ("molmo", "MolmoForCausalLM"),
     "NVLM_D": ("nvlm_d", "NVLM_D_Model"),

From ff7424f491935a1b4737bcc1570de0d616fc22f3 Mon Sep 17 00:00:00 2001
From: Yanyi Liu <wolfsonliu@163.com>
Date: Wed, 29 Jan 2025 17:41:01 +0800
Subject: [PATCH 427/462] [Frontend] Support override generation config in args
 (#12409)

Signed-off-by: liuyanyi <wolfsonliu@163.com>
---
 tests/test_config.py     | 70 ++++++++++++++++++++++++++++++++++++++++
 vllm/config.py           | 13 ++++++--
 vllm/engine/arg_utils.py | 25 ++++++++++----
 3 files changed, 100 insertions(+), 8 deletions(-)

diff --git a/tests/test_config.py b/tests/test_config.py
index 4518adfc31bfc..ec366b93d6a37 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -281,3 +281,73 @@ def test_uses_mrope(model_id, uses_mrope):
     )
 
     assert config.uses_mrope == uses_mrope
+
+
+def test_generation_config_loading():
+    model_id = "Qwen/Qwen2.5-1.5B-Instruct"
+
+    # When set generation_config to None, the default generation config
+    # will not be loaded.
+    model_config = ModelConfig(model_id,
+                               task="auto",
+                               tokenizer=model_id,
+                               tokenizer_mode="auto",
+                               trust_remote_code=False,
+                               seed=0,
+                               dtype="float16",
+                               generation_config=None)
+    assert model_config.get_diff_sampling_param() == {}
+
+    # When set generation_config to "auto", the default generation config
+    # should be loaded.
+    model_config = ModelConfig(model_id,
+                               task="auto",
+                               tokenizer=model_id,
+                               tokenizer_mode="auto",
+                               trust_remote_code=False,
+                               seed=0,
+                               dtype="float16",
+                               generation_config="auto")
+
+    correct_generation_config = {
+        "repetition_penalty": 1.1,
+        "temperature": 0.7,
+        "top_p": 0.8,
+        "top_k": 20,
+    }
+
+    assert model_config.get_diff_sampling_param() == correct_generation_config
+
+    # The generation config could be overridden by the user.
+    override_generation_config = {"temperature": 0.5, "top_k": 5}
+
+    model_config = ModelConfig(
+        model_id,
+        task="auto",
+        tokenizer=model_id,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype="float16",
+        generation_config="auto",
+        override_generation_config=override_generation_config)
+
+    override_result = correct_generation_config.copy()
+    override_result.update(override_generation_config)
+
+    assert model_config.get_diff_sampling_param() == override_result
+
+    # When generation_config is set to None and override_generation_config
+    # is set, the override_generation_config should be used directly.
+    model_config = ModelConfig(
+        model_id,
+        task="auto",
+        tokenizer=model_id,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype="float16",
+        generation_config=None,
+        override_generation_config=override_generation_config)
+
+    assert model_config.get_diff_sampling_param() == override_generation_config
diff --git a/vllm/config.py b/vllm/config.py
index d7c9311ae3cb0..58464eae80b82 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -165,6 +165,8 @@ class ModelConfig:
             `logits_processors` extra completion argument. Defaults to None,
             which allows no processors.
         generation_config: Configuration parameter file for generation.
+        override_generation_config: Override the generation config with the
+            given config.
     """
 
     def compute_hash(self) -> str:
@@ -225,6 +227,7 @@ def __init__(
         logits_processor_pattern: Optional[str] = None,
         generation_config: Optional[str] = None,
         enable_sleep_mode: bool = False,
+        override_generation_config: Optional[Dict[str, Any]] = None,
     ) -> None:
         self.model = model
         self.tokenizer = tokenizer
@@ -368,6 +371,7 @@ def __init__(
         self.logits_processor_pattern = logits_processor_pattern
 
         self.generation_config = generation_config
+        self.override_generation_config = override_generation_config or {}
 
         self._verify_quantization()
         self._verify_cuda_graph()
@@ -904,8 +908,13 @@ def get_diff_sampling_param(self) -> Dict[str, Any]:
         """
         if self.generation_config is None:
             # When generation_config is not set
-            return {}
-        config = self.try_get_generation_config()
+            config = {}
+        else:
+            config = self.try_get_generation_config()
+
+        # Overriding with given generation config
+        config.update(self.override_generation_config)
+
         available_params = [
             "repetition_penalty",
             "temperature",
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index ba96484e3fce9..1f203b6eaeb33 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -195,6 +195,7 @@ class EngineArgs:
     kv_transfer_config: Optional[KVTransferConfig] = None
 
     generation_config: Optional[str] = None
+    override_generation_config: Optional[Dict[str, Any]] = None
     enable_sleep_mode: bool = False
 
     calculate_kv_scales: Optional[bool] = None
@@ -936,12 +937,23 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             type=nullable_str,
             default=None,
             help="The folder path to the generation config. "
-            "Defaults to None, will use the default generation config in vLLM. "
-            "If set to 'auto', the generation config will be automatically "
-            "loaded from model. If set to a folder path, the generation config "
-            "will be loaded from the specified folder path. If "
-            "`max_new_tokens` is specified, then it sets a server-wide limit "
-            "on the number of output tokens for all requests.")
+            "Defaults to None, no generation config is loaded, vLLM defaults "
+            "will be used. If set to 'auto', the generation config will be "
+            "loaded from model path. If set to a folder path, the generation "
+            "config will be loaded from the specified folder path. If "
+            "`max_new_tokens` is specified in generation config, then "
+            "it sets a server-wide limit on the number of output tokens "
+            "for all requests.")
+
+        parser.add_argument(
+            "--override-generation-config",
+            type=json.loads,
+            default=None,
+            help="Overrides or sets generation config in JSON format. "
+            "e.g. ``{\"temperature\": 0.5}``. If used with "
+            "--generation-config=auto, the override parameters will be merged "
+            "with the default config from the model. If generation-config is "
+            "None, only the override parameters are used.")
 
         parser.add_argument("--enable-sleep-mode",
                             action="store_true",
@@ -1002,6 +1014,7 @@ def create_model_config(self) -> ModelConfig:
             override_pooler_config=self.override_pooler_config,
             logits_processor_pattern=self.logits_processor_pattern,
             generation_config=self.generation_config,
+            override_generation_config=self.override_generation_config,
             enable_sleep_mode=self.enable_sleep_mode,
         )
 

From b02fd288b28f0bfa2d7ac8958fe0d71ec22ffc1b Mon Sep 17 00:00:00 2001
From: Pavani Majety <pmajety@nvidia.com>
Date: Wed, 29 Jan 2025 01:46:12 -0800
Subject: [PATCH 428/462] [Hardware][NV] Fix Modelopt model loading for
 k-v-scales for Llama models. (#11787)

Signed-off-by: Pavani Majety <pmajety@nvidia.com>
Co-authored-by: mgoin <michael@neuralmagic.com>
---
 vllm/model_executor/model_loader/weight_utils.py | 11 ++++++++++-
 vllm/model_executor/models/llama.py              |  9 +++++----
 vllm/model_executor/models/mixtral.py            |  6 +++++-
 3 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index b764a940b1742..e4d103f7cab99 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -652,9 +652,18 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> Optional[str]:
         return remapped_name
 
     possible_scale_names = [".k_scale", ".v_scale"]
+    modelopt_scale_names = [
+        ".self_attn.k_proj.k_scale", ".self_attn.v_proj.v_scale"
+    ]
     for scale_name in possible_scale_names:
         if name.endswith(scale_name):
-            remapped_name = name.replace(scale_name, f".attn{scale_name}")
+            if any(mo_scale_name in name
+                   for mo_scale_name in modelopt_scale_names):
+                remapped_name = name.replace(
+                    f".self_attn.{scale_name[1]}_proj{scale_name}",
+                    f".self_attn.attn{scale_name}")
+            else:
+                remapped_name = name.replace(scale_name, f".attn{scale_name}")
             if remapped_name not in params_dict:
                 logger.warning_once(
                     f"Found {scale_name} in the checkpoint (e.g. {name}), "
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index e214c30f5d60b..e7c264c04f1aa 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -404,6 +404,11 @@ def load_weights(self, weights: Iterable[Tuple[str,
                 weight_loader(param, loaded_weight)
                 loaded_params.add(scale_name)
                 continue
+            if "scale" in name:
+                # Remapping the name of FP8 kv-scale.
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
@@ -423,10 +428,6 @@ def load_weights(self, weights: Iterable[Tuple[str,
                 # Skip loading extra bias for GPTQ models.
                 if name.endswith(".bias") and name not in params_dict:
                     continue
-                # Remapping the name of FP8 kv-scale.
-                name = maybe_remap_kv_scale_name(name, params_dict)
-                if name is None:
-                    continue
 
                 if is_pp_missing_parameter(name, self):
                     continue
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index da415cdae96ed..fbb3704fa080f 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -452,7 +452,11 @@ def load_weights(self, weights: Iterable[Tuple[str,
                 # Skip layers on other devices.
                 if is_pp_missing_parameter(name, self):
                     continue
-
+                if name.endswith("scale"):
+                    # Remapping the name of FP8 kv-scale.
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
                 param = params_dict[name]
                 weight_loader = param.weight_loader
                 weight_loader(param, loaded_weight, shard_id)

From 27b78c73cad00f5c7bb3b2431f02dc680f7034bc Mon Sep 17 00:00:00 2001
From: Jinzhen Lin <linjinzhen@hotmail.com>
Date: Wed, 29 Jan 2025 22:07:09 +0800
Subject: [PATCH 429/462] [Kernel] add triton fused moe kernel for gptq/awq
 (#12185)

---
 tests/kernels/test_moe.py                     |  91 ++++
 .../layers/fused_moe/fused_moe.py             | 407 ++++++++++++++---
 .../layers/quantization/__init__.py           |   7 +-
 .../layers/quantization/moe_wna16.py          | 424 ++++++++++++++++++
 4 files changed, 874 insertions(+), 55 deletions(-)
 create mode 100644 vllm/model_executor/layers/quantization/moe_wna16.py

diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py
index 7fa5de1984452..7aa248ed1475c 100644
--- a/tests/kernels/test_moe.py
+++ b/tests/kernels/test_moe.py
@@ -18,6 +18,8 @@
     fused_moe as iterative_moe)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
     marlin_quantize)
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    quantize_weights)
 from vllm.model_executor.models.mixtral import MixtralMoE
 from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
@@ -55,6 +57,95 @@ def test_fused_moe(
                                rtol=0)
 
 
+@pytest.mark.parametrize("m", [1, 32, 222])
+@pytest.mark.parametrize("n", [128, 1024, 2048])
+@pytest.mark.parametrize("k", [128, 1024])
+@pytest.mark.parametrize("e", NUM_EXPERTS)
+@pytest.mark.parametrize("topk", TOP_KS)
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("group_size", [64, 128])
+@pytest.mark.parametrize("has_zp", [True, False])
+@pytest.mark.parametrize("weight_bits", [4, 8])
+def test_fused_moe_wn16(m: int, n: int, k: int, e: int, topk: int,
+                        dtype: torch.dtype, group_size: int, has_zp: bool,
+                        weight_bits: int):
+    print(m, n, k, e, topk, dtype, group_size, has_zp, weight_bits)
+    a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
+    w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
+    w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
+    score = torch.randn((m, e), device="cuda", dtype=dtype)
+
+    if weight_bits == 4:
+        pack_factor = 2
+        quant_type = scalar_types.uint4 if has_zp else scalar_types.uint4b8
+    elif weight_bits == 8:
+        pack_factor = 1
+        quant_type = scalar_types.uint8 if has_zp else scalar_types.uint8b128
+
+    w1_ref = w1.clone()
+    w2_ref = w2.clone()
+    w1_qweight = torch.empty((e, 2 * n, k // pack_factor),
+                             device="cuda",
+                             dtype=torch.uint8)
+    w2_qweight = torch.empty((e, k, n // pack_factor),
+                             device="cuda",
+                             dtype=torch.uint8)
+    w1_scales = torch.empty((e, 2 * n, k // group_size),
+                            device="cuda",
+                            dtype=dtype)
+    w2_scales = torch.empty((e, k, n // group_size),
+                            device="cuda",
+                            dtype=dtype)
+    w1_qzeros = torch.empty((e, 2 * n // pack_factor, k // group_size),
+                            device="cuda",
+                            dtype=torch.uint8)
+    w2_qzeros = torch.empty((e, k // pack_factor, n // group_size),
+                            device="cuda",
+                            dtype=torch.uint8)
+
+    for i in range(e * 2):
+        expert_id = i % e
+        if i // e == 0:
+            w, w_ref, w_qweight, w_scales, w_qzeros = \
+                w1, w1_ref, w1_qweight, w1_scales, w1_qzeros
+        else:
+            w, w_ref, w_qweight, w_scales, w_qzeros = \
+                w2, w2_ref, w2_qweight, w2_scales, w2_qzeros
+        weight, qweight, scales, qzeros = quantize_weights(
+            w[expert_id].T, quant_type, group_size, has_zp, False)
+        weight = weight.T
+        qweight = qweight.T.contiguous().to(torch.uint8)
+        scales = scales.T
+        if has_zp:
+            qzeros = qzeros.T.contiguous().to(torch.uint8)
+        if weight_bits == 4:
+            qweight = qweight[:, 1::2] * 16 + qweight[:, ::2]
+            if has_zp:
+                qzeros = qzeros[1::2, :] * 16 + qzeros[::2, :]
+
+        w_ref[expert_id] = weight
+        w_qweight[expert_id] = qweight
+        w_scales[expert_id] = scales
+        if has_zp:
+            w_qzeros[expert_id] = qzeros
+
+    triton_output = fused_moe(a,
+                              w1_qweight,
+                              w2_qweight,
+                              score,
+                              topk,
+                              renormalize=False,
+                              use_int4_w4a16=weight_bits == 4,
+                              use_int8_w8a16=weight_bits == 8,
+                              w1_scale=w1_scales,
+                              w2_scale=w2_scales,
+                              w1_zp=w1_qzeros if has_zp else None,
+                              w2_zp=w2_qzeros if has_zp else None,
+                              block_shape=[0, group_size])
+    torch_output = torch_moe(a, w1_ref, w2_ref, score, topk)
+    torch.testing.assert_close(triton_output, torch_output, atol=2e-2, rtol=0)
+
+
 @pytest.mark.parametrize("dtype",
                          [torch.float32, torch.float16, torch.bfloat16])
 @torch.inference_mode()
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 308c1d6ac6db1..dbb6c2ce4649e 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -19,6 +19,206 @@
 logger = init_logger(__name__)
 
 
+@triton.jit
+def fused_moe_kernel_gptq_awq(
+        # Pointers to matrices
+        a_ptr,
+        b_ptr,
+        c_ptr,
+        b_scale_ptr,
+        b_zp_ptr,
+        topk_weights_ptr,
+        sorted_token_ids_ptr,
+        expert_ids_ptr,
+        num_tokens_post_padded_ptr,
+        # Matrix dimensions
+        N: tl.constexpr,
+        K: tl.constexpr,
+        EM,
+        num_valid_tokens,
+        # The stride variables represent how much to increase the ptr by when
+        # moving by 1 element in a particular dimension. E.g. `stride_am` is
+        # how much to increase `a_ptr` by to get the element one row down
+        # (A has M rows).
+        stride_am,
+        stride_ak,
+        stride_be,
+        stride_bk,
+        stride_bn,
+        stride_cm,
+        stride_cn,
+        stride_bse,
+        stride_bsk,
+        stride_bsn,
+        stride_bze,
+        stride_bzk,
+        stride_bzn,
+        block_k_diviable: tl.constexpr,
+        group_size: tl.constexpr,
+        # Meta-parameters
+        BLOCK_SIZE_M: tl.constexpr,
+        BLOCK_SIZE_N: tl.constexpr,
+        BLOCK_SIZE_K: tl.constexpr,
+        GROUP_SIZE_M: tl.constexpr,
+        MUL_ROUTED_WEIGHT: tl.constexpr,
+        top_k: tl.constexpr,
+        compute_type: tl.constexpr,
+        has_zp: tl.constexpr,
+        use_int4_w4a16: tl.constexpr,
+        use_int8_w8a16: tl.constexpr):
+    """
+    Implements the fused computation for a Mixture of Experts (MOE) using
+    token and expert matrices.
+
+    Key Parameters:
+    - A: The input tensor representing tokens with shape (*, K), where '*' can
+        be any shape representing batches and K is the feature dimension of
+        each token.
+    - B: The stacked MOE weight tensor with shape (E, N, K), where E is
+        the number of experts, K is the input feature dimension, and N is
+        the output feature dimension.
+    - C: The output cache tensor with shape (M, topk, N), where M is the
+        total number of tokens post padding, topk is the number of times
+        each token is repeated, and N is the output feature dimension.
+    - sorted_token_ids: A tensor containing the sorted indices of tokens,
+        repeated topk times and arranged by the expert index they are
+        assigned to.
+    - expert_ids: A tensor containing the indices of the expert for each
+        block. It determines which expert matrix from B should be used for
+        each block in A.
+    This kernel performs the multiplication of a token by its corresponding
+    expert matrix as determined by `expert_ids`. The sorting of
+    `sorted_token_ids` by expert index and padding ensures divisibility by
+    BLOCK_SIZE_M, which is necessary to maintain consistency in block matrix
+    multiplication across different blocks processed by the same expert.
+    """
+    # -----------------------------------------------------------
+    # Map program ids `pid` to the block of C it should compute.
+    # This is done in a grouped ordering to promote L2 data reuse.
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(EM, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+
+    # ----------------------------------------------------------
+    # Create pointers for the first blocks of A and B.
+    # We will advance this pointer as we move in the K direction
+    # and accumulate
+    # `a_ptrs` is a block of [BLOCK_SIZE_M, BLOCK_SIZE_K] pointers
+    # `b_ptrs` is a block of [BLOCK_SIZE_K, BLOCK_SIZE_N] pointers
+    num_tokens_post_padded = tl.load(num_tokens_post_padded_ptr)
+    if pid_m * BLOCK_SIZE_M >= num_tokens_post_padded:
+        return
+    offs_token_id = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to(
+        tl.int64)
+    offs_token = tl.load(sorted_token_ids_ptr + offs_token_id)
+    token_mask = offs_token < num_valid_tokens
+
+    offs_bn = (pid_n * BLOCK_SIZE_N +
+               tl.arange(0, BLOCK_SIZE_N).to(tl.int64)) % N
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    a_ptrs = a_ptr + (offs_token[:, None] // top_k * stride_am +
+                      offs_k[None, :] * stride_ak)
+
+    off_experts = tl.load(expert_ids_ptr + pid_m).to(tl.int64)
+
+    if use_int4_w4a16:
+        b_ptrs = b_ptr + off_experts * stride_be + \
+            (offs_k[:, None] // 2) * stride_bk + offs_bn[None, :] * stride_bn
+        b_shifter = (offs_k[:, None] % 2) * 4
+    elif use_int8_w8a16:
+        b_ptrs = b_ptr + off_experts * stride_be + \
+            offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn
+
+    if not has_zp and use_int4_w4a16:
+        b_zp_num = 8
+    if not has_zp and use_int8_w8a16:
+        b_zp_num = 128
+    elif has_zp and use_int4_w4a16:
+        b_zp_shifter = (offs_bn[None, :] % 2) * 4
+
+    # -----------------------------------------------------------
+    # Iterate to compute a block of the C matrix.
+    # We accumulate into a `[BLOCK_SIZE_M, BLOCK_SIZE_N]` block
+    # of fp32 values for higher accuracy.
+    # `accumulator` will be converted back to fp16 after the loop.
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        # Load the next block of A and B, generate a mask by checking the
+        # K dimension.
+
+        if not block_k_diviable:
+            k_mask = offs_k[:, None] < K - k * BLOCK_SIZE_K
+            k_other = 0.0
+        else:
+            k_mask = None
+            k_other = None
+
+        a = tl.load(a_ptrs,
+                    mask=token_mask[:, None] &
+                    (offs_k[None, :] < K - k * BLOCK_SIZE_K),
+                    other=0.0)
+        b = tl.load(b_ptrs)
+        if use_int4_w4a16:
+            b = (b >> b_shifter) & 0xF
+
+        b_scale_ptrs = b_scale_ptr + off_experts * stride_bse + \
+            offs_bn[None, :] * stride_bsn + \
+            ((offs_k[:, None] + BLOCK_SIZE_K * k) // group_size) * stride_bsk
+        b_scale = tl.load(b_scale_ptrs, mask=k_mask, other=k_other)
+        b_scale = b_scale.to(tl.float32)
+
+        if has_zp and use_int4_w4a16:
+            offs_k_true = (offs_k[:, None] + BLOCK_SIZE_K * k) // group_size
+            b_zp_ptrs = b_zp_ptr + off_experts * stride_bze + \
+                (offs_bn[None, :] // 2) * stride_bzn + \
+                offs_k_true * stride_bzk
+            b_zp = tl.load(b_zp_ptrs, mask=k_mask, other=k_other)
+            b_zp = ((b_zp >> b_zp_shifter) & 0xF)
+            b_zp = b_zp.to(tl.float32)
+        elif has_zp and use_int8_w8a16:
+            offs_k_true = (offs_k[:, None] + BLOCK_SIZE_K * k) // group_size
+            b_zp_ptrs = b_zp_ptr + off_experts * stride_bze + \
+                offs_bn[None, :] * stride_bzn + \
+                offs_k_true * stride_bzk
+            b_zp = tl.load(b_zp_ptrs, mask=k_mask, other=k_other)
+            b_zp = b_zp.to(tl.float32)
+
+        # We accumulate along the K dimension.
+        if has_zp:
+            b = ((b.to(tl.float32) - b_zp) * b_scale).to(compute_type)
+        else:
+            b = ((b.to(tl.float32) - b_zp_num) * b_scale).to(compute_type)
+        accumulator = tl.dot(a, b, acc=accumulator)
+
+        # Advance the ptrs to the next K block.
+        a_ptrs += BLOCK_SIZE_K * stride_ak
+        if use_int4_w4a16:
+            b_ptrs += (BLOCK_SIZE_K // 2) * stride_bk
+        else:
+            b_ptrs += BLOCK_SIZE_K * stride_bk
+
+    if MUL_ROUTED_WEIGHT:
+        moe_weight = tl.load(topk_weights_ptr + offs_token,
+                             mask=token_mask,
+                             other=0)
+        accumulator = accumulator * moe_weight[:, None]
+
+    accumulator = accumulator.to(compute_type)
+    # -----------------------------------------------------------
+    # Write back the block of the output
+    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    c_ptrs = c_ptr + stride_cm * offs_token[:, None] + stride_cn * offs_cn[
+        None, :]
+    c_mask = token_mask[:, None] & (offs_cn[None, :] < N)
+    tl.store(c_ptrs, accumulator, mask=c_mask)
+
+
 @triton.jit
 def fused_moe_kernel(
         # Pointers to matrices
@@ -266,6 +466,7 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
                             C: torch.Tensor,
                             A_scale: Optional[torch.Tensor],
                             B_scale: Optional[torch.Tensor],
+                            B_zp: Optional[torch.Tensor],
                             topk_weights: torch.Tensor,
                             topk_ids: torch.Tensor,
                             sorted_token_ids: torch.Tensor,
@@ -277,6 +478,7 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
                             compute_type: tl.dtype,
                             use_fp8_w8a8: bool,
                             use_int8_w8a16: bool,
+                            use_int4_w4a16: bool,
                             block_shape: Optional[List[int]] = None) -> None:
     assert topk_weights.stride(1) == 1
     assert sorted_token_ids.stride(0) == 1
@@ -292,50 +494,108 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
             assert triton.cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
             assert triton.cdiv(B.shape[-2], block_n) == B_scale.shape[-2]
             assert triton.cdiv(B.shape[-1], block_k) == B_scale.shape[-1]
-    elif use_int8_w8a16:
+    elif use_int8_w8a16 or use_int4_w4a16:
         assert B_scale is not None
+        assert block_shape is None or block_shape[0] == 0
     else:
         assert A_scale is None
         assert B_scale is None
 
-    grid = lambda META: (triton.cdiv(sorted_token_ids.shape[0], META[
-        'BLOCK_SIZE_M']) * triton.cdiv(B.shape[1], META['BLOCK_SIZE_N']), )
+    EM = sorted_token_ids.shape[0]
+    if A.shape[0] < config["BLOCK_SIZE_M"]:
+        # optimize for small batch_size.
+        # We assume that top_ids of each token is unique, so
+        # so num_valid_experts <= batch_size <= BLOCK_SIZE_M,
+        # and we can skip some invalid blocks.
+        EM = min(sorted_token_ids.shape[0],
+                 A.shape[0] * top_k * config['BLOCK_SIZE_M'])
+    grid = lambda META: (triton.cdiv(EM, META['BLOCK_SIZE_M']) * triton.cdiv(
+        B.shape[1], META['BLOCK_SIZE_N']), )
+
+    if (use_int8_w8a16 or use_int4_w4a16) and \
+            block_shape is not None and block_shape[1] > 0:
+        assert B_scale is not None and B_scale.ndim == 3
+        assert B_zp is None or B_zp.ndim == 3
+
+        fused_moe_kernel_gptq_awq[grid](
+            A,
+            B,
+            C,
+            B_scale,
+            B_zp,
+            topk_weights,
+            sorted_token_ids,
+            expert_ids,
+            num_tokens_post_padded,
+            B.shape[1],
+            A.shape[1],
+            EM,
+            topk_ids.numel(),
+            A.stride(0),
+            A.stride(1),
+            B.stride(0),
+            B.stride(2),
+            B.stride(1),
+            C.stride(1),
+            C.stride(2),
+            B_scale.stride(0),
+            B_scale.stride(2),
+            B_scale.stride(1),
+            B_zp.stride(0) if B_zp is not None else 0,
+            B_zp.stride(2) if B_zp is not None else 0,
+            B_zp.stride(1) if B_zp is not None else 0,
+            block_k_diviable=A.shape[1] % config["BLOCK_SIZE_K"] == 0,
+            group_size=block_shape[1],
+            MUL_ROUTED_WEIGHT=mul_routed_weight,
+            top_k=top_k,
+            compute_type=compute_type,
+            has_zp=B_zp is not None,
+            use_int4_w4a16=use_int4_w4a16,
+            use_int8_w8a16=use_int8_w8a16,
+            **config,
+        )
 
-    fused_moe_kernel[grid](
-        A,
-        B,
-        C,
-        A_scale,
-        B_scale,
-        topk_weights,
-        sorted_token_ids,
-        expert_ids,
-        num_tokens_post_padded,
-        B.shape[1],
-        B.shape[2],
-        sorted_token_ids.shape[0],
-        topk_ids.numel(),
-        A.stride(0),
-        A.stride(1),
-        B.stride(0),
-        B.stride(2),
-        B.stride(1),
-        C.stride(1),
-        C.stride(2),
-        A_scale.stride(0) if A_scale is not None and A_scale.ndim == 2 else 0,
-        A_scale.stride(1) if A_scale is not None and A_scale.ndim == 2 else 0,
-        B_scale.stride(0) if B_scale is not None and B_scale.ndim >= 2 else 0,
-        B_scale.stride(2) if B_scale is not None and B_scale.ndim == 3 else 0,
-        B_scale.stride(1) if B_scale is not None and B_scale.ndim >= 2 else 0,
-        0 if block_shape is None else block_shape[0],
-        0 if block_shape is None else block_shape[1],
-        MUL_ROUTED_WEIGHT=mul_routed_weight,
-        top_k=top_k,
-        compute_type=compute_type,
-        use_fp8_w8a8=use_fp8_w8a8,
-        use_int8_w8a16=use_int8_w8a16,
-        **config,
-    )
+    else:
+        fused_moe_kernel[grid](
+            A,
+            B,
+            C,
+            A_scale,
+            B_scale,
+            topk_weights,
+            sorted_token_ids,
+            expert_ids,
+            num_tokens_post_padded,
+            B.shape[1],
+            A.shape[1],
+            EM,
+            topk_ids.numel(),
+            A.stride(0),
+            A.stride(1),
+            B.stride(0),
+            B.stride(2),
+            B.stride(1),
+            C.stride(1),
+            C.stride(2),
+            A_scale.stride(0)
+            if A_scale is not None and A_scale.ndim == 2 else 0,
+            A_scale.stride(1)
+            if A_scale is not None and A_scale.ndim == 2 else 0,
+            B_scale.stride(0)
+            if B_scale is not None and B_scale.ndim >= 2 else 0,
+            B_scale.stride(2)
+            if B_scale is not None and B_scale.ndim == 3 else 0,
+            B_scale.stride(1)
+            if B_scale is not None and B_scale.ndim >= 2 else 0,
+            0 if block_shape is None else block_shape[0],
+            0 if block_shape is None else block_shape[1],
+            MUL_ROUTED_WEIGHT=mul_routed_weight,
+            top_k=top_k,
+            compute_type=compute_type,
+            use_fp8_w8a8=use_fp8_w8a8,
+            use_int8_w8a16=use_int8_w8a16,
+            **config,
+        )
 
 
 def get_config_file_name(E: int, N: int, dtype: Optional[str]) -> str:
@@ -432,7 +692,7 @@ def try_get_optimal_moe_config(
     # NOTE: For block-wise quant,
     # BLOCK_K must be divisible by block_shape[1]
     # BLOCK_N and BLOCK_M has no requirements
-    if block_shape is not None:
+    if block_shape is not None and block_shape[0] != 0:
         config["BLOCK_SIZE_N"] = block_shape[0]
         config["BLOCK_SIZE_K"] = block_shape[1]
     return config
@@ -531,12 +791,15 @@ def grouped_topk(hidden_states: torch.Tensor,
 
 
 def get_config_dtype_str(dtype: torch.dtype,
+                         use_int4_w4a16: Optional[bool] = False,
                          use_int8_w8a16: Optional[bool] = False,
                          use_fp8_w8a8: Optional[bool] = False):
     if use_fp8_w8a8:
         return "fp8_w8a8"
     elif use_int8_w8a16:
         return "int8_w8a16"
+    elif use_int4_w4a16:
+        return "int4_w8a16"
     elif dtype == torch.float:
         # avoiding cases where kernel fails when float32 MoE
         # use fp16/bfloat16 configs
@@ -551,14 +814,17 @@ def inplace_fused_experts(hidden_states: torch.Tensor,
                           topk_ids: torch.Tensor,
                           use_fp8_w8a8: bool = False,
                           use_int8_w8a16: bool = False,
+                          use_int4_w4a16: bool = False,
                           w1_scale: Optional[torch.Tensor] = None,
                           w2_scale: Optional[torch.Tensor] = None,
+                          w1_zp: Optional[torch.Tensor] = None,
+                          w2_zp: Optional[torch.Tensor] = None,
                           a1_scale: Optional[torch.Tensor] = None,
                           a2_scale: Optional[torch.Tensor] = None,
                           block_shape: Optional[List[int]] = None) -> None:
     fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids, True,
-                       use_fp8_w8a8, use_int8_w8a16, w1_scale, w2_scale,
-                       a1_scale, a2_scale, block_shape)
+                       use_fp8_w8a8, use_int8_w8a16, use_int4_w4a16, w1_scale,
+                       w2_scale, w1_zp, w2_zp, a1_scale, a2_scale, block_shape)
 
 
 def inplace_fused_experts_fake(
@@ -569,8 +835,11 @@ def inplace_fused_experts_fake(
         topk_ids: torch.Tensor,
         use_fp8_w8a8: bool = False,
         use_int8_w8a16: bool = False,
+        use_int4_w4a16: bool = False,
         w1_scale: Optional[torch.Tensor] = None,
         w2_scale: Optional[torch.Tensor] = None,
+        w1_zp: Optional[torch.Tensor] = None,
+        w2_zp: Optional[torch.Tensor] = None,
         a1_scale: Optional[torch.Tensor] = None,
         a2_scale: Optional[torch.Tensor] = None,
         block_shape: Optional[List[int]] = None) -> None:
@@ -593,14 +862,18 @@ def outplace_fused_experts(
         topk_ids: torch.Tensor,
         use_fp8_w8a8: bool = False,
         use_int8_w8a16: bool = False,
+        use_int4_w4a16: bool = False,
         w1_scale: Optional[torch.Tensor] = None,
         w2_scale: Optional[torch.Tensor] = None,
+        w1_zp: Optional[torch.Tensor] = None,
+        w2_zp: Optional[torch.Tensor] = None,
         a1_scale: Optional[torch.Tensor] = None,
         a2_scale: Optional[torch.Tensor] = None,
         block_shape: Optional[List[int]] = None) -> torch.Tensor:
     return fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids,
-                              False, use_fp8_w8a8, use_int8_w8a16, w1_scale,
-                              w2_scale, a1_scale, a2_scale, block_shape)
+                              False, use_fp8_w8a8, use_int8_w8a16,
+                              use_int4_w4a16, w1_scale, w2_scale, w1_zp, w2_zp,
+                              a1_scale, a2_scale, block_shape)
 
 
 def outplace_fused_experts_fake(
@@ -611,8 +884,11 @@ def outplace_fused_experts_fake(
         topk_ids: torch.Tensor,
         use_fp8_w8a8: bool = False,
         use_int8_w8a16: bool = False,
+        use_int4_w4a16: bool = False,
         w1_scale: Optional[torch.Tensor] = None,
         w2_scale: Optional[torch.Tensor] = None,
+        w1_zp: Optional[torch.Tensor] = None,
+        w2_zp: Optional[torch.Tensor] = None,
         a1_scale: Optional[torch.Tensor] = None,
         a2_scale: Optional[torch.Tensor] = None,
         block_shape: Optional[List[int]] = None) -> torch.Tensor:
@@ -635,8 +911,11 @@ def fused_experts(hidden_states: torch.Tensor,
                   inplace: bool = False,
                   use_fp8_w8a8: bool = False,
                   use_int8_w8a16: bool = False,
+                  use_int4_w4a16: bool = False,
                   w1_scale: Optional[torch.Tensor] = None,
                   w2_scale: Optional[torch.Tensor] = None,
+                  w1_zp: Optional[torch.Tensor] = None,
+                  w2_zp: Optional[torch.Tensor] = None,
                   a1_scale: Optional[torch.Tensor] = None,
                   a2_scale: Optional[torch.Tensor] = None,
                   block_shape: Optional[List[int]] = None):
@@ -644,16 +923,15 @@ def fused_experts(hidden_states: torch.Tensor,
         torch.ops.vllm.inplace_fused_experts(hidden_states, w1, w2,
                                              topk_weights, topk_ids,
                                              use_fp8_w8a8, use_int8_w8a16,
-                                             w1_scale, w2_scale, a1_scale,
+                                             use_int4_w4a16, w1_scale,
+                                             w2_scale, w1_zp, w2_zp, a1_scale,
                                              a2_scale, block_shape)
         return hidden_states
     else:
-        return torch.ops.vllm.outplace_fused_experts(hidden_states, w1, w2,
-                                                     topk_weights, topk_ids,
-                                                     use_fp8_w8a8,
-                                                     use_int8_w8a16, w1_scale,
-                                                     w2_scale, a1_scale,
-                                                     a2_scale, block_shape)
+        return torch.ops.vllm.outplace_fused_experts(
+            hidden_states, w1, w2, topk_weights, topk_ids, use_fp8_w8a8,
+            use_int8_w8a16, use_int4_w4a16, w1_scale, w2_scale, w1_zp, w2_zp,
+            a1_scale, a2_scale, block_shape)
 
 
 def fused_experts_impl(hidden_states: torch.Tensor,
@@ -664,13 +942,21 @@ def fused_experts_impl(hidden_states: torch.Tensor,
                        inplace: bool = False,
                        use_fp8_w8a8: bool = False,
                        use_int8_w8a16: bool = False,
+                       use_int4_w4a16: bool = False,
                        w1_scale: Optional[torch.Tensor] = None,
                        w2_scale: Optional[torch.Tensor] = None,
+                       w1_zp: Optional[torch.Tensor] = None,
+                       w2_zp: Optional[torch.Tensor] = None,
                        a1_scale: Optional[torch.Tensor] = None,
                        a2_scale: Optional[torch.Tensor] = None,
                        block_shape: Optional[List[int]] = None):
     # Check constraints.
-    assert hidden_states.shape[1] == w1.shape[2], "Hidden size mismatch"
+    if use_int4_w4a16:
+        assert hidden_states.shape[1] // 2 == w1.shape[
+            2], "Hidden size mismatch"
+    else:
+        assert hidden_states.shape[1] == w1.shape[2], "Hidden size mismatch"
+
     assert topk_weights.shape == topk_ids.shape, "topk shape mismatch"
     assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
     assert w1.is_contiguous(), "Expert weights1 must be contiguous"
@@ -687,6 +973,7 @@ def fused_experts_impl(hidden_states: torch.Tensor,
     M = min(num_tokens, CHUNK_SIZE)
     config_dtype = get_config_dtype_str(use_fp8_w8a8=use_fp8_w8a8,
                                         use_int8_w8a16=use_int8_w8a16,
+                                        use_int4_w4a16=use_int4_w4a16,
                                         dtype=hidden_states.dtype)
 
     get_config_func = functools.partial(
@@ -755,6 +1042,7 @@ def fused_experts_impl(hidden_states: torch.Tensor,
                                 intermediate_cache1,
                                 a1_scale,
                                 w1_scale,
+                                w1_zp,
                                 curr_topk_weights,
                                 curr_topk_ids,
                                 sorted_token_ids,
@@ -766,6 +1054,7 @@ def fused_experts_impl(hidden_states: torch.Tensor,
                                 compute_type=compute_type,
                                 use_fp8_w8a8=use_fp8_w8a8,
                                 use_int8_w8a16=use_int8_w8a16,
+                                use_int4_w4a16=use_int4_w4a16,
                                 block_shape=block_shape)
 
         torch.ops._C.silu_and_mul(intermediate_cache2,
@@ -776,6 +1065,7 @@ def fused_experts_impl(hidden_states: torch.Tensor,
                                 intermediate_cache3,
                                 a2_scale,
                                 w2_scale,
+                                w2_zp,
                                 curr_topk_weights,
                                 curr_topk_ids,
                                 sorted_token_ids,
@@ -787,6 +1077,7 @@ def fused_experts_impl(hidden_states: torch.Tensor,
                                 compute_type=compute_type,
                                 use_fp8_w8a8=use_fp8_w8a8,
                                 use_int8_w8a16=use_int8_w8a16,
+                                use_int4_w4a16=use_int4_w4a16,
                                 block_shape=block_shape)
 
         ops.moe_sum(intermediate_cache3.view(*intermediate_cache3.shape),
@@ -808,8 +1099,11 @@ def fused_moe(
     custom_routing_function: Optional[Callable] = None,
     use_fp8_w8a8: bool = False,
     use_int8_w8a16: bool = False,
+    use_int4_w4a16: bool = False,
     w1_scale: Optional[torch.Tensor] = None,
     w2_scale: Optional[torch.Tensor] = None,
+    w1_zp: Optional[torch.Tensor] = None,
+    w2_zp: Optional[torch.Tensor] = None,
     a1_scale: Optional[torch.Tensor] = None,
     a2_scale: Optional[torch.Tensor] = None,
     block_shape: Optional[List[int]] = None,
@@ -834,8 +1128,12 @@ def fused_moe(
         note: Deepseekv2 model uses grouped_topk
     - use_fp8_w8a8 (bool): If True, use fp8 arithmetic to compute the inner
         products for w1 and w2. Defaults to False.
-    - use_int8_w8a16 (bool): If True, use fp8 arithmetic to compute the inner
-        products for w1 and w2. Defaults to False.
+    - use_int8_w8a16 (bool): If True, use matmul of int8 weight and bf16/fp16
+        activation to compute the inner products for w1 and w2.
+        Defaults to False.
+    - use_int4_w4a16 (bool): If True, use matmul of int4 weight and bf16/fp16
+        activation to compute the inner products for w1 and w2.
+        Defaults to False.
     - w1_scale (Optional[torch.Tensor]): Optional scale to be used for
         w1.
     - w2_scale (Optional[torch.Tensor]): Optional scale to be used for
@@ -873,8 +1171,11 @@ def fused_moe(
                          inplace=inplace,
                          use_fp8_w8a8=use_fp8_w8a8,
                          use_int8_w8a16=use_int8_w8a16,
+                         use_int4_w4a16=use_int4_w4a16,
                          w1_scale=w1_scale,
                          w2_scale=w2_scale,
+                         w1_zp=w1_zp,
+                         w2_zp=w2_zp,
                          a1_scale=a1_scale,
                          a2_scale=a2_scale,
                          block_shape=block_shape)
diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
index d2bde13fcf546..bd0fd47993396 100644
--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -26,7 +26,8 @@
     "experts_int8",
     "neuron_quant",
     "ipex",
-    "quark"
+    "quark",
+    "moe_wna16"
 ]
 
 # The customized quantization methods which will be added to this dict.
@@ -94,6 +95,7 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
     from .ipex_quant import IPEXConfig
     from .marlin import MarlinConfig
     from .modelopt import ModelOptFp8Config
+    from .moe_wna16 import MoeWNA16Config
     from .neuron_quant import NeuronQuantConfig
     from .qqq import QQQConfig
     from .tpu_int8 import Int8TpuConfig
@@ -121,7 +123,8 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
         "experts_int8": ExpertsInt8Config,
         "neuron_quant": NeuronQuantConfig,
         "ipex": IPEXConfig,
-        "quark": QuarkConfig
+        "quark": QuarkConfig,
+        "moe_wna16": MoeWNA16Config,
     }
     # Update the `method_to_config` with customized quantization methods.
     method_to_config.update(_CUSTOMIZED_METHOD_TO_QUANT_CONFIG)
diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py
new file mode 100644
index 0000000000000..8cd9c0a7ef253
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/moe_wna16.py
@@ -0,0 +1,424 @@
+from typing import Any, Callable, Dict, List, Optional
+
+import torch
+
+from vllm.distributed import get_tensor_model_parallel_rank, get_tp_group
+from vllm.model_executor.layers.fused_moe.layer import (
+    FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
+from vllm.model_executor.layers.linear import UnquantizedLinearMethod
+from vllm.model_executor.layers.quantization.awq import (AWQConfig,
+                                                         AWQLinearMethod)
+from vllm.model_executor.layers.quantization.awq_marlin import (
+    AWQMarlinConfig, AWQMarlinLinearMethod)
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig, QuantizeMethodBase)
+from vllm.model_executor.layers.quantization.gptq import (GPTQConfig,
+                                                          GPTQLinearMethod)
+from vllm.model_executor.layers.quantization.gptq_marlin import (
+    GPTQMarlinConfig, GPTQMarlinLinearMethod)
+from vllm.model_executor.utils import set_weight_attrs
+from vllm.platforms import current_platform
+
+
+class MoeWNA16Config(QuantizationConfig):
+    """Config class for MOE WNA16 (W8A16/W4A16) quantization."""
+
+    def __init__(self, linear_quant_method: str, weight_bits: int,
+                 group_size: int, has_zp: bool, lm_head_quantized: bool,
+                 modules_to_not_convert: Optional[List[str]],
+                 full_config: Dict[str, Any]) -> None:
+        self.weight_bits = weight_bits
+        self.group_size = group_size
+        self.has_zp = has_zp
+        self.bit8_pack_factor = 8 // self.weight_bits
+        self.lm_head_quantized = lm_head_quantized
+        self.linear_quant_method = linear_quant_method
+        self.full_config = full_config
+        self.use_marlin = False
+        if self.linear_quant_method == "gptq":
+            self.use_marlin = GPTQMarlinConfig.is_gptq_marlin_compatible(
+                full_config)
+        elif self.linear_quant_method == "awq":
+            capability_tuple = current_platform.get_device_capability()
+            device_capability = (-1 if capability_tuple is None else
+                                 capability_tuple.to_int())
+            awq_min_capability = AWQConfig.get_min_capability()
+            if device_capability < awq_min_capability:
+                raise ValueError(
+                    "The quantization method moe_wna16 + awq is not supported "
+                    "for the current GPU. "
+                    f"Minimum capability: {awq_min_capability}. "
+                    f"Current capability: {device_capability}.")
+            self.use_marlin = AWQMarlinConfig.is_awq_marlin_compatible(
+                full_config)
+        else:
+            raise ValueError("moe_wna16 only support gptq and awq.")
+
+        if modules_to_not_convert is None:
+            self.modules_to_not_convert = []
+        else:
+            self.modules_to_not_convert = modules_to_not_convert
+
+    @classmethod
+    def get_name(cls) -> str:
+        return "moe_wna16"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.bfloat16, torch.half]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 70
+
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        return ["quantize_config.json"]
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> "MoeWNA16Config":
+        linear_quant_method = cls.get_from_keys(config, ["quant_method"])
+        weight_bits = cls.get_from_keys(config, ["bits"])
+        group_size = cls.get_from_keys(config, ["group_size"])
+        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"],
+                                                 default=False)
+        if linear_quant_method == "gptq":
+            has_zp = not cls.get_from_keys(config, ["sym"])
+            modules_to_not_convert = []
+        elif linear_quant_method == "awq":
+            has_zp = cls.get_from_keys(config, ["zero_point"])
+            modules_to_not_convert = cls.get_from_keys(
+                config, ["modules_to_not_convert"])
+        else:
+            raise ValueError("moe_wna16 only support gptq and awq.")
+
+        return cls(linear_quant_method, weight_bits, group_size, has_zp,
+                   lm_head_quantized, modules_to_not_convert, config)
+
+    @classmethod
+    def override_quantization_method(cls, hf_quant_cfg,
+                                     user_quant) -> Optional[str]:
+        can_convert = cls.is_moe_wna16_compatible(hf_quant_cfg)
+        if can_convert and user_quant == "moe_wna16":
+            return cls.get_name()
+        return None
+
+    @classmethod
+    def is_moe_wna16_compatible(cls, quant_config: Dict[str, Any]):
+        # Extract data from quant config.
+        quant_method = quant_config.get("quant_method", "").lower()
+        num_bits = quant_config.get("bits")
+        desc_act = quant_config.get("desc_act")
+
+        capability_tuple = current_platform.get_device_capability()
+        device_capability = (-1 if capability_tuple is None else
+                             capability_tuple.to_int())
+        awq_min_capability = AWQConfig.get_min_capability()
+
+        gptq_compatible = quant_method == "gptq" and \
+                not desc_act and num_bits in [4, 8]
+        awq_compatible = quant_method == "awq" and num_bits == 4 and \
+            device_capability >= awq_min_capability
+
+        return gptq_compatible or awq_compatible
+
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional["QuantizeMethodBase"]:
+        if is_layer_skipped_quant(prefix, self.modules_to_not_convert):
+            return UnquantizedLinearMethod()
+        elif isinstance(layer, FusedMoE):
+            return MoeWNA16Method(self)
+        else:
+            if self.linear_quant_method == "gptq":
+                if self.use_marlin:
+                    return GPTQMarlinLinearMethod(
+                        GPTQMarlinConfig.from_config(self.full_config))
+                else:
+                    return GPTQLinearMethod(
+                        GPTQConfig.from_config(self.full_config))
+            elif self.linear_quant_method == "awq":
+                if self.use_marlin:
+                    return AWQMarlinLinearMethod(
+                        AWQMarlinConfig.from_config(self.full_config))
+                else:
+                    return AWQLinearMethod(
+                        AWQConfig.from_config(self.full_config))
+            else:
+                raise ValueError("moe_wna16 only support gptq and awq.")
+
+
+def is_layer_skipped_quant(prefix: str, modules_to_not_convert: List[str]):
+    return any(module_name in prefix for module_name in modules_to_not_convert)
+
+
+class MoeWNA16Method(FusedMoEMethodBase):
+    """Linear method for MOE WNA16 (W8A16/W4A16) quantization.
+
+    Args:
+        quant_config: The MOE WNA16 (W8A16/W4A16) quantization config.
+    """
+
+    def __init__(self, quant_config: MoeWNA16Config):
+        self.quant_config = quant_config
+
+    def create_weights(self, layer: torch.nn.Module, num_experts: int,
+                       hidden_size: int, intermediate_size_per_partition: int,
+                       params_dtype: torch.dtype, **extra_weight_attrs):
+
+        layer.quant_config = self.quant_config
+        bit8_pack_factor = self.quant_config.bit8_pack_factor
+        group_size = self.quant_config.group_size
+        group_size_div_factor = 1
+
+        # make intermediate_size and hidden_size diviable by group_size
+        # we reduce the group size to ensure that
+        # and we would repeat the loaded_weight later
+        while intermediate_size_per_partition % group_size or \
+                hidden_size % group_size:
+            group_size = group_size // 2
+            group_size_div_factor *= 2
+            assert group_size >= 32
+        layer.group_size = group_size
+        layer.group_size_div_factor = group_size_div_factor
+
+        strategy = FusedMoeWeightScaleSupported.GROUP.value
+        extra_weight_attrs.update({
+            "quant_method": strategy,
+            "is_transposed": False
+        })
+
+        assert 'weight_loader' in extra_weight_attrs
+        weight_loader = extra_weight_attrs['weight_loader']
+        wrapped_weight_loader = MoeWNA16Method.get_weight_loader(
+            layer, weight_loader)
+        extra_weight_attrs['weight_loader'] = wrapped_weight_loader
+
+        # Fused gate_up_proj (column parallel)
+        w13_qweight = torch.nn.Parameter(torch.empty(
+            num_experts,
+            2 * intermediate_size_per_partition,
+            hidden_size // bit8_pack_factor,
+            dtype=torch.uint8),
+                                         requires_grad=False)
+        layer.register_parameter("w13_qweight", w13_qweight)
+        set_weight_attrs(w13_qweight, extra_weight_attrs)
+
+        # down_proj (row parallel)
+        w2_qweight = torch.nn.Parameter(torch.empty(
+            num_experts,
+            hidden_size,
+            intermediate_size_per_partition // bit8_pack_factor,
+            dtype=torch.uint8),
+                                        requires_grad=False)
+        layer.register_parameter("w2_qweight", w2_qweight)
+        set_weight_attrs(w2_qweight, extra_weight_attrs)
+
+        w13_scales = torch.nn.Parameter(torch.zeros(
+            num_experts,
+            2 * intermediate_size_per_partition,
+            hidden_size // group_size,
+            dtype=params_dtype),
+                                        requires_grad=False)
+        layer.register_parameter("w13_scales", w13_scales)
+        set_weight_attrs(w13_scales, extra_weight_attrs)
+
+        w2_scales = torch.nn.Parameter(torch.zeros(
+            num_experts,
+            hidden_size,
+            intermediate_size_per_partition // group_size,
+            dtype=params_dtype),
+                                       requires_grad=False)
+        layer.register_parameter("w2_scales", w2_scales)
+        set_weight_attrs(w2_scales, extra_weight_attrs)
+
+        if self.quant_config.has_zp:
+            w13_qzeros = torch.nn.Parameter(torch.zeros(
+                num_experts,
+                2 * intermediate_size_per_partition // bit8_pack_factor,
+                hidden_size // group_size,
+                dtype=torch.uint8),
+                                            requires_grad=False)
+            layer.register_parameter("w13_qzeros", w13_qzeros)
+            set_weight_attrs(w13_qzeros, extra_weight_attrs)
+
+            w2_qzeros = torch.nn.Parameter(torch.zeros(
+                num_experts,
+                hidden_size // bit8_pack_factor,
+                intermediate_size_per_partition // group_size,
+                dtype=torch.uint8),
+                                           requires_grad=False)
+            layer.register_parameter("w2_qzeros", w2_qzeros)
+            set_weight_attrs(w2_qzeros, extra_weight_attrs)
+
+        if self.quant_config.linear_quant_method == "gptq":
+            # some param are unused, but we need to init them in order to
+            # load weights
+            invalid_param_keys = ["w13_g_idx", "w2_g_idx"]
+            if not self.quant_config.has_zp:
+                invalid_param_keys += ["w13_qzeros", "w2_qzeros"]
+            for key in invalid_param_keys:
+                param = torch.nn.Parameter(torch.empty((0, ),
+                                                       dtype=torch.int32),
+                                           requires_grad=False)
+                layer.register_parameter(key, param)
+                set_weight_attrs(param, extra_weight_attrs)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool = False,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        from vllm.model_executor.layers.fused_moe import fused_experts
+
+        topk_weights, topk_ids = FusedMoE.select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+            use_grouped_topk=use_grouped_topk,
+            top_k=top_k,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias)
+
+        weight_bits = self.quant_config.weight_bits
+        has_zp = self.quant_config.has_zp
+
+        return fused_experts(x,
+                             layer.w13_qweight,
+                             layer.w2_qweight,
+                             topk_weights=topk_weights,
+                             topk_ids=topk_ids,
+                             inplace=True,
+                             use_int4_w4a16=weight_bits == 4,
+                             use_int8_w8a16=weight_bits == 8,
+                             w1_scale=layer.w13_scales,
+                             w2_scale=layer.w2_scales,
+                             w1_zp=layer.w13_qzeros if has_zp else None,
+                             w2_zp=layer.w2_qzeros if has_zp else None,
+                             block_shape=[0, layer.group_size])
+
+    @staticmethod
+    def get_weight_loader(layer, weight_loader):
+
+        def convert_awq_tensor(tensor, tensor_type):
+            # convert awq qweight/qzeros to a standard format (assume int4)
+            # qweight: (k, n // pack_factor_bit32) -> (n, k // pack_factor_bit8)
+            # qzeros: (k // group_size, n // pack_factor_bit32) ->
+            #         (n // pack_factor_bit8, k // group_size)
+            # pack_factor_bit32 = 32 // weight_bits
+            # pack_factor_bit8 = 8 // weight_bits
+
+            # 0. suppose origin shape (a, b), dtype int32
+            # 1. convert to uint8, shape (a, b) -> (a, 4 * b)
+            size0 = tensor.size(0)
+            tensor = tensor.view(torch.uint8)
+
+            # 2. unpack to uint4 (only when weight_bits == 4)
+            #    shape (a, 4 * b) -> (a, 4 * b, 2)
+            shifter = torch.tensor([0, 4],
+                                   dtype=torch.uint8,
+                                   device=tensor.device)
+            tensor = (tensor[:, :, None] >> shifter) & 0xF
+
+            # 3. change order, see
+            # https://github.com/casper-hansen/AutoAWQ/blob/v0.2.8/awq/utils/quant_utils.py
+            # shape -> (a, 4 * b * pack_factor_bit8)
+            reverse_awq_pack_order = [0, 4, 1, 5, 2, 6, 3, 7]
+            tensor = tensor.view(-1, 8)[:, reverse_awq_pack_order]
+            tensor = tensor.view(size0, -1)
+
+            # 4. transpose, shape -> (4 * b * pack_factor_bit8, a)
+            tensor = tensor.T.contiguous()
+
+            # 5. repack (only when weight_bits == 4)
+            # qweight shape -> (4 * b * pack_factor_bit8, a // pack_factor_bit8)
+            # qzeros shape -> (4 * b, a)
+
+            if tensor_type == "qweight":
+                tensor = tensor[:, 1::2] * 16 + tensor[:, ::2]
+            elif tensor_type == "qzeros":
+                tensor = tensor[1::2, :] * 16 + tensor[::2, :]
+            return tensor
+
+        def convert_gptq_int4_qzeros(tensor):
+            tensor = tensor.view(torch.uint8)
+            shifter = torch.tensor([0, 4],
+                                   dtype=torch.uint8,
+                                   device=tensor.device)
+            tensor = (tensor[:, :, None] >> shifter) & 0xF
+            tensor = tensor + 1
+            tensor = tensor[:, :, 0] + tensor[:, :, 1] * 16
+            return tensor
+
+        def moe_wna16_weight_loader(param: torch.nn.Parameter,
+                                    loaded_weight: torch.Tensor,
+                                    weight_name: str, shard_id: str,
+                                    expert_id: int):
+            if "g_idx" in weight_name:
+                return
+            if not layer.quant_config.has_zp and "qzeros" in weight_name:
+                return
+
+            device = get_tp_group().device
+            tp_rank = get_tensor_model_parallel_rank()
+            loaded_weight = loaded_weight.to(device)
+            shard_size = layer.intermediate_size_per_partition
+
+            # convert gptq and awq weight to a standard format
+            if layer.quant_config.linear_quant_method == "awq":
+                assert layer.quant_config.weight_bits == 4
+                if "weight" in weight_name:
+                    loaded_weight = convert_awq_tensor(loaded_weight,
+                                                       "qweight")
+                elif "zeros" in weight_name:
+                    loaded_weight = convert_awq_tensor(loaded_weight, "qzeros")
+                else:
+                    loaded_weight = loaded_weight.T
+            elif layer.quant_config.linear_quant_method == "gptq":
+                assert layer.quant_config.weight_bits in [4, 8]
+                if "weight" in weight_name:
+                    loaded_weight = loaded_weight.T.contiguous().view(
+                        torch.uint8)
+                elif "zeros" in weight_name:
+                    # add 1 to gptq qzeros to align with awq
+                    loaded_weight = loaded_weight.view(torch.uint8)
+                    if layer.quant_config.weight_bits == 4:
+                        loaded_weight = convert_gptq_int4_qzeros(
+                            loaded_weight).T
+                    else:
+                        loaded_weight = loaded_weight.T + 1
+                else:
+                    loaded_weight = loaded_weight.T
+
+            # repeat the qzeros/scales to fit new group size
+            if layer.group_size_div_factor > 1 and \
+                    "qzeros" in weight_name or "scales" in weight_name:
+                loaded_weight = loaded_weight.repeat_interleave(
+                    layer.group_size_div_factor, 1)
+
+            if "w13_qzeros" in weight_name:
+                tensor = loaded_weight.view(layer.tp_size, -1,
+                                            loaded_weight.size(1))[tp_rank]
+                if shard_id == "w1":
+                    param.data[expert_id, :shard_size // 2] = tensor
+                else:
+                    param.data[expert_id, shard_size // 2:] = tensor
+            elif "w2_qzeros" in weight_name:
+                param.data[expert_id] = loaded_weight.view(
+                    loaded_weight.size(0), layer.tp_size, -1)[:, tp_rank]
+            else:
+                weight_loader(param, loaded_weight, weight_name, shard_id,
+                              expert_id)
+
+        return moe_wna16_weight_loader

From 73aa6cfdf789ddc67a3d2924ef52fd791554fe2a Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Wed, 29 Jan 2025 16:12:24 -0500
Subject: [PATCH 430/462] Revert "[Build/CI] Fix libcuda.so linkage" (#12552)

---
 CMakeLists.txt | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4dee9ec36895f..6c946fc5aa3ac 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -446,9 +446,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 endif()
 
 message(STATUS "Enabling C extension.")
-if(VLLM_GPU_LANG STREQUAL "CUDA")
-  list(APPEND VLLM_C_LIBS cuda)
-endif()
 define_gpu_extension_target(
   _C
   DESTINATION vllm
@@ -457,7 +454,6 @@ define_gpu_extension_target(
   COMPILE_FLAGS ${VLLM_GPU_FLAGS}
   ARCHITECTURES ${VLLM_GPU_ARCHES}
   INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR};${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
-  LIBRARIES ${VLLM_C_LIBS}
   USE_SABI 3
   WITH_SOABI)
 

From e0cc5f259a8bec0d66ed0bc3e25ca245377679a1 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 29 Jan 2025 13:47:33 -0800
Subject: [PATCH 431/462] [V1][BugFix] Free encoder cache for aborted requests
 (#12545)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/core/encoder_cache_manager.py |  9 ++++++++-
 vllm/v1/core/scheduler.py             | 14 ++++++++------
 2 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/vllm/v1/core/encoder_cache_manager.py b/vllm/v1/core/encoder_cache_manager.py
index 0cd8c806a3e47..9d570b334c6cf 100644
--- a/vllm/v1/core/encoder_cache_manager.py
+++ b/vllm/v1/core/encoder_cache_manager.py
@@ -38,7 +38,8 @@ def allocate(self, request: Request, input_id: int) -> None:
     def get_cached_input_ids(self, request: Request) -> Set[int]:
         return self.cached.get(request.request_id, set())
 
-    def free(self, request: Request, input_id: int) -> None:
+    def free_encoder_input(self, request: Request, input_id: int) -> None:
+        """Free a single encoder input id for the request."""
         req_id = request.request_id
         if req_id not in self.cached:
             return
@@ -49,6 +50,12 @@ def free(self, request: Request, input_id: int) -> None:
         self.num_free_slots += request.get_num_encoder_tokens(input_id)
         self.freed.append((req_id, input_id))
 
+    def free(self, request: Request) -> None:
+        """Free all cached input ids for the request."""
+        input_ids = self.get_cached_input_ids(request)
+        for input_id in input_ids:
+            self.free_encoder_input(request, input_id)
+
     def get_freed_ids(self) -> List[Tuple[str, int]]:
         freed = self.freed
         self.freed = []
diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index 7a88cc9433b32..da2e31b1fb75b 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -202,7 +202,7 @@ def schedule(self) -> "SchedulerOutput":
                 # which have output tokens.
                 num_new_tokens = request.num_tokens - num_computed_tokens
                 if num_new_tokens == 0:
-                    # The happens when prompt length is divisible by the block
+                    # This happens when prompt length is divisible by the block
                     # size and all blocks are cached. Now we force to recompute
                     # the last block. Note that we have to re-compute an entire
                     # block because allocate_slots() assumes num_computed_tokens
@@ -269,6 +269,7 @@ def schedule(self) -> "SchedulerOutput":
 
         # Get the longest common prefix among all requests in the running queue.
         # This can be potentially used for cascade attention.
+        num_common_prefix_blocks = 0
         if self.running:
             any_request = self.running[0]
             num_common_prefix_blocks = (
@@ -433,7 +434,8 @@ def update_from_output(
                     if start_pos + num_tokens <= request.num_computed_tokens:
                         # The encoder output is already processed and stored
                         # in the decoder's KV cache.
-                        self.encoder_cache_manager.free(request, input_id)
+                        self.encoder_cache_manager.free_encoder_input(
+                            request, input_id)
 
             if request.num_computed_tokens == request.num_tokens:
                 req_index = model_runner_output.req_id_to_index[req_id]
@@ -445,8 +447,10 @@ def update_from_output(
                 # TODO: Update the KV cache manager for prefix caching.
 
                 # Check for stop and update request state.
-                # This must be called before me make the EngineCoreOutput.
+                # This must be called before we make the EngineCoreOutput.
                 stopped = self._check_stop(request)
+                if stopped:
+                    self._free_request(request)
 
                 # Add EngineCoreOutput for this Request.
                 output = EngineCoreOutput(
@@ -472,7 +476,6 @@ def _check_stop(self, request: Request) -> bool:
         if (request.num_tokens >= self.max_model_len
                 or request.num_output_tokens >= request.max_tokens):
             request.status = RequestStatus.FINISHED_LENGTH_CAPPED
-            self._free_request(request)
             return True
 
         sampling_params = request.sampling_params
@@ -480,13 +483,11 @@ def _check_stop(self, request: Request) -> bool:
         if (not sampling_params.ignore_eos
                 and last_token_id == request.eos_token_id):
             request.status = RequestStatus.FINISHED_STOPPED
-            self._free_request(request)
             return True
 
         if last_token_id in (sampling_params.stop_token_ids or ()):
             request.status = RequestStatus.FINISHED_STOPPED
             request.stop_reason = last_token_id
-            self._free_request(request)
             return True
         return False
 
@@ -525,6 +526,7 @@ def finish_requests(
     def _free_request(self, request: Request) -> None:
         assert request.is_finished()
         self.kv_cache_manager.free(request)
+        self.encoder_cache_manager.free(request)
         self.running_reqs_data.pop(request.request_id, None)
         del self.requests[request.request_id]
         self.finished_req_ids.add(request.request_id)

From 1c1bb0bbf20955d346f66bb25d349c1bd9fe6ea2 Mon Sep 17 00:00:00 2001
From: Divakar Verma <137818590+divakar-amd@users.noreply.github.com>
Date: Wed, 29 Jan 2025 18:47:30 -0600
Subject: [PATCH 432/462] [Misc][MoE] add Deepseek-V3 moe tuning support
 (#12558)

Signed-off-by: Divakar Verma <divakar.verma@amd.com>
---
 benchmarks/kernels/benchmark_moe.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
index 1fa0da75c79d2..5c8bf33afebc8 100644
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -450,7 +450,8 @@ def save_configs(configs: Dict[int, BenchmarkConfig], num_experts: int,
 def main(args: argparse.Namespace):
     print(args)
 
-    config = AutoConfig.from_pretrained(args.model)
+    config = AutoConfig.from_pretrained(
+        args.model, trust_remote_code=args.trust_remote_code)
     if config.architectures[0] == "DbrxForCausalLM":
         E = config.ffn_config.moe_num_experts
         topk = config.ffn_config.moe_top_k
@@ -461,6 +462,11 @@ def main(args: argparse.Namespace):
         topk = config.num_experts_per_tok
         intermediate_size = config.intermediate_size
         shard_intermediate_size = 2 * intermediate_size // args.tp_size
+    elif config.architectures[0] == "DeepseekV3ForCausalLM":
+        E = config.n_routed_experts
+        topk = config.num_experts_per_tok
+        intermediate_size = config.moe_intermediate_size
+        shard_intermediate_size = 2 * intermediate_size // args.tp_size
     else:
         # Default: Mixtral.
         E = config.num_local_experts
@@ -538,6 +544,7 @@ def _distribute(method: str, inputs: List[Any]) -> List[Any]:
     parser.add_argument("--seed", type=int, default=0)
     parser.add_argument("--batch-size", type=int, required=False)
     parser.add_argument("--tune", action="store_true")
+    parser.add_argument("--trust-remote-code", action="store_true")
     args = parser.parse_args()
 
     main(args)

From f17f1d46086692a2973fad94860a95799fbd8582 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Thu, 30 Jan 2025 02:31:01 +0000
Subject: [PATCH 433/462] [V1][Metrics] Add GPU cache usage % gauge (#12561)

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
---
 tests/entrypoints/openai/test_metrics.py |  1 +
 vllm/v1/core/kv_cache_manager.py         |  5 +++++
 vllm/v1/core/scheduler.py                |  1 +
 vllm/v1/metrics/loggers.py               | 11 ++++++++++-
 vllm/v1/metrics/stats.py                 |  2 +-
 5 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py
index 901ba8e8e5ef3..941f465711ef1 100644
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@@ -200,6 +200,7 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
 EXPECTED_METRICS_V1 = [
     "vllm:num_requests_running",
     "vllm:num_requests_waiting",
+    "vllm:gpu_cache_usage_perc",
     "vllm:prompt_tokens_total",
     "vllm:generation_tokens_total",
     "vllm:request_prompt_tokens_sum",
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index 18fdfdfe4a010..d6c612f155f01 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -69,6 +69,11 @@ def __init__(
         # is finished.
         self.req_to_blocks: Dict[str, List[KVCacheBlock]] = {}
 
+    @property
+    def usage(self) -> float:
+        return 1.0 - (self.free_block_queue.num_free_blocks /
+                      self.num_gpu_blocks)
+
     def get_computed_blocks(
             self, request: Request) -> Tuple[List[KVCacheBlock], int]:
         """Get the computed (cached) blocks for the request.
diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index da2e31b1fb75b..910fc4ff4d2b6 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -544,6 +544,7 @@ def make_stats(self) -> SchedulerStats:
         return SchedulerStats(
             num_running_reqs=len(self.running),
             num_waiting_reqs=len(self.waiting),
+            gpu_cache_usage=self.kv_cache_manager.usage,
         )
 
 
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index 9bb24d1948651..f901822c7887c 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -69,11 +69,13 @@ def log(self, scheduler_stats: SchedulerStats,
         logger.info(
             "Avg prompt throughput: %.1f tokens/s, "
             "Avg generation throughput: %.1f tokens/s, "
-            "Running: %d reqs, Waiting: %d reqs ",
+            "Running: %d reqs, Waiting: %d reqs "
+            "GPU KV cache usage: %.1f%%.",
             prompt_throughput,
             generation_throughput,
             scheduler_stats.num_running_reqs,
             scheduler_stats.num_waiting_reqs,
+            scheduler_stats.gpu_cache_usage * 100,
         )
 
 
@@ -97,6 +99,11 @@ def __init__(self, model_config: ModelConfig):
             documentation="Number of requests waiting to be processed.",
             labelnames=labelnames).labels(*labelvalues)
 
+        self.gauge_gpu_cache_usage = prometheus_client.Gauge(
+            name="vllm:gpu_cache_usage_perc",
+            documentation="GPU KV-cache usage. 1 means 100 percent usage.",
+            labelnames=labelnames).labels(*labelvalues)
+
         self.counter_prompt_tokens = prometheus_client.Counter(
             name="vllm:prompt_tokens_total",
             documentation="Number of prefill tokens processed.",
@@ -147,6 +154,8 @@ def log(self, scheduler_stats: SchedulerStats,
         self.gauge_scheduler_running.set(scheduler_stats.num_running_reqs)
         self.gauge_scheduler_waiting.set(scheduler_stats.num_waiting_reqs)
 
+        self.gauge_gpu_cache_usage.set(scheduler_stats.gpu_cache_usage)
+
         self.counter_prompt_tokens.inc(iteration_stats.num_prompt_tokens)
         self.counter_generation_tokens.inc(
             iteration_stats.num_generation_tokens)
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
index f4c276f0b6902..5277505128a63 100644
--- a/vllm/v1/metrics/stats.py
+++ b/vllm/v1/metrics/stats.py
@@ -14,7 +14,7 @@ class SchedulerStats:
     num_running_reqs: int = 0
     num_waiting_reqs: int = 0
 
-    # gpu_cache_usage: float = 0.0
+    gpu_cache_usage: float = 0.0
     # gpu_prefix_cache_hit_rate: float = 0.0
 
 
From a2769032ca78108e58abc45e2eb0ade8b47a6515 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Thu, 30 Jan 2025 08:05:42 +0000
Subject: [PATCH 434/462] Set `?device={device}` when changing tab in
 installation guides (#12560)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/source/_static/custom.js | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/docs/source/_static/custom.js b/docs/source/_static/custom.js
index 18b502c786e1d..be0b2a388e404 100644
--- a/docs/source/_static/custom.js
+++ b/docs/source/_static/custom.js
@@ -1,3 +1,4 @@
+// Add RunLLM widget
 document.addEventListener("DOMContentLoaded", function () {
     var script = document.createElement("script");
     script.type = "module";
@@ -15,4 +16,23 @@ document.addEventListener("DOMContentLoaded", function () {
   
     script.async = true;
     document.head.appendChild(script);
-  });
\ No newline at end of file
+  });
+
+// Update URL search params when tab is clicked
+  document.addEventListener("DOMContentLoaded", function () {
+    const tabs = document.querySelectorAll(".sd-tab-label");
+
+    function updateURL(tab) {
+      const syncGroup = tab.getAttribute("data-sync-group");
+      const syncId = tab.getAttribute("data-sync-id");
+      if (syncGroup && syncId) {
+          const url = new URL(window.location);
+          url.searchParams.set(syncGroup, syncId);
+          window.history.replaceState(null, "", url);
+      }
+    }
+
+    tabs.forEach(tab => {
+        tab.addEventListener("click", () => updateURL(tab));
+    });
+});

From 41bf5612f590dd13fa5e5dec083849ab6cde2f70 Mon Sep 17 00:00:00 2001
From: Beim <805908499@qq.com>
Date: Fri, 31 Jan 2025 04:39:22 +1300
Subject: [PATCH 435/462] [Misc] fix typo: add missing space in lora adapter
 error message (#12564)

Signed-off-by: Beim <beim2015@outlook.com>
---
 vllm/entrypoints/openai/serving_models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py
index fc422f0917bd5..22e74b387cd73 100644
--- a/vllm/entrypoints/openai/serving_models.py
+++ b/vllm/entrypoints/openai/serving_models.py
@@ -203,7 +203,7 @@ async def _check_load_lora_adapter_request(
                for lora_request in self.lora_requests):
             return create_error_response(
                 message=
-                f"The lora adapter '{request.lora_name}' has already been"
+                f"The lora adapter '{request.lora_name}' has already been "
                 "loaded.",
                 err_type="InvalidUserInput",
                 status_code=HTTPStatus.BAD_REQUEST)

From 9b0c4bab36c8f355f562d58521650ee8d5b6095d Mon Sep 17 00:00:00 2001
From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Date: Thu, 30 Jan 2025 14:53:22 -0500
Subject: [PATCH 436/462] [Kernel] Triton Configs for Fp8 Block Quantization
 (#11589)

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Signed-off-by: mgoin <michael@neuralmagic.com>
Co-authored-by: mgoin <michael@neuralmagic.com>
Co-authored-by: simon-mo <xmo@berkeley.edu>
---
 setup.py                                      |   6 +-
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 .../layers/fused_moe/fused_moe.py             |  91 ++++++++---
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 .../layers/quantization/utils/fp8_utils.py    |  77 +++++++--
 43 files changed, 5972 insertions(+), 42 deletions(-)
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json

diff --git a/setup.py b/setup.py
index 59ece870b5585..50a2392a4d83b 100755
--- a/setup.py
+++ b/setup.py
@@ -608,7 +608,11 @@ def _read_requirements(filename: str) -> List[str]:
     ext_modules.append(CMakeExtension(name="vllm._C"))
 
 package_data = {
-    "vllm": ["py.typed", "model_executor/layers/fused_moe/configs/*.json"]
+    "vllm": [
+        "py.typed",
+        "model_executor/layers/fused_moe/configs/*.json",
+        "model_executor/layers/quantization/utils/configs/*.json",
+    ]
 }
 
 if _no_device():
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..2e692a1583a4a
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..6fcf408755f5d
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index dbb6c2ce4649e..39607dc4ca11e 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -598,15 +598,27 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
         )
 
 
-def get_config_file_name(E: int, N: int, dtype: Optional[str]) -> str:
+# Adapted from: https://github.com/sgl-project/sglang/pull/2628
+def get_config_file_name(E: int,
+                         N: int,
+                         dtype: Optional[str],
+                         block_shape: Optional[List[int]] = None) -> str:
     device_name = current_platform.get_device_name().replace(" ", "_")
     dtype_selector = "" if not dtype else f",dtype={dtype}"
-    return f"E={E},N={N},device_name={device_name}{dtype_selector}.json"
+    block_shape_selector = ("" if not block_shape or not all(block_shape) else
+                            f",block_shape={block_shape}")
+    return f"E={E},N={N},device_name={device_name}{dtype_selector}{block_shape_selector}.json"  # noqa: E501
 
 
+# Adapted from: https://github.com/sgl-project/sglang/pull/2628
 @functools.lru_cache
-def get_moe_configs(E: int, N: int,
-                    dtype: Optional[str]) -> Optional[Dict[int, Any]]:
+def get_moe_configs(
+    E: int,
+    N: int,
+    dtype: Optional[str],
+    block_n: Optional[int] = None,
+    block_k: Optional[int] = None,
+) -> Optional[Dict[int, Any]]:
     """
     Return optimized configurations for the fused MoE kernel.
 
@@ -618,7 +630,8 @@ def get_moe_configs(E: int, N: int,
 
     # First look up if an optimized configuration is available in the configs
     # directory
-    json_file_name = get_config_file_name(E, N, dtype)
+    block_shape = [block_n, block_k] if block_n and block_k else None
+    json_file_name = get_config_file_name(E, N, dtype, block_shape)
 
     config_file_path = os.path.join(
         os.path.dirname(os.path.realpath(__file__)), "configs", json_file_name)
@@ -645,21 +658,53 @@ def get_default_config(
     topk: int,
     dtype: Optional[str],
     is_marlin: bool,
+    block_shape: Optional[List[int]] = None,
 ) -> Dict[str, int]:
-    config = {
-        'BLOCK_SIZE_M': 64,
-        'BLOCK_SIZE_N': 64,
-        'BLOCK_SIZE_K': 32,
-        'GROUP_SIZE_M': 8
-    }
-    # A heuristic: fused marlin works faster with this config for small M
-    if M <= E or (is_marlin and M <= 32):
+    if dtype == "fp8_w8a8":
+        if block_shape is None:
+            config = {
+                "BLOCK_SIZE_M": 128,
+                "BLOCK_SIZE_N": 256,
+                "BLOCK_SIZE_K": 128,
+                "GROUP_SIZE_M": 32,
+                "num_warps": 8,
+                "num_stages": 4,
+            }
+            if M <= E:
+                config = {
+                    "BLOCK_SIZE_M": 64,
+                    "BLOCK_SIZE_N": 128,
+                    "BLOCK_SIZE_K": 128,
+                    "GROUP_SIZE_M": 1,
+                    "num_warps": 4,
+                    "num_stages": 4,
+                }
+        else:
+            # Block-wise quant: BLOCK_SIZE_N must be divisible by block_shape[0]
+            # BLOCK_SIZE_K must be divisible by block_shape[1]
+            config = {
+                "BLOCK_SIZE_M": 64,
+                "BLOCK_SIZE_N": block_shape[0],
+                "BLOCK_SIZE_K": block_shape[1],
+                "GROUP_SIZE_M": 32,
+                "num_warps": 4,
+                "num_stages": 3,
+            }
+    else:
         config = {
-            'BLOCK_SIZE_M': 16,
-            'BLOCK_SIZE_N': 32,
-            'BLOCK_SIZE_K': 64,
-            'GROUP_SIZE_M': 1
+            "BLOCK_SIZE_M": 64,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 32,
+            "GROUP_SIZE_M": 8,
         }
+        # A heuristic: fused marlin works faster with this config for small M
+        if M <= E or (is_marlin and M <= 32):
+            config = {
+                "BLOCK_SIZE_M": 16,
+                "BLOCK_SIZE_N": 32,
+                "BLOCK_SIZE_K": 64,
+                "GROUP_SIZE_M": 1,
+            }
     return config
 
 
@@ -679,7 +724,9 @@ def try_get_optimal_moe_config(
     else:
         # First try to load optimal config from the file
         E, _, N = w2_shape
-        configs = get_moe_configs(E, N, dtype)
+        block_n = block_shape[0] if block_shape else 0
+        block_k = block_shape[1] if block_shape else 0
+        configs = get_moe_configs(E, N, dtype, block_n, block_k)
 
         if configs:
             # If an optimal configuration map has been found, look up the
@@ -688,13 +735,7 @@ def try_get_optimal_moe_config(
         else:
             # Else use the default config
             config = get_default_config(M, E, N, w1_shape[2], top_k, dtype,
-                                        is_marlin)
-    # NOTE: For block-wise quant,
-    # BLOCK_K must be divisible by block_shape[1]
-    # BLOCK_N and BLOCK_M has no requirements
-    if block_shape is not None and block_shape[0] != 0:
-        config["BLOCK_SIZE_N"] = block_shape[0]
-        config["BLOCK_SIZE_K"] = block_shape[1]
+                                        is_marlin, block_shape)
     return config
 
 
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..6496a38fba8ae
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..3618053b65831
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..46a982f5ee9a4
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..035ec027fa566
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..8b49f2781cb54
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..851bc9f9f0b50
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..d1227c2157990
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..1c61451fb34e5
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..63e661c80de6a
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..cf354037903c0
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..eccb86a76df0d
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..88af48431d8b8
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..dd069726d7ed4
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..56b939e52fac3
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..63d9a0bf5d79d
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..7fa398c15a2a5
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..f15d8f64c7090
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..cd3e07804fdec
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..9d5a329d7466a
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..96e1594a3eabb
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..5ffd367df833d
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..eabc423949a24
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..51e237b91b8e7
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..6280219c9ee7d
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..40c01c0b92b4b
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..c6fd3659799bc
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..160f12ed3f95a
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..e5c4a1d2c94e5
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..2bf5eb27e3820
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..0a1e14cffbb2a
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..15b1c93f60fc5
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..8ff12e64c172f
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..4532f93681e2b
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..ca7f32b9552b4
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..5acea242cc0ad
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..58cdd93e90b8c
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..b72e0371d1421
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..293adce387e06
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index 43b1997019107..a7a3fa6601639 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -1,12 +1,18 @@
 # Adapted from https://github.com/sgl-project/sglang/pull/2575
-from typing import List, Optional, Tuple
+import functools
+import json
+import os
+from typing import Any, Dict, List, Optional, Tuple
 
 import torch
 import triton
 import triton.language as tl
 
+from vllm.logger import init_logger
 from vllm.platforms import current_platform
 
+logger = init_logger(__name__)
+
 
 def apply_w8a8_block_fp8_linear(
     input: torch.Tensor,
@@ -277,6 +283,43 @@ def _w8a8_block_fp8_matmul(
     tl.store(c_ptrs, c, mask=c_mask)
 
 
+@functools.lru_cache
+def get_w8a8_block_fp8_configs(N: int, K: int, block_n: int,
+                               block_k: int) -> Optional[Dict[int, Any]]:
+    """
+    Return optimized configurations for the w8a8 block fp8 kernel.
+    The return value will be a dictionary that maps an irregular grid of
+    batch sizes to configurations of the w8a8 block fp8 kernel. To evaluate the
+    kernel on a given batch size bs, the closest batch size in the grid should
+    be picked and the associated configuration chosen to invoke the kernel.
+    """
+
+    # First look up if an optimized configuration is available in the configs
+    # directory
+    device_name = current_platform.get_device_name().replace(" ", "_")
+    json_file_name = f"N={N},K={K},device_name={device_name},dtype=fp8_w8a8,block_shape=[{block_n}, {block_k}].json"  # noqa: E501
+
+    config_file_path = os.path.join(
+        os.path.dirname(os.path.realpath(__file__)), "configs", json_file_name)
+    if os.path.exists(config_file_path):
+        with open(config_file_path) as f:
+            logger.info(
+                "Using configuration from %s for W8A8 Block FP8 kernel.",
+                config_file_path,
+            )
+            # If a configuration has been found, return it
+            return {int(key): val for key, val in json.load(f).items()}
+
+    # If no optimized configuration is available, we will use the default
+    # configuration
+    logger.warning(
+        "Using default W8A8 Block FP8 kernel config. Performance might "
+        "be sub-optimal! Config file not found at %s",
+        config_file_path,
+    )
+    return None
+
+
 def w8a8_block_fp8_matmul(
     A: torch.Tensor,
     B: torch.Tensor,
@@ -316,17 +359,22 @@ def w8a8_block_fp8_matmul(
     C_shape = A.shape[:-1] + (N, )
     C = A.new_empty(C_shape, dtype=output_dtype)
 
-    # TODO:
-    # BLOCK_SIZE_M, BLOCK_SIZE_K, BLOCK_SIZE_N can be optimized.
-    # BLOCK_SIZE_K must be divisible by block_k
-    # BLOCK_SIZE_N and BLOCK_SIZE_M has no requirements
-    BLOCK_SIZE_M = 128
-    if M < BLOCK_SIZE_M:
-        BLOCK_SIZE_M = triton.next_power_of_2(M)
-        BLOCK_SIZE_M = max(BLOCK_SIZE_M, 16)
-    BLOCK_SIZE_K = block_k
-    assert block_k % BLOCK_SIZE_K == 0
-    BLOCK_SIZE_N = block_n
+    configs = get_w8a8_block_fp8_configs(N, K, block_size[0], block_size[1])
+    if configs:
+        # Get the optimal config if there is one
+        config = configs[min(configs.keys(), key=lambda x: abs(x - M))]
+    else:
+        # Default config
+        # Block-wise quant: BLOCK_SIZE_N must be divisible by block_size[0]
+        # BLOCK_SIZE_K must be divisible by block_size[1]
+        config = {
+            "BLOCK_SIZE_M": 64,
+            "BLOCK_SIZE_N": block_size[0],
+            "BLOCK_SIZE_K": block_size[1],
+            "GROUP_SIZE_M": 32,
+            "num_warps": 4,
+            "num_stages": 2,
+        }
 
     def grid(META):
         return (triton.cdiv(M, META["BLOCK_SIZE_M"]) *
@@ -353,10 +401,7 @@ def grid(META):
         As.stride(-1),
         Bs.stride(1),
         Bs.stride(0),
-        BLOCK_SIZE_M=BLOCK_SIZE_M,
-        BLOCK_SIZE_N=BLOCK_SIZE_N,
-        BLOCK_SIZE_K=BLOCK_SIZE_K,
-        GROUP_SIZE_M=8,
+        **config,
     )
 
     return C

From bd2107e30a258a5bcaa94e678a3890ec083a60a0 Mon Sep 17 00:00:00 2001
From: Nishidha <nishidha.panpaliya@partner.ibm.com>
Date: Fri, 31 Jan 2025 02:59:39 +0530
Subject: [PATCH 437/462] [CPU][PPC] Updated torch, torchvision, torchaudio
 dependencies (#12555)

Signed-off-by: npanpaliya <nishidha.panpaliya@partner.ibm.com>
---
 Dockerfile.ppc64le   |  5 ++---
 requirements-cpu.txt | 12 +++++++++---
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/Dockerfile.ppc64le b/Dockerfile.ppc64le
index d3cd1c7b313bc..c4c1f3e357972 100644
--- a/Dockerfile.ppc64le
+++ b/Dockerfile.ppc64le
@@ -4,12 +4,12 @@ USER root
 
 ENV PATH="/usr/local/cargo/bin:$PATH:/opt/conda/bin/"
 
-RUN apt-get update -y && apt-get install -y git wget curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1 libssl-dev 
+RUN apt-get update -y && apt-get install -y git wget kmod curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1 libssl-dev 
 
 # Some packages in requirements-cpu are installed here
 # IBM provides optimized packages for ppc64le processors in the open-ce project for mamba
 # Currently these may not be available for venv or pip directly
-RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 torchvision-cpu=0.16.2 rust && micromamba clean --all --yes
+RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 rust && micromamba clean --all --yes
 
 COPY ./ /workspace/vllm
 
@@ -21,7 +21,6 @@ RUN --mount=type=bind,source=.git,target=.git \
 RUN --mount=type=cache,target=/root/.cache/pip  \
     RUSTFLAGS='-L /opt/conda/lib' pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \
         'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
-        torch==2.3.1 \
         -r requirements-cpu.txt \
         xformers uvloop==0.20.0
 
diff --git a/requirements-cpu.txt b/requirements-cpu.txt
index ed0d2c9fae0b6..ecfa822e01186 100644
--- a/requirements-cpu.txt
+++ b/requirements-cpu.txt
@@ -3,7 +3,13 @@
 
 # Dependencies for CPUs
 torch==2.5.1+cpu; platform_machine != "ppc64le" and platform_machine != "aarch64" and platform_system != "Darwin"
-torch==2.5.1; platform_machine == "aarch64" or platform_system == "Darwin" 
-torchaudio; platform_machine != "ppc64le"  # required for the image processor of minicpm-o-2_6, this must be updated alongside torch
-torchvision; platform_machine != "ppc64le"  # required for the image processor of phi3v, this must be updated alongside torch
+torch==2.5.1; platform_machine == "ppc64le" or platform_machine == "aarch64" or platform_system == "Darwin" 
+
+# required for the image processor of minicpm-o-2_6, this must be updated alongside torch
+torchaudio; platform_machine != "ppc64le"
+torchaudio==2.5.1; platform_machine == "ppc64le"
+
+# required for the image processor of phi3v, this must be updated alongside torch
+torchvision; platform_machine != "ppc64le"
+torchvision==0.20.1; platform_machine == "ppc64le"
 datasets # for benchmark scripts

From 4078052f09f42f898b542e18d60d15a43db67a8b Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Thu, 30 Jan 2025 18:07:19 -0500
Subject: [PATCH 438/462] [V1][Log] Add max request concurrency log to V1
 (#12569)

Signed-off-by: mgoin <michael@neuralmagic.com>
---
 vllm/v1/core/kv_cache_utils.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index bab99fe37caee..dbdda51aedaa0 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -393,6 +393,10 @@ def _get_kv_cache_config_uniform_type(vllm_config: VllmConfig,
         num_blocks = num_gpu_blocks_override
 
     logger.info("# GPU blocks: %d", num_blocks)
+    max_concurrency = (num_blocks * vllm_config.cache_config.block_size /
+                       vllm_config.model_config.max_model_len)
+    logger.info("Maximum concurrency for %s tokens per request: %.2fx",
+                vllm_config.model_config.max_model_len, max_concurrency)
 
     per_layer_size = page_size * num_blocks
 

From 9798b2fb0052092a6420172e41c0c8a307eedfa6 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Thu, 30 Jan 2025 21:33:00 -0500
Subject: [PATCH 439/462] [Kernel] Update `cutlass_scaled_mm` to support 2d
 group (blockwise) scaling (#11868)

---
 CMakeLists.txt                                |   9 +-
 .../cutlass_benchmarks/w8a8_benchmarks.py     | 290 ++++---
 csrc/core/math.hpp                            |   9 +-
 csrc/cutlass_extensions/common.hpp            |  17 +
 .../gemm/collective/collective_builder.hpp    | 123 +++
 .../gemm/collective/fp8_accumulation.hpp      | 183 +++++
 ..._warpspecialized_fp8_blockwise_scaling.hpp | 730 ++++++++++++++++++
 .../gemm/dispatch_policy.hpp                  |  39 +
 .../vllm_collective_builder.cuh               |   2 +-
 .../cutlass_w8a8/c3x/cutlass_gemm_caller.cuh  |  93 +++
 .../{scaled_mm_c3x.cuh => c3x/scaled_mm.cuh}  |  74 --
 .../c3x/scaled_mm_azp_sm90_int8.cu            |  24 +
 .../c3x/scaled_mm_blockwise_sm90_fp8.cu       |  24 +
 .../scaled_mm_blockwise_sm90_fp8_dispatch.cuh | 168 ++++
 .../cutlass_w8a8/c3x/scaled_mm_kernels.hpp    |  33 +
 .../cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu    |  24 +
 .../scaled_mm_sm90_fp8_dispatch.cuh}          |  26 +-
 .../cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu   |  24 +
 .../scaled_mm_sm90_int8_dispatch.cuh}         |  25 +-
 .../cutlass_w8a8/scaled_mm_c3x.cu             | 104 ++-
 .../cutlass_w8a8/scaled_mm_entry.cu           |   3 -
 .../quantization/machete/machete_mainloop.cuh |   4 +
 tests/kernels/test_cutlass.py                 | 188 +++--
 tests/kernels/utils.py                        |  32 +-
 vllm/_custom_ops.py                           |  22 +
 25 files changed, 1924 insertions(+), 346 deletions(-)
 create mode 100644 csrc/cutlass_extensions/gemm/collective/collective_builder.hpp
 create mode 100644 csrc/cutlass_extensions/gemm/collective/fp8_accumulation.hpp
 create mode 100644 csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
 create mode 100644 csrc/cutlass_extensions/gemm/dispatch_policy.hpp
 create mode 100644 csrc/quantization/cutlass_w8a8/c3x/cutlass_gemm_caller.cuh
 rename csrc/quantization/cutlass_w8a8/{scaled_mm_c3x.cuh => c3x/scaled_mm.cuh} (51%)
 create mode 100644 csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu
 create mode 100644 csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu
 create mode 100644 csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh
 create mode 100644 csrc/quantization/cutlass_w8a8/c3x/scaled_mm_kernels.hpp
 create mode 100644 csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu
 rename csrc/quantization/cutlass_w8a8/{scaled_mm_c3x_sm90_fp8_dispatch.cuh => c3x/scaled_mm_sm90_fp8_dispatch.cuh} (76%)
 create mode 100644 csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu
 rename csrc/quantization/cutlass_w8a8/{scaled_mm_c3x_sm90_int8_dispatch.cuh => c3x/scaled_mm_sm90_int8_dispatch.cuh} (84%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6c946fc5aa3ac..c823c9ff895c3 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -245,7 +245,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     FetchContent_Declare(
         cutlass
         GIT_REPOSITORY https://github.com/nvidia/cutlass.git
-        GIT_TAG v3.6.0
+        GIT_TAG v3.7.0
         GIT_PROGRESS TRUE
 
         # Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
@@ -299,7 +299,12 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # CUDA 12.0 or later (and only work on Hopper, 9.0a for now).
   cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0a" "${CUDA_ARCHS}")
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
-    set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")
+    set(SRCS 
+       "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu"
+       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
+       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu"
+       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu"
+       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu")
     set_gencode_flags_for_srcs(
       SRCS "${SRCS}"
       CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
index d0353bc8cb42a..b87496ca3b2b4 100644
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@@ -3,7 +3,7 @@
 import itertools
 import pickle as pkl
 import time
-from typing import Callable, Iterable, List, Tuple
+from typing import Callable, Iterable, List, Optional, Tuple
 
 import torch
 import torch.utils.benchmark as TBenchmark
@@ -12,6 +12,8 @@
 from weight_shapes import WEIGHT_SHAPES
 
 from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    w8a8_block_fp8_matmul)
 from vllm.utils import FlexibleArgumentParser
 
 DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
@@ -38,8 +40,15 @@ def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args,
     ).blocked_autorange(min_run_time=min_run_time)
 
 
-def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
-               sub_label: str) -> Iterable[TMeasurement]:
+def bench_int8(
+        dtype: torch.dtype,
+        m: int,
+        k: int,
+        n: int,
+        label: str,
+        sub_label: str,
+        bench_kernels: Optional[List[str]] = None) -> Iterable[TMeasurement]:
+    """Benchmark INT8-based kernels."""
     assert dtype == torch.int8
     a, b = make_rand_tensors(torch.int8, m, n, k)
     scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
@@ -48,155 +57,132 @@ def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
     azp = torch.zeros((m, ), device="cuda", dtype=torch.int32)
     azp_adj = torch.zeros((n, ), device="cuda", dtype=torch.int32)
 
+    bench_fns = {
+        "pytorch_bf16_bf16_bf16_matmul-no-scales":
+        lambda: torch.mm(a.to(dtype=torch.bfloat16), b.to(dtype=torch.bfloat16)
+                         ),
+        "pytorch_fp16_fp16_fp16_matmul-no-scales":
+        lambda: torch.mm(a.to(dtype=torch.float16), b.to(dtype=torch.float16)),
+        "cutlass_i8_i8_bf16_scaled_mm":
+        lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16),
+        "cutlass_i8_i8_bf16_scaled_mm_bias":
+        lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16,
+                                      bias),
+        "cutlass_i8_i8_bf16_scaled_mm_azp":
+        lambda: ops.cutlass_scaled_mm_azp(a, b, scale_a, scale_b, torch.
+                                          bfloat16, azp_adj),
+        "cutlass_i8_i8_bf16_scaled_mm_azp_bias":
+        lambda: ops.cutlass_scaled_mm_azp(a, b, scale_a, scale_b, torch.
+                                          bfloat16, azp_adj, None, bias),
+        "cutlass_i8_i8_bf16_scaled_mm_azp_pt":
+        lambda: ops.cutlass_scaled_mm_azp(a, b, scale_a, scale_b, torch.
+                                          bfloat16, azp_adj, azp),
+        "cutlass_i8_i8_bf16_scaled_mm_azp_pt_bias":
+        lambda: ops.cutlass_scaled_mm_azp(a, b, scale_a, scale_b, torch.
+                                          bfloat16, azp_adj, azp, bias),
+    }
+
     timers = []
-    # pytorch impl - bfloat16
-    timers.append(
-        bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
-                 torch.mm, a.to(dtype=torch.bfloat16),
-                 b.to(dtype=torch.bfloat16)))
-
-    # pytorch impl - float16
-    timers.append(
-        bench_fn(label, sub_label,
-                 "pytorch_fp16_fp16_fp16_matmul-no-scales", torch.mm,
-                 a.to(dtype=torch.float16), b.to(dtype=torch.float16)))
-
-    # cutlass impl
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm",
-                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b,
-                 torch.bfloat16))
-
-    # cutlass with bias
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_bias",
-                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.bfloat16,
-                 bias))
-
-    # cutlass with azp per-tensor
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp",
-                 ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
-                 torch.bfloat16, azp_adj))
-
-    # cutlass with azp per-tensor + bias
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_bias",
-                 ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
-                 torch.bfloat16, azp_adj, None, bias))
-
-    # cutlass with azp per-token
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_pt",
-                 ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
-                 torch.bfloat16, azp_adj, azp))
-
-    # cutlass with azp per-token + bias
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_pt_bias",
-                 ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
-                 torch.bfloat16, azp_adj, azp, bias))
+    for name, fn in bench_fns.items():
+        # If bench_kernels is None, run all. Otherwise, run only exact matches.
+        if bench_kernels is None or name in bench_kernels:
+            print(f"Running {name}")
+            timers.append(bench_fn(label, sub_label, name, fn))
 
     return timers
 
 
-def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
-              sub_label: str) -> Iterable[TMeasurement]:
+def bench_fp8(
+        dtype: torch.dtype,
+        m: int,
+        k: int,
+        n: int,
+        label: str,
+        sub_label: str,
+        bench_kernels: Optional[List[str]] = None) -> Iterable[TMeasurement]:
+    """Benchmark FP8-based kernels."""
     assert dtype == torch.float8_e4m3fn
     a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k)
+    a_cont = a.contiguous()
     scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
     scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    block_scale_a = torch.rand((m, k // 128),
+                               device="cuda",
+                               dtype=torch.float32)
+    block_scale_b = torch.rand((k // 128, n // 128),
+                               device="cuda",
+                               dtype=torch.float32)
+    block_scale_a_M_major = block_scale_a.t().contiguous().t()
+    block_scale_b_K_major = block_scale_b.t().contiguous().t()
     bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
 
-    timers = []
+    print(m, k, n)
+
+    bench_fns = {
+        "pytorch_bf16_bf16_bf16_matmul-no-scales":
+        lambda: torch.mm(a.to(dtype=torch.bfloat16), b.to(dtype=torch.bfloat16)
+                         ),
+        "pytorch_fp16_fp16_fp16_matmul-no-scales":
+        lambda: torch.mm(a.to(dtype=torch.float16), b.to(dtype=torch.float16)),
+        "pytorch_fp8_fp8_fp16_scaled_mm":
+        lambda: torch._scaled_mm(
+            a, b, scale_a, scale_b, out_dtype=torch.float16),
+        "pytorch_fp8_fp8_fp16_scaled_mm_fast_accum":
+        lambda: torch._scaled_mm(a,
+                                 b,
+                                 scale_a,
+                                 scale_b,
+                                 out_dtype=torch.float16,
+                                 use_fast_accum=True),
+        "pytorch_fp8_fp8_bf16_scaled_mm":
+        lambda: torch._scaled_mm(
+            a, b, scale_a, scale_b, out_dtype=torch.bfloat16),
+        "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum":
+        lambda: torch._scaled_mm(a,
+                                 b,
+                                 scale_a,
+                                 scale_b,
+                                 out_dtype=torch.bfloat16,
+                                 use_fast_accum=True),
+        "cutlass_fp8_fp8_bf16_scaled_mm":
+        lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16),
+        "cutlass_fp8_fp8_fp16_scaled_mm":
+        lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.float16),
+        "cutlass_fp8_fp8_bf16_scaled_mm_bias":
+        lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16,
+                                      bias),
+        "cutlass_fp8_fp8_fp16_scaled_mm_bias":
+        lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.float16,
+                                      bias.to(dtype=torch.float16)),
+        "triton_fp8_fp8_fp16_scaled_mm_blockwise":
+        lambda: w8a8_block_fp8_matmul(a_cont, b.t(), block_scale_a,
+                                      block_scale_b.t(), (128, 128)),
+        "cutlass_fp8_fp8_fp16_scaled_mm_blockwise":
+        lambda: ops.cutlass_scaled_mm(a, b, block_scale_a_M_major,
+                                      block_scale_b_K_major, torch.float16),
+    }
 
-    # pytorch impl w. bf16
-    timers.append(
-        bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
-                 torch.mm, a.to(dtype=torch.bfloat16, device="cuda"),
-                 b.to(dtype=torch.bfloat16, device="cuda")))
-
-    # pytorch impl: bf16 output, without fp8 fast accum
-    timers.append(
-        bench_fn(label,
-                 sub_label,
-                 "pytorch_fp8_fp8_bf16_scaled_mm",
-                 torch._scaled_mm,
-                 a,
-                 b,
-                 scale_a=scale_a,
-                 scale_b=scale_b,
-                 out_dtype=torch.bfloat16))
-
-    # pytorch impl: bf16 output, with fp8 fast accum
-    timers.append(
-        bench_fn(label,
-                 sub_label,
-                 "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum",
-                 torch._scaled_mm,
-                 a,
-                 b,
-                 scale_a=scale_a,
-                 scale_b=scale_b,
-                 out_dtype=torch.bfloat16,
-                 use_fast_accum=True))
-
-    # pytorch impl: fp16 output, without fp8 fast accum
-    timers.append(
-        bench_fn(label,
-                 sub_label,
-                 "pytorch_fp8_fp8_fp16_scaled_mm",
-                 torch._scaled_mm,
-                 a,
-                 b,
-                 scale_a=scale_a,
-                 scale_b=scale_b,
-                 out_dtype=torch.float16))
-
-    # pytorch impl: fp16 output, with fp8 fast accum
-    timers.append(
-        bench_fn(label,
-                 sub_label,
-                 "pytorch_fp8_fp8_fp16_scaled_mm_fast_accum",
-                 torch._scaled_mm,
-                 a,
-                 b,
-                 scale_a=scale_a,
-                 scale_b=scale_b,
-                 out_dtype=torch.float16,
-                 use_fast_accum=True))
-
-    # cutlass impl: bf16 output
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_mm",
-                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b,
-                 torch.bfloat16))
-    # cutlass impl: fp16 output
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_mm",
-                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.float16))
-
-    # cutlass impl: bf16 output, with bias
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_mm_bias",
-                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.bfloat16,
-                 bias))
-
-    # cutlass impl: fp16 output, with bias
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_mm_bias",
-                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.float16,
-                 bias.to(dtype=torch.float16)))
+    timers = []
+    for name, fn in bench_fns.items():
+        # If bench_kernels is None, run all. Otherwise, run only exact matches.
+        if bench_kernels is None or name in bench_kernels:
+            print(f"Running {name}")
+            timers.append(bench_fn(label, sub_label, name, fn))
 
     return timers
 
 
-def bench(dtype: torch.dtype, m: int, k: int, n: int, label: str,
-          sub_label: str) -> Iterable[TMeasurement]:
+def bench(dtype: torch.dtype,
+          m: int,
+          k: int,
+          n: int,
+          label: str,
+          sub_label: str,
+          bench_kernels: Optional[List[str]] = None) -> Iterable[TMeasurement]:
     if dtype == torch.int8:
-        return bench_int8(dtype, m, k, n, label, sub_label)
+        return bench_int8(dtype, m, k, n, label, sub_label, bench_kernels)
     if dtype == torch.float8_e4m3fn:
-        return bench_fp8(dtype, m, k, n, label, sub_label)
+        return bench_fp8(dtype, m, k, n, label, sub_label, bench_kernels)
     raise ValueError("unsupported type")
 
 
@@ -207,18 +193,22 @@ def print_timers(timers: Iterable[TMeasurement]):
 
 
 def run(dtype: torch.dtype,
-        MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
+        MKNs: Iterable[Tuple[int, int, int]],
+        bench_kernels: Optional[List[str]] = None) -> Iterable[TMeasurement]:
     results = []
     for m, k, n in MKNs:
-        timers = bench(dtype, m, k, n, f"scaled-{dtype}-gemm",
-                       f"MKN=({m}x{k}x{n})")
+        timers = bench(dtype,
+                       m,
+                       k,
+                       n,
+                       f"scaled-{dtype}-gemm",
+                       f"MKN=({m}x{k}x{n})",
+                       bench_kernels=bench_kernels)
         print_timers(timers)
         results.extend(timers)
-
     return results
 
 
-# output makers
 def make_output(data: Iterable[TMeasurement],
                 MKNs: Iterable[Tuple[int, int, int]],
                 base_description: str,
@@ -232,15 +222,11 @@ def make_output(data: Iterable[TMeasurement],
         pkl.dump(data, f)
 
 
-# argparse runners
-
-
 def run_square_bench(args):
     dim_sizes = list(
         range(args.dim_start, args.dim_end + 1, args.dim_increment))
     MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
-    data = run(args.dtype, MKNs)
-
+    data = run(args.dtype, MKNs, bench_kernels=args.kernels)
     make_output(data, MKNs, f"square_bench-{args.dtype}")
 
 
@@ -251,8 +237,7 @@ def run_range_bench(args):
     Ks = [args.k_constant] * n if args.k_constant is not None else dim_sizes
     Ns = [args.n_constant] * n if args.n_constant is not None else dim_sizes
     MKNs = list(zip(Ms, Ks, Ns))
-    data = run(args.dtype, MKNs)
-
+    data = run(args.dtype, MKNs, bench_kernels=args.kernels)
     make_output(data, MKNs, f"range_bench-{args.dtype}")
 
 
@@ -278,7 +263,7 @@ def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
             for k, n in KNs:
                 MKNs.append((m, k, n))
 
-        data = run(args.dtype, MKNs)
+        data = run(args.dtype, MKNs, bench_kernels=args.kernels)
         model_bench_data.append(data)
 
     # Print all results
@@ -328,6 +313,15 @@ def to_torch_dtype(dt):
                         type=to_torch_dtype,
                         required=True,
                         help="Available options are ['int8', 'fp8']")
+    parser.add_argument(
+        "--kernels",
+        nargs="+",
+        type=str,
+        default=None,
+        help=
+        "Exact names of the kernels to benchmark. If not set, runs all kernels."
+    )
+
     subparsers = parser.add_subparsers(dest="cmd")
 
     square_parser = subparsers.add_parser("square_bench")
@@ -362,4 +356,4 @@ def to_torch_dtype(dt):
     model_parser.set_defaults(func=run_model_bench)
 
     args = parser.parse_args()
-    args.func(args)
\ No newline at end of file
+    args.func(args)
diff --git a/csrc/core/math.hpp b/csrc/core/math.hpp
index ba9f40a230c8e..ddfaca27147b4 100644
--- a/csrc/core/math.hpp
+++ b/csrc/core/math.hpp
@@ -1,7 +1,14 @@
+#pragma once
+
 #include <climits>
 #include <iostream>
 
-inline uint32_t next_pow_2(uint32_t const num) {
+inline constexpr uint32_t next_pow_2(uint32_t const num) {
   if (num <= 1) return num;
   return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
+}
+
+template <typename T>
+inline constexpr std::enable_if_t<std::is_integral_v<T>, T> ceil_div(T a, T b) {
+  return (a + b - 1) / b;
 }
\ No newline at end of file
diff --git a/csrc/cutlass_extensions/common.hpp b/csrc/cutlass_extensions/common.hpp
index 07c9e46c27b06..febc4eccd9561 100644
--- a/csrc/cutlass_extensions/common.hpp
+++ b/csrc/cutlass_extensions/common.hpp
@@ -32,3 +32,20 @@ inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) {
 }
 
 int32_t get_sm_version_num();
+
+/**
+ * A wrapper for a kernel that is used to guard against compilation on
+ * architectures that will never use the kernel. The purpose of this is to
+ * reduce the size of the compiled binary.
+ * __CUDA_ARCH__ is not defined in host code, so this lets us smuggle the ifdef
+ * into code that will be executed on the device where it is defined.
+ */
+template <typename Kernel>
+struct enable_sm90_or_later : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE void operator()(Args&&... args) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 900
+    Kernel::operator()(std::forward<Args>(args)...);
+#endif
+  }
+};
\ No newline at end of file
diff --git a/csrc/cutlass_extensions/gemm/collective/collective_builder.hpp b/csrc/cutlass_extensions/gemm/collective/collective_builder.hpp
new file mode 100644
index 0000000000000..ec75c29e54f4d
--- /dev/null
+++ b/csrc/cutlass_extensions/gemm/collective/collective_builder.hpp
@@ -0,0 +1,123 @@
+// Modified from: cutlass/gemm/collective/builders/sm90_gmma_builder.inl
+// clang-format off
+#pragma once
+
+#include "cutlass/gemm/collective/builders/sm90_gmma_builder.inl"
+
+#include "cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp"
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA_TMA_WS_SS (BlockScaled Builders)
+template <
+  class ElementA,
+  class GmemLayoutATag,
+  int AlignmentA,
+  class ElementB,
+  class GmemLayoutBTag,
+  int AlignmentB,
+  class ElementAccumulator,
+  class TileShape_MNK,
+  class ClusterShape_MNK,
+  class StageCountType,
+  int ScaleGranularityM
+>
+struct CollectiveBuilder<
+    arch::Sm90,
+    arch::OpClassTensorOp,
+    ElementA,
+    GmemLayoutATag,
+    AlignmentA,
+    ElementB,
+    GmemLayoutBTag,
+    AlignmentB,
+    ElementAccumulator,
+    TileShape_MNK,
+    ClusterShape_MNK,
+    StageCountType,
+    KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum<ScaleGranularityM>,
+    cute::enable_if_t<
+      not detail::is_use_rmem_A<ElementA, GmemLayoutATag, ElementB, GmemLayoutBTag>()>
+> {
+  using KernelScheduleType = KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum<ScaleGranularityM>;
+
+  static_assert(is_static<TileShape_MNK>::value);
+  static_assert(is_static<ClusterShape_MNK>::value);
+#ifndef CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
+  static_assert(cutlass::detail::dependent_false<ElementA>, "Unsupported Toolkit for SM90 Collective Builder\n");
+#endif
+  static_assert(detail::is_aligned<ElementA, AlignmentA, ElementB, AlignmentB, detail::tma_alignment_bytes>(),
+                "Should meet TMA alignment requirement\n");
+
+  static constexpr bool IsArrayOfPointersGemm = (cute::is_any_of_v<KernelScheduleType,
+                                                                   KernelPtrArrayTmaWarpSpecializedCooperative,
+                                                                   KernelPtrArrayTmaWarpSpecializedPingpong>);
+  static constexpr bool IsFP8Input = detail::is_input_fp8<ElementA, ElementB>();
+  static_assert((!IsFP8Input || !IsArrayOfPointersGemm),
+                "KernelTmaWarpSpecializedCooperativeFP8BlockScaledAccum is only compatible with FP8 Blocked Scaled version right now.");
+
+  // For fp32 types, map to tf32 MMA value type
+  using ElementAMma = cute::conditional_t<cute::is_same_v<ElementA, float>, tfloat32_t, ElementA>;
+  using ElementBMma = cute::conditional_t<cute::is_same_v<ElementB, float>, tfloat32_t, ElementB>;
+
+  static constexpr cute::GMMA::Major GmmaMajorA = detail::gmma_ss_tag_to_major_A<ElementAMma, GmemLayoutATag>();
+  static constexpr cute::GMMA::Major GmmaMajorB = detail::gmma_ss_tag_to_major_B<ElementBMma, GmemLayoutBTag>();
+
+  static constexpr bool IsCooperative = cute::is_any_of_v<KernelScheduleType,
+                                                          KernelTmaWarpSpecializedCooperative,
+                                                          KernelPtrArrayTmaWarpSpecializedCooperative,
+                                                          KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum<ScaleGranularityM>>;
+  using AtomLayoutMNK = cute::conditional_t<IsCooperative,
+      Layout<Shape<_2,_1,_1>>, Layout<Shape<_1,_1,_1>>>;
+
+  using TiledMma = decltype(cute::make_tiled_mma(cute::GMMA::ss_op_selector<
+      ElementAMma, ElementBMma, ElementAccumulator, TileShape_MNK, GmmaMajorA, GmmaMajorB>(), AtomLayoutMNK{}));
+
+  using GmemTiledCopyA = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<1>(ClusterShape_MNK{})));
+  using GmemTiledCopyB = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<0>(ClusterShape_MNK{})));
+
+  using SmemLayoutAtomA = decltype(detail::ss_smem_selector<
+      GmmaMajorA, ElementAMma, decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
+  using SmemLayoutAtomB = decltype(detail::ss_smem_selector<
+      GmmaMajorB, ElementBMma, decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
+
+  static constexpr size_t TensorMapStorage = IsArrayOfPointersGemm ? sizeof(cute::TmaDescriptor) * 2 /* for A and B */ : 0;
+  static constexpr int KernelSmemCarveout = static_cast<int>(TensorMapStorage);
+
+  static constexpr int PipelineStages = detail::compute_stage_count_or_override<detail::sm90_smem_capacity_bytes - KernelSmemCarveout,
+      ElementAMma, ElementBMma, TileShape_MNK>(StageCountType{});
+  using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecializedBlockScalingSubGroupMFP8<PipelineStages, ClusterShape_MNK, KernelScheduleType, ScaleGranularityM>;
+
+  using SmemCopyAtomA = void;
+  using SmemCopyAtomB = void;
+
+  using CollectiveOp = CollectiveMma<
+      DispatchPolicy,
+      TileShape_MNK,
+      ElementA,
+      TagToStrideA_t<GmemLayoutATag>,
+      ElementB,
+      TagToStrideB_t<GmemLayoutBTag>,
+      TiledMma,
+      GmemTiledCopyA,
+      SmemLayoutAtomA,
+      SmemCopyAtomA,
+      cute::identity,
+      GmemTiledCopyB,
+      SmemLayoutAtomB,
+      SmemCopyAtomB,
+      cute::identity
+    >;
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/cutlass_extensions/gemm/collective/fp8_accumulation.hpp b/csrc/cutlass_extensions/gemm/collective/fp8_accumulation.hpp
new file mode 100644
index 0000000000000..13b90e998625e
--- /dev/null
+++ b/csrc/cutlass_extensions/gemm/collective/fp8_accumulation.hpp
@@ -0,0 +1,183 @@
+// clang-format off
+// adapted from: https://github.com/soundOfDestiny/cutlass/blob/a4208aa6958864923505cade9c63eb2a6daf16e5/include/cutlass/gemm/collective/fp8_accumulation.hpp
+
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cute/algorithm/clear.hpp"
+#include "cute/tensor.hpp"
+
+//////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////FP8 Accumulation///////////////////////////
+//////////////////////////////////////////////////////////////////////////////
+/// This class provides API to promote (add) or scale (multiply_add) the results
+/// from the tensor core accumulators to the main accumulators when the number 
+/// of MMAs reaches the max number of MMA interval specified by user, after that
+/// the tensor core accumulators are zeroed.
+//////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+
+template <
+    class EngineAccum,
+    class LayoutAccum>
+struct GmmaFP8AccumulationWithScale {  
+  using TensorAccum = cute::Tensor<EngineAccum, LayoutAccum>;
+  using ElementAccumulator = typename EngineAccum::value_type;
+
+  static_assert(is_static<LayoutAccum>::value, "Accumulator Layout should be static");
+  static_assert(is_rmem<TensorAccum>::value , "Accumulator tensor must be rmem resident.");
+
+private:
+  TensorAccum& accum_;
+  TensorAccum accum_temp_;
+
+  uint32_t accum_promotion_interval_;         // defines the max num of executed MMAs after which accum should be promoted.
+  uint32_t mma_count_per_mainloop_iteration_; // num of MMAs per k_tile of mainloop
+  uint32_t mma_count_;                        // current executed MMAs
+  uint32_t reset_accum_flag_;                 // accum needs to be zeroed or not. 
+
+  // promote or `add` the partial accumulators to main accumulator (FADD).
+  CUTLASS_DEVICE
+  void promote_core() {
+    warpgroup_wait<0>();
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < size(accum_); ++i) {
+      accum_(i) += accum_temp_(i);
+    }
+  }
+
+  // `multiply` scale the partial accumulators and `add` to main accumulator (FFMA).
+  template <
+    class EngineScale,
+    class LayoutScale>
+  CUTLASS_DEVICE
+  void scale_core(const cute::Tensor<EngineScale, LayoutScale> &scale) {
+    using TensorScale = cute::Tensor<EngineScale, LayoutScale>;
+
+    static_assert(is_static<LayoutScale>::value, "Scale Layout should be static");
+    static_assert(is_rmem<TensorScale>::value , "Scale tensor must be rmem resident.");
+
+    static_assert(LayoutAccum{}.shape() == LayoutScale{}.shape(), "Accumulator and scale must have same shape.");
+
+    warpgroup_wait<0>();
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < size(accum_); ++i) {
+      accum_(i) += accum_temp_(i) * scale(i);
+    }
+  }
+
+public:
+  CUTLASS_DEVICE
+  GmmaFP8AccumulationWithScale(
+      TensorAccum &accum,
+      uint32_t accum_promotion_interval,
+      uint32_t mma_count_per_mainloop_iteration)
+      : accum_(accum), 
+        accum_promotion_interval_(accum_promotion_interval),
+        mma_count_per_mainloop_iteration_(mma_count_per_mainloop_iteration),
+        mma_count_(0), 
+        reset_accum_flag_(0) 
+  {
+    accum_temp_ = cute::make_fragment_like(accum);
+  }
+
+  //
+  // Methods (Common)
+  //
+
+  CUTLASS_DEVICE 
+  TensorAccum& operator()() {
+    return accum_temp_;
+  }
+
+  /// prepare the MMA accumulators when initialization or zeroing is required.
+  CUTLASS_DEVICE
+  bool prepare_if_needed() { 
+    return reset_accum_flag_;
+  }
+
+  //
+  // Methods (for FADD version)
+  //
+
+  /// promote (add) the results from the MMA accumulators to main accumulator if needed.
+  CUTLASS_DEVICE
+  void promote_if_needed() {
+    mma_count_ += mma_count_per_mainloop_iteration_;
+    reset_accum_flag_ = __shfl_sync(0xffffffff, mma_count_ == accum_promotion_interval_, 0);
+    if (reset_accum_flag_) {
+      promote_core();
+      mma_count_ = 0;
+    }
+  }
+
+  /// promote (add) the residue results from the MMA accumulators to main accumulator if needed.
+  CUTLASS_DEVICE
+  void promote_residue_if_needed() {
+    if (__shfl_sync(0xffffffff, mma_count_ > 0, 0)) {
+      promote_core();
+    }
+  }
+
+  //
+  // Methods (for FFMA version)
+  //
+
+  /// scale (multiply_add) the results from the MMA accumulators to main accumulator if needed.
+  template <
+    class EngineScale,
+    class LayoutScale>
+  CUTLASS_DEVICE
+  void scale_if_needed(const cute::Tensor<EngineScale, LayoutScale> &scale) {
+    mma_count_ += mma_count_per_mainloop_iteration_;
+    reset_accum_flag_ = __shfl_sync(0xffffffff, mma_count_ == accum_promotion_interval_, 0);
+    if (reset_accum_flag_) {
+      scale_core(scale);
+      mma_count_ = 0;
+    }
+  }
+
+  /// scale (multiply_add) the residue results from the MMA accumulators to main accumulator if needed.
+  template <
+    class EngineScale,
+    class LayoutScale>
+  CUTLASS_DEVICE
+  void scale_residue_if_needed(const cute::Tensor<EngineScale, LayoutScale> &scale) {
+    if (__shfl_sync(0xffffffff, mma_count_ > 0, 0)) {
+      scale_core(scale);
+    }
+  }
+};
+
+} // namespace cutlass::gemm::collective
diff --git a/csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp b/csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
new file mode 100644
index 0000000000000..928a9500cbb08
--- /dev/null
+++ b/csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
@@ -0,0 +1,730 @@
+// clang-format off
+// Adapted (Heavily) from: https://github.com/soundOfDestiny/cutlass/blob/9d997ce0dea4c5fa1a617db6b7ff29aa9235822c/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
+
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/trace.h"
+#include "cutlass/numeric_types.h"
+
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/arch/copy_sm80.hpp"
+#include "cute/arch/copy_sm90.hpp"
+#include "cute/algorithm/functional.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/tensor_predicate.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+
+#include "cutlass_extensions/gemm/dispatch_policy.hpp"
+#include "cutlass_extensions/gemm/collective/fp8_accumulation.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// WarpSpecialized Mainloop
+template <
+  int Stages,
+  class ClusterShape,
+  class KernelSchedule,
+  int ScaleGranularityM_,
+  class TileShape_,
+  class ElementA_,
+  class StrideA_,
+  class ElementB_,
+  class StrideB_,
+  class TiledMma_,
+  class GmemTiledCopyA_,
+  class SmemLayoutAtomA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyB_,
+  class SmemLayoutAtomB_,
+  class SmemCopyAtomB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm90TmaGmmaWarpSpecializedBlockScalingSubGroupMFP8<Stages, ClusterShape, KernelSchedule, ScaleGranularityM_>,
+    TileShape_,
+    ElementA_,
+    StrideA_,
+    ElementB_,
+    StrideB_,
+    TiledMma_,
+    GmemTiledCopyA_,
+    SmemLayoutAtomA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyB_,
+    SmemLayoutAtomB_,
+    SmemCopyAtomB_,
+    TransformB_>
+{
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecializedBlockScalingSubGroupMFP8<Stages, ClusterShape, KernelSchedule, ScaleGranularityM_>;
+  using TileShape = TileShape_;
+  using ElementA = ElementA_;
+  using StrideA = StrideA_;
+  using ElementB = ElementB_;
+  using StrideB = StrideB_;
+  using TiledMma = TiledMma_;
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using ElementBlockScale = ElementAccumulator;
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
+  using MainloopPipeline = cutlass::PipelineTmaAsync<DispatchPolicy::Stages>;
+  using PipelineState = cutlass::PipelineState<DispatchPolicy::Stages>;
+  using PipelineParams = typename MainloopPipeline::Params;
+
+  // Two threads per CTA are producers (1 for operand tile and 32 for scales)
+  static constexpr int NumProducerThreadEvents = 33; 
+
+  static constexpr int ScaleGranularityM = ScaleGranularityM_ == 0 ? size<0>(TileShape{}) : ScaleGranularityM_;
+  static constexpr int ScaleMsPerTile = size<0>(TileShape{}) / ScaleGranularityM;
+
+  static_assert(cute::rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(cute::rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert((size<0>(TileShape{}) % ScaleGranularityM) == 0, "FP8 scaling granularity must evenly divide tile shape along M.");
+
+  // Tile along modes in a way that maximizes the TMA box size.
+  using SmemLayoutA = decltype(tile_to_shape(
+      SmemLayoutAtomA{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+  using SmemLayoutB = decltype(tile_to_shape(
+      SmemLayoutAtomB{},
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+  
+  // Block scaling gmem-to-smem copy atom 
+  using SmemBlockScalingCopyAtomA = Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<ElementBlockScale>, ElementBlockScale>;
+  using SmemBlockScalingCopyAtomB = Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<ElementBlockScale>, ElementBlockScale>;
+  
+  // Block scaling smem layout
+  using SmemLayoutScaleA = Layout<Shape<Int<ScaleMsPerTile>, Int<DispatchPolicy::Stages>>>;
+  using SmemLayoutScaleB = Layout<Shape<Int<DispatchPolicy::Stages>>, Stride<_1>>; // `ScaleNsPerTile` is always 1.
+
+  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 1 or more.");
+  static_assert(cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source both A and B operand from smem_desc for this mainloop.");
+  static_assert(cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+  static_assert(cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+  static_assert(cute::is_same_v<ElementAccumulator, ElementBlockScale>,
+             "ElementAccumulator and ElementBlockScale should be same datatype");
+
+  struct SharedStorage
+  {
+    struct TensorStorage : cute::aligned_struct<128> {
+      cute::array_aligned<typename TiledMma::ValTypeA, cute::cosize_v<SmemLayoutA>> smem_A;  // mxk
+      cute::array_aligned<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>> smem_B;  // nxk
+      cute::array_aligned<ElementBlockScale, cute::cosize_v<SmemLayoutScaleA>> smem_scale_A; // ScaleMsPerTile x k
+      cute::array_aligned<ElementBlockScale, cute::cosize_v<SmemLayoutScaleB>> smem_scale_B; // 1xk
+    } tensors;
+
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    PipelineStorage pipeline;
+  };
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const* ptr_A;
+    StrideA dA;
+    ElementB const* ptr_B;
+    StrideB dB;
+    ElementBlockScale const* ptr_scale_A; 
+    ElementBlockScale const* ptr_scale_B;
+  };
+
+  // Device side kernel params
+  struct Params {
+    // Assumption: StrideA is congruent with Problem_MK
+    using TMA_A = decltype(make_tma_copy_A_sm90(
+        GmemTiledCopyA{},
+        make_tensor(static_cast<ElementA const*>(nullptr), repeat_like(StrideA{}, int32_t(0)), StrideA{}),
+        SmemLayoutA{}(_,_,0),
+        TileShape{},
+        ClusterShape{}));
+    // Assumption: StrideB is congruent with Problem_NK
+    using TMA_B = decltype(make_tma_copy_B_sm90(
+        GmemTiledCopyB{},
+        make_tensor(static_cast<ElementB const*>(nullptr), repeat_like(StrideB{}, int32_t(0)), StrideB{}),
+        SmemLayoutB{}(_,_,0),
+        TileShape{},
+        ClusterShape{}));
+    TMA_A tma_load_a;
+    TMA_B tma_load_b;
+    uint32_t tma_transaction_bytes = TmaTransactionBytes;
+    uint32_t tma_transaction_bytes_mk = TmaTransactionBytesMK;
+    uint32_t tma_transaction_bytes_nk = TmaTransactionBytesNK;
+    // Block scaling factors for A and B
+    ElementBlockScale const* ptr_scale_A; 
+    ElementBlockScale const* ptr_scale_B;
+  };
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    (void) workspace;
+
+    // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    auto ptr_A = reinterpret_cast<ElementA const*>(args.ptr_A);
+    auto ptr_B = reinterpret_cast<ElementB const*>(args.ptr_B);
+
+    Tensor tensor_a = make_tensor(ptr_A, make_layout(make_shape(M,K,L), args.dA));
+    Tensor tensor_b = make_tensor(ptr_B, make_layout(make_shape(N,K,L), args.dB));
+    typename Params::TMA_A tma_load_a = make_tma_copy_A_sm90(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{});
+    typename Params::TMA_B tma_load_b = make_tma_copy_B_sm90(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{});
+    uint32_t transaction_bytes_mk = TmaTransactionBytesMK;
+    uint32_t transaction_bytes_nk = TmaTransactionBytesNK;
+    uint32_t transaction_bytes = transaction_bytes_mk + transaction_bytes_nk;
+
+    return {
+      tma_load_a,
+      tma_load_b,
+      transaction_bytes,
+      transaction_bytes_mk,
+      transaction_bytes_nk,
+      args.ptr_scale_A,
+      args.ptr_scale_B
+    };
+  }
+
+  template<class ProblemShape>
+  static bool
+  can_implement(
+      ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    constexpr int tma_alignment_bits = 128;
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+    
+    bool implementable = true;
+    constexpr int min_tma_aligned_elements_A = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K,L), StrideA{});
+    constexpr int min_tma_aligned_elements_B = tma_alignment_bits / cutlass::sizeof_bits<ElementB>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), StrideB{});
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+    return implementable;
+  }
+
+  static constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
+  static constexpr int K_PIPE_MMAS = 1;
+  static constexpr uint32_t TmaTransactionBytesMK =
+        cutlass::bits_to_bytes(size<0>(SmemLayoutA{}) * size<1>(SmemLayoutA{}) * static_cast<uint32_t>(sizeof_bits<ElementA>::value));
+  static constexpr uint32_t TmaTransactionBytesNK =
+        cutlass::bits_to_bytes(size<0>(SmemLayoutB{}) * size<1>(SmemLayoutB{}) * static_cast<uint32_t>(sizeof_bits<ElementB>::value));
+  static constexpr uint32_t TmaTransactionBytes = TmaTransactionBytesMK + TmaTransactionBytesNK;
+
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
+  CUTLASS_DEVICE
+  static void prefetch_tma_descriptors(Params const& mainloop_params)
+  {
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_a.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_b.get_tma_descriptor());
+  }
+
+  /// Set up the data needed by this collective for load and mma.
+  /// Returns a tuple of tensors. The collective and the kernel layer have the contract
+  /// Returned tuple must contain at least two elements, with the first two elements being:
+  /// gA_mkl - The tma tensor, A after a local tile so it has shape  (BLK_M,BLK_K,m,k,l)
+  /// gB_nkl - The tma tensor, B after a local tile so it has shape  (BLK_N,BLK_K,n,k,l)
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  load_init(ProblemShape_MNKL const& problem_shape_MNKL, Params const& mainloop_params) const {
+    using X = Underscore;
+    // Separate out problem shape for convenience
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    // TMA requires special handling of strides to deal with coord codomain mapping
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(make_shape(M,K,L));                            // (m,k,l)
+    Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(make_shape(N,K,L));                            // (n,k,l)
+
+    // Make tiled views, defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});        // (BLK_M,BLK_K,m,k,l)
+    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});        // (BLK_N,BLK_K,n,k,l)
+
+    constexpr auto scales_m = Int<ScaleMsPerTile>{};
+    auto tM = get<2>(gA_mkl.shape());
+    auto tN = get<2>(gB_nkl.shape());
+    auto tK = get<3>(gA_mkl.shape());
+
+    // Make the tiled views of scale tensors
+    auto scaleA_shape = make_shape(M / ScaleGranularityM, tK, L); // (scale_m,k,l)
+    auto scaleA_layout = make_ordered_layout(scaleA_shape,  Step<_0, _1, _2>{});
+    auto scaleB_shape = make_shape(tN, tK, L); // (n,k,l)
+    auto scaleB_layout = make_ordered_layout(scaleB_shape, Step<_1, _0, _2>{});
+
+    // Note that mScaleA_mkl and mScaleB_nkl are already blocked tiled in the `m` host and 
+    // gScaleA_mkl and gScaleB_nkl in `g` global memory are same as mScaleA_mkl and mScaleB_nkl.
+    Tensor mScaleA_mkl = make_tensor(make_gmem_ptr(mainloop_params.ptr_scale_A), scaleA_layout); // (scale_m,k,l)
+    Tensor mScaleB_nkl = make_tensor(make_gmem_ptr(mainloop_params.ptr_scale_B), scaleB_layout); // (n,k,l)
+
+    return cute::make_tuple(gA_mkl, gB_nkl, mScaleA_mkl, mScaleB_nkl);
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Producer Perspective
+  template <
+    class TensorA, class TensorB,
+    class TensorScaleA, class TensorScaleB,
+    class KTileIterator, class BlockCoord
+  >
+  CUTLASS_DEVICE void
+  load(
+      Params const& mainloop_params,
+      MainloopPipeline pipeline,
+      PipelineState smem_pipe_write,
+      cute::tuple<TensorA, TensorB, TensorScaleA, TensorScaleB> const& load_inputs,
+      BlockCoord const& blk_coord,
+      KTileIterator k_tile_iter, int k_tile_count,
+      int thread_idx,
+      uint32_t block_rank_in_cluster,
+      TensorStorage& shared_tensors) {
+    int lane_predicate = cute::elect_one_sync();
+
+    // Blockscaling: Tma loads for load_input and CpAsync for load_scale
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});        // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});        // (BLK_N,BLK_K,PIPE)
+    Tensor sScaleA = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_A.data()), SmemLayoutScaleA{}); // (ScaleMsPerTile,k)
+    Tensor sScaleB = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_B.data()), SmemLayoutScaleB{}); // (k)
+
+    //
+    // Prepare the TMA loads for A and B
+    //
+
+    constexpr uint32_t cluster_shape_x = get<0>(ClusterShape());
+    uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x};
+
+    Tensor gA_mkl = get<0>(load_inputs);
+    Tensor gB_nkl = get<1>(load_inputs);
+
+    auto block_tma_a = mainloop_params.tma_load_a.get_slice(cluster_local_block_id.y);
+    auto block_tma_b = mainloop_params.tma_load_b.get_slice(cluster_local_block_id.x);
+
+    // Partition the inputs based on the current block coordinates.
+    auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
+    Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                     // (BLK_M,BLK_K,k)
+    Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                     // (BLK_N,BLK_K,k)
+
+
+    // Block scaling: load_scale has scaling tensors in global memory which are not tiled
+    Tensor mScaleA_mkl = get<2>(load_inputs);
+    Tensor mScaleB_nkl = get<3>(load_inputs);
+    auto scales_m = get<0>(mScaleA_mkl.shape());
+
+    Tensor cScaleA_mkl = make_identity_tensor(mScaleA_mkl.shape());
+
+    Tensor gScaleA = local_tile( 
+      mScaleA_mkl, make_tile(Int<ScaleMsPerTile>{}), 
+      make_coord(m_coord,_,l_coord));                   // (ScaleMsPerTile,k,1)
+    Tensor cScaleA = local_tile( 
+      cScaleA_mkl, make_tile(Int<ScaleMsPerTile>{}), 
+      make_coord(m_coord,_,l_coord));
+    Tensor gScaleB = mScaleB_nkl(n_coord,_,l_coord);                                           // (1,k,1)
+
+    // TODO: test `scale_copy_a` with `ScaleMsPerTile` < 128
+    TiledCopy scale_copy_a = make_tiled_copy(SmemBlockScalingCopyAtomA{}, 
+      Layout<Shape<_32, _1>>{}, Layout<Shape<_4, _1>>{}); // (1,1,1)
+    TiledCopy scale_copy_b = make_tiled_copy(SmemBlockScalingCopyAtomB{}, 
+      Layout<Shape<_1>>{}, Layout<Shape<_1>>{}); // (1,1,1)
+    ThrCopy thr_scale_copy_a = scale_copy_a.get_slice(threadIdx.x);
+    ThrCopy thr_scale_copy_b = scale_copy_b.get_slice(threadIdx.x);
+    
+    Tensor tAgA_ScaleA = thr_scale_copy_a.partition_S(gScaleA);
+    Tensor tAcA_ScaleA = thr_scale_copy_a.partition_S(cScaleA);
+    Tensor tAsA_ScaleA = thr_scale_copy_a.partition_D(sScaleA);
+    
+    Tensor tBgB_ScaleB = thr_scale_copy_b.partition_S(gScaleB);
+    Tensor tBsB_ScaleB = thr_scale_copy_b.partition_D(sScaleB);
+
+    // Applies the mapping from block_tma_a
+    Tensor tAgA = block_tma_a.partition_S(gA);                                              // (TMA,TMA_M,TMA_K,k)
+    Tensor tAsA = block_tma_a.partition_D(sA);                                              // (TMA,TMA_M,TMA_K,PIPE)
+
+    Tensor tBgB = block_tma_b.partition_S(gB);                                              // (TMA,TMA_N,TMA_K,k)
+    Tensor tBsB = block_tma_b.partition_D(sB);                                              // (TMA,TMA_N,TMA_K,PIPE)
+
+    uint16_t mcast_mask_a = 0;
+    uint16_t mcast_mask_b = 0;
+
+    // Issue TmaLoads for GEMM operands A/B and CpAsync for scale tensors
+    // Maps the tile -> block, value
+    if constexpr (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>) {
+      auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};                       // (m,n) -> block_id
+      for (int n = 0; n < size<1>(block_layout); ++n) {
+        mcast_mask_a |= (uint16_t(1) << block_layout(cluster_local_block_id.x,n,Int<0>{}));
+      }
+    }
+
+    if constexpr (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>) {
+      auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};                       // (m,n) -> block_id
+      for (int m = 0; m < size<0>(block_layout); ++m) {
+        mcast_mask_b |= (uint16_t(1) << block_layout(m,cluster_local_block_id.y,Int<0>{}));
+      }
+    }
+
+    // Allocate predicate tensors for a_scales (since we can't guarantee that 
+    // all scales are valid, since we could have a partial tiles along M)
+    Tensor tApA_ScaleA = make_tensor<bool>(shape(tAsA_ScaleA(_,_,0)));
+    #pragma unroll
+    for (int i = 0; i < size(tApA_ScaleA); ++i) {
+      tApA_ScaleA(i) = get<0>(tAcA_ScaleA(i)) < scales_m;
+    }
+
+    // Mainloop
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count) {
+      // LOCK smem_pipe_write for _writing_
+      pipeline.producer_acquire(smem_pipe_write);
+
+      //
+      // Copy gmem to smem for *k_tile_iter
+      //
+      int write_stage = smem_pipe_write.index();
+      using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+      BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
+
+      // Copy operands A and B from global memory to shared memory
+      if (lane_predicate) copy(mainloop_params.tma_load_a.with(*tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
+      if (lane_predicate) copy(mainloop_params.tma_load_b.with(*tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
+
+      // Copy scale tensors from global memory to shared memory
+      copy_if(scale_copy_a, tApA_ScaleA, tAgA_ScaleA(_,_,*k_tile_iter), tAsA_ScaleA(_,_,write_stage));
+      copy(scale_copy_b, tBgB_ScaleB(_,*k_tile_iter), tBsB_ScaleB(_,write_stage));
+      pipeline.producer_commit(smem_pipe_write, cutlass::arch::cpasync_barrier_arrive_noinc);
+
+      ++k_tile_iter;
+
+      // Advance smem_pipe_write
+      ++smem_pipe_write;
+    }
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
+  CUTLASS_DEVICE void
+  load_tail(
+      MainloopPipeline pipeline,
+      PipelineState smem_pipe_write) {
+    int lane_predicate = cute::elect_one_sync();
+
+    // Issue the epilogue waits
+    if (lane_predicate) {
+      /* This helps avoid early exit of blocks in Cluster
+       * Waits for all stages to either be released (all
+       * Consumer UNLOCKs), or if the stage was never used
+       * then would just be acquired since the phase was
+       * still inverted from make_producer_start_state
+       */
+      pipeline.producer_tail(smem_pipe_write);
+    }
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <
+    class FrgTensorC
+  >
+  CUTLASS_DEVICE void
+  mma(MainloopPipeline pipeline,
+      PipelineState smem_pipe_read,
+      FrgTensorC& accum,
+      int k_tile_count,
+      int thread_idx,
+      TensorStorage& shared_tensors,
+      Params const& mainloop_params) {
+
+
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+    static_assert(cute::rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::is_void_v<SmemCopyAtomA>,
+      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+    static_assert(cute::is_void_v<SmemCopyAtomB>,
+      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});          // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});          // (BLK_N,BLK_K,PIPE)
+    
+    // Block scaling
+    Tensor sScaleAViewAsC = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_A.data()),
+      Layout<
+        Shape<Shape<Int<ScaleGranularityM>, Int<ScaleMsPerTile>>, cute::tuple_element_t<1, TileShape>, Int<DispatchPolicy::Stages>>,
+        Stride<Stride<_0, _1>, _0, Int<ScaleMsPerTile>>
+      >{}); // ((ScaleGranularityM,ScaleMsPerTile),n,k)
+    Tensor sScaleB = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_B.data()), SmemLayoutScaleB{}); // (k)
+
+    //
+    // Define C accumulators and A/B partitioning
+    //
+    
+    // Layout of warp group to thread mapping
+
+    static_assert(stride<0>(typename TiledMma::ALayout{}) == 0 and 
+                  stride<0>(typename TiledMma::BLayout{}) == 0 and
+                  size<0>(typename TiledMma::ALayout{}) == NumThreadsPerWarpGroup and
+                  size<0>(typename TiledMma::BLayout{}) == NumThreadsPerWarpGroup, 
+                  "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup");
+
+    constexpr int MmaWarpGroups = size(TiledMma{}) / NumThreadsPerWarpGroup;
+    Layout warp_group_thread_layout = make_layout(Int<MmaWarpGroups>{}, 
+                                                  Int<NumThreadsPerWarpGroup>{});
+
+    int warp_group_idx = __shfl_sync(0xFFFFFFFF, thread_idx / NumThreadsPerWarpGroup, 0);
+
+    TiledMma tiled_mma;
+    auto thread_mma = tiled_mma.get_slice(warp_group_thread_layout(warp_group_idx));
+
+    Tensor tCsScaleAViewAsC = tiled_mma.get_slice(thread_idx).partition_C(sScaleAViewAsC);    // (MMA,MMA_M,MMA_N,PIPE), `thread_mma` above is correct when partitioning A and B, but it is not correct when partitioning C.
+
+    Tensor tCsA = thread_mma.partition_A(sA);                                                 // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCsB = thread_mma.partition_B(sB);                                                 // (MMA,MMA_N,MMA_K,PIPE)
+
+    // Allocate "fragments/descriptors"
+    Tensor tCrA = thread_mma.make_fragment_A(tCsA);                                           // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCrB = thread_mma.make_fragment_B(tCsB);                                           // (MMA,MMA_N,MMA_K,PIPE)
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(accum));                                                         // M
+    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum));                                                         // N
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                                                          // K
+    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));                                                       // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));                                         // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));                                         // PIPE
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+    static_assert((0 <= K_PIPE_MMAS) && (K_PIPE_MMAS <  K_PIPE_MAX),
+        "ERROR : Incorrect number of MMAs in flight");
+
+    // We release buffers to producer warps(dma load) with some mmas in flight
+    PipelineState smem_pipe_release = smem_pipe_read;
+    
+    // Per block scale values for operand A and B
+
+    using RegLayoutScaleAViewAsC = decltype(make_layout_like(tCsScaleAViewAsC(_, _, _, 0).layout())); // `make_layout_like` makes a compact layout.
+    using RegLayoutScaleAEssential = decltype(filter_zeros(RegLayoutScaleAViewAsC{}.stride(), RegLayoutScaleAViewAsC{}.shape())); // an interface to traverse the underlying storage for the compact layout mentioned above
+
+    Tensor tCrScaleAViewAsC = make_tensor<ElementBlockScale>(RegLayoutScaleAViewAsC{});              // (MMA,MMA_M,MMA_N)
+    ElementBlockScale scale_b;
+
+    // Prologue GMMAs
+    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
+
+    tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+
+    GmmaFP8AccumulationWithScale accumulation(accum, size<2>(TileShape{}) / size<2>(typename TiledMma::AtomShape_MNK{}), size<2>(tCrA));
+    warpgroup_fence_operand(accumulation());
+    CUTLASS_PRAGMA_UNROLL
+    for (int k_tile_prologue = prologue_mma_count; k_tile_prologue > 0; --k_tile_prologue)
+    {
+      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      if (accumulation.prepare_if_needed()) {
+        tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+      }
+
+      int read_stage = smem_pipe_read.index();
+      
+      // Load per block scale values from shared memory to registers.
+      scale_b = sScaleB[read_stage];
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(RegLayoutScaleAEssential{}); i++) {
+        tCrScaleAViewAsC.data()[i] = tCsScaleAViewAsC(_, _, _, read_stage)(idx2crd(i, RegLayoutScaleAEssential{}));
+      }
+      if constexpr (ScaleMsPerTile == 1) {
+        static_assert(size(RegLayoutScaleAEssential{}) == 1);
+        tCrScaleAViewAsC.data()[0] = __shfl_sync(0xffffffff, tCrScaleAViewAsC.data()[0] * scale_b, 0); // `tCrScaleAViewAsC.data()[0]` are all same in a warp group when `ScaleMsPerTile == 1`.
+      } else {
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size(RegLayoutScaleAEssential{}); i++) {
+          tCrScaleAViewAsC.data()[i] = tCrScaleAViewAsC.data()[i] * scale_b;
+        }
+      }
+
+      warpgroup_arrive();
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        // (V,M,K) x (V,N,K) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accumulation());
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      }
+      warpgroup_commit_batch();
+
+      // Block scale the accumulators with reg tensor `tCrScaleAViewAsC`
+      accumulation.scale_if_needed(tCrScaleAViewAsC);
+
+      ++smem_pipe_read;
+    }
+
+    warpgroup_fence_operand(accumulation());
+    // Mainloop GMMAs
+    k_tile_count -= prologue_mma_count;
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count)
+    {
+      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      //
+      // Compute on k_tile
+      //
+
+      int read_stage = smem_pipe_read.index();
+
+      // Load per block scale values from shared memory to registers (at most twice per block along M and exactly once per block along N) 
+      scale_b = sScaleB[read_stage];
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(RegLayoutScaleAEssential{}); i++) {
+        tCrScaleAViewAsC.data()[i] = tCsScaleAViewAsC(_, _, _, read_stage)(idx2crd(i, RegLayoutScaleAEssential{}));
+      }
+      if constexpr (ScaleMsPerTile == 1) {
+        static_assert(size(RegLayoutScaleAEssential{}) == 1);
+        tCrScaleAViewAsC.data()[0] = __shfl_sync(0xffffffff, tCrScaleAViewAsC.data()[0] * scale_b, 0); // `tCrScaleAViewAsC.data()[0]` are all same in a warp group when `ScaleMsPerTile == 1`.
+      } else {
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size(RegLayoutScaleAEssential{}); i++) {
+          tCrScaleAViewAsC.data()[i] = tCrScaleAViewAsC.data()[i] * scale_b;
+        }
+      }
+
+      if (accumulation.prepare_if_needed()) {
+        tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+      }
+
+      warpgroup_fence_operand(accumulation());
+      warpgroup_arrive();
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        // (V,M,K) x (V,N,K) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accumulation());
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      }
+      warpgroup_commit_batch();
+
+      /// Wait on the GMMA barrier for K_PIPE_MMAS (or fewer) outstanding to ensure smem_pipe_write is consumed
+      warpgroup_wait<K_PIPE_MMAS>();
+      warpgroup_fence_operand(accumulation());
+
+      // Block scale the accumulators with reg tensor `tCrScaleAViewAsC`
+      accumulation.scale_if_needed(tCrScaleAViewAsC);
+
+      pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
+
+      // Advance smem_pipe_read and smem_pipe_release
+      ++smem_pipe_read;
+      ++smem_pipe_release;
+    }
+    
+    accumulation.scale_residue_if_needed(tCrScaleAViewAsC);
+
+    warpgroup_fence_operand(accumulation());
+  }
+
+  /// Perform a Consumer Epilogue to release all buffers
+  CUTLASS_DEVICE void
+  mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release, int k_tile_count) {
+    // Prologue GMMAs
+    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
+    k_tile_count -= prologue_mma_count;
+
+    smem_pipe_release.advance(k_tile_count);
+
+    // Wait on all GMMAs to complete
+    warpgroup_wait<0>();
+
+    for (int count = 0; count < prologue_mma_count; ++count) {
+      pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
+      ++smem_pipe_release;
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/cutlass_extensions/gemm/dispatch_policy.hpp b/csrc/cutlass_extensions/gemm/dispatch_policy.hpp
new file mode 100644
index 0000000000000..df809e27a3efe
--- /dev/null
+++ b/csrc/cutlass_extensions/gemm/dispatch_policy.hpp
@@ -0,0 +1,39 @@
+#pragma once
+
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+namespace cutlass::gemm {
+
+//////////////////////////////////////////////////////////////////////////////
+
+// FP8 related policies (including Blocked Scaled Accumulation)
+//  `ScaleGranularityM` specifies scaling granularity along M, while zero-value
+//  `ScaleGranularityM` indicates that scaling granularity is
+//  `size<0>(TileShape_MNK{})` along M.
+template <int ScaleGranularityM = 0>
+struct KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum
+    : KernelTmaWarpSpecializedCooperative {};
+
+// n-buffer in smem (Hopper TMA), pipelined with Hopper GMMA and TMA, Warp
+// specialized dynamic schedule For FP8 kernels with Block Scaling
+template <int Stages_, class ClusterShape_ = Shape<_1, _1, _1>,
+          class KernelSchedule = KernelTmaWarpSpecialized,
+          int ScaleGranularityM =
+              0  // `ScaleGranularityM` specifies scaling granularity along M,
+                 // while zero-value `ScaleGranularityM` indicates that scaling
+                 // granularity is `size<0>(TileShape_MNK{})` along M.
+          >
+struct MainloopSm90TmaGmmaWarpSpecializedBlockScalingSubGroupMFP8
+    : MainloopSm90TmaGmmaWarpSpecialized<Stages_, ClusterShape_,
+                                         KernelSchedule> {
+  static_assert(
+      cute::is_same_v<
+          KernelSchedule,
+          KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum<
+              ScaleGranularityM>>,
+      "KernelSchedule must be one of the warp specialized policies");
+};
+
+//////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass::gemm
\ No newline at end of file
diff --git a/csrc/cutlass_extensions/vllm_collective_builder.cuh b/csrc/cutlass_extensions/vllm_collective_builder.cuh
index 085ee1290031f..e7fbba4cd4b0d 100644
--- a/csrc/cutlass_extensions/vllm_collective_builder.cuh
+++ b/csrc/cutlass_extensions/vllm_collective_builder.cuh
@@ -1,6 +1,6 @@
 #pragma once
 
-#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass_extensions/gemm/collective/collective_builder.hpp"
 
 namespace cutlass::gemm::collective {
 using namespace cute;
diff --git a/csrc/quantization/cutlass_w8a8/c3x/cutlass_gemm_caller.cuh b/csrc/quantization/cutlass_w8a8/c3x/cutlass_gemm_caller.cuh
new file mode 100644
index 0000000000000..9ac7eee7204ec
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/c3x/cutlass_gemm_caller.cuh
@@ -0,0 +1,93 @@
+#pragma once
+
+// clang-format will break include orders
+// clang-format off
+#include <torch/all.h>
+
+#include <ATen/cuda/CUDAContext.h>
+
+#include "cutlass/cutlass.h"
+
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+
+#include "core/math.hpp"
+#include "cutlass_extensions/common.hpp"
+// clang-format on
+
+namespace vllm::c3x {
+
+static inline cute::Shape<int, int, int, int> get_problem_shape(
+    torch::Tensor const& a, torch::Tensor const& b) {
+  int32_t m = a.size(0), n = b.size(1), k = a.size(1);
+  return {m, n, k, 1};
+}
+
+template <typename GemmKernel>
+void cutlass_gemm_caller(torch::Device device,
+                         cute::Shape<int, int, int, int> prob_shape,
+                         typename GemmKernel::MainloopArguments mainloop_args,
+                         typename GemmKernel::EpilogueArguments epilogue_args) {
+  typename GemmKernel::Arguments args{cutlass::gemm::GemmUniversalMode::kGemm,
+                                      prob_shape, mainloop_args, epilogue_args};
+
+  // Launch the CUTLASS GEMM kernel.
+  using GemmOp = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  GemmOp gemm_op;
+  CUTLASS_CHECK(gemm_op.can_implement(args));
+
+  size_t workspace_size = gemm_op.get_workspace_size(args);
+  auto const workspace_options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(device);
+  auto workspace = torch::empty(workspace_size, workspace_options);
+
+  auto stream = at::cuda::getCurrentCUDAStream(device.index());
+
+  cutlass::Status status = gemm_op.run(args, workspace.data_ptr(), stream);
+  CUTLASS_CHECK(status);
+}
+
+template <typename Gemm, typename... EpilogueArgs>
+void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
+                         torch::Tensor const& b,
+                         EpilogueArgs&&... epilogue_params) {
+  using ElementAB = typename Gemm::ElementAB;
+  using ElementD = typename Gemm::ElementD;
+  using GemmKernel = typename Gemm::GemmKernel;
+
+  int64_t lda = a.stride(0);
+  int64_t ldb = b.stride(1);
+  int64_t ldc = out.stride(0);
+
+  using StrideA = cute::Stride<int64_t, cute::Int<1>, int64_t>;
+  using StrideB = cute::Stride<int64_t, cute::Int<1>, int64_t>;
+  using StrideC = typename Gemm::StrideC;
+
+  StrideA a_stride{lda, cute::Int<1>{}, 0};
+  StrideB b_stride{ldb, cute::Int<1>{}, 0};
+  StrideC c_stride{ldc, cute::Int<1>{}, cute::Int<0>{}};
+
+  typename GemmKernel::ProblemShape prob_shape = get_problem_shape(a, b);
+
+  auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
+  auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
+  typename GemmKernel::MainloopArguments mainloop_args{a_ptr, a_stride, b_ptr,
+                                                       b_stride};
+
+  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
+  typename GemmKernel::EpilogueArguments epilogue_args{
+      Gemm::Epilogue::prepare_args(
+          std::forward<EpilogueArgs>(epilogue_params)...),
+      c_ptr, c_stride, c_ptr, c_stride};
+
+  cutlass_gemm_caller<GemmKernel>(a.device(), prob_shape, mainloop_args,
+                                  epilogue_args);
+}
+
+}  // namespace vllm::c3x
\ No newline at end of file
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cuh b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm.cuh
similarity index 51%
rename from csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cuh
rename to csrc/quantization/cutlass_w8a8/c3x/scaled_mm.cuh
index d4bc2f0ade50d..9227ebb735245 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm.cuh
@@ -2,9 +2,6 @@
 
 // clang-format will break include orders
 // clang-format off
-#include <torch/all.h>
-
-#include <ATen/cuda/CUDAContext.h>
 
 #include "cutlass/cutlass.h"
 
@@ -32,21 +29,6 @@ using namespace cute;
 
 namespace vllm {
 
-// A wrapper for the GEMM kernel that is used to guard against compilation on
-// architectures that will never use the kernel. The purpose of this is to
-// reduce the size of the compiled binary.
-// __CUDA_ARCH__ is not defined in host code, so this lets us smuggle the ifdef
-// into code that will be executed on the device where it is defined.
-template <typename Kernel>
-struct enable_sm90_or_later : Kernel {
-  template <typename... Args>
-  CUTLASS_DEVICE void operator()(Args&&... args) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 900
-    Kernel::operator()(std::forward<Args>(args)...);
-#endif
-  }
-};
-
 template <typename ElementAB_, typename ElementD_,
           template <typename, typename, typename> typename Epilogue_,
           typename TileShape, typename ClusterShape, typename KernelSchedule,
@@ -101,60 +83,4 @@ struct cutlass_3x_gemm {
   struct GemmKernel : public KernelType {};
 };
 
-template <typename Gemm, typename... EpilogueArgs>
-void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
-                         torch::Tensor const& b,
-                         EpilogueArgs&&... epilogue_params) {
-  using ElementAB = typename Gemm::ElementAB;
-  using ElementD = typename Gemm::ElementD;
-
-  int32_t m = a.size(0);
-  int32_t n = b.size(1);
-  int32_t k = a.size(1);
-
-  int64_t lda = a.stride(0);
-  int64_t ldb = b.stride(1);
-  int64_t ldc = out.stride(0);
-
-  using StrideA = Stride<int64_t, Int<1>, int64_t>;
-  using StrideB = Stride<int64_t, Int<1>, int64_t>;
-  using StrideC = typename Gemm::StrideC;
-
-  StrideA a_stride{lda, Int<1>{}, 0};
-  StrideB b_stride{ldb, Int<1>{}, 0};
-  StrideC c_stride{ldc, Int<1>{}, Int<0>{}};
-
-  using GemmKernel = typename Gemm::GemmKernel;
-  typename GemmKernel::ProblemShape prob_shape{m, n, k, 1};
-
-  auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
-  auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
-  typename GemmKernel::MainloopArguments mainloop_args{a_ptr, a_stride, b_ptr,
-                                                       b_stride};
-
-  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
-  typename GemmKernel::EpilogueArguments epilogue_args{
-      Gemm::Epilogue::prepare_args(
-          std::forward<EpilogueArgs>(epilogue_params)...),
-      c_ptr, c_stride, c_ptr, c_stride};
-
-  typename GemmKernel::Arguments args{cutlass::gemm::GemmUniversalMode::kGemm,
-                                      prob_shape, mainloop_args, epilogue_args};
-
-  // Launch the CUTLASS GEMM kernel.
-  using GemmOp = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
-  GemmOp gemm_op;
-  CUTLASS_CHECK(gemm_op.can_implement(args));
-
-  size_t workspace_size = gemm_op.get_workspace_size(args);
-  auto const workspace_options =
-      torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
-  auto workspace = torch::empty(workspace_size, workspace_options);
-
-  auto stream = at::cuda::getCurrentCUDAStream(a.get_device());
-
-  cutlass::Status status = gemm_op.run(args, workspace.data_ptr(), stream);
-  CUTLASS_CHECK(status);
-}
-
 }  // namespace vllm
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu
new file mode 100644
index 0000000000000..4cd38f4975df7
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu
@@ -0,0 +1,24 @@
+#include "scaled_mm_kernels.hpp"
+#include "scaled_mm_sm90_int8_dispatch.cuh"
+#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
+
+namespace vllm {
+
+void cutlass_scaled_mm_azp_sm90_int8(torch::Tensor& out, torch::Tensor const& a,
+                                     torch::Tensor const& b,
+                                     torch::Tensor const& a_scales,
+                                     torch::Tensor const& b_scales,
+                                     torch::Tensor const& azp_adj,
+                                     std::optional<torch::Tensor> const& azp,
+                                     std::optional<torch::Tensor> const& bias) {
+  if (azp) {
+    return cutlass_scaled_mm_sm90_int8_epilogue<
+        c3x::ScaledEpilogueBiasAzpToken>(out, a, b, a_scales, b_scales, azp_adj,
+                                         *azp, bias);
+  } else {
+    return cutlass_scaled_mm_sm90_int8_epilogue<c3x::ScaledEpilogueBiasAzp>(
+        out, a, b, a_scales, b_scales, azp_adj, bias);
+  }
+}
+
+}  // namespace vllm
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu
new file mode 100644
index 0000000000000..0501e6da160e2
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu
@@ -0,0 +1,24 @@
+
+#include "scaled_mm_kernels.hpp"
+#include "scaled_mm_blockwise_sm90_fp8_dispatch.cuh"
+#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
+
+namespace vllm {
+
+void cutlass_scaled_mm_blockwise_sm90_fp8(torch::Tensor& out,
+                                          torch::Tensor const& a,
+                                          torch::Tensor const& b,
+                                          torch::Tensor const& a_scales,
+                                          torch::Tensor const& b_scales) {
+  if (out.dtype() == torch::kBFloat16) {
+    cutlass_gemm_blockwise_sm90_fp8_dispatch<cutlass::bfloat16_t>(
+        out, a, b, a_scales, b_scales);
+
+  } else {
+    TORCH_CHECK(out.dtype() == torch::kFloat16);
+    cutlass_gemm_blockwise_sm90_fp8_dispatch<cutlass::half_t>(
+        out, a, b, a_scales, b_scales);
+  }
+}
+
+}  // namespace vllm
\ No newline at end of file
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh
new file mode 100644
index 0000000000000..fb7a82b80ee65
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh
@@ -0,0 +1,168 @@
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+
+#include "cute/tensor.hpp"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler_params.h"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "cutlass_extensions/gemm/dispatch_policy.hpp"
+#include "cutlass_extensions/gemm/collective/collective_builder.hpp"
+
+#include "cutlass_gemm_caller.cuh"
+
+namespace vllm {
+
+using namespace cute;
+
+template <typename OutType, int GroupSizeM_, int GroupSizeN_, int GroupSizeK_,
+          int TileSizeM_ = 128, class ClusterShape = Shape<_1, _2, _1>>
+struct cutlass_3x_gemm_fp8_blockwise {
+  using GroupSizeM = Int<GroupSizeM_>;
+  using GroupSizeN = Int<GroupSizeN_>;
+  using GroupSizeK = Int<GroupSizeK_>;
+  using TileSizeM = Int<TileSizeM_>;
+
+  static_assert(TileSizeM_ % GroupSizeM_ == 0,
+                "TileSizeM must be a multiple of GroupSizeM");
+
+  using ElementAB = cutlass::float_e4m3_t;
+
+  using ElementA = ElementAB;
+  using LayoutA = cutlass::layout::RowMajor;
+  static constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value;
+
+  using ElementB = ElementAB;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  static constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value;
+
+  using ElementD = OutType;
+  using StrideD = Stride<int64_t, Int<1>, Int<0>>;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using ElementC = void;
+  using StrideC = StrideD;
+  static constexpr int AlignmentC = AlignmentD;
+
+  using ElementAccumulator = float;
+  using ElementBlockScale = float;
+  using ElementCompute = float;
+  using ArchTag = cutlass::arch::Sm90;
+  using OperatorClass = cutlass::arch::OpClassTensorOp;
+  using TileShape = Shape<TileSizeM, GroupSizeN, GroupSizeK>;
+
+  using KernelSchedule = cutlass::gemm::
+      KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum<
+          GroupSizeM_>;
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using EpilogueTileType = cutlass::epilogue::collective::EpilogueTileAuto;
+
+  using StoreEpilogueCompute = typename cutlass::epilogue::fusion::Sm90EVT<
+      cutlass::epilogue::fusion::Sm90AccFetch>;
+
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          ArchTag, OperatorClass, TileShape, ClusterShape, EpilogueTileType,
+          ElementAccumulator, ElementCompute, ElementC, StrideC, AlignmentC,
+          ElementD, StrideD, AlignmentD, EpilogueSchedule,
+          StoreEpilogueCompute>::CollectiveOp;
+
+  using CollectiveMainloop =
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          ArchTag, OperatorClass, ElementA, LayoutA, AlignmentA, ElementB,
+          LayoutB, AlignmentB, ElementAccumulator, TileShape, ClusterShape,
+          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
+              sizeof(typename CollectiveEpilogue::SharedStorage))>,
+          KernelSchedule>::CollectiveOp;
+
+  using KernelType = enable_sm90_or_later<cutlass::gemm::kernel::GemmUniversal<
+      Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue,
+      cutlass::gemm::PersistentScheduler>>;
+
+  struct GemmKernel : public KernelType {};
+
+  using StrideA = typename GemmKernel::StrideA;
+  using StrideB = typename GemmKernel::StrideB;
+};
+
+template <typename Gemm>
+void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
+                                   torch::Tensor const& b,
+                                   torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales) {
+  using GemmKernel = typename Gemm::GemmKernel;
+
+  using ElementAB = typename Gemm::ElementAB;
+  using ElementD = typename Gemm::ElementD;
+
+  auto prob_shape = c3x::get_problem_shape(a, b);
+  int32_t m = get<0>(prob_shape), n = get<1>(prob_shape),
+          k = get<2>(prob_shape);
+
+  int64_t lda = a.stride(0);
+  int64_t ldb = b.stride(1);
+  int64_t ldc = out.stride(0);
+
+  using StrideA = Stride<int64_t, Int<1>, int64_t>;
+  using StrideB = Stride<int64_t, Int<1>, int64_t>;
+  using StrideC = typename Gemm::StrideC;
+
+  StrideA a_stride{lda, Int<1>{}, 0};
+  StrideB b_stride{ldb, Int<1>{}, 0};
+  StrideC c_stride{ldc, Int<1>{}, Int<0>{}};
+
+  auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
+  auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
+  auto a_scales_ptr = static_cast<float*>(a_scales.data_ptr());
+  auto b_scales_ptr = static_cast<float*>(b_scales.data_ptr());
+
+  // Check is the t is contiguous and is 1D or 2D with one of the dimensions
+  // being 1 (i.e. a row or column vector)
+  auto is_contiguous_vector = [](const torch::Tensor& t) {
+    auto t_sizes = t.sizes();
+    return t.is_contiguous() &&
+           (t.dim() == 1 ||
+            (t.dim() == 2 &&
+             *std::min_element(t_sizes.begin(), t_sizes.end()) == 1));
+  };
+
+  // TODO(lucas): lets clean-up the kernel so that we pass in Strides so
+  //  we don't have to deal with enforcing implicit layouts
+  TORCH_CHECK(a_scales.size(0) == m / Gemm::GroupSizeM::value);
+  TORCH_CHECK(a_scales.size(1) == k / Gemm::GroupSizeK::value);
+  TORCH_CHECK(a_scales.stride(0) == 1 || is_contiguous_vector(a_scales),
+              "a_scales must be M major");
+  TORCH_CHECK(b_scales.size(0) == k / Gemm::GroupSizeK::value);
+  TORCH_CHECK(b_scales.size(1) == n / Gemm::GroupSizeN::value);
+  TORCH_CHECK(b_scales.stride(0) == 1 || is_contiguous_vector(b_scales),
+              "b_scales must be K major");
+  typename GemmKernel::MainloopArguments mainloop_args{
+      a_ptr, a_stride, b_ptr, b_stride, a_scales_ptr, b_scales_ptr};
+
+  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
+  typename GemmKernel::EpilogueArguments epilogue_args{
+      {}, c_ptr, c_stride, c_ptr, c_stride};
+
+  c3x::cutlass_gemm_caller<GemmKernel>(a.device(), prob_shape, mainloop_args,
+                                       epilogue_args);
+}
+
+template <typename OutType>
+void cutlass_gemm_blockwise_sm90_fp8_dispatch(torch::Tensor& out,
+                                              torch::Tensor const& a,
+                                              torch::Tensor const& b,
+                                              torch::Tensor const& a_scales,
+                                              torch::Tensor const& b_scales) {
+  cutlass_gemm_caller_blockwise<
+      cutlass_3x_gemm_fp8_blockwise<OutType, 1, 128, 128>>(out, a, b, a_scales,
+                                                           b_scales);
+}
+
+}  // namespace vllm
\ No newline at end of file
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_kernels.hpp b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_kernels.hpp
new file mode 100644
index 0000000000000..7ede9e067477b
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_kernels.hpp
@@ -0,0 +1,33 @@
+#pragma once
+
+#include <torch/all.h>
+
+namespace vllm {
+
+void cutlass_scaled_mm_sm90_fp8(torch::Tensor& out, torch::Tensor const& a,
+                                torch::Tensor const& b,
+                                torch::Tensor const& a_scales,
+                                torch::Tensor const& b_scales,
+                                std::optional<torch::Tensor> const& bias);
+
+void cutlass_scaled_mm_sm90_int8(torch::Tensor& out, torch::Tensor const& a,
+                                 torch::Tensor const& b,
+                                 torch::Tensor const& a_scales,
+                                 torch::Tensor const& b_scales,
+                                 std::optional<torch::Tensor> const& bias);
+
+void cutlass_scaled_mm_azp_sm90_int8(torch::Tensor& out, torch::Tensor const& a,
+                                     torch::Tensor const& b,
+                                     torch::Tensor const& a_scales,
+                                     torch::Tensor const& b_scales,
+                                     torch::Tensor const& azp_adj,
+                                     std::optional<torch::Tensor> const& azp,
+                                     std::optional<torch::Tensor> const& bias);
+
+void cutlass_scaled_mm_blockwise_sm90_fp8(torch::Tensor& out,
+                                          torch::Tensor const& a,
+                                          torch::Tensor const& b,
+                                          torch::Tensor const& a_scales,
+                                          torch::Tensor const& b_scales);
+
+}  // namespace vllm
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu
new file mode 100644
index 0000000000000..e092c61abc249
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu
@@ -0,0 +1,24 @@
+#include "scaled_mm_kernels.hpp"
+#include "scaled_mm_sm90_fp8_dispatch.cuh"
+#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
+
+namespace vllm {
+
+void cutlass_scaled_mm_sm90_fp8(torch::Tensor& out, torch::Tensor const& a,
+                                torch::Tensor const& b,
+                                torch::Tensor const& a_scales,
+                                torch::Tensor const& b_scales,
+                                std::optional<torch::Tensor> const& bias) {
+  TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
+  if (bias) {
+    TORCH_CHECK(bias->dtype() == out.dtype(),
+                "currently bias dtype must match output dtype ", out.dtype());
+    return cutlass_scaled_mm_sm90_fp8_epilogue<c3x::ScaledEpilogueBias>(
+        out, a, b, a_scales, b_scales, *bias);
+  } else {
+    return cutlass_scaled_mm_sm90_fp8_epilogue<c3x::ScaledEpilogue>(
+        out, a, b, a_scales, b_scales);
+  }
+}
+
+}  // namespace vllm
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90_fp8_dispatch.cuh b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8_dispatch.cuh
similarity index 76%
rename from csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90_fp8_dispatch.cuh
rename to csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8_dispatch.cuh
index f08419b3122b2..32ea5db3321bc 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90_fp8_dispatch.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8_dispatch.cuh
@@ -1,6 +1,7 @@
 #pragma once
 
-#include "scaled_mm_c3x.cuh"
+#include "scaled_mm.cuh"
+#include "cutlass_gemm_caller.cuh"
 
 /**
  * This file defines Gemm kernel configurations for SM90 (fp8) based on the Gemm
@@ -9,6 +10,8 @@
 
 namespace vllm {
 
+using c3x::cutlass_gemm_caller;
+
 template <typename InType, typename OutType,
           template <typename, typename, typename> typename Epilogue>
 struct sm90_fp8_config_default {
@@ -93,4 +96,25 @@ inline void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out,
   }
 }
 
+template <template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_scaled_mm_sm90_fp8_epilogue(torch::Tensor& out,
+                                         torch::Tensor const& a,
+                                         torch::Tensor const& b,
+                                         EpilogueArgs&&... epilogue_args) {
+  TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
+
+  if (out.dtype() == torch::kBFloat16) {
+    return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
+                                          cutlass::bfloat16_t, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
+  } else {
+    TORCH_CHECK(out.dtype() == torch::kFloat16);
+    return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
+                                          cutlass::half_t, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
+  }
+}
+
 }  // namespace vllm
\ No newline at end of file
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu
new file mode 100644
index 0000000000000..021467b8bde8f
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu
@@ -0,0 +1,24 @@
+#include "scaled_mm_kernels.hpp"
+#include "scaled_mm_sm90_int8_dispatch.cuh"
+#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
+
+namespace vllm {
+
+void cutlass_scaled_mm_sm90_int8(torch::Tensor& out, torch::Tensor const& a,
+                                 torch::Tensor const& b,
+                                 torch::Tensor const& a_scales,
+                                 torch::Tensor const& b_scales,
+                                 std::optional<torch::Tensor> const& bias) {
+  TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
+  if (bias) {
+    TORCH_CHECK(bias->dtype() == out.dtype(),
+                "currently bias dtype must match output dtype ", out.dtype());
+    return cutlass_scaled_mm_sm90_int8_epilogue<c3x::ScaledEpilogueBias>(
+        out, a, b, a_scales, b_scales, *bias);
+  } else {
+    return cutlass_scaled_mm_sm90_int8_epilogue<c3x::ScaledEpilogue>(
+        out, a, b, a_scales, b_scales);
+  }
+}
+
+}  // namespace vllm
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90_int8_dispatch.cuh b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8_dispatch.cuh
similarity index 84%
rename from csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90_int8_dispatch.cuh
rename to csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8_dispatch.cuh
index 34e5fd90ba26a..c4fa18101956b 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90_int8_dispatch.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8_dispatch.cuh
@@ -1,6 +1,7 @@
 #pragma once
 
-#include "scaled_mm_c3x.cuh"
+#include "scaled_mm.cuh"
+#include "cutlass_gemm_caller.cuh"
 
 /**
  * This file defines Gemm kernel configurations for SM90 (int8) based on the
@@ -9,6 +10,8 @@
 
 namespace vllm {
 
+using c3x::cutlass_gemm_caller;
+
 template <typename InType, typename OutType,
           template <typename, typename, typename> typename Epilogue>
 struct sm90_int8_config_default {
@@ -137,4 +140,24 @@ inline void cutlass_gemm_sm90_int8_dispatch(torch::Tensor& out,
   }
 }
 
+template <template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_scaled_mm_sm90_int8_epilogue(torch::Tensor& out,
+                                          torch::Tensor const& a,
+                                          torch::Tensor const& b,
+                                          EpilogueArgs&&... epilogue_args) {
+  TORCH_CHECK(a.dtype() == torch::kInt8);
+  TORCH_CHECK(b.dtype() == torch::kInt8);
+
+  if (out.dtype() == torch::kBFloat16) {
+    return cutlass_gemm_sm90_int8_dispatch<int8_t, cutlass::bfloat16_t,
+                                           Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
+  } else {
+    TORCH_CHECK(out.dtype() == torch::kFloat16);
+    return cutlass_gemm_sm90_int8_dispatch<int8_t, cutlass::half_t, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
+  }
+}
+
 }  // namespace vllm
\ No newline at end of file
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
index e18d7d79e5b77..e6f06d72fbfd4 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
@@ -1,52 +1,13 @@
 #include <cudaTypedefs.h>
+#include "c3x/scaled_mm_kernels.hpp"
 
-#if defined CUDA_VERSION && CUDA_VERSION >= 12000
-
-  #include "scaled_mm_c3x_sm90_fp8_dispatch.cuh"
-  #include "scaled_mm_c3x_sm90_int8_dispatch.cuh"
-
-  #include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
-using namespace vllm;
+#include "core/math.hpp"
 
 /*
    This file defines quantized GEMM operations using the CUTLASS 3.x API, for
    NVIDIA GPUs with sm90a (Hopper) or later.
 */
 
-template <template <typename, typename, typename> typename Epilogue,
-          typename... EpilogueArgs>
-void cutlass_scaled_mm_sm90_epilogue(torch::Tensor& out, torch::Tensor const& a,
-                                     torch::Tensor const& b,
-                                     EpilogueArgs&&... epilogue_args) {
-  if (a.dtype() == torch::kInt8) {
-    TORCH_CHECK(b.dtype() == torch::kInt8);
-
-    if (out.dtype() == torch::kBFloat16) {
-      return cutlass_gemm_sm90_int8_dispatch<int8_t, cutlass::bfloat16_t,
-                                             Epilogue>(
-          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
-    } else {
-      TORCH_CHECK(out.dtype() == torch::kFloat16);
-      return cutlass_gemm_sm90_int8_dispatch<int8_t, cutlass::half_t, Epilogue>(
-          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
-    }
-  } else {
-    TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
-    TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
-
-    if (out.dtype() == torch::kBFloat16) {
-      return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
-                                            cutlass::bfloat16_t, Epilogue>(
-          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
-    } else {
-      TORCH_CHECK(out.dtype() == torch::kFloat16);
-      return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
-                                            cutlass::half_t, Epilogue>(
-          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
-    }
-  }
-}
-
 void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
                             torch::Tensor const& b,
                             torch::Tensor const& a_scales,
@@ -54,14 +15,50 @@ void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
                             std::optional<torch::Tensor> const& bias) {
   TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
   TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
-  if (bias) {
-    TORCH_CHECK(bias->dtype() == c.dtype(),
-                "currently bias dtype must match output dtype ", c.dtype());
-    return cutlass_scaled_mm_sm90_epilogue<c3x::ScaledEpilogueBias>(
-        c, a, b, a_scales, b_scales, *bias);
+
+  using GroupShape = std::array<int64_t, 2>;
+
+  int M = a.size(0), N = b.size(1), K = a.size(1);
+
+  GroupShape a_scale_group_shape = [&, &s = a_scales]() -> GroupShape {
+    if (s.numel() == 1) return {M, K};  // tensor-wise
+    if (s.dim() == 2)
+      return {ceil_div(a.size(0), s.size(0)), ceil_div(a.size(1), s.size(1))};
+    TORCH_CHECK(false, "Unsupported scale shape for scale_a");
+  }();
+
+  GroupShape b_scale_group_shape = [&, &s = b_scales]() -> GroupShape {
+    if (s.numel() == 1) return {K, N};  // tensor-wise
+    if (s.dim() == 2)
+      return {ceil_div(b.size(0), s.size(0)), ceil_div(b.size(1), s.size(1))};
+    TORCH_CHECK(false, "Unsupported scale shape for scale_b");
+  }();
+
+  if ((a_scale_group_shape == GroupShape{M, K} ||
+       a_scale_group_shape == GroupShape{1, K}) &&
+      (b_scale_group_shape == GroupShape{K, N} ||
+       b_scale_group_shape == GroupShape{K, 1})) {
+    // "standard per-tensor/per-token/per-channel" scaling
+    TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
+    if (a.dtype() == torch::kFloat8_e4m3fn) {
+      vllm::cutlass_scaled_mm_sm90_fp8(c, a, b, a_scales, b_scales, bias);
+    } else {
+      TORCH_CHECK(a.dtype() == torch::kInt8);
+      vllm::cutlass_scaled_mm_sm90_int8(c, a, b, a_scales, b_scales, bias);
+    }
+  } else if (a_scale_group_shape == GroupShape{1, 128} &&
+             b_scale_group_shape == GroupShape{128, 128}) {
+    // 1x128 per-token group scales for activations
+    // 128x128 blockwise scales for weights
+    TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn &&
+                    b.dtype() == torch::kFloat8_e4m3fn,
+                "Currently only FP8 is supported for A group shape 1x128 and "
+                "B group shape 128x128");
+    TORCH_CHECK(!bias, "Bias not yet supported blockwise scaled_mm");
+
+    vllm::cutlass_scaled_mm_blockwise_sm90_fp8(c, a, b, a_scales, b_scales);
   } else {
-    return cutlass_scaled_mm_sm90_epilogue<c3x::ScaledEpilogue>(
-        c, a, b, a_scales, b_scales);
+    TORCH_CHECK(false, "Unsupported scale group shapes for CUTLASS 3.x GEMM");
   }
 }
 
@@ -75,13 +72,6 @@ void cutlass_scaled_mm_azp_sm90(torch::Tensor& out, torch::Tensor const& a,
   TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
   TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
 
-  if (azp) {
-    return cutlass_scaled_mm_sm90_epilogue<c3x::ScaledEpilogueBiasAzpToken>(
-        out, a, b, a_scales, b_scales, azp_adj, *azp, bias);
-  } else {
-    return cutlass_scaled_mm_sm90_epilogue<c3x::ScaledEpilogueBiasAzp>(
-        out, a, b, a_scales, b_scales, azp_adj, bias);
-  }
+  vllm::cutlass_scaled_mm_azp_sm90_int8(out, a, b, a_scales, b_scales, azp_adj,
+                                        azp, bias);
 }
-
-#endif
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
index 3f2b52624f366..da77312bc4b98 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
@@ -89,15 +89,12 @@ void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
   TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2);
   TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) &&
               b.size(1) == c.size(1));
-  TORCH_CHECK(a_scales.numel() == 1 || a_scales.numel() == a.size(0));
-  TORCH_CHECK(b_scales.numel() == 1 || b_scales.numel() == b.size(1));
 
   // Check for strides and alignment
   TORCH_CHECK(a.stride(1) == 1 && c.stride(1) == 1);  // Row-major
   TORCH_CHECK(b.stride(0) == 1);                      // Column-major
   TORCH_CHECK(c.stride(0) % 16 == 0 &&
               b.stride(1) % 16 == 0);  // 16 Byte Alignment
-  TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
 
   if (bias) {
     TORCH_CHECK(bias->numel() == b.size(1) && bias->is_contiguous() &&
diff --git a/csrc/quantization/machete/machete_mainloop.cuh b/csrc/quantization/machete/machete_mainloop.cuh
index 4071b19a3564d..572894064dc59 100644
--- a/csrc/quantization/machete/machete_mainloop.cuh
+++ b/csrc/quantization/machete/machete_mainloop.cuh
@@ -272,6 +272,10 @@ struct MacheteCollectiveMma {
   using PipelineState = cutlass::PipelineState<DispatchPolicy::Stages>;
 
   using PipelineParams = typename MainloopPipeline::Params;
+
+  // One threads per CTA are producers (1 for operand tile)
+  static constexpr int NumProducerThreadEvents = 1;
+
   using ScaleTileShape = decltype(make_shape(shape<0>(TileShape{}),
                                              shape<1>(SmemLayoutAtomScale{})));
 
diff --git a/tests/kernels/test_cutlass.py b/tests/kernels/test_cutlass.py
index c3eddacec2727..f538d492c2dfa 100644
--- a/tests/kernels/test_cutlass.py
+++ b/tests/kernels/test_cutlass.py
@@ -10,6 +10,7 @@
 from tests.kernels.utils import opcheck
 from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
+from vllm.utils import cdiv
 
 from .utils import baseline_scaled_mm, to_fp8, to_int8
 
@@ -39,6 +40,11 @@
     f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
 ]
 
+# -1 means full extent in that dimension
+TENSORWISE_GROUP_SHAPE = (-1, -1)
+PER_TOKEN_GROUP_SHAPE = (1, -1)
+PER_OUT_CH_GROUP_SHAPE = (-1, 1)
+
 capability = current_platform.get_device_capability()
 capability = capability[0] * 10 + capability[1]
 
@@ -47,11 +53,22 @@ def rand_int8(shape: tuple, device: str = "cuda"):
     return to_int8(torch.rand(shape, device=device) * 255 - 128)
 
 
+def group_scale_helper(shape, group_shape):
+    return [shape[i] if s < 0 else s for i, s in enumerate(group_shape)]
+
+
+def scale_shape(shape, group_shape):
+    assert len(shape) == len(group_shape)
+    group_shape = group_scale_helper(shape, group_shape)
+    return tuple(
+        cdiv(shape[i], group_shape[i]) for i in range(len(group_shape)))
+
+
 def cutlass_fp8_gemm_helper(m: int,
                             n: int,
                             k: int,
-                            per_token_act_quant: bool,
-                            per_out_channel_weight_quant: bool,
+                            a_scale_group_shape: tuple,
+                            b_scale_group_shape: tuple,
                             use_bias: bool,
                             out_dtype: Type[torch.dtype] = torch.bfloat16,
                             device: str = "cuda"):
@@ -60,13 +77,17 @@ def cutlass_fp8_gemm_helper(m: int,
     a = to_fp8(torch.randn((m, k), device=device))
     b = to_fp8(torch.randn((n, k), device=device).t())
 
-    m_a_scales = m if per_token_act_quant else 1
-    n_b_scales = n if per_out_channel_weight_quant else 1
+    a_scales_shape = scale_shape(a.shape, a_scale_group_shape)
+    b_scales_shape = scale_shape(b.shape, b_scale_group_shape)
+
+    scale_a = (torch.randn(a_scales_shape, device=device, dtype=torch.float32))
+    scale_b = (torch.randn(b_scales_shape, device=device, dtype=torch.float32))
+
+    # make scales M-major for blockwise quant, doesn't affect 1D scales
+    scale_a = scale_a.t().contiguous().t()
+    # make scales K-major for blockwise quant, doesn't affect 1D scales
+    scale_b = scale_b.t().contiguous().t()
 
-    scale_a = (torch.randn((m_a_scales, 1), device=device,
-                           dtype=torch.float32))
-    scale_b = (torch.randn((1, n_b_scales), device=device,
-                           dtype=torch.float32))
     if use_bias:
         bias = torch.rand((n, ), device=device, dtype=out_dtype) * 10
     else:
@@ -84,8 +105,8 @@ def cutlass_fp8_gemm_helper(m: int,
 def cutlass_int8_gemm_helper(m: int,
                              n: int,
                              k: int,
-                             per_token_act_quant: bool,
-                             per_out_channel_weight_quant: bool,
+                             a_scale_group_shape: tuple,
+                             b_scale_group_shape: tuple,
                              use_bias: bool,
                              out_dtype: Type[torch.dtype] = torch.bfloat16,
                              device: str = "cuda"):
@@ -94,13 +115,11 @@ def cutlass_int8_gemm_helper(m: int,
     a = to_int8(torch.randn((m, k), device=device) * 5)
     b = to_int8(torch.randn((n, k), device=device).t() * 5)
 
-    m_a_scales = m if per_token_act_quant else 1
-    n_b_scales = n if per_out_channel_weight_quant else 1
+    a_scales_shape = scale_shape(a.shape, a_scale_group_shape)
+    b_scales_shape = scale_shape(b.shape, b_scale_group_shape)
 
-    scale_a = (torch.randn((m_a_scales, 1), device=device,
-                           dtype=torch.float32))
-    scale_b = (torch.randn((1, n_b_scales), device=device,
-                           dtype=torch.float32))
+    scale_a = (torch.randn(a_scales_shape, device=device, dtype=torch.float32))
+    scale_b = (torch.randn(b_scales_shape, device=device, dtype=torch.float32))
 
     if use_bias:
         bias = torch.rand((n, ), device=device, dtype=out_dtype) * 10
@@ -117,82 +136,135 @@ def cutlass_int8_gemm_helper(m: int,
 
 
 @pytest.mark.parametrize("m,n,k", MNK_FACTORS)
-@pytest.mark.parametrize("per_act_token", [True, False])
-@pytest.mark.parametrize("per_out_ch", [True, False])
+@pytest.mark.parametrize("a_scale_group_shape",
+                         [PER_TOKEN_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
+@pytest.mark.parametrize("b_scale_group_shape",
+                         [PER_OUT_CH_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
 @pytest.mark.parametrize("use_bias", [True, False])
 @pytest.mark.skipif(not current_platform.has_device_capability(89),
                     reason="FP8 is not supported on this GPU type.")
-def test_cutlass_fp8_gemm(m: int, n: int, k: int, per_act_token: bool,
-                          per_out_ch: bool, use_bias: bool):
-    cutlass_fp8_gemm_helper(m, n, k, per_act_token, per_out_ch, use_bias)
+def test_cutlass_fp8_gemm(m: int, n: int, k: int, a_scale_group_shape,
+                          b_scale_group_shape, use_bias: bool):
+    cutlass_fp8_gemm_helper(m, n, k, a_scale_group_shape, b_scale_group_shape,
+                            use_bias)
 
 
 @pytest.mark.parametrize("m,n,k", MNK_FACTORS)
-@pytest.mark.parametrize("per_act_token", [True, False])
-@pytest.mark.parametrize("per_out_ch", [True, False])
+@pytest.mark.parametrize("a_scale_group_shape,b_scale_group_shape",
+                         [((1, 128), (128, 128))])
+@pytest.mark.parametrize("use_bias", [False])
+@pytest.mark.skipif(not current_platform.has_device_capability(90),
+                    reason="FP8 blockwise is not supported on this GPU type.")
+def test_cutlass_fp8_blockwise_scale_gemm(m: int, n: int, k: int,
+                                          a_scale_group_shape,
+                                          b_scale_group_shape, use_bias: bool):
+    if k % b_scale_group_shape[0] != 0 or n % b_scale_group_shape[1] != 0:
+        return
+    if m % a_scale_group_shape[0] != 0 or k % a_scale_group_shape[1] != 0:
+        return
+    cutlass_fp8_gemm_helper(m, n, k, a_scale_group_shape, b_scale_group_shape,
+                            use_bias)
+
+
+@pytest.mark.parametrize("m,n,k", MNK_FACTORS)
+@pytest.mark.parametrize("a_scale_group_shape",
+                         [PER_TOKEN_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
+@pytest.mark.parametrize("b_scale_group_shape",
+                         [PER_OUT_CH_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
 @pytest.mark.parametrize("use_bias", [True, False])
-def test_cutlass_int8_gemm(m: int, n: int, k: int, per_act_token: bool,
-                           per_out_ch: bool, use_bias: bool):
-    cutlass_int8_gemm_helper(m, n, k, per_act_token, per_out_ch, use_bias)
+def test_cutlass_int8_gemm(m: int, n: int, k: int, a_scale_group_shape,
+                           b_scale_group_shape, use_bias: bool):
+    cutlass_int8_gemm_helper(m, n, k, a_scale_group_shape, b_scale_group_shape,
+                             use_bias)
 
 
-@pytest.mark.parametrize("per_act_token", [True, False])
-@pytest.mark.parametrize("per_out_ch", [True, False])
+@pytest.mark.parametrize("a_scale_group_shape",
+                         [PER_TOKEN_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
+@pytest.mark.parametrize("b_scale_group_shape",
+                         [PER_OUT_CH_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
 @pytest.mark.parametrize("out_dtype", [torch.bfloat16, torch.float16])
 @pytest.mark.parametrize("use_bias", [True, False])
-def test_cutlass_int8_gemm_output_dtype(per_act_token: bool, per_out_ch: bool,
+def test_cutlass_int8_gemm_output_dtype(a_scale_group_shape,
+                                        b_scale_group_shape,
                                         out_dtype: Type[torch.dtype],
                                         use_bias: bool):
     cutlass_int8_gemm_helper(512,
                              512,
                              512,
-                             per_act_token,
-                             per_out_ch,
+                             a_scale_group_shape,
+                             b_scale_group_shape,
                              use_bias,
                              out_dtype=out_dtype)
 
 
-@pytest.mark.parametrize("per_act_token", [True, False])
-@pytest.mark.parametrize("per_out_ch", [True, False])
+@pytest.mark.parametrize("a_scale_group_shape",
+                         [PER_TOKEN_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
+@pytest.mark.parametrize("b_scale_group_shape",
+                         [PER_OUT_CH_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
 @pytest.mark.parametrize("out_dtype", [torch.bfloat16, torch.float16])
 @pytest.mark.parametrize("use_bias", [True, False])
 @pytest.mark.skipif(not current_platform.has_device_capability(89),
                     reason="FP8 is not supported on this GPU type.")
-def test_cutlass_fp8_gemm_output_dtype(per_act_token: bool, per_out_ch: bool,
+def test_cutlass_fp8_gemm_output_dtype(a_scale_group_shape,
+                                       b_scale_group_shape,
                                        out_dtype: Type[torch.dtype],
                                        use_bias: bool):
     cutlass_fp8_gemm_helper(512,
                             512,
                             512,
-                            per_act_token,
-                            per_out_ch,
+                            a_scale_group_shape,
+                            b_scale_group_shape,
                             use_bias,
                             out_dtype=out_dtype)
 
 
-@pytest.mark.parametrize("per_act_token", [True, False])
-@pytest.mark.parametrize("per_out_ch", [True, False])
+@pytest.mark.parametrize("a_scale_group_shape,b_scale_group_shape",
+                         [((1, 128), (128, 128))])
+@pytest.mark.parametrize("out_dtype", [torch.bfloat16, torch.float16])
+@pytest.mark.parametrize("use_bias", [False])
+@pytest.mark.skipif(not current_platform.has_device_capability(90),
+                    reason="FP8 blockwise is not supported on this GPU type.")
+def test_cutlass_fp8_blockwise_scale_gemm_dtype(a_scale_group_shape,
+                                                b_scale_group_shape,
+                                                out_dtype: Type[torch.dtype],
+                                                use_bias: bool):
+    cutlass_fp8_gemm_helper(512,
+                            512,
+                            512,
+                            a_scale_group_shape,
+                            b_scale_group_shape,
+                            use_bias,
+                            out_dtype=out_dtype)
+
+
+@pytest.mark.parametrize("a_scale_group_shape",
+                         [PER_TOKEN_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
+@pytest.mark.parametrize("b_scale_group_shape",
+                         [PER_OUT_CH_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
 @pytest.mark.parametrize("use_bias", [True, False])
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @pytest.mark.skipif(not current_platform.has_device_capability(89),
                     reason="FP8 is not supported on this GPU type.")
-def test_cutlass_fp8_gemm_devices(per_act_token: bool, per_out_ch: bool,
+def test_cutlass_fp8_gemm_devices(a_scale_group_shape, b_scale_group_shape,
                                   use_bias: bool, device: str):
-    cutlass_fp8_gemm_helper(512, 512, 512, per_act_token, per_out_ch, use_bias,
-                            torch.bfloat16, device)
+    cutlass_fp8_gemm_helper(512, 512, 512, a_scale_group_shape,
+                            b_scale_group_shape, use_bias, torch.bfloat16,
+                            device)
 
 
-@pytest.mark.parametrize("per_act_token", [True, False])
-@pytest.mark.parametrize("per_out_ch", [True, False])
+@pytest.mark.parametrize("a_scale_group_shape",
+                         [PER_TOKEN_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
+@pytest.mark.parametrize("b_scale_group_shape",
+                         [PER_OUT_CH_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
 @pytest.mark.parametrize("use_bias", [True, False])
 @pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_cutlass_int8_gemm_devices(per_act_token: bool, per_out_ch: bool,
+def test_cutlass_int8_gemm_devices(a_scale_group_shape, b_scale_group_shape,
                                    use_bias: bool, device: str):
     cutlass_int8_gemm_helper(512,
                              512,
                              512,
-                             per_act_token,
-                             per_out_ch,
+                             a_scale_group_shape,
+                             b_scale_group_shape,
                              use_bias,
                              out_dtype=torch.bfloat16,
                              device=device)
@@ -203,28 +275,32 @@ def test_cutlass_int8_gemm_devices(per_act_token: bool, per_out_ch: bool,
 # of a large power of two. In any case, the kernel will have a naive fallback
 # when N and K are not divisible by 16. But M is the number of tokens and the
 # kernel must handle any M thrown at it.
-@pytest.mark.parametrize("per_act_token", [True, False])
-@pytest.mark.parametrize("per_out_ch", [True, False])
+@pytest.mark.parametrize("a_scale_group_shape",
+                         [PER_TOKEN_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
+@pytest.mark.parametrize("b_scale_group_shape",
+                         [PER_OUT_CH_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
 @pytest.mark.parametrize("use_bias", [True, False])
 @pytest.mark.skipif(not current_platform.has_device_capability(89),
                     reason="FP8 is not supported on this GPU type.")
-def test_cutlass_fp8_gemm_m_sweep(per_act_token: bool, per_out_ch: bool,
+def test_cutlass_fp8_gemm_m_sweep(a_scale_group_shape, b_scale_group_shape,
                                   use_bias: bool):
     for nk in range(32, 128, 32):
         for m in range(1, 128):
-            cutlass_fp8_gemm_helper(m, nk, nk, per_act_token, per_out_ch,
-                                    use_bias)
+            cutlass_fp8_gemm_helper(m, nk, nk, a_scale_group_shape,
+                                    b_scale_group_shape, use_bias)
 
 
-@pytest.mark.parametrize("per_act_token", [True, False])
-@pytest.mark.parametrize("per_out_ch", [True, False])
+@pytest.mark.parametrize("a_scale_group_shape",
+                         [PER_TOKEN_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
+@pytest.mark.parametrize("b_scale_group_shape",
+                         [PER_OUT_CH_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
 @pytest.mark.parametrize("use_bias", [True, False])
-def test_cutlass_int8_gemm_m_sweep(per_act_token: bool, per_out_ch: bool,
+def test_cutlass_int8_gemm_m_sweep(a_scale_group_shape, b_scale_group_shape,
                                    use_bias: bool):
     for nk in range(32, 128, 32):
         for m in range(1, 128):
-            cutlass_int8_gemm_helper(m, nk, nk, per_act_token, per_out_ch,
-                                     use_bias)
+            cutlass_int8_gemm_helper(m, nk, nk, a_scale_group_shape,
+                                     b_scale_group_shape, use_bias)
 
 
 @pytest.mark.parametrize("m", [32, 64, 128])
diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py
index fb2c9f5d30583..c735c5edd7a36 100644
--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@@ -1119,8 +1119,36 @@ def baseline_scaled_mm(a: torch.Tensor,
                        scale_b: torch.Tensor,
                        out_dtype: Type[torch.dtype],
                        bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-    output = (scale_a * (scale_b * (torch.mm(
-        a.to(dtype=torch.float32), b.to(dtype=torch.float32))))).to(out_dtype)
+
+    # We treat N-dimensional group scaling as extended numpy-style broadcasting
+    # in numpy simply stretches dimensions with an extent of 1 to match the
+    # the target shape by repeating the data along that dimension (broadcasting)
+    # , we extend these semantics to say if the extent of a dimension in the
+    # source shape is not 1 and does not match the target shape we repeat each
+    # element along that dimension src_shape[dim] // target_shape[dim] times
+    # example if we have:
+    #       a = [[1, 2], and target_shape = (2, 4)
+    #            [3, 4]]
+    # then we would expand a to:
+    #       a = [[1, 1, 2, 2],
+    #            [3, 3, 4, 4]]
+    # NOTE this function this function does not explicitly broadcast dimensions
+    # with an extent of 1, since this can be done implicitly by pytorch
+    def group_broadcast(t, shape):
+        for i, s in enumerate(shape):
+            if t.shape[i] != s and t.shape[i] != 1:
+                assert s % t.shape[i] == 0
+                t = t.unsqueeze(i + 1)\
+                  .expand(*t.shape[:i+1], s // t.shape[i], *t.shape[i+1:])\
+                  .flatten(i, i + 1)
+        return t
+
+    scale_a = group_broadcast(scale_a, a.shape)
+    scale_b = group_broadcast(scale_b, b.shape)
+
+    output = torch.mm((scale_a * a.to(dtype=torch.float32)),
+                      (scale_b * b.to(dtype=torch.float32))).to(out_dtype)
+
     if bias is not None:
         output = output + bias
 
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 85c1121ed6ff8..4a11b0206e003 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -441,6 +441,28 @@ def cutlass_scaled_mm(a: torch.Tensor,
                       scale_b: torch.Tensor,
                       out_dtype: torch.dtype,
                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    """
+    `cutlass_scaled_mm` implements a fused version of 
+        `output = torch.mm((scale_a * a), (scale_b * b)).to(out_dtype)`
+    where scale_a * a and scale_b * b are implemented using numpy-style 
+    broadcasting. 
+    
+    In order to support blockwise scaling like found in DeepSeek V3 we also 
+    support extended "group" broadcast rules. We extend the numpy-style 
+    broadcasting rules with the following rule: 
+        "if the extent of a dimension in the source shape is between 1 and 
+        corresponding extent in the target shape we repeat each element along 
+        that dimension  src_shape[dim] // target_shape[dim] times consecutively"
+    example if we have:
+          a = [[1, 2], and target_shape = (2, 4)
+               [3, 4]]
+    then we would expand a to:
+          a = [[1, 1, 2, 2],
+               [3, 3, 4, 4]]
+    currently we only support the case:
+        scale_a.shape * [1, 128] == a.shape
+        scale_b.shape * [128, 128] == b.shape
+    """
     assert (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
     assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
     assert bias is None or bias.shape[0] == b.shape[

From a1fc18c030e4d0466f2b23cb7dd4d11ce4b85603 Mon Sep 17 00:00:00 2001
From: Aleksandr Malyshev <164964928+maleksan85@users.noreply.github.com>
Date: Thu, 30 Jan 2025 20:24:28 -0800
Subject: [PATCH 440/462] [ROCm][AMD][Model] llama 3.2 support upstreaming
 (#12421)

Signed-off-by: Aleksandr Malyshev <maleksan@amd.com>
Co-authored-by: Aleksandr Malyshev <maleksan@amd.com>
---
 vllm/attention/backends/rocm_flash_attn.py | 376 ++++++++++++++++-----
 vllm/model_executor/models/mllama.py       |  16 +-
 2 files changed, 304 insertions(+), 88 deletions(-)

diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index ca6fa9ca61b30..12110ec7356d5 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -90,6 +90,17 @@ class ROCmFlashAttentionMetadata(AttentionMetadata, PagedAttentionMetadata):
     seq_lens: Optional[List[int]]
     # seq_lens stored as a tensor.
     seq_lens_tensor: Optional[torch.Tensor]
+    # Maximum sequence length among prefill batch. 0 if there are decoding
+    # requests only.
+    max_prefill_seq_len: int
+    # Maximum sequence length among decode batch. 0 if there are prefill
+    # requests only.
+    max_decode_seq_len: int
+
+    # Whether or not if cuda graph is enabled.
+    # Cuda-graph is currently enabled for decoding only.
+    # TODO(woosuk): Move `use_cuda_graph` out since it's unrelated to attention.
+    use_cuda_graph: bool
 
     # NOTE(sang): Definition of context_len, query_len, and seq_len.
     # |---------- N-1 iteration --------|
@@ -100,30 +111,18 @@ class ROCmFlashAttentionMetadata(AttentionMetadata, PagedAttentionMetadata):
     #                                   |-- query_len ---|
 
     # Maximum query length in the batch. None for decoding.
-    max_query_len: Optional[int]
-    # Maximum sequence length among prefill batch. 0 if there are decoding
-    # requests only.
-    max_prefill_seq_len: int
-    # Maximum sequence length among decode batch. 0 if there are prefill
-    # requests only.
-    max_decode_seq_len: int
+    max_query_len: Optional[int] = None
     # (batch_size + 1,). The cumulative subquery lengths of the sequences in
     # the batch, used to index into subquery. E.g., if the subquery length
     # is [4, 6], it is [0, 4, 10].
-    query_start_loc: Optional[torch.Tensor]
+    query_start_loc: Optional[torch.Tensor] = None
     # (batch_size + 1,). The cumulative sequence lengths of the sequences in
     # the batch, used to index into sequence. E.g., if the sequence length is
     # [4, 6], it is [0, 4, 10].
-    seq_start_loc: Optional[torch.Tensor]
-
-    # Whether or not if cuda graph is enabled.
-    # Cuda-graph is currently enabled for decoding only.
-    # TODO(woosuk): Move `use_cuda_graph` out since it's unrelated to attention.
-    use_cuda_graph: bool
-
+    seq_start_loc: Optional[torch.Tensor] = None
     # (batch_size,) A tensor of context lengths (tokens that are computed
     # so far).
-    context_lens_tensor: Optional[torch.Tensor]
+    context_lens_tensor: Optional[torch.Tensor] = None
 
     # Max number of query tokens among request in the batch.
     max_decode_query_len: Optional[int] = None
@@ -131,6 +130,23 @@ class ROCmFlashAttentionMetadata(AttentionMetadata, PagedAttentionMetadata):
     _cached_prefill_metadata: Optional["ROCmFlashAttentionMetadata"] = None
     _cached_decode_metadata: Optional["ROCmFlashAttentionMetadata"] = None
 
+    # Begin encoder attn & enc/dec cross-attn fields...
+
+    # Encoder sequence lengths representation
+    encoder_seq_lens: Optional[List[int]] = None
+    encoder_seq_lens_tensor: Optional[torch.Tensor] = None
+
+    # Maximum sequence length among encoder sequences
+    max_encoder_seq_len: Optional[int] = None
+
+    # Number of tokens input to encoder
+    num_encoder_tokens: Optional[int] = None
+
+    # Cross-attention memory-mapping data structures: slot mapping
+    # and block tables
+    cross_slot_mapping: Optional[torch.Tensor] = None
+    cross_block_tables: Optional[torch.Tensor] = None
+
     @property
     def prefill_metadata(self) -> Optional["ROCmFlashAttentionMetadata"]:
         if self.num_prefills == 0:
@@ -141,10 +157,7 @@ def prefill_metadata(self) -> Optional["ROCmFlashAttentionMetadata"]:
 
         assert self.seq_lens is not None
         assert self.seq_lens_tensor is not None
-        assert self.query_start_loc is not None
-        assert self.context_lens_tensor is not None
         assert self.block_tables is not None
-        assert self.seq_start_loc is not None
 
         self._cached_prefill_metadata = ROCmFlashAttentionMetadata(
             num_prefills=self.num_prefills,
@@ -159,12 +172,20 @@ def prefill_metadata(self) -> Optional["ROCmFlashAttentionMetadata"]:
             max_query_len=self.max_query_len,
             max_prefill_seq_len=self.max_prefill_seq_len,
             max_decode_seq_len=0,
-            query_start_loc=self.query_start_loc[:self.num_prefills + 1],
-            seq_start_loc=self.seq_start_loc[:self.num_prefills + 1],
-            context_lens_tensor=self.context_lens_tensor[:self.num_prefills],
+            query_start_loc=None if self.query_start_loc is None else
+            self.query_start_loc[:self.num_prefills + 1],
+            seq_start_loc=None if self.seq_start_loc is None else
+            self.seq_start_loc[:self.num_prefills + 1],
+            context_lens_tensor=None if self.context_lens_tensor is None else
+            self.context_lens_tensor[:self.num_prefills],
             block_tables=self.block_tables[:self.num_prefills],
             use_cuda_graph=False,
-        )
+            # Begin encoder & cross attn fields below...
+            encoder_seq_lens=self.encoder_seq_lens,
+            encoder_seq_lens_tensor=self.encoder_seq_lens_tensor,
+            max_encoder_seq_len=self.max_encoder_seq_len,
+            cross_slot_mapping=self.cross_slot_mapping,
+            cross_block_tables=self.cross_block_tables)
         return self._cached_prefill_metadata
 
     @property
@@ -194,7 +215,12 @@ def decode_metadata(self) -> Optional["ROCmFlashAttentionMetadata"]:
             context_lens_tensor=None,
             block_tables=self.block_tables[self.num_prefills:],
             use_cuda_graph=self.use_cuda_graph,
-        )
+            # Begin encoder & cross attn fields below...
+            encoder_seq_lens=self.encoder_seq_lens,
+            encoder_seq_lens_tensor=self.encoder_seq_lens_tensor,
+            max_encoder_seq_len=self.max_encoder_seq_len,
+            cross_slot_mapping=self.cross_slot_mapping,
+            cross_block_tables=self.cross_block_tables)
         # Batch may be composed of prefill|decodes, adjust query start indices
         # to refer to the start of decodes when the two are split apart.
         # E.g. in tokens:[3 prefills|6 decodes], query_start_loc=[3,9] => [0,6].
@@ -304,6 +330,97 @@ def _make_alibi_bias(alibi_slopes: torch.Tensor,
     return attn_biases
 
 
+def _get_seq_len_block_table_args(
+    attn_metadata: ROCmFlashAttentionMetadata,
+    attn_type: str,
+) -> tuple:
+    '''
+    The particular choice of sequence-length
+    attributes which should be extracted from attn_metadata is dependent
+    on the type of attention operation.
+
+    Decoder attn -> select entirely decoder self-attention-related fields
+    Encoder/decoder cross-attn -> select encoder sequence lengths
+    Encoder attn -> select encoder sequence lengths fields
+    
+    Arguments:
+
+    * attn_metadata: Attention metadata structure associated with attention op
+    * attn_type: encoder attention, decoder self-attention,
+                encoder/decoder cross-attention
+
+    Returns:
+
+    * Appropriate sequence-lengths tensors for query and key
+    * Appropriate max sequence-length scalar
+    '''
+
+    partial_prefix_sum = 0
+    if attn_type == AttentionType.ENCODER:
+        assert attn_metadata.encoder_seq_lens is not None
+        assert attn_metadata.encoder_seq_lens_tensor is not None
+        query_seq_start_loc = torch.tensor(
+            [0] + [
+                partial_prefix_sum := partial_prefix_sum + i
+                for i in attn_metadata.encoder_seq_lens
+            ],
+            device=attn_metadata.encoder_seq_lens_tensor.device,
+            dtype=attn_metadata.encoder_seq_lens_tensor.dtype)
+        causal_mask = False
+
+        # No block tables associated with encoder attention
+        return (query_seq_start_loc, attn_metadata.max_encoder_seq_len,
+                query_seq_start_loc, attn_metadata.max_encoder_seq_len,
+                attn_metadata.encoder_seq_lens, causal_mask)
+    elif attn_type == AttentionType.DECODER:
+        # Decoder self-attention
+        # Choose max_seq_len based on whether we are in prompt_run
+        assert attn_metadata.seq_lens is not None
+        assert attn_metadata.seq_lens_tensor is not None
+        query_seq_start_loc = torch.tensor(
+            [0] + [
+                partial_prefix_sum := partial_prefix_sum + i
+                for i in attn_metadata.seq_lens
+            ],
+            device=attn_metadata.seq_lens_tensor.device,
+            dtype=attn_metadata.seq_lens_tensor.dtype)
+        max_seq_len = attn_metadata.max_prefill_seq_len
+        causal_mask = True
+
+        return (query_seq_start_loc, max_seq_len, query_seq_start_loc,
+                max_seq_len, attn_metadata.seq_lens, causal_mask)
+    elif attn_type == AttentionType.ENCODER_DECODER:
+        assert attn_metadata.seq_lens is not None
+        assert attn_metadata.encoder_seq_lens_tensor is not None
+        query_start_loc = torch.tensor(
+            [0] + [
+                partial_prefix_sum := partial_prefix_sum + i
+                for i in attn_metadata.seq_lens
+            ],
+            device=attn_metadata.encoder_seq_lens_tensor.device,
+            dtype=attn_metadata.encoder_seq_lens_tensor.dtype)
+
+        partial_prefix_sum = 0
+        assert attn_metadata.encoder_seq_lens is not None
+        assert attn_metadata.seq_lens_tensor is not None
+        key_seq_start_loc = torch.tensor(
+            [0] + [
+                partial_prefix_sum := partial_prefix_sum + i
+                for i in attn_metadata.encoder_seq_lens
+            ],
+            device=attn_metadata.seq_lens_tensor.device,
+            dtype=attn_metadata.seq_lens_tensor.dtype)
+        causal_mask = False
+
+        # Enc/dec cross-attention KVs match encoder sequence length;
+        # cross-attention utilizes special "cross" block tables
+        return (query_start_loc, attn_metadata.max_prefill_seq_len,
+                key_seq_start_loc, attn_metadata.max_encoder_seq_len,
+                attn_metadata.seq_lens, causal_mask)
+    else:
+        raise AttributeError(f"Invalid attention type {str(attn_type)}")
+
+
 class ROCmFlashAttentionImpl(AttentionImpl):
     """
     If the input tensors contain prompt tokens, the layout is as follows:
@@ -346,10 +463,13 @@ def __init__(
         if blocksparse_params is not None:
             raise ValueError(
                 "ROCmFlashAttention does not support blocksparse attention.")
-        if logits_soft_cap is not None:
-            raise ValueError(
-                "ROCmFlashAttention does not support attention logits soft "
-                "capping.")
+
+        if logits_soft_cap is None:
+            # In flash-attn, setting logits_soft_cap as 0 means no soft cap.
+            self.logits_soft_cap = 0.0
+        else:
+            self.logits_soft_cap = logits_soft_cap
+        self.attn_type = attn_type
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
@@ -374,6 +494,14 @@ def __init__(
         # NOTE: Allow for switching between Triton and CK. Defaulting to triton.
         self.use_triton_flash_attn = envs.VLLM_USE_TRITON_FLASH_ATTN
         if self.use_triton_flash_attn:
+            if logits_soft_cap is not None:
+                raise ValueError(
+                    "ROCm Triton FlashAttention does not support attention"
+                    "logits soft capping."
+                    " please try using the ROCm CK "
+                    "FA backend instead by setting the env var "
+                    "`VLLM_USE_TRITON_FLASH_ATTN=0`")
+
             from vllm.attention.ops.triton_flash_attention import (  # noqa: F401
                 triton_attention)
             self.attn_func = triton_attention
@@ -398,14 +526,13 @@ def __init__(
                     self.use_naive_attn = True
 
             if self.use_naive_attn:
-                self.attn_func = _sdpa_attention
-                logger.debug("Using naive attention in ROCmBackend")
+                if logits_soft_cap is not None:
+                    raise ValueError(
+                        "ROCm Naive FlashAttention does not support"
+                        "attention logits soft capping.")
 
-        if attn_type != AttentionType.DECODER:
-            raise NotImplementedError("Encoder self-attention and "
-                                      "encoder/decoder cross-attention "
-                                      "are not implemented for "
-                                      "ROCmFlashAttentionImpl")
+                self.attn_func = _sdpa_attention
+                logger.debug("Using naive (SDPA) attention in ROCmBackend")
 
     def repeat_kv(self, x: torch.Tensor, n_rep: int) -> torch.Tensor:
         """torch.repeat_interleave(x, dim=1, repeats=n_rep)"""
@@ -427,6 +554,37 @@ def forward(
     ) -> torch.Tensor:
         """Forward pass with FlashAttention and PagedAttention.
 
+        For decoder-only models: query, key and value must be non-None.
+
+        For encoder/decoder models:
+        * ROCmFlashAttentionImpl.forward() may be invoked for both self- and 
+            cross-attention layers.
+        * For self-attention: query, key and value must be non-None.
+        * For cross-attention:
+            * Query must be non-None
+            * During prefill, key and value must be non-None; key and value
+              get cached for use during decode.
+            * During decode, key and value may be None, since:
+              (1) key and value tensors were cached during prefill, and
+              (2) cross-attention key and value tensors do not grow during
+                  decode
+        
+        A note on how the attn_type (attention type enum) argument impacts
+        attention forward() behavior:
+    
+            * DECODER: normal decoder-only behavior;
+                use decoder self-attention block table
+            * ENCODER: no KV caching; pass encoder sequence
+                attributes (encoder_seq_lens/encoder_seq_lens_tensor/
+                max_encoder_seq_len) to kernel, in lieu of decoder
+                sequence attributes (seq_lens/seq_lens_tensor/max_seq_len)
+            * ENCODER_DECODER: cross-attention behavior;
+                use cross-attention block table for caching KVs derived
+                from encoder hidden states; since KV sequence lengths
+                will match encoder sequence lengths, pass encoder sequence
+                attributes to kernel (encoder_seq_lens/encoder_seq_lens_tensor/
+                max_encoder_seq_len)
+
         Args:
             query: shape = [num_tokens, num_heads * head_size]
             key: shape = [num_tokens, num_kv_heads * head_size]
@@ -435,54 +593,80 @@ def forward(
                 NOTE: kv_cache will be an empty tensor with shape [0]
                 for profiling run.
             attn_metadata: Metadata for attention.
+            attn_type: Select attention type, between encoder attention,
+                       decoder self-attention, or encoder/decoder cross-
+                       attention. Defaults to decoder self-attention,
+                       which is the vLLM default generally
         Returns:
             shape = [num_tokens, num_heads * head_size]
         """
-        # Reminder: Please update docs/source/features/compatibility_matrix.md
-        # If the feature combo become valid
-        num_tokens, hidden_size = query.shape
-        # Reshape the query, key, and value tensors.
         query = query.view(-1, self.num_heads, self.head_size)
-        key = key.view(-1, self.num_kv_heads, self.head_size)
-        value = value.view(-1, self.num_kv_heads, self.head_size)
+        if key is not None:
+            assert value is not None
+            key = key.view(-1, self.num_kv_heads, self.head_size)
+            value = value.view(-1, self.num_kv_heads, self.head_size)
+        else:
+            assert value is None
 
-        if kv_cache.numel() > 0:
+        if self.attn_type != AttentionType.ENCODER and kv_cache.numel() > 0:
             key_cache, value_cache = PagedAttention.split_kv_cache(
                 kv_cache, self.num_kv_heads, self.head_size)
 
-            # Reshape the input keys and values and store them in the cache.
-            # If kv_cache is not provided, the new key and value tensors are
-            # not cached. This happens during the initial memory profiling run.
-            PagedAttention.write_to_paged_cache(
-                key,
-                value,
-                key_cache,
-                value_cache,
-                attn_metadata.slot_mapping,
-                self.kv_cache_dtype,
-                layer._k_scale,
-                layer._v_scale,
-            )
-
-        num_prefill_tokens = attn_metadata.num_prefill_tokens
-        num_decode_tokens = attn_metadata.num_decode_tokens
-        assert key.shape[0] == num_prefill_tokens + num_decode_tokens
-        assert value.shape[0] == num_prefill_tokens + num_decode_tokens
+            if key is not None and value is not None:
+                # Reshape the input keys and values and store them in the
+                # cache. If kv_cache is not provided, the new key and value
+                # tensors are not cached. This happens during the initial
+                # memory profiling run.
+                PagedAttention.write_to_paged_cache(
+                    key,
+                    value,
+                    key_cache,
+                    value_cache,
+                    attn_metadata.slot_mapping
+                    if self.attn_type != AttentionType.ENCODER_DECODER else
+                    attn_metadata.cross_slot_mapping,
+                    self.kv_cache_dtype,
+                    layer._k_scale,
+                    layer._v_scale,
+                )
+
+        if self.attn_type != AttentionType.ENCODER:
+            num_prefill_tokens = attn_metadata.num_prefill_tokens
+        else:
+            assert attn_metadata.num_encoder_tokens is not None
+            num_prefill_tokens = attn_metadata.num_encoder_tokens
 
         output = torch.empty_like(query)
         # Query for decode. KV is not needed because it is already cached.
         decode_query = query[num_prefill_tokens:]
         # QKV for prefill.
         query = query[:num_prefill_tokens]
-        key = key[:num_prefill_tokens]
-        value = value[:num_prefill_tokens]
 
-        assert query.shape[0] == num_prefill_tokens
-        assert decode_query.shape[0] == num_decode_tokens
+        if key is not None and value is not None \
+            and self.attn_type != AttentionType.ENCODER_DECODER:
+            key = key[:num_prefill_tokens]
+            value = value[:num_prefill_tokens]
 
         if prefill_meta := attn_metadata.prefill_metadata:
             # Prompt run.
-            assert prefill_meta.seq_lens is not None
+            # normal attention and DECODER
+            if self.attn_type == AttentionType.DECODER and (
+                    kv_cache.numel() == 0 or prefill_meta.block_tables is None
+                    or prefill_meta.block_tables.numel() == 0):
+                (query_seq_start_loc, query_max_seq_len, key_seq_start_loc,
+                 key_max_seq_len, seq_lens,
+                 causal_mask) = (prefill_meta.seq_start_loc,
+                                 prefill_meta.max_prefill_seq_len,
+                                 prefill_meta.seq_start_loc,
+                                 prefill_meta.max_prefill_seq_len,
+                                 attn_metadata.seq_lens, True)
+            # prefix-enabled attention and ENCODER/ENCODER_DECODER
+            else:
+                (query_seq_start_loc, query_max_seq_len, key_seq_start_loc,
+                 key_max_seq_len, seq_lens,
+                 causal_mask) = _get_seq_len_block_table_args(
+                     prefill_meta, self.attn_type)
+            # Prompt run.
             if kv_cache.numel() == 0 or prefill_meta.block_tables.numel() == 0:
                 # triton attention
                 # When block_tables are not filled, it means q and k are the
@@ -493,18 +677,18 @@ def forward(
                         attn_masks = _make_alibi_bias(
                             self.alibi_slopes,
                             query.dtype,
-                            attn_metadata.seq_lens,
+                            seq_lens,
                             make_attn_mask=False)  # type: ignore
                     out, _ = self.attn_func(
                         query,
                         key,
                         value,
                         None,
-                        prefill_meta.seq_start_loc,
-                        prefill_meta.seq_start_loc,
-                        prefill_meta.max_prefill_seq_len,
-                        prefill_meta.max_prefill_seq_len,
-                        True,
+                        query_seq_start_loc,
+                        key_seq_start_loc,
+                        query_max_seq_len,
+                        key_max_seq_len,
+                        causal_mask,
                         self.scale,
                         attn_masks[0][None]
                         if attn_masks is not None else None,
@@ -528,11 +712,12 @@ def forward(
                         query,
                         key,
                         value,
-                        prefill_meta.seq_lens,
-                        num_tokens,
+                        query_seq_start_loc,
+                        num_prefill_tokens,
                         self.num_heads,
                         self.head_size,
                         self.scale,
+                        causal_mask,
                         attn_masks,
                     )
                 else:
@@ -540,19 +725,23 @@ def forward(
                         q=query,
                         k=key,
                         v=value,
-                        cu_seqlens_q=prefill_meta.seq_start_loc,
-                        cu_seqlens_k=prefill_meta.seq_start_loc,
+                        cu_seqlens_q=query_seq_start_loc,
+                        cu_seqlens_k=key_seq_start_loc,
                         max_seqlen_q=prefill_meta.max_prefill_seq_len,
-                        max_seqlen_k=prefill_meta.max_prefill_seq_len,
+                        max_seqlen_k=key_max_seq_len,
                         softmax_scale=self.scale,
                         causal=True,
                         window_size=self.sliding_window,
                         alibi_slopes=self.alibi_slopes,
+                        softcap=self.logits_soft_cap,
                     )
 
                 # common code for prefill
                 assert output[:num_prefill_tokens].shape == out.shape
-                output[:num_prefill_tokens] = out
+                if output.shape[0] > num_prefill_tokens:
+                    output[:num_prefill_tokens] = out
+                else:
+                    output = out
             else:
                 # prefix-enabled attention
                 output[:num_prefill_tokens] = PagedAttention.forward_prefix(
@@ -583,7 +772,10 @@ def forward(
                 decode_query.dtype, head_size, block_size, gqa_ratio,
                 decode_meta.max_decode_seq_len)
             if use_custom:
-                max_seq_len = decode_meta.max_decode_seq_len
+                max_seq_len = (decode_meta.max_decode_seq_len if self.attn_type
+                               != AttentionType.ENCODER_DECODER else
+                               decode_meta.max_encoder_seq_len)
+                assert max_seq_len is not None
                 max_num_partitions = (
                     (max_seq_len + _PARTITION_SIZE_ROCM - 1) //
                     _PARTITION_SIZE_ROCM)
@@ -599,8 +791,12 @@ def forward(
                     device=output.device,
                 )
                 max_logits = torch.empty_like(exp_sums)
+                if num_prefill_tokens > 0:
+                    out = output[num_prefill_tokens:]
+                else:
+                    out = output
                 ops.paged_attention_rocm(
-                    output[num_prefill_tokens:],
+                    out,
                     exp_sums,
                     max_logits,
                     tmp_output,
@@ -609,8 +805,12 @@ def forward(
                     value_cache,
                     self.num_kv_heads,
                     self.scale,
-                    decode_meta.block_tables,
-                    decode_meta.seq_lens_tensor,
+                    decode_meta.block_tables
+                    if self.attn_type != AttentionType.ENCODER_DECODER else
+                    decode_meta.cross_block_tables,
+                    decode_meta.seq_lens_tensor
+                    if self.attn_type != AttentionType.ENCODER_DECODER else
+                    decode_meta.encoder_seq_lens_tensor,
                     block_size,
                     max_seq_len,
                     self.alibi_slopes,
@@ -623,9 +823,15 @@ def forward(
                     decode_query,
                     key_cache,
                     value_cache,
-                    decode_meta.block_tables,
-                    decode_meta.seq_lens_tensor,
-                    decode_meta.max_decode_seq_len,
+                    decode_meta.block_tables
+                    if self.attn_type != AttentionType.ENCODER_DECODER else
+                    decode_meta.cross_block_tables,
+                    decode_meta.seq_lens_tensor
+                    if self.attn_type != AttentionType.ENCODER_DECODER else
+                    decode_meta.encoder_seq_lens_tensor,
+                    decode_meta.max_decode_seq_len
+                    if self.attn_type != AttentionType.ENCODER_DECODER else
+                    decode_meta.max_encoder_seq_len,
                     self.kv_cache_dtype,
                     self.num_kv_heads,
                     self.scale,
@@ -635,7 +841,7 @@ def forward(
                 )
 
         # Reshape the output tensor.
-        return output.view(num_tokens, hidden_size)
+        return output.view(-1, self.num_heads * self.head_size)
 
 
 def _sdpa_attention(
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index 34b8624647ce6..f7f9d7a186d95 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -48,7 +48,8 @@
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.sequence import SequenceData
@@ -847,7 +848,8 @@ def _attention_with_mask(
                     i,
                     i,
                 )
-            elif self.attn.backend in (_Backend.XFORMERS, _Backend.TORCH_SDPA):
+            elif self.attn.backend in (_Backend.XFORMERS, _Backend.ROCM_FLASH,
+                                       _Backend.TORCH_SDPA):
                 key_cache, value_cache = PagedAttention.split_kv_cache(
                     kv_cache, self.num_local_key_value_heads, self.head_dim)
                 cached_k = torch.cat([k[s:e] for s, e in kv_range_for_decode])
@@ -859,7 +861,8 @@ def _attention_with_mask(
                 raise ValueError(
                     f"Unsupported Attention backend {self.attn.backend} "
                     "enum found. Expected the Attention backend to be "
-                    "FLASH_ATTN, FLASH_ATTN_VLLM_V1, XFORMERS or TORCH_SDPA.")
+                    "FLASH_ATTN, FLASH_ATTN_VLLM_V1, "
+                    "XFORMERS or TORCH_SDPA.")
 
         # We have to call torch.sdpa for prefill when using a
         # custom cross-attention mask. Because the mask is not a
@@ -1452,6 +1455,13 @@ def load_weights(self, weights: Iterable[Tuple[str,
                 weight_loader(param, loaded_weight, shard_id)
                 break
             else:
+                orig_name = name
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    logger.debug("Missing name %s, orig name %s", name,
+                                 orig_name)
+                    continue
+
                 param = params_dict.pop(name)
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)

From cabaf4eff3c7df30d785769d5a0a1fa1a1c48a8a Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Fri, 31 Jan 2025 02:49:37 -0500
Subject: [PATCH 441/462] [Attention] MLA decode optimizations (#12528)

Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
Signed-off-by: simon-mo <xmo@berkeley.edu>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: simon-mo <simon.mo@hey.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Zhuohan Li <zhuohan123@gmail.com>
Co-authored-by: Tyler Michael Smith <tysmith@redhat.com>
Co-authored-by: Alexander Matveev <59768536+alexm-neuralmagic@users.noreply.github.com>
Co-authored-by: simon-mo <xmo@berkeley.edu>
---
 csrc/cache.h                                  |   5 +
 csrc/cache_kernels.cu                         |  95 +++
 csrc/torch_bindings.cpp                       |   9 +
 tests/kernels/test_triton_decode_attention.py |  89 +++
 .../vllm_add_dummy_platform/dummy_platform.py |   2 +-
 tests/weight_loading/models.txt               |   2 +-
 .../run_model_weight_loading_test.sh          |   9 +-
 vllm/_custom_ops.py                           |  13 +
 vllm/attention/backends/abstract.py           |  16 +
 vllm/attention/backends/mla/__init__.py       |   0
 vllm/attention/backends/mla/utils.py          | 365 +++++++++
 vllm/attention/backends/triton_mla.py         | 749 ++++++++++++++++++
 vllm/attention/backends/utils.py              |   4 +
 vllm/attention/layer.py                       |  19 +-
 vllm/attention/ops/triton_decode_attention.py | 667 ++++++++++++++++
 vllm/attention/selector.py                    |   6 +-
 vllm/config.py                                |  35 +-
 vllm/engine/arg_utils.py                      |   1 -
 vllm/envs.py                                  |  14 +
 vllm/model_executor/model_loader/loader.py    |   6 +
 vllm/model_executor/models/deepseek_v2.py     | 154 +++-
 vllm/platforms/cpu.py                         |   3 +-
 vllm/platforms/cuda.py                        |   9 +-
 vllm/platforms/hpu.py                         |   3 +-
 vllm/platforms/interface.py                   |   4 +-
 vllm/platforms/openvino.py                    |   3 +-
 vllm/platforms/rocm.py                        |   3 +-
 vllm/platforms/tpu.py                         |   3 +-
 vllm/platforms/xpu.py                         |   3 +-
 vllm/worker/cache_engine.py                   |   3 +-
 vllm/worker/model_runner.py                   |   4 +-
 31 files changed, 2266 insertions(+), 32 deletions(-)
 create mode 100644 tests/kernels/test_triton_decode_attention.py
 create mode 100644 vllm/attention/backends/mla/__init__.py
 create mode 100644 vllm/attention/backends/mla/utils.py
 create mode 100644 vllm/attention/backends/triton_mla.py
 create mode 100644 vllm/attention/ops/triton_decode_attention.py

diff --git a/csrc/cache.h b/csrc/cache.h
index eedad9fafa3c0..55ed30bd8ce48 100644
--- a/csrc/cache.h
+++ b/csrc/cache.h
@@ -28,6 +28,11 @@ void reshape_and_cache_flash(torch::Tensor& key, torch::Tensor& value,
                              const std::string& kv_cache_dtype,
                              torch::Tensor& k_scale, torch::Tensor& v_scale);
 
+void concat_and_cache_mla(torch::Tensor& kv_c, torch::Tensor& k_pe,
+                          torch::Tensor& kv_cache, torch::Tensor& slot_mapping,
+                          const std::string& kv_cache_dtype,
+                          torch::Tensor& scale);
+
 // Just for unittest
 void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
                  const double scale, const std::string& kv_cache_dtype);
diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
index 21a0aec0ececc..23a46b6ed8ad8 100644
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -245,6 +245,51 @@ __global__ void reshape_and_cache_flash_kernel(
     }
   }
 }
+
+template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
+__global__ void concat_and_cache_mla_kernel(
+    const scalar_t* __restrict__ kv_c,  // [num_tokens, kv_lora_rank]
+    const scalar_t* __restrict__ k_pe,  // [num_tokens, pe_dim]
+    cache_t* __restrict__ kv_cache,  // [num_blocks, block_size, (kv_lora_rank
+                                     // + pe_dim)]
+    const int64_t* __restrict__ slot_mapping,  // [num_tokens]
+    const int block_stride,                    //
+    const int kv_c_stride,                     //
+    const int k_pe_stride,                     //
+    const int kv_lora_rank,                    //
+    const int pe_dim,                          //
+    const int block_size,                      //
+    const float* scale                         //
+) {
+  const int64_t token_idx = blockIdx.x;
+  const int64_t slot_idx = slot_mapping[token_idx];
+  // NOTE: slot_idx can be -1 if the token is padded
+  if (slot_idx < 0) {
+    return;
+  }
+  const int64_t block_idx = slot_idx / block_size;
+  const int64_t block_offset = slot_idx % block_size;
+
+  auto copy = [&](const scalar_t* __restrict__ src, cache_t* __restrict__ dst,
+                  int src_stride, int dst_stride, int size, int offset) {
+    for (int i = threadIdx.x; i < size; i += blockDim.x) {
+      const int64_t src_idx = token_idx * src_stride + i;
+      const int64_t dst_idx = block_idx * block_stride +
+                              block_offset * (kv_lora_rank + pe_dim) + i +
+                              offset;
+      if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
+        dst[dst_idx] = src[src_idx];
+      } else {
+        dst[dst_idx] =
+            fp8::scaled_convert<cache_t, scalar_t, kv_dt>(src[src_idx], *scale);
+      }
+    }
+  };
+
+  copy(kv_c, kv_cache, kv_c_stride, block_stride, kv_lora_rank, 0);
+  copy(k_pe, kv_cache, k_pe_stride, block_stride, pe_dim, kv_lora_rank);
+}
+
 }  // namespace vllm
 
 // KV_T is the stored data type of kv-cache.
@@ -343,6 +388,56 @@ void reshape_and_cache_flash(
                              CALL_RESHAPE_AND_CACHE_FLASH);
 }
 
+// KV_T is the stored data type of kv-cache.
+// CACHE_T is the data type of key and value tensors.
+// KV_DTYPE is the real data type of kv-cache.
+#define CALL_CONCAT_AND_CACHE_MLA(KV_T, CACHE_T, KV_DTYPE)             \
+  vllm::concat_and_cache_mla_kernel<KV_T, CACHE_T, KV_DTYPE>           \
+      <<<grid, block, 0, stream>>>(                                    \
+          reinterpret_cast<KV_T*>(kv_c.data_ptr()),                    \
+          reinterpret_cast<KV_T*>(k_pe.data_ptr()),                    \
+          reinterpret_cast<CACHE_T*>(kv_cache.data_ptr()),             \
+          slot_mapping.data_ptr<int64_t>(), block_stride, kv_c_stride, \
+          k_pe_stride, kv_lora_rank, pe_dim, block_size,               \
+          reinterpret_cast<const float*>(scale.data_ptr()));
+
+void concat_and_cache_mla(
+    torch::Tensor& kv_c,          // [num_tokens, kv_lora_rank]
+    torch::Tensor& k_pe,          // [num_tokens, pe_dim]
+    torch::Tensor& kv_cache,      // [num_blocks, block_size, (kv_lora_rank +
+                                  // pe_dim)]
+    torch::Tensor& slot_mapping,  // [num_tokens] or [num_actual_tokens]
+    const std::string& kv_cache_dtype, torch::Tensor& scale) {
+  // NOTE(woosuk): In vLLM V1, key.size(0) can be different from
+  // slot_mapping.size(0) because of padding for CUDA graphs.
+  // In vLLM V0, key.size(0) is always equal to slot_mapping.size(0) because
+  // both include padding.
+  // In vLLM V1, however, key.size(0) can be larger than slot_mapping.size(0)
+  // since key includes padding for CUDA graphs, while slot_mapping does not.
+  // In this case, slot_mapping.size(0) represents the actual number of tokens
+  // before padding.
+  // For compatibility with both cases, we use slot_mapping.size(0) as the
+  // number of tokens.
+  int num_tokens = slot_mapping.size(0);
+  int kv_lora_rank = kv_c.size(1);
+  int pe_dim = k_pe.size(1);
+  int block_size = kv_cache.size(1);
+
+  TORCH_CHECK(kv_cache.size(2) == kv_lora_rank + pe_dim);
+
+  int kv_c_stride = kv_c.stride(0);
+  int k_pe_stride = k_pe.stride(0);
+  int block_stride = kv_cache.stride(0);
+
+  dim3 grid(num_tokens);
+  dim3 block(std::min(kv_lora_rank, 512));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(kv_c));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  DISPATCH_BY_KV_CACHE_DTYPE(kv_c.dtype(), kv_cache_dtype,
+                             CALL_CONCAT_AND_CACHE_MLA);
+}
+
 namespace vllm {
 
 template <typename Tout, typename Tin, Fp8KVCacheDataType kv_dt>
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index ec63170d511f0..1846d9ac29943 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -463,6 +463,15 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
   cache_ops.impl("reshape_and_cache_flash", torch::kCUDA,
                  &reshape_and_cache_flash);
 
+  // Concat kv_c and k_pe and cache them.
+  cache_ops.def(
+      "concat_and_cache_mla(Tensor kv_c, Tensor k_pe,"
+      "                     Tensor! kv_cache,"
+      "                     Tensor slot_mapping,"
+      "                     str kv_cache_dtype,"
+      "                     Tensor scale) -> ()");
+  cache_ops.impl("concat_and_cache_mla", torch::kCUDA, &concat_and_cache_mla);
+
   // Convert the key and value cache to fp8 data type.
   cache_ops.def(
       "convert_fp8(Tensor! dst_cache, Tensor src_cache, float scale, "
diff --git a/tests/kernels/test_triton_decode_attention.py b/tests/kernels/test_triton_decode_attention.py
new file mode 100644
index 0000000000000..14f5a3b770b69
--- /dev/null
+++ b/tests/kernels/test_triton_decode_attention.py
@@ -0,0 +1,89 @@
+import pytest
+import torch
+
+from vllm.attention.ops.triton_decode_attention import decode_attention_fwd
+
+
+def cdiv(a, b):
+    return (a + b - 1) // b
+
+
+@pytest.mark.parametrize("B", [3, 5])
+@pytest.mark.parametrize("L", [1027, 1025])
+@pytest.mark.parametrize("H_Q", [32])
+@pytest.mark.parametrize("H_KV", [32, 8])
+@pytest.mark.parametrize("D_QK", [128, 192, 576])
+@pytest.mark.parametrize("D_V", [128, 512])
+@pytest.mark.parametrize("CACHE_SIZE", [16384])
+@pytest.mark.parametrize("PAGE_SIZE", [1, 16])
+def test_decode_attention(B, L, H_Q, H_KV, D_QK, D_V, CACHE_SIZE, PAGE_SIZE):
+    assert CACHE_SIZE % PAGE_SIZE == 0
+    dtype = torch.bfloat16
+    seq_len = L  # This represents the number of tokens already in the sequence
+    sm_scale = 1.0 / (D_QK**0.5)
+    num_kv_splits = 8
+
+    num_pages_per_batch = cdiv(seq_len, PAGE_SIZE)
+    req_to_page = torch.randint(0,
+                                CACHE_SIZE // PAGE_SIZE,
+                                (B, num_pages_per_batch, 1),
+                                device="cuda")
+    req_to_token = req_to_page * PAGE_SIZE
+    req_to_token = req_to_token.expand(B, num_pages_per_batch, PAGE_SIZE)
+    req_to_token = req_to_token + torch.arange(PAGE_SIZE, device="cuda").view(
+        1, 1, -1)
+    req_to_token = req_to_token.view(B, -1)
+    req_to_token = req_to_token[:, :seq_len].contiguous()
+
+    # q represents the new token being generated, one per batch
+    q = torch.randn(B, H_Q, D_QK, dtype=dtype, device="cuda")
+
+    # k_buffer and v_buffer represent all previous tokens
+    # Page size is 1.
+    k_buffer = torch.randn(CACHE_SIZE, H_KV, D_QK, dtype=dtype, device="cuda")
+    v_buffer = torch.randn(CACHE_SIZE, H_KV, D_V, dtype=dtype, device="cuda")
+
+    # o will have the same shape as q
+    o = torch.zeros(B, H_Q, D_V, dtype=dtype, device="cuda")
+
+    b_seq_len = torch.full((B, ), seq_len, device="cuda")
+
+    attn_logits = torch.empty(
+        (B, H_Q, num_kv_splits, D_V + 1),
+        dtype=torch.float32,
+        device="cuda",
+    )
+
+    # Call the original implementation.
+    decode_attention_fwd(
+        q,
+        k_buffer,
+        v_buffer,
+        o,
+        req_to_token,
+        b_seq_len,
+        attn_logits,
+        num_kv_splits,
+        sm_scale,
+    )
+
+    # Page size can be larger than 1.
+    k_buffer = k_buffer.view(CACHE_SIZE // PAGE_SIZE, PAGE_SIZE, H_KV, D_QK)
+    v_buffer = v_buffer.view(CACHE_SIZE // PAGE_SIZE, PAGE_SIZE, H_KV, D_V)
+
+    o1 = torch.zeros_like(o)
+
+    decode_attention_fwd(
+        q,
+        k_buffer,
+        v_buffer,
+        o1,
+        req_to_page,
+        b_seq_len,
+        attn_logits,
+        num_kv_splits,
+        sm_scale,
+        PAGE_SIZE,
+    )
+
+    assert torch.allclose(o, o1)
diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
index 84721d5971ccf..d7c6bdd707eb7 100644
--- a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
+++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
@@ -5,5 +5,5 @@ class DummyPlatform(CudaPlatform):
     device_name = "DummyDevice"
 
     def get_attn_backend_cls(self, backend_name, head_size, dtype,
-                             kv_cache_dtype, block_size, use_v1):
+                             kv_cache_dtype, block_size, use_v1, use_mla):
         return "vllm_add_dummy_platform.dummy_attention_backend.DummyAttentionBackend"  # noqa E501
diff --git a/tests/weight_loading/models.txt b/tests/weight_loading/models.txt
index 272206d4502e9..1b797074096ed 100644
--- a/tests/weight_loading/models.txt
+++ b/tests/weight_loading/models.txt
@@ -20,7 +20,7 @@ compressed-tensors, nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test, main
 compressed-tensors, nm-testing/Phi-3-mini-128k-instruct-FP8, main
 compressed-tensors, neuralmagic/Phi-3-medium-128k-instruct-quantized.w4a16, main
 compressed-tensors, nm-testing/TinyLlama-1.1B-Chat-v1.0-actorder-group, main
-compressed-tensors, mgoin/DeepSeek-Coder-V2-Lite-Instruct-FP8, main
+#compressed-tensors, mgoin/DeepSeek-Coder-V2-Lite-Instruct-FP8, main
 compressed-tensors, nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-FP8-Dynamic-testing, main, 90
 compressed-tensors, nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-W8A8-testing, main, 90
 awq, casperhansen/mixtral-instruct-awq, main
diff --git a/tests/weight_loading/run_model_weight_loading_test.sh b/tests/weight_loading/run_model_weight_loading_test.sh
index 693128640e07d..8a899bc154f35 100755
--- a/tests/weight_loading/run_model_weight_loading_test.sh
+++ b/tests/weight_loading/run_model_weight_loading_test.sh
@@ -3,7 +3,7 @@ SUCCESS=0
 
 while getopts "c:" OPT; do
   case ${OPT} in
-    c ) 
+    c )
         CONFIG="$OPTARG"
         ;;
     \? )
@@ -18,9 +18,14 @@ IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < "$CONFIG"
 
 for MODEL_CONFIG in "${MODEL_CONFIGS[@]}"
 do
+    if [[ $MODEL_CONFIG == \#* ]]; then
+        echo "=== SKIPPING MODEL: $MODEL_CONFIG ==="
+        continue
+    fi
+
     LOCAL_SUCCESS=0
     IFS=', ' read -r -a array <<< "$MODEL_CONFIG"
-    
+
     echo "=== RUNNING MODEL: $MODEL_CONFIG ==="
 
     export QUANTIZATION=${array[0]}
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 4a11b0206e003..fd94134de0219 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -1002,6 +1002,19 @@ def reshape_and_cache_flash(
                                                    v_scale)
 
 
+def concat_and_cache_mla(
+    kv_c: torch.Tensor,
+    k_pe: torch.Tensor,
+    kv_cache: torch.Tensor,
+    slot_mapping: torch.Tensor,
+    kv_cache_dtype: str,
+    scale: torch.Tensor,
+) -> None:
+    torch.ops._C_cache_ops.concat_and_cache_mla(kv_c, k_pe, kv_cache,
+                                                slot_mapping, kv_cache_dtype,
+                                                scale)
+
+
 def copy_blocks(key_caches: List[torch.Tensor],
                 value_caches: List[torch.Tensor],
                 block_mapping: torch.Tensor) -> None:
diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index 8027a52b82ffc..b9425f659f7c0 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -276,3 +276,19 @@ def forward(
         output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         raise NotImplementedError
+
+
+class MLAAttentionImpl(AttentionImpl[T], Generic[T]):
+
+    @abstractmethod
+    def forward(
+        self,
+        layer: AttentionLayer,
+        hidden_states_or_cq: torch.Tensor,
+        kv_c_normed: torch.Tensor,
+        k_pe: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: T,
+        output: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        raise NotImplementedError
diff --git a/vllm/attention/backends/mla/__init__.py b/vllm/attention/backends/mla/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/vllm/attention/backends/mla/utils.py b/vllm/attention/backends/mla/utils.py
new file mode 100644
index 0000000000000..c6c8a6034e20f
--- /dev/null
+++ b/vllm/attention/backends/mla/utils.py
@@ -0,0 +1,365 @@
+from abc import abstractmethod
+from dataclasses import dataclass
+from typing import Any, Dict, Generic, List, Optional
+
+import torch
+
+from vllm import _custom_ops as ops
+from vllm import envs
+from vllm.attention.backends.abstract import (AttentionLayer,
+                                              AttentionMetadata,
+                                              MLAAttentionImpl, T)
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
+from vllm.vllm_flash_attn import flash_attn_varlen_func
+
+
+@dataclass
+class MLACommonMetadata(AttentionMetadata):
+    # Input positions for rotrary embeddings since for MLA the rotary
+    # position embeddings are applied inside the attention backend
+    input_positions: torch.Tensor
+
+
+class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
+    """
+    Common class for implementing repeated parts 
+    
+    Main reference: DeepseekV2 paper, and FlashInfer Implementation
+    (https://arxiv.org/abs/2405.04434 and https://github.com/flashinfer-ai/flashinfer/pull/551).
+    
+    Deepseek's MLA attention works the following way:
+    * Use a single latent vector to represent the entire KV cache.
+    * The attention "simulates" a multi-head attention, while the compute is
+      similar to multi-query attention.
+    * The dataflow is as follows,
+
+        * B: batch/sequence length
+        * H: hidden size
+        * N: number of attention heads
+        * Lq: latent dimension for Q
+        * Lkv: latent dimension for K/V
+        * P: nope dimension, P+R is the actual head_dim in common attention.
+        * R: rope dimension, this slide of the head_dim goes through rope.
+        * V: V head dim.
+        * kv_c: latent/compressed KV
+        * q_c: latent/compressed Q
+        
+        #
+        # Outside the MLA attention backend
+        #
+
+        1. The hidden states (B, H) are projected down into cq (B, Lq) and
+           kv_c_k_pe (B, Lkv+R).
+        2. The kv_c_k_pe is split into kv_c (B, Lkv) and k_pe (B, R). cq
+           and kv_c are normalized.
+        
+        #
+        # Inside the MLA attention backend
+        #
+
+        * if prefill:
+        
+        3. The q_c is then projected up into the multi-head version. 
+           * q_c goes from (B, Lq) to (B, N, (P+R)), which is split into q_nope 
+             (B, N, P) and q_pe (B, N, R). 
+        4. q_pe, k_pe are then passed through rotary embeddings.
+        5. kv_c and k_pe are concatenated and inserted into the cache
+        6. The kv_c is then projected up into the multi-head version. 
+           * kv_c goes from (B, Lkv) to (B, N, (P+V)) which has the nope 
+             dimensions for K and V, which is split into k_nope (B, N, P) 
+             and v (B, N, V).
+        7. q (B, N, (P+R)) and k (B, N, (P+R)) matrices are assembled from
+           q_nope, q_pe, k_nope, k_pe.
+        8. Attention is computued with q, k, v.
+        9. The attention computation returns (B, N, V), which is projected back
+           to (B, H) using out projection.
+
+        * if decode:
+
+        3. Here's the change, we do not perform up the full up projection for
+           q_c, and there is no up projection at all for kv_c. This is
+           achieved by the technique of "weight absorption". The paper says
+           "Fortunately, due to the associative law of matrix multiplication,
+           we can absorb WUK into WUQ, and WUV into WO"
+           * The q up projection turns (B, Lq) into (B, N, (P+R)), we split it
+             into W_UQ (Lq, N, P) and W_QR (Lq, N, R).
+           * The kv_c up projection turns (B, Lkv) into (B, N, (P+V)), we split
+             it into W_UK (Lkv, N, P) and W_UV (Lkv, N, V).
+           * The out projection shape W_O (N*V, H) turns (B, N, V) into (B, H).
+           * We can precompute the product of W_UQ and W_UK into
+             W_UQ_UK (Lq, N, Lkv), which is possible due to QK^T operation in
+             attention.
+           * We can precompute the product of W_UV and W_O into
+             W_UV_O (N, Lkv, H), which is possible due to V@O as the
+             "epilogue" of attention
+        4. We still need to compute q_pe (B, N, R) by applying W_QR to q_latent.
+        5. q_pe, k_pe are then passed through rotary embeddings.
+        6. kv_c and k_pe are concatenated and inserted into the cache
+        7. By applying W_UQ_UK to q_latent, we have the new q_nope of shape
+           (B, N, Lkv).
+        8. q (B, N, (Lkv+R)), k (B, (Lkv+R)) are assembled from q_nope, q_pe,
+           kv_a, k_pe. v (B, Lkv) is exactly the same vector as kv_a.
+        9. The attention is computed with q, k, v. Note that we just performed
+           a MQA attention with (LKv+R) as our head dim.
+        10. The KV cache is updated using the new entries k (B, N, (Lkv+R)),
+           which included the v and rope values.
+        11. The attention computation returns (B, N, Lkv), which is projected
+           back to (B, H) using W_UV_O.
+
+    From @tsu-bin's calculation, we only want to use the absorption technique
+    for decode. The prefill algorithm should still use the up-projected MHA
+    for less flops and memory usage.
+    
+    """
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: int,
+        alibi_slopes: Optional[List[float]],
+        sliding_window: Optional[int],
+        kv_cache_dtype: str,
+        blocksparse_params: Optional[Dict[str, Any]],
+        logits_soft_cap: Optional[float],
+        attn_type: str,
+        # MLA Specific Arguments
+        q_lora_rank: Optional[int],
+        kv_lora_rank: int,
+        qk_nope_head_dim: int,
+        qk_rope_head_dim: int,
+        qk_head_dim: int,
+        v_head_dim: int,
+        rotary_emb: RotaryEmbedding,
+        # q_proj should be q_b_proj if q_lora_rank is not None, but from an
+        # attention backend perspective we rely on the layer to pass in the
+        # correct matrix
+        q_proj: ColumnParallelLinear,
+        kv_b_proj: ColumnParallelLinear,
+        o_proj: RowParallelLinear,
+    ) -> None:
+        self.num_heads = num_heads
+        self.head_size = head_size
+        self.scale = float(scale)
+        self.num_kv_heads = num_kv_heads
+        self.kv_cache_dtype = kv_cache_dtype
+
+        self.q_lora_rank = q_lora_rank
+        self.kv_lora_rank = kv_lora_rank
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.qk_head_dim = qk_head_dim
+        self.v_head_dim = v_head_dim
+
+        self.rotary_emb = rotary_emb
+        self.q_proj = q_proj
+        self.kv_b_proj = kv_b_proj
+        self.o_proj = o_proj
+
+    def _v_up_proj_and_o_proj(self, x):
+        if envs.VLLM_MLA_PERFORM_MATRIX_ABSORPTION:
+            return self.o_proj_absorbed(
+                x.reshape(-1, self.num_heads * self.kv_lora_rank))[0]
+        else:
+            x = torch.einsum("bnl,lnv->bnv", x, self.W_UV)
+            return self.o_proj(x.reshape(-1,
+                                         self.num_heads * self.v_head_dim))[0]
+
+    def _q_proj_and_k_up_proj(self, x):
+        if envs.VLLM_MLA_PERFORM_MATRIX_ABSORPTION:
+            return torch.matmul(x, self.W_Q_UK)\
+                .view(-1, self.num_heads, self.kv_lora_rank)
+        else:
+            x = torch.matmul(x, self.W_Q)\
+                .view(-1, self.num_heads, self.qk_nope_head_dim)
+            return torch.einsum("bnp,lnp->bnl", x, self.W_UK)\
+                .view(-1, self.num_heads, self.kv_lora_rank)
+
+    def process_weights_after_loading(self):
+        kv_b_proj_weight = self.kv_b_proj.weight.T
+        assert kv_b_proj_weight.shape == (
+            self.kv_lora_rank,
+            self.num_heads * (self.qk_nope_head_dim + self.v_head_dim)), (
+                f"{kv_b_proj_weight.shape=}, "
+                f"{self.kv_lora_rank=}, "
+                f"{self.num_heads=}, "
+                f"{self.qk_nope_head_dim=}, "
+                f"{self.v_head_dim=}")
+        kv_b_proj_weight = kv_b_proj_weight.view(
+            self.kv_lora_rank,
+            self.num_heads,
+            self.qk_nope_head_dim + self.v_head_dim,
+        )
+
+        W_UK, W_UV = kv_b_proj_weight.split(
+            [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
+
+        q_proj = self.q_proj.weight.T\
+                .view(-1, self.num_heads, self.qk_head_dim)
+
+        # can be W_Q or W_UQ depending q_lora_rank, the former if
+        # q_lora_rank is None, the latter otherwise. From the Attention backend
+        # perspective though we call these both W_Q and rely on the layer
+        # to pass in the correct matrix
+        W_Q = q_proj[..., :self.qk_nope_head_dim]
+        self.W_QR = q_proj[..., self.qk_nope_head_dim:]\
+            .flatten(start_dim=1).contiguous()
+
+        if envs.VLLM_MLA_PERFORM_MATRIX_ABSORPTION:
+            #
+            # Perform matrix-absorption following
+            #     https://github.com/flashinfer-ai/flashinfer/pull/551
+            # for decode, as a result we end up with absorbed weights for decode
+            # and another copy of raw weights for prefill.
+            #
+            self.W_UK, self.W_UV = kv_b_proj_weight.split(
+                [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
+            # We absorb `W_UK` into `W_Q` resulting in either W_Q_UK or W_UQ_UK
+            # depending q_lora_rank, the former if q_lora_rank is None, the
+            # latter otherwise
+            # basically if q_lora_rank is none we are absorbing into q_proj
+            # instead of UQ
+            self.W_Q_UK = torch.einsum("qnd,lnd -> qnl", W_Q, W_UK)\
+                .flatten(start_dim=1).contiguous()
+
+            W_O = self.o_proj.weight\
+                .view(-1, self.num_heads, self.v_head_dim)
+            self.W_UV_O = torch.einsum("lnd,hnd -> nlh", W_UV, W_O)\
+                .flatten(start_dim=0, end_dim=1).contiguous()
+
+            tp_size = get_tensor_model_parallel_world_size()
+            self.o_proj_absorbed = RowParallelLinear(
+                self.W_UV_O.shape[0] * tp_size,
+                self.W_UV_O.shape[1],
+                bias=False,
+                # TODO(lucas) figure out how to properly forward quant_method
+                #quant_config=self.o_proj.quant_method,
+            )
+
+            self.o_proj_absorbed.weight = torch.nn.Parameter(self.W_UV_O.T)
+        else:
+            self.W_UV = W_UV
+            self.W_UK = W_UK
+            self.W_Q = W_Q.flatten(start_dim=1)
+
+    @abstractmethod
+    def _forward_prefill(
+        self,
+        q: torch.Tensor,
+        kv_c_normed: torch.Tensor,
+        k_pe: torch.Tensor,
+        attn_metadata: T,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    @abstractmethod
+    def _forward_decode(
+        self,
+        q_nope: torch.Tensor,
+        q_pe: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: T,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def forward(
+        self,
+        layer: AttentionLayer,
+        hidden_states_or_q_c: torch.Tensor,  # query in unified attn
+        k_c_normed: torch.Tensor,  # key in unified attn
+        k_pe: torch.Tensor,  # value in unified attn
+        kv_cache: torch.Tensor,
+        attn_metadata: T,
+        output: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if output is not None:
+            raise NotImplementedError(
+                "output is not yet supported for MLAImplBase")
+
+        is_decode = attn_metadata.decode_metadata is not None
+        is_prefill = attn_metadata.prefill_metadata is not None
+
+        if (is_decode and is_prefill):
+            raise NotImplementedError(
+                "chunked prefill is not supported for MLAImplBase")
+
+        # Restore head dim (for rotary embedding)
+        k_pe = k_pe.unsqueeze(1)
+        assert hasattr(attn_metadata, "input_positions")
+
+        if is_decode:
+            q_nope = self._q_proj_and_k_up_proj(hidden_states_or_q_c)
+            q_pe = torch.matmul(hidden_states_or_q_c, self.W_QR)\
+                .view(-1, self.num_heads, self.qk_rope_head_dim)
+            q_pe, k_pe = \
+                self.rotary_emb(attn_metadata.input_positions, q_pe, k_pe)
+        else:
+            assert is_prefill
+            q = self.q_proj(hidden_states_or_q_c)[0]\
+                .view(-1, self.num_heads, self.qk_head_dim)
+
+            # TODO(lucas): there must be a nicer way to write this line
+            q[..., self.qk_nope_head_dim:], k_pe = \
+                self.rotary_emb(
+                    attn_metadata.input_positions,
+                    q[..., self.qk_nope_head_dim:], k_pe)
+
+        # write the latent and rope to kv cache
+        if kv_cache.numel() > 0:
+            ops.concat_and_cache_mla(
+                k_c_normed,
+                k_pe.squeeze(1),
+                kv_cache,
+                attn_metadata.slot_mapping.flatten(),
+                kv_cache_dtype=self.kv_cache_dtype,
+                scale=layer._k_scale,
+            )
+
+        if attn_metadata.prefill_metadata is not None:
+            return self._forward_prefill(q, k_c_normed, k_pe, attn_metadata)
+
+        if attn_metadata.decode_metadata is not None:
+            return self._forward_decode(q_nope, q_pe, kv_cache, attn_metadata)
+
+    # Optional common flash-attn based prefill
+    def _forward_prefill_flash(
+        self,
+        q: torch.Tensor,
+        k_c_normed: torch.Tensor,
+        k_pe: torch.Tensor,
+        seq_start_loc: torch.Tensor,
+        max_prefill_seq_len: int,
+    ) -> torch.Tensor:
+
+        kv_nope = self.kv_b_proj(k_c_normed)[0]\
+            .view(-1, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
+        k_nope, v = kv_nope\
+            .split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
+
+        k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1)
+
+        # For MLA the v head dim is smaller than qk head dim so we pad out
+        # v with 0s to match the qk head dim
+        v_padded = torch.nn.functional.pad(v, [0, q.shape[-1] - v.shape[-1]],
+                                           value=0)
+
+        attn_output = flash_attn_varlen_func(
+            q=q,
+            k=k,
+            v=v_padded,
+            cu_seqlens_q=seq_start_loc,
+            cu_seqlens_k=seq_start_loc,
+            max_seqlen_q=max_prefill_seq_len,
+            max_seqlen_k=max_prefill_seq_len,
+            softmax_scale=self.scale,
+            causal=True,
+        )
+        attn_output = attn_output\
+            .view(-1, self.num_heads, q.shape[-1])[..., :v.shape[-1]]\
+                .reshape(-1, self.num_heads * v.shape[-1])
+
+        return self.o_proj(attn_output)[0]
diff --git a/vllm/attention/backends/triton_mla.py b/vllm/attention/backends/triton_mla.py
new file mode 100644
index 0000000000000..da09bb70b4f1a
--- /dev/null
+++ b/vllm/attention/backends/triton_mla.py
@@ -0,0 +1,749 @@
+from collections import defaultdict
+from contextlib import contextmanager
+from dataclasses import dataclass
+from itertools import accumulate
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type
+
+from vllm.multimodal import MultiModalPlaceholderMap
+
+try:
+    from flashinfer import BatchDecodeMlaWithPagedKVCacheWrapper
+    FLASHINFER_WORKSPACE_BUFFER_SIZE = 256 * 1024 * 1024
+except ImportError:
+    BatchDecodeMlaWithPagedKVCacheWrapper = None
+    FLASHINFER_WORKSPACE_BUFFER_SIZE = 0
+
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.attention.backends.abstract import (AttentionBackend,
+                                              AttentionMetadata,
+                                              AttentionMetadataBuilder,
+                                              AttentionState, AttentionType)
+from vllm.attention.backends.mla.utils import MLACommonImpl, MLACommonMetadata
+from vllm.attention.backends.utils import (PAD_SLOT_ID, compute_slot_mapping,
+                                           compute_slot_mapping_start_idx,
+                                           is_block_tables_empty)
+from vllm.attention.ops.paged_attn import PagedAttention
+from vllm.attention.ops.triton_decode_attention import decode_attention_fwd
+from vllm.utils import async_tensor_h2d, make_tensor_with_pad
+
+if TYPE_CHECKING:
+    from vllm.worker.model_runner import (ModelInputForGPUBuilder,
+                                          ModelInputForGPUWithSamplingMetadata)
+
+
+class TritonMLABackend(AttentionBackend):
+
+    @staticmethod
+    def get_name() -> str:
+        return "TRITON_MLA"
+
+    @staticmethod
+    def get_impl_cls() -> Type["TritonMLAImpl"]:
+        return TritonMLAImpl
+
+    @staticmethod
+    def get_metadata_cls() -> Type["AttentionMetadata"]:
+        return TritonMLAMetadata
+
+    @staticmethod
+    def get_builder_cls() -> Type["TritonMLAMetadataBuilder"]:
+        return TritonMLAMetadataBuilder
+
+    @staticmethod
+    def get_state_cls() -> Type["TritonMLAState"]:
+        return TritonMLAState
+
+    @staticmethod
+    def get_kv_cache_shape(
+            num_blocks: int,
+            block_size: int,
+            num_kv_heads: int,  # assumed to be 1 for MLA
+            kv_lora_rank: int,  # passed via head_size
+    ) -> Tuple[int, ...]:
+        # TODO(lucas): remove hardcoding k_pe size as 1/8th of kv_lora_rank
+        k_pe_size = kv_lora_rank // 8
+        return (num_blocks, block_size, kv_lora_rank + k_pe_size)
+
+    @staticmethod
+    def swap_blocks(
+        src_kv_cache: torch.Tensor,
+        dst_kv_cache: torch.Tensor,
+        src_to_dst: torch.Tensor,
+    ) -> None:
+        PagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst)
+
+    @staticmethod
+    def copy_blocks(
+        kv_caches: List[torch.Tensor],
+        src_to_dists: torch.Tensor,
+    ) -> None:
+        PagedAttention.copy_blocks(kv_caches, src_to_dists)
+
+    @staticmethod
+    def get_supported_head_sizes() -> List[int]:
+        return [512]
+
+
+class TritonMLAState(AttentionState):
+
+    def __init__(self, runner):
+        self.runner = runner
+        self._is_graph_capturing = False
+
+    @contextmanager
+    def graph_capture(self, max_batch_size: int):
+        self._is_graph_capturing = True
+
+        self._graph_slot_mapping = torch.full((max_batch_size, ),
+                                              PAD_SLOT_ID,
+                                              dtype=torch.long,
+                                              device=self.runner.device)
+        self._graph_seq_lens = torch.ones(max_batch_size,
+                                          dtype=torch.int32,
+                                          device=self.runner.device)
+        self._graph_block_tables = torch.from_numpy(
+            self.runner.graph_block_tables).to(device=self.runner.device)
+
+        self._positions = torch.zeros((max_batch_size, ),
+                                      dtype=torch.long,
+                                      device=self.runner.device)
+
+        yield
+
+        self._is_graph_capturing = False
+        del self._graph_slot_mapping
+        del self._graph_seq_lens
+        del self._graph_block_tables
+        del self._positions
+
+    def graph_clone(self, batch_size: int):
+        assert self._is_graph_capturing
+        return self.__class__(self.runner)
+
+    def graph_capture_get_metadata_for_batch(
+            self, batch_size: int, is_encoder_decoder_model: bool = False):
+        assert self._is_graph_capturing
+
+        attn_metadata = self.runner.attn_backend.make_metadata(
+            num_prefills=0,
+            num_prefill_tokens=0,
+            num_decode_tokens=batch_size,
+            slot_mapping=self._graph_slot_mapping[:batch_size],
+            multi_modal_placeholder_index_maps=None,
+            enable_kv_scales_calculation=True,
+            seq_lens=None,
+            seq_lens_tensor=self._graph_seq_lens[:batch_size],
+            max_query_len=1,
+            max_decode_query_len=1,
+            max_prefill_seq_len=0,
+            max_decode_seq_len=self.runner.max_seq_len_to_capture,
+            query_start_loc=None,
+            seq_start_loc=None,
+            context_lens_tensor=None,
+            block_tables=self._graph_block_tables[:batch_size],
+            use_cuda_graph=True,
+            input_positions=self._positions[:batch_size],
+            head_dim=self.runner.model_config.get_head_size())
+
+        if is_encoder_decoder_model:
+            raise NotImplementedError(
+                "TritonMLAState does not support encoder/decoder yet")
+
+        return attn_metadata
+
+    def get_graph_input_buffers(self,
+                                attn_metadata,
+                                is_encoder_decoder_model: bool = False):
+        input_buffers = {
+            "slot_mapping": attn_metadata.slot_mapping,
+            "seq_lens_tensor": attn_metadata.decode_metadata.seq_lens_tensor,
+            "block_tables": attn_metadata.decode_metadata.block_tables,
+            "input_positions": attn_metadata.decode_metadata.input_positions,
+        }
+        if is_encoder_decoder_model:
+            raise NotImplementedError(
+                "TritonMLAState does not support encoder/decoder yet")
+
+        return input_buffers
+
+    def prepare_graph_input_buffers(self,
+                                    input_buffers,
+                                    attn_metadata,
+                                    is_encoder_decoder_model: bool = False):
+        input_positions = attn_metadata.input_positions
+        num_positions = input_positions.shape[0]
+        input_buffers["seq_lens_tensor"].copy_(
+            attn_metadata.decode_metadata.seq_lens_tensor, non_blocking=True)
+        input_buffers["block_tables"].copy_(
+            attn_metadata.decode_metadata.block_tables, non_blocking=True)
+        # CUDA graph buffer is padded so only perform a partial copy based on
+        # num_positions
+        input_buffers["input_positions"][:num_positions].copy_(
+            input_positions, non_blocking=True)
+        if is_encoder_decoder_model:
+            raise NotImplementedError(
+                "TritonMLAState does not support encoder/decoder yet")
+
+    def begin_forward(self, model_input):
+        return
+
+
+@dataclass
+class TritonMLAMetadata(MLACommonMetadata):
+    """Metadata for TritonMLAMetadata.
+
+    NOTE: Any python object stored here is not updated when it is
+    cuda-graph replayed. If you have values that need to be changed
+    dynamically, it should be stored in tensor. The tensor has to be
+    updated from `CUDAGraphRunner.forward` API.
+    """
+    # (batch_size,). The sequence length per sequence. Sequence length means
+    # the computed tokens + new tokens None if it is a decoding.
+    seq_lens: Optional[List[int]]
+    # seq_lens stored as a tensor.
+    seq_lens_tensor: Optional[torch.Tensor]
+
+    # NOTE(sang): Definition of context_len, query_len, and seq_len.
+    # |---------- N-1 iteration --------|
+    # |---------------- N iteration ---------------------|
+    # |- tokenA -|......................|-- newTokens ---|
+    # |---------- context_len ----------|
+    # |-------------------- seq_len ---------------------|
+    #                                   |-- query_len ---|
+
+    # Maximum sequence length among prefill batch. 0 if there are decoding
+    # requests only.
+    max_prefill_seq_len: int
+    # Maximum sequence length among decode batch. 0 if there are prefill
+    # requests only.
+    max_decode_seq_len: int
+    # (batch_size,) A tensor of context lengths (tokens that are computed
+    # so far).
+    context_lens_tensor: Optional[torch.Tensor]
+
+    # (batch_size, max_blocks_per_seq).
+    # Block addresses per sequence. (Seq id -> list of physical block)
+    # E.g., [0, 1, 2] means tokens are stored in 0th, 1st, and 2nd blocks
+    # in the kv cache. Each block can contain up to block_size tokens.
+    # 2nd dimensions are padded up to max_blocks_per_seq if it is cuda-graph
+    # captured.
+    block_tables: Optional[torch.Tensor]
+
+    # Whether or not if cuda graph is enabled.
+    # Cuda-graph is currently enabled for decoding only.
+    # TODO(woosuk): Move `use_cuda_graph` out since it's unrelated to attention.
+
+    use_cuda_graph: bool
+
+    # Maximum query length in the batch.
+    max_query_len: Optional[int] = None
+
+    # Max number of query tokens among request in the batch.
+    max_decode_query_len: Optional[int] = None
+
+    # (batch_size + 1,). The cumulative subquery lengths of the sequences in
+    # the batch, used to index into subquery. E.g., if the subquery length
+    # is [4, 6], it is [0, 4, 10].
+    query_start_loc: Optional[torch.Tensor] = None
+    # (batch_size + 1,). The cumulative sequence lengths of the sequences in
+    # the batch, used to index into sequence. E.g., if the sequence length is
+    # [4, 6], it is [0, 4, 10].
+    seq_start_loc: Optional[torch.Tensor] = None
+
+    _cached_prefill_metadata: Optional["TritonMLAMetadata"] = None
+    _cached_decode_metadata: Optional["TritonMLAMetadata"] = None
+
+    num_prefill_tokens: int
+
+    num_kv_splits: int = 4  # TODO(lucas) add heuristic
+    attn_logits: Optional[torch.Tensor] = None
+    req_idx: Optional[torch.Tensor] = None
+
+    # The dimension of the attention heads
+    head_dim: Optional[int] = None
+
+    def __post_init__(self):
+        supported_head_sizes = TritonMLABackend.get_supported_head_sizes()
+        if self.head_dim is not None and self.head_dim \
+                not in supported_head_sizes:
+            raise ValueError(
+                f"Only {supported_head_sizes} are supported for head_dim,",
+                f"received {self.head_dim}.")
+
+    @property
+    def prefill_metadata(self) -> Optional["TritonMLAMetadata"]:
+        if self.num_prefills == 0:
+            return None
+
+        if self._cached_prefill_metadata is not None:
+            return self._cached_prefill_metadata
+
+        assert self.seq_lens is not None
+        assert self.seq_lens_tensor is not None
+
+        # Compute some attn_metadata fields which default to None
+        query_start_loc = (None if self.query_start_loc is None else
+                           self.query_start_loc[:self.num_prefills + 1])
+        slot_mapping = (None if self.slot_mapping is None else
+                        self.slot_mapping[:self.num_prefill_tokens])
+        seq_lens = (None if self.seq_lens is None else
+                    self.seq_lens[:self.num_prefills])
+        seq_lens_tensor = (None if self.seq_lens_tensor is None else
+                           self.seq_lens_tensor[:self.num_prefills])
+        seq_start_loc = (None if self.seq_start_loc is None else
+                         self.seq_start_loc[:self.num_prefills + 1])
+        context_lens_tensor = (None if self.context_lens_tensor is None else
+                               self.context_lens_tensor[:self.num_prefills])
+        block_tables = (None if self.block_tables is None else
+                        self.block_tables[:self.num_prefills])
+        input_positions = (None if self.input_positions is None else
+                           self.input_positions[:self.num_prefill_tokens])
+
+        self._cached_prefill_metadata = TritonMLAMetadata(
+            num_prefills=self.num_prefills,
+            num_prefill_tokens=self.num_prefill_tokens,
+            num_decode_tokens=0,
+            slot_mapping=slot_mapping,
+            multi_modal_placeholder_index_maps=self.
+            multi_modal_placeholder_index_maps,
+            enable_kv_scales_calculation=self.enable_kv_scales_calculation,
+            input_positions=input_positions,
+            seq_lens=seq_lens,
+            seq_lens_tensor=seq_lens_tensor,
+            max_query_len=self.max_query_len,
+            max_prefill_seq_len=self.max_prefill_seq_len,
+            max_decode_query_len=0,
+            max_decode_seq_len=0,
+            query_start_loc=query_start_loc,
+            seq_start_loc=seq_start_loc,
+            context_lens_tensor=context_lens_tensor,
+            block_tables=block_tables,
+            use_cuda_graph=False,
+            head_dim=self.head_dim)
+        return self._cached_prefill_metadata
+
+    @property
+    def decode_metadata(self) -> Optional["TritonMLAMetadata"]:
+        if self.num_decode_tokens == 0:
+            return None
+
+        if self._cached_decode_metadata is not None:
+            return self._cached_decode_metadata
+        assert self.seq_lens_tensor is not None
+
+        # Compute some attn_metadata fields which default to None
+        slot_mapping = (None if self.slot_mapping is None else
+                        self.slot_mapping[self.num_prefill_tokens:])
+        seq_lens_tensor = (None if self.seq_lens_tensor is None else
+                           self.seq_lens_tensor[self.num_prefills:])
+        block_tables = (None if self.block_tables is None else
+                        self.block_tables[self.num_prefills:])
+        input_positions = (None if self.input_positions is None else
+                           self.input_positions[self.num_prefill_tokens:])
+
+        self._cached_decode_metadata = TritonMLAMetadata(
+            num_prefills=0,
+            num_prefill_tokens=0,
+            num_decode_tokens=self.num_decode_tokens,
+            slot_mapping=slot_mapping,
+            multi_modal_placeholder_index_maps=None,
+            enable_kv_scales_calculation=True,
+            seq_lens=None,
+            seq_lens_tensor=seq_lens_tensor,
+            max_decode_query_len=self.max_decode_query_len,
+            max_query_len=self.max_query_len,
+            max_prefill_seq_len=0,
+            max_decode_seq_len=self.max_decode_seq_len,
+            # Batch may be composed of prefill|decodes, adjust query start
+            # indices to refer to the start of decodes. E.g.
+            # in tokens:[3 prefills|6 decodes], query_start_loc=[3,9] => [0,6].
+            query_start_loc=(self.query_start_loc[self.num_prefills:] -
+                             self.query_start_loc[self.num_prefills])
+            if self.query_start_loc is not None else None,
+            seq_start_loc=self.seq_start_loc[self.num_prefills:]
+            if self.seq_start_loc is not None else None,
+            context_lens_tensor=None,
+            block_tables=block_tables,
+            use_cuda_graph=self.use_cuda_graph,
+            input_positions=input_positions,
+            head_dim=self.head_dim)
+        return self._cached_decode_metadata
+
+    def advance_step(self,
+                     model_input: "ModelInputForGPUWithSamplingMetadata",
+                     sampled_token_ids: Optional[torch.Tensor],
+                     block_size: int,
+                     num_seqs: int,
+                     num_queries: int,
+                     turn_prefills_into_decodes: bool = False):
+        """
+        Update metadata in-place to advance one decode step.
+        """
+        # When using cudagraph, the num_seqs is padded to the next captured
+        # batch sized, but num_queries tracks the actual number of requests in
+        # the batch. For --enforce-eager mode, num_seqs == num_queries
+        if num_seqs != num_queries:
+            assert num_seqs > num_queries
+            assert self.use_cuda_graph
+
+        if turn_prefills_into_decodes:
+            # When Mutli-Step is enabled with Chunked-Prefill, prefills and
+            # decodes are scheduled together. In the first step, all the
+            # prefills turn into decodes. This update reflects that
+            # conversion.
+            assert self.num_decode_tokens + self.num_prefills == num_seqs
+            self.num_decode_tokens += self.num_prefills
+            self.num_prefills = 0
+            self.num_prefill_tokens = 0
+            self.max_prefill_seq_len = 0
+            self.max_query_len = 1
+
+            self.slot_mapping = self.slot_mapping[:num_seqs]
+        else:
+            assert self.seq_lens is not None
+            assert self.max_decode_seq_len == max(self.seq_lens)
+
+        assert self.num_prefills == 0
+        assert self.num_prefill_tokens == 0
+        assert self.num_decode_tokens == num_seqs
+        assert self.slot_mapping.shape == (num_seqs, )
+
+        assert self.seq_lens is not None
+        assert len(self.seq_lens) == num_seqs
+        assert self.seq_lens_tensor is not None
+        assert self.seq_lens_tensor.shape == (num_seqs, )
+        assert self.max_query_len == 1
+        assert self.max_prefill_seq_len == 0
+
+        assert self.query_start_loc is not None
+        assert self.query_start_loc.shape == (num_queries + 1, )
+        assert self.seq_start_loc is not None
+        assert self.seq_start_loc.shape == (num_seqs + 1, )
+
+        assert self.context_lens_tensor is not None
+        assert self.context_lens_tensor.shape == (num_queries, )
+
+        assert self.block_tables is not None
+        assert self.block_tables.shape[0] == num_seqs
+
+        # Update query lengths. Note that we update only queries and not seqs,
+        # since tensors may be padded due to captured cuda graph batch size
+        for i in range(num_queries):
+            self.seq_lens[i] += 1
+        self.max_decode_seq_len = max(self.seq_lens)
+
+        ops.advance_step_flashattn(num_seqs=num_seqs,
+                                   num_queries=num_queries,
+                                   block_size=block_size,
+                                   input_tokens=model_input.input_tokens,
+                                   sampled_token_ids=sampled_token_ids,
+                                   input_positions=model_input.input_positions,
+                                   seq_lens=self.seq_lens_tensor,
+                                   slot_mapping=self.slot_mapping,
+                                   block_tables=self.block_tables)
+
+
+class TritonMLAMetadataBuilder(AttentionMetadataBuilder[TritonMLAMetadata]):
+
+    def __init__(self, input_builder: "ModelInputForGPUBuilder"):
+        self.input_builder = input_builder
+        self.runner = input_builder.runner
+        self.sliding_window = input_builder.sliding_window
+        self.block_size = input_builder.block_size
+
+    def prepare(self):
+        self.slot_mapping: List[int] = []
+        self.prefill_seq_lens: List[int] = []
+        self.context_lens: List[int] = []
+        self.block_tables: List[List[int]] = []
+        self.curr_seq_lens: List[int] = []
+        self.input_positions: List[int] = []
+        self.multimodal_placeholder_maps: Dict[
+            str,
+            MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap)
+        self.num_prefills = 0
+        self.num_prefill_tokens = 0
+        self.num_decode_tokens = 0
+        self.has_prefix_cache_hit = False
+
+    def _add_seq_group(
+            self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup",
+            chunked_prefill_enabled: bool, prefix_cache_hit: bool):
+        """Add a sequence group to the metadata. Specifically update/append
+        1. context length.
+        2. block table.
+        3. slot mapping.
+        """
+        is_prompt = inter_data.is_prompt
+        block_tables = inter_data.block_tables
+
+        for (seq_id, token_len, seq_len, curr_seq_len, query_len, context_len,
+             curr_sliding_window_block, input_positions) in zip(
+                 inter_data.seq_ids, [len(t) for t in inter_data.input_tokens],
+                 inter_data.orig_seq_lens, inter_data.seq_lens,
+                 inter_data.query_lens, inter_data.context_lens,
+                 inter_data.curr_sliding_window_blocks,
+                 inter_data.input_positions):
+            self.input_positions.extend(input_positions)
+            self.context_lens.append(context_len)
+            if is_prompt:
+                mm_maps = inter_data.multi_modal_placeholder_maps
+                if mm_maps:
+                    for modality, placeholders in mm_maps.items():
+                        self.multimodal_placeholder_maps[modality].extend(
+                            placeholders)
+
+                self.num_prefills += 1
+                self.num_prefill_tokens += token_len
+                self.prefill_seq_lens.append(seq_len)
+            else:
+                self.num_decode_tokens += query_len
+                self.curr_seq_lens.append(curr_seq_len)
+
+            # Compute block table.
+            # TODO(sang): Combine chunked prefill and prefix caching by
+            # only allowing multiple of block_size chunk size.
+            # NOTE: This only works for oooooooxxx style attention.
+            block_table = []
+            if prefix_cache_hit:
+                # NOTE(woosuk): For flash-attn, the block table should
+                # include the entries for the incoming prefill tokens.
+                block_table = block_tables[seq_id]
+            elif ((chunked_prefill_enabled or not is_prompt)
+                  and block_tables is not None):
+                if curr_sliding_window_block == 0:
+                    block_table = block_tables[seq_id]
+                else:
+                    block_table = block_tables[seq_id][
+                        -curr_sliding_window_block:]
+            self.block_tables.append(block_table)
+
+            # Compute slot mapping.
+            is_profile_run = is_block_tables_empty(block_tables)
+            start_idx = compute_slot_mapping_start_idx(is_prompt, query_len,
+                                                       context_len,
+                                                       self.sliding_window)
+            compute_slot_mapping(is_profile_run, self.slot_mapping, seq_id,
+                                 seq_len, context_len, start_idx,
+                                 self.block_size, inter_data.block_tables)
+
+    def _get_graph_runner_block_tables(
+            self, num_seqs: int,
+            block_tables: List[List[int]]) -> torch.Tensor:
+        # The shape of graph_block_tables is
+        # [max batch size, max context len // block size].
+        max_batch_size, max_blocks = self.runner.graph_block_tables.shape
+        assert max_batch_size >= num_seqs
+
+        graph_block_tables = self.runner.graph_block_tables[:num_seqs]
+        for i, block_table in enumerate(block_tables):
+            if block_table:
+                num_blocks = len(block_table)
+                if num_blocks <= max_blocks:
+                    graph_block_tables[i, :num_blocks] = block_table
+                else:
+                    # It may be possible to have more blocks allocated due
+                    # to lookahead slots of multi-step, however, they are
+                    # not used anyway, so can be safely ignored.
+                    graph_block_tables[
+                        i, :max_blocks] = block_table[:max_blocks]
+
+        return torch.from_numpy(graph_block_tables).to(
+            device=self.runner.device, non_blocking=True)
+
+    def build(self, seq_lens: List[int], query_lens: List[int],
+              cuda_graph_pad_size: int, batch_size: int):
+        """Build attention metadata with on-device tensors.
+
+        Args:
+            seq_lens: The maybe padded sequence lengths of the input sequences.
+            query_lens: The query lengths of the input sequences.
+            cuda_graph_pad_size: The padding size for cuda graph.
+                                 -1 if cuda graph is not used.
+            batch_size: The maybe padded batch size.
+        """
+        prefix_cache_hit = any([
+            inter_data.prefix_cache_hit
+            for inter_data in self.input_builder.inter_data_list
+        ])
+        for inter_data in self.input_builder.inter_data_list:
+            self._add_seq_group(inter_data,
+                                self.input_builder.chunked_prefill_enabled,
+                                prefix_cache_hit)
+
+        device = self.runner.device
+        use_captured_graph = cuda_graph_pad_size != -1
+
+        max_query_len = max(query_lens)
+        decode_query_lens = query_lens[self.num_prefills:]
+        if len(decode_query_lens) > 0:
+            max_decode_query_len = max(decode_query_lens)
+        else:
+            max_decode_query_len = 1
+        max_prefill_seq_len = max(self.prefill_seq_lens, default=0)
+        max_decode_seq_len = max(self.curr_seq_lens, default=0)
+        num_decode_tokens = self.num_decode_tokens
+        query_start_loc = list(accumulate(query_lens, initial=0))
+        seq_start_loc = list(accumulate(seq_lens, initial=0))
+
+        num_seqs = len(seq_lens)
+        if use_captured_graph:
+            self.slot_mapping.extend([PAD_SLOT_ID] * cuda_graph_pad_size)
+            self.block_tables.extend([] * cuda_graph_pad_size)
+            num_decode_tokens = batch_size - self.num_prefill_tokens
+            block_tables = self._get_graph_runner_block_tables(
+                num_seqs, self.block_tables)
+        else:
+            block_tables = make_tensor_with_pad(
+                self.block_tables,
+                pad=0,
+                dtype=torch.int,
+                device=device,
+            )
+        assert max_query_len > 0, ("query_lens: {}".format(query_lens))
+
+        assert device is not None
+        context_lens_tensor = async_tensor_h2d(self.context_lens, torch.int,
+                                               device, self.runner.pin_memory)
+        seq_lens_tensor = async_tensor_h2d(seq_lens, torch.int, device,
+                                           self.runner.pin_memory)
+        input_positions = async_tensor_h2d(self.input_positions, torch.long,
+                                           device, self.runner.pin_memory)
+        slot_mapping_tensor = async_tensor_h2d(self.slot_mapping, torch.long,
+                                               device, self.runner.pin_memory)
+        query_start_loc_tensor = async_tensor_h2d(query_start_loc, torch.int32,
+                                                  device,
+                                                  self.runner.pin_memory)
+        seq_start_loc_tensor = async_tensor_h2d(seq_start_loc, torch.int32,
+                                                device, self.runner.pin_memory)
+        placeholder_index_maps = {
+            modality: placeholder_map.index_map()
+            for modality, placeholder_map in
+            self.multimodal_placeholder_maps.items()
+        }
+
+        num_kv_splits = 8
+
+        return TritonMLAMetadata(
+            num_prefills=self.num_prefills,
+            slot_mapping=slot_mapping_tensor,
+            num_prefill_tokens=self.num_prefill_tokens,
+            num_decode_tokens=num_decode_tokens,
+            seq_lens=seq_lens,
+            multi_modal_placeholder_index_maps=placeholder_index_maps,
+            enable_kv_scales_calculation=True,
+            input_positions=input_positions,
+            seq_lens_tensor=seq_lens_tensor,
+            max_query_len=max_query_len,
+            max_decode_query_len=max_decode_query_len,
+            max_prefill_seq_len=max_prefill_seq_len,
+            max_decode_seq_len=max_decode_seq_len,
+            query_start_loc=query_start_loc_tensor,
+            seq_start_loc=seq_start_loc_tensor,
+            context_lens_tensor=context_lens_tensor,
+            block_tables=block_tables,
+            use_cuda_graph=use_captured_graph,
+            num_kv_splits=num_kv_splits,
+            head_dim=self.runner.model_config.get_head_size(),
+        )
+
+
+class TritonMLAImpl(MLACommonImpl[TritonMLAMetadata]):
+
+    def __init__(
+            self,
+            num_heads: int,
+            head_size: int,
+            scale: float,
+            num_kv_heads: int,
+            alibi_slopes: Optional[List[float]],
+            sliding_window: Optional[int],
+            kv_cache_dtype: str,
+            blocksparse_params: Optional[Dict[str, Any]],
+            logits_soft_cap: Optional[float],
+            attn_type: str,
+            # MLA Specific Arguments
+            **kwargs) -> None:
+        super().__init__(num_heads, head_size, scale, num_kv_heads,
+                         alibi_slopes, sliding_window, kv_cache_dtype,
+                         blocksparse_params, logits_soft_cap, attn_type,
+                         **kwargs)
+
+        unsupported_features = [
+            alibi_slopes, sliding_window, blocksparse_params, logits_soft_cap
+        ]
+        if any(unsupported_features):
+            raise NotImplementedError(
+                "TritonMLAImpl does not support one of the following: "
+                "alibi_slopes, sliding_window, blocksparse_params, "
+                "logits_soft_cap")
+
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "TritonMLAImpl")
+
+    def _forward_prefill(
+        self,
+        q: torch.Tensor,
+        kv_c_normed: torch.Tensor,
+        k_pe: torch.Tensor,
+        attn_metadata: TritonMLAMetadata,
+    ) -> torch.Tensor:
+        assert isinstance(attn_metadata, TritonMLAMetadata)
+        return self._forward_prefill_flash(q, kv_c_normed, k_pe,
+                                           attn_metadata.seq_start_loc,
+                                           attn_metadata.max_prefill_seq_len)
+
+    def _forward_decode(
+        self,
+        q_nope: torch.Tensor,
+        q_pe: torch.Tensor,
+        kv_c_and_k_pe_cache: torch.Tensor,
+        attn_metadata: TritonMLAMetadata,
+    ) -> torch.Tensor:
+        assert kv_c_and_k_pe_cache.numel() > 0
+        if self.kv_cache_dtype.startswith("fp8"):
+            raise NotImplementedError("FP8 Triton MLA not yet supported")
+
+        decode_meta = attn_metadata.decode_metadata
+        assert decode_meta is not None
+        B = q_nope.shape[0]
+
+        q = torch.cat([q_nope, q_pe], dim=-1)
+        o = torch.zeros(B,
+                        self.num_heads,
+                        self.kv_lora_rank,
+                        dtype=q.dtype,
+                        device=q.device)
+
+        # TODO(lucas) Allocate ahead of time
+        attn_logits = torch.empty(
+            (
+                B,
+                self.num_heads,
+                attn_metadata.num_kv_splits,
+                # NOTE(lucas) idk why the +1 is here but sglang has it so we
+                # just mirror that
+                self.kv_lora_rank + 1,
+            ),
+            dtype=torch.float32,
+            device=q.device,
+        )
+
+        # Add a head dim of 1
+        kv_c_and_k_pe_cache = kv_c_and_k_pe_cache.unsqueeze(2)
+        kv_c_cache = kv_c_and_k_pe_cache[..., :self.kv_lora_rank]
+        PAGE_SIZE = kv_c_and_k_pe_cache.size(1)
+
+        # Run MQA
+        decode_attention_fwd(q, kv_c_and_k_pe_cache, kv_c_cache, o,
+                             decode_meta.block_tables,
+                             decode_meta.seq_lens_tensor, attn_logits,
+                             attn_metadata.num_kv_splits, self.scale,
+                             PAGE_SIZE)
+
+        return self._v_up_proj_and_o_proj(o)
diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py
index 84fe89b7df360..7f2fe7e831064 100644
--- a/vllm/attention/backends/utils.py
+++ b/vllm/attention/backends/utils.py
@@ -289,7 +289,9 @@ def __init__(self, runner: "ModelRunnerBase"):
 
     @contextmanager
     def graph_capture(self, max_batch_size: int):
+
         self._is_graph_capturing = True
+
         self._graph_slot_mapping = torch.full((max_batch_size, ),
                                               PAD_SLOT_ID,
                                               dtype=torch.long,
@@ -299,7 +301,9 @@ def graph_capture(self, max_batch_size: int):
                                           device=self.runner.device)
         self._graph_block_tables = torch.from_numpy(
             self.runner.graph_block_tables).to(device=self.runner.device)
+
         yield
+
         self._is_graph_capturing = False
         del self._graph_slot_mapping
         del self._graph_seq_lens
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 962c45a65ae23..9b804a29a485d 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -41,8 +41,10 @@ def __init__(
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         per_layer_sliding_window: Optional[int] = None,
+        use_mla: bool = False,
         prefix: str = "",
         attn_type: str = AttentionType.DECODER,
+        **extra_impl_args,
     ) -> None:
         super().__init__()
         if per_layer_sliding_window is not None:
@@ -101,13 +103,18 @@ def __init__(
         # During model initialization, the default dtype is set as the model
         # weight and activation dtype.
         dtype = torch.get_default_dtype()
-        attn_backend = get_attn_backend(head_size, dtype, kv_cache_dtype,
-                                        block_size, is_attention_free,
-                                        blocksparse_params is not None)
+        attn_backend = get_attn_backend(head_size,
+                                        dtype,
+                                        kv_cache_dtype,
+                                        block_size,
+                                        is_attention_free,
+                                        blocksparse_params is not None,
+                                        use_mla=use_mla)
         impl_cls = attn_backend.get_impl_cls()
         self.impl = impl_cls(num_heads, head_size, scale, num_kv_heads,
                              alibi_slopes, sliding_window, kv_cache_dtype,
-                             blocksparse_params, logits_soft_cap, attn_type)
+                             blocksparse_params, logits_soft_cap, attn_type,
+                             **extra_impl_args)
         self.num_heads = num_heads
         self.head_size = head_size
         self.num_kv_heads = num_kv_heads
@@ -193,6 +200,10 @@ def extra_repr(self) -> str:
         s += f", backend={self.impl.__class__.__name__}"
         return s
 
+    def process_weights_after_loading(self):
+        if hasattr(self.impl, "process_weights_after_loading"):
+            self.impl.process_weights_after_loading()
+
 
 class MultiHeadAttention(nn.Module):
     """Multi-headed attention without any cache, used for ViT."""
diff --git a/vllm/attention/ops/triton_decode_attention.py b/vllm/attention/ops/triton_decode_attention.py
new file mode 100644
index 0000000000000..675df109b6c0e
--- /dev/null
+++ b/vllm/attention/ops/triton_decode_attention.py
@@ -0,0 +1,667 @@
+# Adapted from
+# https://github.com/sgl-project/sglang/blob/9f635ea50de920aa507f486daafba26a5b837574/python/sglang/srt/layers/attention/triton_ops/decode_attention.py
+# which was originally adapted from
+# https://github.com/ModelTC/lightllm/blob/96353e868a840db4d103138caf15ed9dbea8c186/lightllm/models/deepseek2/triton_kernel/gqa_flash_decoding_stage1.py
+# https://github.com/ModelTC/lightllm/blob/96353e868a840db4d103138caf15ed9dbea8c186/lightllm/models/deepseek2/triton_kernel/gqa_flash_decoding_stage2.py
+
+# Changes:
+# - Add support for page size >= 1.
+
+# Copyright 2025 vLLM Team
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+Memory-efficient attention for decoding.
+It supports page size >= 1.
+"""
+
+import logging
+
+import triton
+import triton.language as tl
+
+from vllm.platforms import current_platform
+
+is_hip_ = current_platform.is_rocm()
+
+logger = logging.getLogger(__name__)
+
+# TODO: Remove this when triton>=3.2.0. This issue will not affect performance
+# and accuracy.
+logger.warning(
+    "The following error message 'operation scheduled before its operands' "
+    "can be ignored.")
+
+
+@triton.jit
+def tanh(x):
+    # Tanh is just a scaled sigmoid
+    return 2 * tl.sigmoid(2 * x) - 1
+
+
+@triton.jit
+def _fwd_kernel_stage1(
+    Q,
+    K_Buffer,
+    V_Buffer,
+    sm_scale,
+    Req_to_tokens,
+    B_Seqlen,
+    Att_Out,
+    stride_req_to_tokens_b,
+    stride_qbs,
+    stride_qh,
+    stride_buf_kbs,
+    stride_buf_kh,
+    stride_buf_vbs,
+    stride_buf_vh,
+    stride_mid_ob,
+    stride_mid_oh,
+    stride_mid_os,
+    kv_group_num: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,
+    BLOCK_DV: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    NUM_KV_SPLITS: tl.constexpr,
+    PAGE_SIZE: tl.constexpr,
+    logit_cap: tl.constexpr,
+    Lk: tl.constexpr,
+    Lv: tl.constexpr,
+):
+    cur_batch = tl.program_id(0)
+    cur_head = tl.program_id(1)
+    split_kv_id = tl.program_id(2)
+
+    cur_kv_head = cur_head // kv_group_num
+
+    offs_d = tl.arange(0, BLOCK_DMODEL)
+    offs_dv = tl.arange(0, BLOCK_DV)
+    mask_d = offs_d < Lk
+    mask_dv = offs_dv < Lv
+    cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
+    cur_batch_req_idx = cur_batch
+
+    off_q = cur_batch * stride_qbs + cur_head * stride_qh + offs_d
+    q = tl.load(Q + off_q, mask=mask_d, other=0.0)
+
+    kv_len_per_split = tl.cdiv(cur_batch_seq_len, NUM_KV_SPLITS)
+    split_kv_start = kv_len_per_split * split_kv_id
+    split_kv_end = tl.minimum(split_kv_start + kv_len_per_split,
+                              cur_batch_seq_len)
+
+    e_max = -float("inf")
+    e_sum = 0.0
+    acc = tl.zeros([BLOCK_DV], dtype=tl.float32)
+
+    if split_kv_end > split_kv_start:
+        for start_n in range(split_kv_start, split_kv_end, BLOCK_N):
+            offs_n = start_n + tl.arange(0, BLOCK_N)
+            kv_page_number = tl.load(
+                Req_to_tokens + stride_req_to_tokens_b * cur_batch_req_idx +
+                offs_n // PAGE_SIZE,
+                mask=offs_n < split_kv_end,
+                other=0,
+            )
+            kv_loc = kv_page_number * PAGE_SIZE + offs_n % PAGE_SIZE
+            offs_buf_k = (kv_loc[:, None] * stride_buf_kbs +
+                          cur_kv_head * stride_buf_kh + offs_d[None, :])
+            k = tl.load(
+                K_Buffer + offs_buf_k,
+                mask=(offs_n[:, None] < split_kv_end) & (mask_d[None, :]),
+                other=0.0,
+            )
+            qk = tl.sum(q[None, :] * k, 1)
+            qk *= sm_scale
+
+            if logit_cap > 0:
+                qk = logit_cap * tanh(qk / logit_cap)
+
+            qk = tl.where(offs_n < split_kv_end, qk, float("-inf"))
+
+            offs_buf_v = (kv_loc[:, None] * stride_buf_vbs +
+                          cur_kv_head * stride_buf_vh + offs_dv[None, :])
+            v = tl.load(
+                V_Buffer + offs_buf_v,
+                mask=(offs_n[:, None] < split_kv_end) & (mask_dv[None, :]),
+                other=0.0,
+            )
+
+            n_e_max = tl.maximum(tl.max(qk, 0), e_max)
+            re_scale = tl.exp(e_max - n_e_max)
+            p = tl.exp(qk - n_e_max)
+            acc *= re_scale
+            acc += tl.sum(p[:, None] * v, 0)
+
+            e_sum = e_sum * re_scale + tl.sum(p, 0)
+            e_max = n_e_max
+
+        offs_mid_o = (cur_batch * stride_mid_ob + cur_head * stride_mid_oh +
+                      split_kv_id * stride_mid_os + offs_dv)
+
+        tl.store(
+            Att_Out + offs_mid_o,
+            acc / e_sum,
+            mask=(mask_dv),
+        )
+
+        offs_mid_o_1 = (cur_batch * stride_mid_ob + cur_head * stride_mid_oh +
+                        split_kv_id * stride_mid_os + Lv)
+
+        tl.store(
+            Att_Out + offs_mid_o_1,
+            e_max + tl.log(e_sum),
+        )
+
+
+def _decode_att_m_fwd(
+    q,
+    k_buffer,
+    v_buffer,
+    att_out,
+    Req_to_tokens,
+    B_Seqlen,
+    num_kv_splits,
+    sm_scale,
+    page_size,
+    logit_cap,
+):
+    BLOCK = 64
+    NUM_KV_SPLITS = num_kv_splits
+    Lk = k_buffer.shape[-1]
+    Lv = v_buffer.shape[-1]
+
+    batch, head_num = q.shape[0], q.shape[1]
+
+    grid = (batch, head_num, NUM_KV_SPLITS)
+    kv_group_num = q.shape[1] // k_buffer.shape[-2]
+
+    num_warps = 4 if kv_group_num == 1 else 2
+
+    BLOCK_DMODEL = triton.next_power_of_2(Lk)
+    BLOCK_DV = triton.next_power_of_2(Lv)
+
+    _fwd_kernel_stage1[grid](
+        q,
+        k_buffer,
+        v_buffer,
+        sm_scale,
+        Req_to_tokens,
+        B_Seqlen,
+        att_out,
+        Req_to_tokens.stride(0),
+        q.stride(0),
+        q.stride(1),
+        k_buffer.stride(-2),
+        k_buffer.stride(-1),
+        v_buffer.stride(-2),
+        v_buffer.stride(-1),
+        att_out.stride(0),
+        att_out.stride(1),
+        att_out.stride(2),
+        kv_group_num=kv_group_num,
+        BLOCK_DMODEL=BLOCK_DMODEL,
+        BLOCK_DV=BLOCK_DV,
+        BLOCK_N=BLOCK,
+        NUM_KV_SPLITS=NUM_KV_SPLITS,
+        PAGE_SIZE=page_size,
+        logit_cap=logit_cap,
+        num_warps=num_warps,
+        num_stages=2,
+        Lk=Lk,
+        Lv=Lv,
+    )
+
+
+@triton.jit
+def _fwd_grouped_kernel_stage1(
+    Q,
+    K_Buffer,
+    V_Buffer,
+    sm_scale,
+    Req_to_tokens,
+    B_Seqlen,
+    Att_Out,
+    stride_req_to_tokens_b,
+    stride_qbs,
+    stride_qh,
+    stride_buf_kbs,
+    stride_buf_kh,
+    stride_buf_vbs,
+    stride_buf_vh,
+    stride_mid_ob,
+    stride_mid_oh,
+    stride_mid_os,
+    kv_group_num: tl.constexpr,
+    q_head_num: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,
+    BLOCK_DPE: tl.constexpr,
+    BLOCK_DV: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_H: tl.constexpr,
+    NUM_KV_SPLITS: tl.constexpr,
+    PAGE_SIZE: tl.constexpr,
+    logit_cap: tl.constexpr,
+    Lk: tl.constexpr,
+    Lv: tl.constexpr,
+):
+    cur_batch = tl.program_id(0)
+    cur_head_id = tl.program_id(1)
+    cur_kv_head = cur_head_id // tl.cdiv(kv_group_num, BLOCK_H)
+    split_kv_id = tl.program_id(2)
+
+    if kv_group_num > BLOCK_H:
+        VALID_BLOCK_H: tl.constexpr = BLOCK_H
+    else:
+        VALID_BLOCK_H: tl.constexpr = kv_group_num
+    cur_head = cur_head_id * VALID_BLOCK_H + tl.arange(0, BLOCK_H)
+    mask_h = cur_head < (cur_head_id + 1) * VALID_BLOCK_H
+    mask_h = mask_h & (cur_head < q_head_num)
+
+    offs_d = tl.arange(0, BLOCK_DMODEL)
+    offs_dv = tl.arange(0, BLOCK_DV)
+    mask_d = offs_d < Lk
+    mask_dv = offs_dv < Lv
+    cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
+    cur_batch_req_idx = cur_batch
+
+    offs_q = cur_batch * stride_qbs + cur_head[:, None] * stride_qh + offs_d[
+        None, :]
+    q = tl.load(Q + offs_q,
+                mask=(mask_h[:, None]) & (mask_d[None, :]),
+                other=0.0)
+
+    if BLOCK_DPE > 0:
+        offs_dpe = BLOCK_DMODEL + tl.arange(0, BLOCK_DPE)
+        mask_dpe = offs_dpe < Lk
+        off_qpe = (cur_batch * stride_qbs + cur_head[:, None] * stride_qh +
+                   offs_dpe[None, :])
+        qpe = tl.load(Q + off_qpe,
+                      mask=(mask_h[:, None]) & (mask_dpe[None, :]),
+                      other=0.0)
+
+    kv_len_per_split = tl.cdiv(cur_batch_seq_len, NUM_KV_SPLITS)
+    split_kv_start = kv_len_per_split * split_kv_id
+    split_kv_end = tl.minimum(split_kv_start + kv_len_per_split,
+                              cur_batch_seq_len)
+
+    e_max = tl.zeros([BLOCK_H], dtype=tl.float32) - float("inf")
+    e_sum = tl.zeros([BLOCK_H], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_H, BLOCK_DV], dtype=tl.float32)
+
+    if split_kv_end > split_kv_start:
+        for start_n in range(split_kv_start, split_kv_end, BLOCK_N):
+            offs_n = start_n + tl.arange(0, BLOCK_N)
+            kv_page_number = tl.load(
+                Req_to_tokens + stride_req_to_tokens_b * cur_batch_req_idx +
+                offs_n // PAGE_SIZE,
+                mask=offs_n < split_kv_end,
+                other=0,
+            )
+            kv_loc = kv_page_number * PAGE_SIZE + offs_n % PAGE_SIZE
+            offs_buf_k = (kv_loc[None, :] * stride_buf_kbs +
+                          cur_kv_head * stride_buf_kh + offs_d[:, None])
+            k = tl.load(
+                K_Buffer + offs_buf_k,
+                mask=(offs_n[None, :] < split_kv_end) & (mask_d[:, None]),
+                other=0.0,
+            )
+            qk = tl.dot(q, k.to(q.dtype))
+            if BLOCK_DPE > 0:
+                offs_buf_kpe = (kv_loc[None, :] * stride_buf_kbs +
+                                cur_kv_head * stride_buf_kh +
+                                offs_dpe[:, None])
+                kpe = tl.load(
+                    K_Buffer + offs_buf_kpe,
+                    mask=(offs_n[None, :] < split_kv_end) &
+                    (mask_dpe[:, None]),
+                    other=0.0,
+                )
+                qk += tl.dot(qpe, kpe.to(qpe.dtype))
+            qk *= sm_scale
+
+            if logit_cap > 0:
+                qk = logit_cap * tanh(qk / logit_cap)
+
+            qk = tl.where(mask_h[:, None] & (offs_n[None, :] < split_kv_end),
+                          qk, float("-inf"))
+
+            offs_buf_v = (kv_loc[:, None] * stride_buf_vbs +
+                          cur_kv_head * stride_buf_vh + offs_dv[None, :])
+            v = tl.load(
+                V_Buffer + offs_buf_v,
+                mask=(offs_n[:, None] < split_kv_end) & (mask_dv[None, :]),
+                other=0.0,
+            )
+
+            n_e_max = tl.maximum(tl.max(qk, 1), e_max)
+            re_scale = tl.exp(e_max - n_e_max)
+            p = tl.exp(qk - n_e_max[:, None])
+            acc *= re_scale[:, None]
+            acc += tl.dot(p.to(v.dtype), v)
+
+            e_sum = e_sum * re_scale + tl.sum(p, 1)
+            e_max = n_e_max
+
+        offs_mid_o = (cur_batch * stride_mid_ob +
+                      cur_head[:, None] * stride_mid_oh +
+                      split_kv_id * stride_mid_os + offs_dv[None, :])
+
+        tl.store(
+            Att_Out + offs_mid_o,
+            acc / e_sum[:, None],
+            mask=(mask_h[:, None]) & (mask_dv[None, :]),
+        )
+
+        offs_mid_o_1 = (cur_batch * stride_mid_ob + cur_head * stride_mid_oh +
+                        split_kv_id * stride_mid_os + Lv)
+
+        tl.store(
+            Att_Out + offs_mid_o_1,
+            e_max + tl.log(e_sum),
+            mask=mask_h,
+        )
+
+
+def _decode_grouped_att_m_fwd(
+    q,
+    k_buffer,
+    v_buffer,
+    att_out,
+    Req_to_tokens,
+    B_Seqlen,
+    num_kv_splits,
+    sm_scale,
+    page_size,
+    logit_cap,
+):
+    BLOCK = 32
+    Lk = k_buffer.shape[-1]
+    Lv = v_buffer.shape[-1]
+
+    # [TODO] work around shmem limit on MI3xx
+    if is_hip_ and Lk >= 576:
+        BLOCK = 16
+
+    if Lk == 576:
+        BLOCK_DMODEL = 512
+        BLOCK_DPE = 64
+    elif Lk == 288:
+        BLOCK_DMODEL = 256
+        BLOCK_DPE = 32
+    else:
+        BLOCK_DMODEL = triton.next_power_of_2(Lk)
+        BLOCK_DPE = 0
+    BLOCK_DV = triton.next_power_of_2(Lv)
+
+    batch, head_num = q.shape[0], q.shape[1]
+    kv_group_num = q.shape[1] // k_buffer.shape[-2]
+
+    BLOCK_H = 16
+    NUM_KV_SPLITS = num_kv_splits
+    grid = (
+        batch,
+        triton.cdiv(head_num, min(BLOCK_H, kv_group_num)),
+        NUM_KV_SPLITS,
+    )
+
+    extra_kargs = {}
+    if is_hip_:
+        # https://rocm.docs.amd.com/en/docs-6.2.0/how-to/llm-fine-tuning-optimization/optimizing-triton-kernel.html
+        # https://github.com/triton-lang/triton/blob/main/third_party/amd/backend/compiler.py
+        extra_kargs = {
+            "waves_per_eu": 4,
+            "matrix_instr_nonkdim": 16,
+            "kpack": 2
+        }
+
+    _fwd_grouped_kernel_stage1[grid](
+        q,
+        k_buffer,
+        v_buffer,
+        sm_scale,
+        Req_to_tokens,
+        B_Seqlen,
+        att_out,
+        Req_to_tokens.stride(0),
+        q.stride(0),
+        q.stride(1),
+        k_buffer.stride(-2),
+        k_buffer.stride(-1),
+        v_buffer.stride(-2),
+        v_buffer.stride(-1),
+        att_out.stride(0),
+        att_out.stride(1),
+        att_out.stride(2),
+        kv_group_num=kv_group_num,
+        q_head_num=head_num,
+        BLOCK_DMODEL=BLOCK_DMODEL,
+        BLOCK_DPE=BLOCK_DPE,
+        BLOCK_DV=BLOCK_DV,
+        BLOCK_N=BLOCK,
+        BLOCK_H=BLOCK_H,
+        NUM_KV_SPLITS=NUM_KV_SPLITS,
+        PAGE_SIZE=page_size,
+        logit_cap=logit_cap,
+        num_warps=4,
+        num_stages=2,
+        Lk=Lk,
+        Lv=Lv,
+        **extra_kargs,
+    )
+
+
+@triton.jit
+def _fwd_kernel_stage2(
+    Mid_O,
+    o,
+    B_Seqlen,
+    stride_mid_ob,
+    stride_mid_oh,
+    stride_mid_os,
+    stride_obs,
+    stride_oh,
+    NUM_KV_SPLITS: tl.constexpr,
+    BLOCK_DV: tl.constexpr,
+    Lv: tl.constexpr,
+):
+    cur_batch = tl.program_id(0)
+    cur_head = tl.program_id(1)
+
+    cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
+
+    offs_d = tl.arange(0, BLOCK_DV)
+    mask_d = offs_d < Lv
+
+    e_sum = 0.0
+    e_max = -float("inf")
+    acc = tl.zeros([BLOCK_DV], dtype=tl.float32)
+
+    offs_v = cur_batch * stride_mid_ob + cur_head * stride_mid_oh + offs_d
+    offs_logic = cur_batch * stride_mid_ob + cur_head * stride_mid_oh + Lv
+
+    for split_kv_id in range(0, NUM_KV_SPLITS):
+        kv_len_per_split = tl.cdiv(cur_batch_seq_len, NUM_KV_SPLITS)
+        split_kv_start = kv_len_per_split * split_kv_id
+        split_kv_end = tl.minimum(split_kv_start + kv_len_per_split,
+                                  cur_batch_seq_len)
+
+        if split_kv_end > split_kv_start:
+            tv = tl.load(Mid_O + offs_v + split_kv_id * stride_mid_os,
+                         mask=mask_d,
+                         other=0.0)
+            tlogic = tl.load(Mid_O + offs_logic + split_kv_id * stride_mid_os)
+            n_e_max = tl.maximum(tlogic, e_max)
+
+            old_scale = tl.exp(e_max - n_e_max)
+            acc *= old_scale
+            exp_logic = tl.exp(tlogic - n_e_max)
+            acc += exp_logic * tv
+
+            e_sum = e_sum * old_scale + exp_logic
+            e_max = n_e_max
+
+    tl.store(
+        o + cur_batch * stride_obs + cur_head * stride_oh + offs_d,
+        acc / e_sum,
+        mask=mask_d,
+    )
+
+
+def _decode_softmax_reducev_fwd(
+    logits,
+    q,
+    o,
+    v_buffer,
+    b_seq_len,
+    num_kv_splits,
+):
+    batch, head_num = q.shape[0], q.shape[1]
+    Lv = v_buffer.shape[-1]
+    BLOCK_DV = triton.next_power_of_2(Lv)
+
+    NUM_KV_SPLITS = num_kv_splits
+
+    extra_kargs = {}
+    if is_hip_:
+        # https://rocm.docs.amd.com/en/docs-6.2.0/how-to/llm-fine-tuning-optimization/optimizing-triton-kernel.html
+        # https://github.com/triton-lang/triton/blob/main/third_party/amd/backend/compiler.py
+        extra_kargs = {
+            "waves_per_eu": 4,
+            "matrix_instr_nonkdim": 16,
+            "kpack": 2
+        }
+
+    grid = (batch, head_num)
+    _fwd_kernel_stage2[grid](
+        logits,
+        o,
+        b_seq_len,
+        logits.stride(0),
+        logits.stride(1),
+        logits.stride(2),
+        o.stride(0),
+        o.stride(1),
+        NUM_KV_SPLITS=NUM_KV_SPLITS,
+        BLOCK_DV=BLOCK_DV,
+        Lv=Lv,
+        num_warps=4,
+        num_stages=2,
+        **extra_kargs,
+    )
+
+
+def decode_attention_fwd_normal(
+    q,
+    k_buffer,
+    v_buffer,
+    o,
+    req_to_token,
+    b_seq_len,
+    attn_logits,
+    num_kv_splits,
+    sm_scale,
+    page_size,
+    logit_cap=0.0,
+):
+    _decode_att_m_fwd(
+        q,
+        k_buffer,
+        v_buffer,
+        attn_logits,
+        req_to_token,
+        b_seq_len,
+        num_kv_splits,
+        sm_scale,
+        page_size,
+        logit_cap,
+    )
+    _decode_softmax_reducev_fwd(attn_logits, q, o, v_buffer, b_seq_len,
+                                num_kv_splits)
+
+
+def decode_attention_fwd_grouped(
+    q,
+    k_buffer,
+    v_buffer,
+    o,
+    req_to_token,
+    b_seq_len,
+    attn_logits,
+    num_kv_splits,
+    sm_scale,
+    page_size,
+    logit_cap=0.0,
+):
+    _decode_grouped_att_m_fwd(
+        q,
+        k_buffer,
+        v_buffer,
+        attn_logits,
+        req_to_token,
+        b_seq_len,
+        num_kv_splits,
+        sm_scale,
+        page_size,
+        logit_cap,
+    )
+    _decode_softmax_reducev_fwd(attn_logits, q, o, v_buffer, b_seq_len,
+                                num_kv_splits)
+
+
+def decode_attention_fwd(
+    q,
+    k_buffer,
+    v_buffer,
+    o,
+    req_to_token,
+    b_seq_len,
+    attn_logits,
+    num_kv_splits,
+    sm_scale,
+    page_size=1,
+    logit_cap=0.0,
+):
+    assert num_kv_splits == attn_logits.shape[2]
+    kv_group_num = q.shape[1] // v_buffer.shape[-2]
+
+    if kv_group_num == 1:
+        # MHA
+        decode_attention_fwd_normal(
+            q,
+            k_buffer,
+            v_buffer,
+            o,
+            req_to_token,
+            b_seq_len,
+            attn_logits,
+            num_kv_splits,
+            sm_scale,
+            page_size,
+            logit_cap,
+        )
+    else:
+        # GQA/MQA/MLA
+        decode_attention_fwd_grouped(
+            q,
+            k_buffer,
+            v_buffer,
+            o,
+            req_to_token,
+            b_seq_len,
+            attn_logits,
+            num_kv_splits,
+            sm_scale,
+            page_size,
+            logit_cap,
+        )
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index 1376274d57777..4c6bbc7272280 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -83,6 +83,7 @@ def get_attn_backend(
     block_size: int,
     is_attention_free: bool,
     is_blocksparse: bool = False,
+    use_mla: bool = False,
 ) -> Type[AttentionBackend]:
     """Selects which attention backend to use and lazily imports it."""
     # Accessing envs.* behind an @lru_cache decorator can cause the wrong
@@ -97,6 +98,7 @@ def get_attn_backend(
         is_attention_free=is_attention_free,
         is_blocksparse=is_blocksparse,
         use_v1=envs.VLLM_USE_V1,
+        use_mla=use_mla,
     )
 
 
@@ -109,6 +111,7 @@ def _cached_get_attn_backend(
     is_attention_free: bool,
     is_blocksparse: bool = False,
     use_v1: bool = False,
+    use_mla: bool = False,
 ) -> Type[AttentionBackend]:
     if is_blocksparse:
         logger.info("Using BlocksparseFlashAttention backend.")
@@ -141,7 +144,8 @@ def _cached_get_attn_backend(
 
     # get device-specific attn_backend
     attention_cls = current_platform.get_attn_backend_cls(
-        selected_backend, head_size, dtype, kv_cache_dtype, block_size, use_v1)
+        selected_backend, head_size, dtype, kv_cache_dtype, block_size, use_v1,
+        use_mla)
     if not attention_cls:
         raise ValueError(
             f"Invalid attention backend for {current_platform.device_name}")
diff --git a/vllm/config.py b/vllm/config.py
index 58464eae80b82..f6bd8b1ad8f14 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -736,17 +736,25 @@ def get_vocab_size(self) -> int:
     def get_hidden_size(self) -> int:
         return self.hf_text_config.hidden_size
 
+    @property
+    def is_deepseek_mla(self) -> bool:
+        # TODO add deepseek_v3
+        return hasattr(self.hf_text_config,
+                       "model_type") and (self.hf_text_config.model_type
+                                          in ('deepseek_v2'))
+
     def get_head_size(self) -> int:
         # TODO remove hard code
-        if hasattr(self.hf_text_config,
-                   "model_type") and (self.hf_text_config.model_type
-                                      in ('deepseek_v2', 'deepseek_v3')):
-            qk_rope_head_dim = getattr(self.hf_text_config, "qk_rope_head_dim",
-                                       0)
-            qk_nope_head_dim = getattr(self.hf_text_config, "qk_nope_head_dim",
-                                       0)
-            if qk_rope_head_dim and qk_nope_head_dim:
-                return qk_rope_head_dim + qk_nope_head_dim
+        if self.is_deepseek_mla:
+            if self.use_mla:
+                return self.hf_text_config.kv_lora_rank
+            else:
+                qk_rope_head_dim = getattr(self.hf_text_config,
+                                           "qk_rope_head_dim", 0)
+                qk_nope_head_dim = getattr(self.hf_text_config,
+                                           "qk_nope_head_dim", 0)
+                if qk_rope_head_dim and qk_nope_head_dim:
+                    return qk_rope_head_dim + qk_nope_head_dim
 
         if self.is_attention_free:
             return 0
@@ -805,6 +813,10 @@ def get_total_num_kv_heads(self) -> int:
 
     def get_num_kv_heads(self, parallel_config: "ParallelConfig") -> int:
         """Returns the number of KV heads per GPU."""
+        if self.use_mla:
+            # When using MLA during decode it becomes MQA
+            return 1
+
         total_num_kv_heads = self.get_total_num_kv_heads()
         # If tensor parallelism is used, we divide the number of KV heads by
         # the tensor parallel size. We will replicate the KV heads in the
@@ -955,6 +967,11 @@ def is_cross_encoder(self) -> bool:
         architectures = getattr(self.hf_config, "architectures", [])
         return ModelRegistry.is_cross_encoder_model(architectures)
 
+    @property
+    def use_mla(self) -> bool:
+        use_mla = (self.is_deepseek_mla and not envs.VLLM_MLA_DISABLE)
+        return use_mla
+
     @property
     def supported_runner_types(self) -> Set[RunnerType]:
         return {_TASK_RUNNER[task] for task in self.supported_tasks}
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 1f203b6eaeb33..cc7c99e50ac4d 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -931,7 +931,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             type=str,
             default="auto",
             help='The worker class to use for distributed execution.')
-
         parser.add_argument(
             "--generation-config",
             type=nullable_str,
diff --git a/vllm/envs.py b/vllm/envs.py
index 8627caec7790d..2a18e3b9bc51d 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -77,6 +77,8 @@
     V_SCALE_CONSTANT: int = 100
     VLLM_SERVER_DEV_MODE: bool = False
     VLLM_V1_OUTPUT_PROC_CHUNK_SIZE: int = 128
+    VLLM_MLA_DISABLE: bool = False
+    VLLM_MLA_PERFORM_MATRIX_ABSORPTION: bool = True
 
 
 def get_default_cache_root():
@@ -506,6 +508,18 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     # TTFT and overall throughput.
     "VLLM_V1_OUTPUT_PROC_CHUNK_SIZE":
     lambda: int(os.getenv("VLLM_V1_OUTPUT_PROC_CHUNK_SIZE", "128")),
+
+    # If set, vLLM will disable the MLA attention optimizations.
+    "VLLM_MLA_DISABLE":
+    lambda: bool(int(os.getenv("VLLM_MLA_DISABLE", "0"))),
+
+    # Flag that can control whether or not we perform matrix-absorption for MLA
+    # decode, i.e. absorb W_UK into W_Q/W_UK and W_UV into W_O, absorbing the
+    # matrices reduces the runtime FLOPs needed to compute MLA but requires
+    # storing more weights, W_Q_UK and W_UV_O, so can increase memory usage,
+    # the is enabled by default
+    "VLLM_MLA_PERFORM_MATRIX_ABSORPTION":
+    lambda: bool(int(os.getenv("VLLM_MLA_PERFORM_MATRIX_ABSORPTION", "1")))
 }
 
 # end-env-vars-definition
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 712266ee42639..62babcddd61b1 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -23,6 +23,7 @@
 from transformers import AutoModelForCausalLM
 from transformers.utils import SAFE_WEIGHTS_INDEX_NAME
 
+from vllm.attention import Attention
 from vllm.config import (LoadConfig, LoadFormat, ModelConfig, ParallelConfig,
                          VllmConfig, set_current_vllm_config)
 from vllm.distributed import (get_tensor_model_parallel_rank,
@@ -397,6 +398,11 @@ def load_model(self, vllm_config: VllmConfig) -> nn.Module:
                     # parameters onto device for processing and back off after.
                     with device_loading_context(module, target_device):
                         quant_method.process_weights_after_loading(module)
+                elif isinstance(module, Attention) and \
+                    hasattr(module, "process_weights_after_loading"):
+                    # When attention modules need to process weights after
+                    # currently only used by MLA
+                    module.process_weights_after_loading()
         return model.eval()
 
 
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index af6810a140b43..73388cd269853 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -28,7 +28,7 @@
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig, VllmConfig
+from vllm.config import CacheConfig, ModelConfig, VllmConfig
 from vllm.distributed import (get_pp_group,
                               get_tensor_model_parallel_world_size,
                               tensor_model_parallel_all_reduce)
@@ -326,12 +326,156 @@ def forward(
         return output
 
 
+class DeepseekV2MLAAttention(nn.Module):
+    """
+    Main reference: DeepseekV2 paper, and FlashInfer Implementation
+    (https://arxiv.org/abs/2405.04434 and https://github.com/flashinfer-ai/flashinfer/pull/551).
+    
+    For more info see MLACommonImpl in: vllm/attention/backends/mla/utils.py
+    """
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        hidden_size: int,
+        num_heads: int,
+        qk_nope_head_dim: int,
+        qk_rope_head_dim: int,
+        v_head_dim: int,
+        q_lora_rank: Optional[int],
+        kv_lora_rank: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim
+        self.v_head_dim = v_head_dim
+
+        self.q_lora_rank = q_lora_rank
+        self.kv_lora_rank = kv_lora_rank
+
+        self.num_heads = num_heads
+        tp_size = get_tensor_model_parallel_world_size()
+        assert num_heads % tp_size == 0
+        self.num_local_heads = num_heads // tp_size
+
+        self.scaling = self.qk_head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        if self.q_lora_rank is not None:
+            self.q_a_proj = ReplicatedLinear(self.hidden_size,
+                                             self.q_lora_rank,
+                                             bias=False,
+                                             quant_config=quant_config,
+                                             prefix=f"{prefix}.q_a_proj")
+            self.q_a_layernorm = RMSNorm(self.q_lora_rank,
+                                         eps=config.rms_norm_eps)
+            self.q_b_proj = ColumnParallelLinear(q_lora_rank,
+                                                 self.num_heads *
+                                                 self.qk_head_dim,
+                                                 bias=False,
+                                                 quant_config=quant_config,
+                                                 prefix=f"{prefix}.q_b_proj")
+        else:
+            self.q_proj = ColumnParallelLinear(self.hidden_size,
+                                               self.num_heads *
+                                               self.qk_head_dim,
+                                               bias=False,
+                                               quant_config=quant_config,
+                                               prefix=f"{prefix}.q_proj")
+
+        self.kv_a_proj_with_mqa = ReplicatedLinear(
+            self.hidden_size,
+            self.kv_lora_rank + self.qk_rope_head_dim,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.kv_a_proj_with_mqa")
+        self.kv_a_layernorm = RMSNorm(self.kv_lora_rank,
+                                      eps=config.rms_norm_eps)
+        self.kv_b_proj = ColumnParallelLinear(
+            self.kv_lora_rank,
+            self.num_heads * (self.qk_nope_head_dim + self.v_head_dim),
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.kv_b_proj")
+        self.o_proj = RowParallelLinear(self.num_heads * self.v_head_dim,
+                                        self.hidden_size,
+                                        bias=False,
+                                        quant_config=quant_config,
+                                        prefix=f"{prefix}.o_proj")
+
+        rope_scaling["rope_type"] = 'deepseek_yarn'
+        self.rotary_emb = get_rope(qk_rope_head_dim,
+                                   rotary_dim=qk_rope_head_dim,
+                                   max_position=max_position_embeddings,
+                                   base=rope_theta,
+                                   rope_scaling=rope_scaling,
+                                   is_neox_style=False)
+        if rope_scaling:
+            mscale_all_dim = rope_scaling.get("mscale_all_dim", False)
+            scaling_factor = rope_scaling["factor"]
+            mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
+            self.scaling = self.scaling * mscale * mscale
+
+        self.mla_attn = Attention(
+            num_heads=self.num_local_heads,
+            head_size=self.kv_lora_rank,
+            scale=self.scaling,
+            num_kv_heads=1,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+            use_mla=True,
+            # MLA Args
+            q_lora_rank=self.q_lora_rank,
+            kv_lora_rank=self.kv_lora_rank,
+            qk_nope_head_dim=self.qk_nope_head_dim,
+            qk_rope_head_dim=self.qk_rope_head_dim,
+            qk_head_dim=self.qk_head_dim,
+            v_head_dim=self.v_head_dim,
+            rotary_emb=self.rotary_emb,
+            q_proj=self.q_proj if self.q_lora_rank is None else self.q_b_proj,
+            kv_b_proj=self.kv_b_proj,
+            o_proj=self.o_proj,
+        )
+
+        self.prefix = prefix
+        self.debug_layer_idx = int(self.prefix.split(".")[-2])
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+    ) -> torch.Tensor:
+        if self.q_lora_rank is not None:
+            ckq = self.q_a_proj(hidden_states)[0]
+            hidden_states_or_q_c = self.q_a_layernorm(ckq)
+        else:
+            hidden_states_or_q_c = hidden_states
+        kv_c, k_pe = self.kv_a_proj_with_mqa(hidden_states)[0].split(
+            [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
+        kv_c_normed = self.kv_a_layernorm(kv_c.contiguous())
+        return self.mla_attn(hidden_states_or_q_c, kv_c_normed, k_pe, kv_cache,
+                             attn_metadata)
+
+
 class DeepseekV2DecoderLayer(nn.Module):
 
     def __init__(
         self,
         config: PretrainedConfig,
         prefix: str,
+        model_config: ModelConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
     ) -> None:
@@ -344,7 +488,11 @@ def __init__(
         # DecoderLayers are created with `make_layers` which passes the prefix
         # with the layer's index.
         layer_idx = int(prefix.split(sep='.')[-1])
-        self.self_attn = DeepseekV2Attention(
+        if model_config.use_mla:
+            attn_cls = DeepseekV2MLAAttention
+        else:
+            attn_cls = DeepseekV2Attention
+        self.self_attn = attn_cls(
             config=config,
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
@@ -421,6 +569,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
         config = vllm_config.model_config.hf_config
+        model_config = vllm_config.model_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
 
@@ -440,6 +589,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             lambda prefix: DeepseekV2DecoderLayer(
                 config,
                 prefix,
+                model_config=model_config,
                 cache_config=cache_config,
                 quant_config=quant_config,
             ),
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index 74948202cbe48..159ea94f99a27 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -31,7 +31,8 @@ def get_device_name(cls, device_id: int = 0) -> str:
     @classmethod
     def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
                              dtype: torch.dtype, kv_cache_dtype: Optional[str],
-                             block_size: int, use_v1: bool) -> str:
+                             block_size: int, use_v1: bool,
+                             use_mla: bool) -> str:
         if selected_backend != _Backend.TORCH_SDPA:
             logger.info("Cannot use %s backend on CPU.", selected_backend)
         logger.info("Using Torch SDPA backend.")
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index e4b436edf7588..91dcdff006e3e 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -157,10 +157,14 @@ def get_current_memory_usage(cls,
 
     @classmethod
     def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
-                             kv_cache_dtype, block_size, use_v1) -> str:
+                             kv_cache_dtype, block_size, use_v1,
+                             use_mla) -> str:
         if use_v1:
             logger.info("Using Flash Attention backend on V1 engine.")
             return "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend"
+        if use_mla:
+            logger.info("Using Triton MLA backend.")
+            return "vllm.attention.backends.triton_mla.TritonMLABackend"
         if selected_backend == _Backend.FLASHINFER:
             logger.info("Using FlashInfer backend.")
             return "vllm.attention.backends.flashinfer.FlashInferBackend"
@@ -171,7 +175,8 @@ def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
             pass
         elif selected_backend:
             raise ValueError(
-                f"Invalid attention backend for {cls.device_name}")
+                f"Invalid attention backend for {cls.device_name}, "
+                f"with use_v1: {use_v1} use_mla: {use_mla}")
 
         target_backend = _Backend.FLASH_ATTN
         if not cls.has_device_capability(80):
diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py
index a32c262c84efa..0e1c4c0c5949f 100644
--- a/vllm/platforms/hpu.py
+++ b/vllm/platforms/hpu.py
@@ -27,7 +27,8 @@ class HpuPlatform(Platform):
     @classmethod
     def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
                              dtype: torch.dtype, kv_cache_dtype: Optional[str],
-                             block_size: int, use_v1: bool) -> str:
+                             block_size: int, use_v1: bool,
+                             use_mla: bool) -> str:
         logger.info("Using HPUAttention backend.")
         return "vllm.attention.backends.hpu_attn.HPUAttentionBackend"
 
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index f2ecec3203fb7..186fa54bfc14c 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -30,6 +30,7 @@ class _Backend(enum.Enum):
     TORCH_SDPA = enum.auto()
     OPENVINO = enum.auto()
     FLASHINFER = enum.auto()
+    TRITON_MLA = enum.auto()
     HPU_ATTN = enum.auto()
     PALLAS = enum.auto()
     IPEX = enum.auto()
@@ -139,7 +140,8 @@ def is_cuda_alike(self) -> bool:
     @classmethod
     def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
                              dtype: torch.dtype, kv_cache_dtype: Optional[str],
-                             block_size: int, use_v1: bool) -> str:
+                             block_size: int, use_v1: bool,
+                             use_mla: bool) -> str:
         """Get the attention backend class of a device."""
         return ""
 
diff --git a/vllm/platforms/openvino.py b/vllm/platforms/openvino.py
index 7d414165a8188..3282c061714d3 100644
--- a/vllm/platforms/openvino.py
+++ b/vllm/platforms/openvino.py
@@ -30,7 +30,8 @@ class OpenVinoPlatform(Platform):
     @classmethod
     def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
                              dtype: torch.dtype, kv_cache_dtype: Optional[str],
-                             block_size: int, use_v1: bool) -> str:
+                             block_size: int, use_v1: bool,
+                             use_mla: bool) -> str:
         if selected_backend != _Backend.OPENVINO:
             logger.info("Cannot use %s backend on OpenVINO.", selected_backend)
         logger.info("Using OpenVINO Attention backend.")
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 5ef56406e1935..8888521631481 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -75,7 +75,8 @@ class RocmPlatform(Platform):
 
     @classmethod
     def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
-                             kv_cache_dtype, block_size, use_v1) -> str:
+                             kv_cache_dtype, block_size, use_v1,
+                             use_mla) -> str:
         selected_backend = (_Backend.ROCM_FLASH if selected_backend
                             == _Backend.FLASH_ATTN else selected_backend)
         if selected_backend == _Backend.ROCM_FLASH:
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index 05a3aa4305cfa..494a17633974d 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -29,7 +29,8 @@ class TpuPlatform(Platform):
     @classmethod
     def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
                              dtype: torch.dtype, kv_cache_dtype: Optional[str],
-                             block_size: int, use_v1: bool) -> str:
+                             block_size: int, use_v1: bool,
+                             use_mla: bool) -> str:
         if selected_backend != _Backend.PALLAS:
             logger.info("Cannot use %s backend on TPU.", selected_backend)
         logger.info("Using Pallas backend.")
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index c34b5b58672e7..a5ca77f57cf47 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -27,7 +27,8 @@ class XPUPlatform(Platform):
     @classmethod
     def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
                              dtype: torch.dtype, kv_cache_dtype: Optional[str],
-                             block_size: int, use_v1: bool) -> str:
+                             block_size: int, use_v1: bool,
+                             use_mla: bool) -> str:
         if selected_backend != _Backend.IPEX:
             logger.info("Cannot use %s backend on XPU.", selected_backend)
         logger.info("Using IPEX attention backend.")
diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py
index 7ccd4571b19df..08316ba74aad8 100644
--- a/vllm/worker/cache_engine.py
+++ b/vllm/worker/cache_engine.py
@@ -56,7 +56,8 @@ def __init__(
                                              model_config.dtype,
                                              cache_config.cache_dtype,
                                              self.block_size,
-                                             model_config.is_attention_free)
+                                             model_config.is_attention_free,
+                                             use_mla=model_config.use_mla)
 
         # Initialize the cache.
         self.gpu_cache = self._allocate_kv_cache(
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index bf1a40d48a789..160c0662ce976 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1066,6 +1066,7 @@ def __init__(
             self.kv_cache_dtype,
             self.block_size,
             self.model_config.is_attention_free,
+            use_mla=self.model_config.use_mla,
         ) if needs_attn_backend else None
         if self.attn_backend:
             self.attn_state = self.attn_backend.get_state_cls()(
@@ -1973,7 +1974,8 @@ def forward(
 
         # Copy the input tensors to the input buffers.
         self.input_buffers["input_ids"].copy_(input_ids, non_blocking=True)
-        self.input_buffers["positions"].copy_(positions, non_blocking=True)
+        if positions is not None:
+            self.input_buffers["positions"].copy_(positions, non_blocking=True)
 
         if self.backend_name != "NO_ATTENTION":
             self.input_buffers["slot_mapping"].copy_(

From 7a8987dac5f0ed0c798a73e8b4ec8f5e640bc63a Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Fri, 31 Jan 2025 00:19:35 -0800
Subject: [PATCH 442/462] [Bugfix] Gracefully handle huggingface hub http error
 (#12571)

---
 vllm/transformers_utils/config.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index c97acffa1a719..5805f4ad0b7f7 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -7,7 +7,8 @@
 import huggingface_hub
 from huggingface_hub import (file_exists, hf_hub_download,
                              try_to_load_from_cache)
-from huggingface_hub.utils import (EntryNotFoundError, LocalEntryNotFoundError,
+from huggingface_hub.utils import (EntryNotFoundError, HfHubHTTPError,
+                                   LocalEntryNotFoundError,
                                    RepositoryNotFoundError,
                                    RevisionNotFoundError)
 from torch import nn
@@ -294,6 +295,13 @@ def get_hf_file_to_dict(file_name: str,
                 logger.debug("File or repository not found in hf_hub_download",
                              e)
                 return None
+            except HfHubHTTPError as e:
+                logger.warning(
+                    "Cannot connect to Hugging Face Hub. Skipping file "
+                    "download for '%s':",
+                    file_name,
+                    exc_info=e)
+                return None
             file_path = Path(hf_hub_file)
 
         with open(file_path) as file:

From e3f7ff65e7a6c08cd354f7f333bce543a4f0607e Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Fri, 31 Jan 2025 17:20:34 +0000
Subject: [PATCH 443/462] Add favicon to docs (#12611)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .../assets/logos/vllm-logo-only-light.ico       | Bin 0 -> 16958 bytes
 docs/source/conf.py                             |   1 +
 2 files changed, 1 insertion(+)
 create mode 100644 docs/source/assets/logos/vllm-logo-only-light.ico

diff --git a/docs/source/assets/logos/vllm-logo-only-light.ico b/docs/source/assets/logos/vllm-logo-only-light.ico
new file mode 100644
index 0000000000000000000000000000000000000000..27528ceebfff401d0516b73099381c7425aaff3a
GIT binary patch
literal 16958
zcmeI4%WoS+9LF8?&?8b0aOK!5Hx5Wt{TH}$OG=?sAVsPu5<vw~q*g@&sfelwsys;_
zDTL<HkXK7XLTH*MZInh$?AS>hJATA(JGOWC$M2VAisS6AcV{1QoKE!1WOu#e-)FwF
zznz_(iOW@oKgW)_@b~MkqsLvYS6wdGQ8e7;I)_I4)e-=>D%cS`{w`Vz+8kOEO+u@-
zL_U#k<Rke?K9ld2_|%TShUZ>Hi=b5_FS@0f$Y=7M7>GrUvwVLYe)b)-HMCku#6nEO
zR?b_xeF%>?p%u|;EfEv35o0Mc`<v>9j*Dounuv`UmD&Ndk6P5dTuWjkR;D}U6d!8o
zdPf}1M69JTkz(XQOT<jt(>;>z6Sk*ZE(ft#%&^7daS%nz!xAwQJM|N6zrV-hNk|R1
zLcIT~Ve?IoLnf24S|4I&{W`15-cg>2-*o+P2y&fgL1?ZwY|){12#3Qv%Cu;M^=U0C
zPPv$+NFL4%2O!pW)i68NSLXJ4@Oiz;g`3;U`t#;W(Kws5zc?9(#-RWNC+>3b7h697
z_uMQLi<aM>>3PA{YoD}#vo|dBFD-z8EXJ>a2Jlb(0jX3<qo%nI*~-qef1~F={$K!v
zI~O?k>mfF94K_A5xFGlHB&I#%C;!E%UM~LJ%_dm(xIvO^mCUlLD%Jjf{t!rOcnvf+
z7_Nh2%Sl*%G6jV~!7?B1ET~lbQhvy$Kxq5WFh9kNz*r9?5(zuqwxsNG?RW5F`xW}X
z<Kj>BU4g**x+Ol^SxC9|JNWU7#hF1a{`@azz_a+gs$+nX_T~Iizzsr+@faYqoB-eB
z3E0}&vQs`&Wh-f4&JX!42pu1BwSkTMw_$U0)0B^Povfz)E`E#w;@}l7{?wgsVQsBC
zhbq*x-^Gsth;#S3_zUeHgL~nb9nZt2nuV73O8ipDhcVy;M;$c4%G95b&1OycYxT)m
z+AHxxVGD%LMh^aZh>UbXB;v@Bjjn2{q`e|P#emUkT>R-@FTje=r^`1dK2%A2MSgTl
zS{UczFSecn_xw{31i=YB&F|2oy()jyk1^mqjyh<7waH;fr_-9gIdM>r_G<hf6hXLg
zmV>_@HV2y_6bd<ksQMjxwO8fGp%?K%2N!?#))(Mi_Uvh2Sg-c#{1^ikr?~j(dD!!G
z+Kw?XRlQ3O7|~vnKOVv{v6EbNus+@o$s`ToIiHMZuf-3d07BR2T>SC=A0ZG3I1X=A
zdrf}yUYxka#lLm^Q}8S;IvxXzYF~+8TAt<Nr+cS&W)kxGykl*ENBc_rkle%=@IF@^
z1Rr#7zfW|k0(i8q$dB(fh3>Dp_>(=C!SDAwg3qgcMSh9_4}a(4&v!I}`}u++F@RTl
zJ^a$jJo596iLE>u1A4b^XELL`9)8Fqa9sQpR~rbA-h^m0Y9~IJ(Owt7gfXD^8!rBI
z_a#_eU9pYdtoFM2(R*?FE*F2H?KHTb&Dl04W>$N>{E~l>F9u-TSp8?zjyWP=h4y;+
zA(sK6{WMn_hz_;cGDpN6Ex_@<2)xx7Hf3M_6@_fB!g~uE6C3!Ri$Bx#l`V5bG6fN$
z8NswAa{A_PiBCqj_>0(obkEJ~+4Hbv!8e-(@$W9Y7aE@@;-46>V~&VrX!;A1K#B!%
zY^dGP*g$OHXV?e^^?yyZe-h(?6kY{!<R@L@q`4cP@yrpa{s|>i4*?Ix#><-TjEeX?
zygX}YjtH^u%>WWl+r#HXX>|dFo=d9zgdollq4}~p6S31A!2OkQ+<be?<hS#?>LJnh
z1FY+xBf{1S?Js`jZ84AI#}9G*u<6A;)8HHtwoa=0V*q1T%JVr8=g06}$|u|LVRd#=
z&m0kAruolD(5mNUsuG{xeYrG<v%?@PJ<{`Tkj;OmH2}4>{sG6-btJa_;pO@lc^$|>
zUjIaG;GnO6qdGXq>mP}qwF9~qs&kSVI>wQRg_xAqJu>d{oYu!tjHLU-e(WER&*VEX
zl&<wF=d83%{R8SJu(d+eXRgNjAfH%&p7ra<cWplT9|M3O=Woa5{1va9AHFB&hgg12
j-mYW0y!rC?Uey58f#vsPM_%aVmA4NwU75}x?}vW@qr)|O

literal 0
HcmV?d00001

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 7aa52db092e36..6b0a1dad142b7 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -70,6 +70,7 @@
 html_title = project
 html_theme = 'sphinx_book_theme'
 html_logo = 'assets/logos/vllm-logo-text-light.png'
+html_favicon = 'assets/logos/vllm-logo-only-light.ico'
 html_theme_options = {
     'path_to_docs': 'docs/source',
     'repository_url': 'https://github.com/vllm-project/vllm',

From 325f679f324c1044cfaa0c594bf0d817eeda4451 Mon Sep 17 00:00:00 2001
From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Date: Fri, 31 Jan 2025 15:06:39 -0500
Subject: [PATCH 444/462] [BugFix] Fix Torch.Compile For DeepSeek (#12594)

Co-authored-by: simon-mo <xmo@berkeley.edu>
---
 .../model_executor/layers/quantization/fp8.py | 54 ++++++++++---------
 1 file changed, 29 insertions(+), 25 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 21d4355b36ab0..57dd6e310297d 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -245,20 +245,24 @@ def create_weights(
                 layer.register_parameter("input_scale", None)
 
     def process_weights_after_loading(self, layer: Module) -> None:
-        # Block quant doesn't need to process weights after loading
+        # TODO(rob): refactor block quant into separate class.
         if self.block_quant:
+            assert self.quant_config.activation_scheme == "dynamic"
             if current_platform.is_rocm():
-                weight, weight_scale, _ = \
+                weight, weight_scale_inv, _ = \
                     normalize_e4m3fn_to_e4m3fnuz(
                         weight=layer.weight,
-                        weight_scale=layer.weight_scale_inv,
-                        input_scale=layer.input_scale)
-                layer.weight = Parameter(weight, requires_grad=False)
-                layer.weight_scale_inv = Parameter(weight_scale,
-                                                   requires_grad=False)
+                        weight_scale=layer.weight_scale_inv)
+            else:
+                weight = layer.weight.data
+                weight_scale_inv = layer.weight_scale_inv.data
+
+            # Torch.compile cannot use Parameter subclasses.
+            layer.weight = Parameter(weight, requires_grad=False)
+            layer.weight_scale_inv = Parameter(weight_scale_inv,
+                                               requires_grad=False)
             return
-        layer.weight = torch.nn.Parameter(layer.weight.data,
-                                          requires_grad=False)
+
         # If checkpoint not serialized fp8, quantize the weights.
         if not self.quant_config.is_checkpoint_fp8_serialized:
             qweight, weight_scale = ops.scaled_fp8_quant(layer.weight,
@@ -507,8 +511,9 @@ def create_weights(self, layer: Module, num_experts: int, hidden_size: int,
             layer.w2_input_scale = None
 
     def process_weights_after_loading(self, layer: Module) -> None:
-        # Block quant doesn't need to process weights after loading
+        # TODO (rob): refactor block quant into separate class.
         if self.block_quant:
+            assert self.quant_config.activation_scheme == "dynamic"
             if current_platform.is_rocm():
                 w13_weight, w13_weight_scale_inv, w13_input_scale = \
                     normalize_e4m3fn_to_e4m3fnuz(
@@ -518,22 +523,21 @@ def process_weights_after_loading(self, layer: Module) -> None:
                     normalize_e4m3fn_to_e4m3fnuz(
                         layer.w2_weight, layer.w2_weight_scale_inv,
                         layer.w2_input_scale)
-                # Reset the parameter
-                layer.w13_weight = torch.nn.Parameter(w13_weight,
-                                                      requires_grad=False)
-                layer.w13_weight_scale_inv = torch.nn.Parameter(
-                    w13_weight_scale_inv, requires_grad=False)
-                if w13_input_scale is not None:
-                    layer.w13_input_scale = torch.nn.Parameter(
-                        w13_input_scale, requires_grad=False)
-                layer.w2_weight = torch.nn.Parameter(w2_weight,
-                                                     requires_grad=False)
-                layer.w2_weight_scale_inv = torch.nn.Parameter(
-                    w2_weight_scale_inv, requires_grad=False)
-                if w2_input_scale is not None:
-                    layer.w2_input_scale = torch.nn.Parameter(
-                        w2_input_scale, requires_grad=False)
+            else:
+                w13_weight = layer.w13_weight.data
+                w13_weight_scale_inv = layer.w13_weight_scale_inv.data
+                w2_weight = layer.w2_weight
+                w2_weight_scale_inv = layer.w2_weight_scale_inv
+
+            # torch.compile() cannot use Parameter subclasses.
+            layer.w13_weight = Parameter(w13_weight, requires_grad=False)
+            layer.w13_weight_scale_inv = Parameter(w13_weight_scale_inv,
+                                                   requires_grad=False)
+            layer.w2_weight = Parameter(w2_weight, requires_grad=False)
+            layer.w2_weight_scale_inv = Parameter(w2_weight_scale_inv,
+                                                  requires_grad=False)
             return
+
         # If checkpoint is fp16, quantize in place.
         if not self.quant_config.is_checkpoint_fp8_serialized:
             # If rocm, use float8_e4m3fnuz as dtype

From 847f883232cedef583775d6f4e13baa2446ba1c7 Mon Sep 17 00:00:00 2001
From: Cody Yu <hao.yu.cody@gmail.com>
Date: Fri, 31 Jan 2025 12:30:33 -0800
Subject: [PATCH 445/462] [Git] Automatically sign-off commits (#12595)

It's very annoying when I forgot to add `-s` in `git commit` to
sign-off, because I then need to `git rebase HEAD~1 --signoff` and `git
push -f` to fix the DCO. This PR adds a hook to sign off commits
automatically when `-s` is missing to solve this problem. The only
change from the user side is now users have to install 2 hooks, so
instead of just

```
pre-commit install
```

Now we need to

```
pre-commit install --hook-type pre-commit --hook-type commit-msg
```

Note that even if users still only install the pre-commit hook, they
won't get any error in `git commit`. Just the sign-off hook won't run.

cc @hmellor @youkaichao

---------

Signed-off-by: Cody Yu <hao.yu.cody@gmail.com>
---
 .pre-commit-config.yaml              | 13 +++++++++++++
 docs/source/contributing/overview.md |  2 +-
 format.sh                            |  3 ++-
 3 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 77010090965d4..ae518e1902f53 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -85,9 +85,22 @@ repos:
     entry: tools/png-lint.sh
     language: script
     types: [png]
+  - id: signoff-commit
+    name: Sign-off Commit
+    entry: bash
+    args:
+      - -c
+      - |
+        if ! grep -q "^Signed-off-by: $(git config user.name) <$(git config user.email)>" .git/COMMIT_EDITMSG; then
+          printf "\nSigned-off-by: $(git config user.name) <$(git config user.email)>\n" >> .git/COMMIT_EDITMSG
+        fi
+    language: system
+    verbose: true
+    stages: [commit-msg]
   - id: suggestion
     name: Suggestion
     entry: bash -c 'echo "To bypass pre-commit hooks, add --no-verify to git commit."'
     language: system
     verbose: true
     pass_filenames: false
+
diff --git a/docs/source/contributing/overview.md b/docs/source/contributing/overview.md
index 908c7cb4d38ee..af09bfecc6499 100644
--- a/docs/source/contributing/overview.md
+++ b/docs/source/contributing/overview.md
@@ -26,7 +26,7 @@ Check out the [building from source](#build-from-source) documentation for detai
 pip install -r requirements-dev.txt
 
 # Linting, formatting and static type checking
-pre-commit install
+pre-commit install --hook-type pre-commit --hook-type commit-msg
 
 # You can manually run pre-commit with
 pre-commit run --all-files
diff --git a/format.sh b/format.sh
index 4bcd0be0c96e5..3e78bf9865f0d 100755
--- a/format.sh
+++ b/format.sh
@@ -1,5 +1,6 @@
 #!/bin/bash
 
 echo "vLLM linting system has been moved from format.sh to pre-commit hook."
-echo "Please run 'pip install -r requirements-lint.txt' and 'pre-commit install' to install the pre-commit hook."
+echo "Please run 'pip install -r requirements-lint.txt', followed by"
+echo "'pre-commit install --hook-type pre-commit --hook-type commit-msg' to install the pre-commit hook."
 echo "Then linters will run automatically before each commit."

From 60bcef000ebcfaf120edc1972a8136344d9bfa0d Mon Sep 17 00:00:00 2001
From: Cody Yu <hao.yu.cody@gmail.com>
Date: Fri, 31 Jan 2025 12:30:46 -0800
Subject: [PATCH 446/462] [Docs][V1] Prefix caching design (#12598)

- Create v1 design document section in docs.
- Add prefix caching design doc.

@WoosukKwon @ywang96

---------

Signed-off-by: Cody Yu <hao.yu.cody@gmail.com>
---
 .../v1/prefix_caching/example-time-1.png      | Bin 0 -> 34837 bytes
 .../v1/prefix_caching/example-time-3.png      | Bin 0 -> 37069 bytes
 .../v1/prefix_caching/example-time-4.png      | Bin 0 -> 41530 bytes
 .../v1/prefix_caching/example-time-5.png      | Bin 0 -> 39727 bytes
 .../v1/prefix_caching/example-time-6.png      | Bin 0 -> 25462 bytes
 .../v1/prefix_caching/example-time-7.png      | Bin 0 -> 33144 bytes
 .../assets/design/v1/prefix_caching/free.png  | Bin 0 -> 17933 bytes
 .../design/v1/prefix_caching/overview.png     | Bin 0 -> 33028 bytes
 docs/source/design/v1/prefix_caching.md       | 228 ++++++++++++++++++
 docs/source/index.md                          |   7 +
 10 files changed, 235 insertions(+)
 create mode 100644 docs/source/assets/design/v1/prefix_caching/example-time-1.png
 create mode 100644 docs/source/assets/design/v1/prefix_caching/example-time-3.png
 create mode 100644 docs/source/assets/design/v1/prefix_caching/example-time-4.png
 create mode 100644 docs/source/assets/design/v1/prefix_caching/example-time-5.png
 create mode 100644 docs/source/assets/design/v1/prefix_caching/example-time-6.png
 create mode 100644 docs/source/assets/design/v1/prefix_caching/example-time-7.png
 create mode 100644 docs/source/assets/design/v1/prefix_caching/free.png
 create mode 100644 docs/source/assets/design/v1/prefix_caching/overview.png
 create mode 100644 docs/source/design/v1/prefix_caching.md

diff --git a/docs/source/assets/design/v1/prefix_caching/example-time-1.png b/docs/source/assets/design/v1/prefix_caching/example-time-1.png
new file mode 100644
index 0000000000000000000000000000000000000000..8849ca0237c39b4c428c4ab74c08b512812846f5
GIT binary patch
literal 34837
zcmdqIcU03|w=OC#A_&q11gRn-pj7Fhy-1TTMd@7x1Oz1XD!oUF^ddF%BE1HQfKmdX
zg&KMZE%Xk#L4Cjd-E;Oici%h4-Q$kSACLhgYp%Iwd7fv@U-(Nk1!6*4!W%bk5G%fr
z)x2>7*Yw5>oCgHAZrr#r^onf$#tr`)in7w$?#7!Lc*!i?DIs2PV*&oM6sP_g+qcHp
z7uZ*Q*e_e8U^RnSRlG<zb009NxmWt^AW=JY$b?JPRv=D17wwaYzVG5vTMWKh*SwU=
zc=VirIf(4P>6<K3vi9KC{ibZ-dvJh!f`!{H8NyWLE!ta;oiaT#j0Sh{xC~-u@J>;S
z#JsDXY0!9c0QD5bZ{xy_cBG|kgwH&riB5^mMAIk_76Znr+3p0!iDZ2R%5$`>4rz*X
zgxfw=f7A(V`^xrbQ(ND}Ofj**=)P&K!Gr)`JwT}O5%qzEW7`GB0EByFuPOT1k+o)@
zx}Wdpkl+J+m0#WnBk2+|g;cS{EsQ|39(C?xo>;pt+jI{Gtkiydl`LixGyF}vjZYEI
zQH*yCDEKm=(qL7RXGKC26Jwd1w{gKC?aqjAr!CDt)&4{!i4p_m^jz?d-A)b&Npbw`
zY+0U{y*Rk?&Mi=0FEU9#=>f_EseEUQ@u0n8-D10Eq159}_ozEYF9)yid@?A9kLS|K
zjugpmO(k_`h){h9$rD*P0Og&`U^v%M0ryB2+VA4ECIe@lp2<kBvUMIBJ6cOah+y$d
zc{=??8!#LeRlh6$2Q8_%LaLg;k>qXE0OkpYn%Q@!n$1{Bu+mw>lGMhLBT`(CWc;?G
zj&UYc&_0GgZWTh>YBC6rrvGDd;K+ye*PLq(io4>`&I5*cmvxje)_vy#v+7S2)M2+_
zmu}J!Zl9x=TeFLe)s#oz2X~I11hdf7B@1x?-btpT;=HA@{IVz&rf!4=VWJ(=+d1%j
zGyw8RNelBACzUS)l~=j4agW|gFXC<`Q)W=ZVufOI*m`jmn~e!O%0#G<_#|ag$0mwN
z(Ss$5h!w^$oGC7jm^XKi-V7}pZW|HGWIgT_-<v(nQq;*L`fW1pckJ1x0i&aM_wE92
zq^HgCusbSqT%hsFUJ@E`plHJZUn#r7?&oR-`D9Wu2%EC~jV3r4!4WG4_8r@45vK8=
zXgeDlDeF8?G<@$?8=JmZ#g}V5DA?Od0(Q;_7D?8U?%8$hCIuuOQz!iR(Ec2Mtsp?2
ztrL}nxH;dQun-3pF$!vZ84(HXw9W+e>I)GtRJz1-8_ea+w|i#t1oP*hQusLlvB#`(
zV9noI2qzku#M_-E^sM|M*i#kB>jT<9PTf&Zny*NRapPvl9sG#9K!()9->e<lJ@(GK
zYn9k>d0cJ;kcYg8Q0sTa>(>S_4GjyL)&}u!WN2e;GUj36kD!hLI$eSDoufEAz7O%~
zS5w}KI<U{sR)ZgqjGHJg@O$t+3&SP-44dy;Nas_Hjg~2nzPkO}g`Qz<?T(&Z)HQO_
z=)?|Peiw7mx8(AMEx7FhwH3I04_D|;9qq{6sx*^B$=N&hMNH=jV%^#=DCG-&1Ru{K
zQkmsz9nEPt_RNeiI)h6AiHc2YMYwMf{!THm!ef?BSNMvEns!e7KCzl3FMo1&7+-4~
zS|k@3MU!6~?AHE#{!Q6kd8LTlB&2J015d%Huac_&IO<Iq4)o_<Gt<L@a}URN-Xgir
z>a7S?31bKrxQz+-Pi&&}NN92enejh&TpM)nRK!cjhi99L=BJqwvlhvvfFe)9?K5~K
z@<Y4$F`JRo3P-S|sA;L*qDcB*YrDBxBB{JZz^HpWvBhKn)7gaYt8Q^8tb?0xJX|%?
zpR8BxHVK9ODKXamocMQ=1G*-8U{$WxMRVzHG|8DX9oo%$fr~q>`laW4bM6;~l#BpV
zB!|UPzZe@?xx*gG_{b1nt}?TT=RRI{6w(o$_Bm$a(D5+O_RVcReLRC(W*bQz#j)Ku
zQ24IXh4rgMT%lTEU)b=DCXFN7SAuaCPbBW@%u#<eO3}$VwmaUPm?qaiMfHs{nI9M3
z2lw&DIB~slU=(pxlG}%ki7w;cUegGCM?zC+)Z$PH$pDJpK_qoJ_O<K>Kv&$DTgoXz
z_`d<4&xV;`(@KdiX8|G%Y=;qc9CHNE(*z9@w&Hfxp(Ohp&iEdqCU-vxnv-pJgzYcv
zrn!Hq%CC$LsEm&{pUh^VxeG9rQi)*E%GOn09`G}4y7iU5(lLTH*Hk6)yUq8My?LZr
z2YgS9&M%P?(ZCg_9a9(0(@88me?z{tlEo~F{wo~cVA?B`79R#-iPm^Ih9HLt{^<b5
zUlMV<2viy@#k6of$^)W3nHbeJ7LD|Mn;7O^oH}QF=Sl(F6IGwgjxHaX?M#T7ijdf0
zGF?bj5KxZ*RD92|C11``eTRZw_qJJ*EJQ3<=VZJ3Q%jm7?^(?U6fHJgPBw&jf5y$i
z;WC+%=LhC*1}qqO+ixEs7Ir$g6`fS+%{D7fT!xmH%s|n~yMH<iaYwHg_)#Y{x7Ek}
zb`RM=q=QKeMc0d@*?f>jadZ4;*_4}l6TK*MrgJ;Ym(X<0A7`V|u_5{DhkzW?qtp5m
zB=oc?r-__CouGjEE|XQLi?Oe4FSHPDshGqPQvL0`Tuo^qi9VhY#vFt1^xW#%P4LfI
zt;F%U#g@De8(tzJs0fO3=)(CC*6D73Y7r!?Eb3l2ot_J+M9j_$51epvy!X7jZTMr2
z$>}-x4mWqOR1)Pr*%${yywGg-q=@_?joF_Ld*D<oYWfcn@Vx)8*YJl+{(s7WY~(mM
z$M(G+O{~Am!+#rc_}?$|-7Q#yB@|P=!}dAvDhDMWeUbHuEmtA>1pR1HEKf;QYNn7~
zrFi6vbYKoI$Ji(4z+CK)9kbypg@-|8IkRzB=nBcWJYSZc8m=Y5u<|5xj;yAnJ@%{w
z^;22NSFc}TopMH@(aEFeqnBV~@WYnOh^JFi(uHSPLy{wFFWkGhv>0pP0b5RlN!;02
z_TjouhzCu^5&7Qh#yz>-nv%;Dr`me2us&Q<m!*O{I<(X%uxGB22DVZvpN_O9D4gbb
z_pt31eJ#HGoB*B`_QY3ly*Ec?>bY^?nkPr%UXd3EXixDpL48Tf@)axAVkz=h7nd4-
zr@M5hr=V#{)Rxt@3g)uJE$8G?nyJmv<ZMVrI1h5N6uf5Wch7S2-X5(i0+Gvubnjc<
z1tC2j0+RAECo3}tL;Nv{nAIev*Fk=VSJma>D_oJYS!XY_6Qz8Xv)8=*E>X%Tt|Zln
z@(P{k6^gTsB!@ipS)W&wnfbH6n9m0V%4*J8epkEua18CbgOmq<6LMoQY1_@i5(-aK
zpUJ%1Y_*zRWokLHJ{z}Bl6P9ya9GkP#GD;n9%NXdl}x;M&JIEyBpEXXh7e5RNl2gm
z#_fHyRMx`YDwBy0k5(4sO84yB{;~YLao#1;*-c?Sx-iz;({E2=+ap(H+*$AJTzyul
zLgZZJ_%tc2Wo<9;nb{(9mdR4ML{ffM+d0s4^_5lg>S?S2cX-ddl;4%K=Wbr^5ay~A
zA71P6?<YIUbof4-8Hxd{+EwRStfW+zG6&*U+k2dyY$eGdymbyojC2aR8Ct9;w-2*s
z+mhb5eVasa^7nr#2e_sHCv8Fs5GL+Qlob#{v0tZl)&f{*pP6_9FtQz`GGt_GoGsU&
zFBt|Qr6ewsadUQ}$Mhr^L>x^gP-|+<cFo}!siy+7)6NaI-qE;StUqlE$-j*89C;zh
zOb?xTY?oO6wPOh0$hnJ|aXA#--mj%jr@YL{E*30=H`$h_Z`T}lX^5oS9_bA$b~CO?
zF4FK+tpzkPpg09n6|x;SscHE^ULiWhzD2;)Y#E~~cwxPhj7Cgm>*~AX(?+ifr{jbE
z&;hyKmZJ&FjZ~r2pJM8b-lz3htqd6(MV@i$epm1fRB8!lmYS4jLbFL`eYLvCqX;XP
zk*j7=wy=SEW8V;EuMm`cGhnDoVyp0S`ZQeZ{rwCgIlT$L`mO2xrjw3+ux*9V*bE|S
z7{r~VSCLD)rUWtm+*42XJj_=~d-qtu1HVsbjOPB7`;6dFidA~sTlHNxMFBONs0qi4
zNHwV*(ImeU^9V8DgE8fd+|%VLbvdW=+{)@-={k`6^VlejB+Zo;@p2^@&K6@Ez4HlE
z-Ty3??lvLxel3lFwW(+OHpLKCqMmpYw{ijw{nPWywDS4_XF{m)1%eSGXaxLr>N!mT
z@=R3^iw(9HHW=_xDC=sYhArHU<b28455>v7m~{6@K;Xy-3>FdDwY0o@1S%cVv&aVq
zI+Y}D<e^*lOabO6l?N-z>YxQr3E;3CYW*pff#fzZ{vC#f?wbfFQ>a&Q<`M;inn<4C
z#k{tg3LEaM#m4^2tk$zn)l~yBo4x!1&yldc)dZ^`Si?4b<qVr$CJeA2h3X}p)g@tx
z)&flnGw_=FmpJ&GpnZ3W&+W|)`hC^i7BEEX9N2n%8CAG|1{JUPXR~cLOlJ_$mCgDJ
zOdJ|M(|5uUQ_D&Ws2BmzOVFWeHIZ>6g+PCKk=5kKGK*|cn0MJ4r)L!PpFuWv`0azy
zkewQqUajnwHIKLNh*ndcuYl`$?ISeGMqe8WwXRMsXtO-^EQN;TllBf<Ofv;>sot-U
z1Nty-HH^xmT+_;!f|=<M8-uNGN~D<$GL*h6yZHIaJZ+9=i}hJRVv(NWXp7Iw{>%lu
z+`O}f8Z2>{o;_*NSvnH^V5!SBLE(jVcInJ6$&TAOo9BzKMNOmj25)pvR5|Szf<4~I
z8Oiwa4KajRIHWVv&D!QKC#s+u_Dc7h97Ii-THQ?m!*_-oQQk*}>Lc;qO_x3V!}#x8
zD!rqeVTE6^-lybCxQ#!Qm>&#ZwxF-Xwv1-jQ&_N5O1Z>&DSsom?D^)n`?M6~{t<mn
zu30SJN(sS~OS9^5Jn1Z3#8k!$w$j4uGS8C}0g)<B#C82)EY_rfD-RXv(xVkMZ?GlU
zgEo#ow=e_=XsL?~H1YZVsGoFiNehvei9t^6tBX!&fa?=a%xI?^s-&9`Y6gu1Dt>x?
zTd#&ZR?_=bHM03sJ$=_{Y+s8!PB$K9r8XFhCK&AXoT9R7x^)z=-=D!)vw8%_bTpC6
zYcaN+E=24w)J+fm>_US&V81$!B<O3pVY<`io5hE;qXOGy7HTqg4Kke<vbWu%KPQz;
z+&H)njz!XxwVu~B@+oihi=)%9ZfB$2^7Z7&xIH6yq*oC}5y8I>)nE3{wihxIMraRl
z9twc=!ZftxHECw*R>^WDRKhK4;Gu_!xFPK7$Hs2Plt>D}RCS=Sk?)vOIZ62uWL|||
zbvK%=8PM8-d$g3H)1Zc9l9g*gItg|8#2a9^acc4CQ&EU(Sw1z(;)UNhG!*_(L9g1W
znPQgWwf1$ZrPk=2Wm;ya<PxG^^^_vlQgnPDLULadS?JI7nm!$RJk#jSwL!YZ(0bNF
zF~c|P-NC4x4oJ^zAsK(QnmOm0;CI-X)U&GR_jgmC-?n0CelJ(8^(p<Zt~JfsAxCY&
zgRdSq7_iYkG!?HvVB+Lb;{sC<g?QK`30g^ZS=n3g<=Iqfh>r9=J@2>T(Fw9(>xprA
z@%2^|?1q$2P3k*QQ$)I7+&Rp#hdCEt#9tyKWPI40@N+kgg?7@RfEf)5jhbGA>u}_1
zhpz%VJ@0mFYAJ3AOUq`Q(SWhUdbD`YU6;RHMyH+oUS|eaxh#uzH=nWG*-*u|+fW@-
zJf5<jVsBx4_b7{oDvy4j;pmh|&gveCjmW!I23~p|dS0oH7gt|h?^>0K;YYzssgewy
z5gwZZ_>M#3SE0$YD!Ryii54YGZ31RlDc0hX^IUJ-WEUY3F%>Eci5n~=X1KvjiVGR;
z<Hql6env7G(J~*D?m^!p$Q&graO)<{Y(?>(Uid<8q-Xlkk_w2}+?jTS7rhT{*=fdZ
z`@nx+B5EOc{$f@@-u~i>{R$It*EntMOx&EGnzr?N<F+k!hO#^&%l#qe(H!F%#6Ej_
zxaBhK>fbEZZ5kGtw7w8jEc^2T{BiFc4e2=F*e?d3Qa%CUcCkMk=7aLm^Im1`c<k;C
zFf)*vNmi9G&nI|LX6~EI{sfqc_zUc8`7C1rL@X7{B=CrLeZfA9RVa<7esVmbau5fK
zBEY!0>O4|2#*fnYb~vWbN{9}p{mItDGhB^&GbfsTXIvyK=FXT+uo)6(kzZ|$amry^
za?yRhZ@(poghuNpR2-VJ@t);=LHRx=cv#{7Wr5^zTJ93nEX9b{bxhvBj>*r0c)Mj_
zwBrUrviurU7c3_}rU(CWyR(oc_jNicxb8uEu|COoP(Ol4%+y=Bb>iqe)<I;ba{5E=
z?)xVXQGAxMvAnR+5ADfF5%--h*3o|rS2p0&L8KxsirMFD!^;P9O9fq`GI@<gyW|82
zY-B^tqDr4lc1q}0G2bA5PXCoRz1Gjrpcls?<Gz+Tl(#_E$wyt-9AzUJ+_UJ<=Mz_}
zVy2tP5mxFzv>#5+(y@^d@Z%OCuu?nC{IzmxtU|V~RQKh87-Ex3Yw~Lix_nBOF6?^q
zV$cgc_bU&hbZiDGhN8DzEo$Z^G#r*GSU!x(KSi~hYCpY0#K5~368oB7tAJ5+n}C#|
zvZ|L`Bi3r9--_PtQ(Egw`$wmz=M_)8iJ_+%w-J@g8XQK~Eh*u;CG~u0C$oJ*aI7Vt
zywc6L(bC}_r84h5KgSzqhLG8@*ka)I?VETeDy}ZmgXL_mU2*ZpviQ!rj)&9Kd3fdp
z$ks&jcemXhmSCI47_*V%IyseDRKrBqTR*C!63zudtlwD1hV|$i?Y7oC#R@iS{x&A|
z$EMu9uH_l!>7*GoNo_&=C}Yn8>7l|q(23iz=o*{lSL*v5+Kl*7t8g)VV-Nbucf|;t
z+*o6K%4)Ggc}5TTM8}H<hf0~pCw%4(`*p(L2VYO|GgO3H*wz&#;is0uvEMk}T;@eb
zx=T?vGlZCB<97Xc$nvFBeaT}NLNuc)ePP!FeJ@ZldFw@AoT<JSO;_pdD=T@p-G%-_
zYWX`QY<~K&T<0g2atVZ?oUCFSXM+Oa#A6Ysa+56w@l3XutH-p!DA|*Y+bP=$6^(A~
z;}WsPnh@haq;0wo+P?}Jg|{$Iq-Mh#f2(<%p=y>Jr5VYkjg(9|H<b;X*%NuIbeGOQ
z2&2GW!2j&-V!haEXjYdWGO6*%#mwp#g~?JNMFO*J=T2vyrbxb6<em-yP!bCd?ZpV@
z%%?5WpSXn8%5#?ChAhGoiio=_R>tZp@3BS=XqROI?iKz{T=~FcD^_oHt)~o?S~27G
z=1-43N9a73f?57%Fg4M8sXD^p+GFGZFij}Qe6MCc8t-lHCN1B55!r*flMAb3AT|bm
ziB%I>76P`D<%VpMM0o4Ek<fJ9U!EXkcn*+EzSV<n=YQ)7XkJ{X8T~4g*ns(xdCiap
z!k%1R6=J0f^HgeLaf}75u2%q-+|e;2MO&l7aX{4ci?}>f=3Jn?p+fftz?=qj$B$yP
z?O4;Zd=xcs!03g42Ml8J7~>rbD*&g3e8kLm&FA94Mp%Skrapf;T~Ny2%i4#0@@<Gu
zqK<hq94rE239nv7mr=+flb;z!VoP3!qB#RIOZ*a_U&_&I+jUykJgdwBBy+Xx?N(hP
zxs+0&vyZr4hp(fOy>~8JHN0ulW&}BHi~e{Gd~3KinQ;@^<V9XROV6f!2rv}~xxTY8
z#zr2qiO|<2@N50uOb%L%8Jn^(eqX`Ij_8-V&>-k;2QESmaV#tGlMO&Qj?4gf#JeCI
z84SmJ+*0*4x)(k@+uMYe;;ty9ro3K*?unQ#;j_d<8Sm%V0h&)#d@@FHoM(dB6gFu0
z3*THzq`$LsAv87ifz&(8&cBX}Kr@<>1>ny%;ZgKKX!KrAXl56`md1D$Nhl}q!;4Uj
zxbt$U*$k^C=ypJ->%O<tZi_Qj!N7<NjGro+gMZ#be&EEfxK%t={x5@v&Jx4`uQsX3
zXZA!M#5o%q<hD2^pUIyZbF?35NiM#zkh0a`8N)cXO%~iw$Z!n2;dz0gsoW*A9BmV+
zGsYr{FpUn*)j`70a0LH}MV8L!caL)hV_?H49U=N%wbq?WFA=_|w&2`^!$RRbf~we<
zOQzP%$()zKu@l7opjoZ5n7>C9g^0f=ykZKCVNQ>9)c#6-D3G;gXYNdtS-*2t9G-G>
zQ-2Lq<>1cX@(5$wu*i<~uRa6{Ix)BipxJYwUnZM)0XXi6Tw=7^zY}lG%daHKBa(k8
zyiavde82S*vHS_=eo8UkBcOhU&eQ#H4udH6YS*qz+X9-?J+Z|@*|NS|PWZ>~sqE)N
zC-QP&=`xKj&(UJ<4!%{7>I{Q2N|v~&xK|q|C9MB00~GoG#5vy%`{Thh4%)9O=U?ah
z@3Ppb`1PcE8i4dO)Jui0L4D57+i>`uF}-I><^#*GO7_{6RWw?57lw0oG9$LuJv!xU
zV<$5UKcC9pUoWe*l^{<$JhACMjQDruqS)l1?wNgGh;kYAAIU&itCYN#->9no`^c1h
z2G7X4xA{S&s+^sB_4k8*9^?<<!_rj!3<bRoWT{Q<yaZJ<<h4QQxK{16EE?iB#YMyp
z!L&YOKfyi4zslXi;uF>)o`1q2U$+OmoiE<QwyY%?tN!`bXDk)GlGWZ-)-?G?u~4o`
zP@hruNKo$qdq2#2Qu5?FWWx&=lJeR(|1Xr9S;TkG*xmsEHz68y1~*V|D2v~iK}B)p
zd4NdM{=Nn^+eTJGB(|&t;-1cwNQ7)kU2w}S_+i;y+gl}N<Yo$zm?1r$y2h3K*)k*p
zSuKCoI1P*Kx#yUerM)-w{LEb*>x5C5_;b6gtlDKsgDorjzHi(9Ep6=~!UkJCY-7*&
zlt|CxKJXaJ?d|kaE6lejlnEn7m#O|u2l6u+cEU%eOpu*(M;-?T^W5k!Sxz$UYjr#f
z7vl!OUQwV6&Aishp`ne`GCa*oIVgV|+-a{s-^12&H-*ci^NNO~jJ>M|=L|lIe?+O-
zC-%(~p4)gZp>+Fk>P$Z*`$f($=mW+8c}#g&F!@;>NWb&J#+dW{!0yo0wFE&ZWqN#j
z+qa}OsYvgg4kNRx(^4?H<;zrN^V|qa4^PUmc_4xDvjL)6`-`_Fqo=Z=crFY)X2vd?
z&$c@c*cjT}C><a(b<u#m2G`x-RX4=cN&mf5%KrfINwOexdUgL$qQ*!0n%ySbyijzr
zGo`U|l)frI&>G;>%{6m)<efM^5gmfh^N85ZW@))El1re=K5KwT&F92MVJqnu=Q%!L
z{U_a?0ymbe@&fTmdw}ikHXF=dTe{@(q~at3$5V<VG{lyB9KASGSlMBy7baf!Fe(X5
zk#)ZH2;c6UOUM!9hSiTF{XJ~FX<SkFyBkh2=KM1}E~H_i`GMgUbz@-Mp0+-Zw#L;k
z*A8*_1rqM8U&7~_J(lqCgu6XqaGu*<wkSwxx!8$DSyf0b`MTe~PXtc)K0VSZkChKR
zI<O#d<l$kZ$KOuuecZ@P%QpXm3?cQQmlfq8=_T6gKBR%lvmoVk*t)#9L^Rr#x6bA5
zCGfRv)IEg(TZeQqyaL3ePTOpIXS)yBk^QoyHCcNd2T9H4l5A&~=kUl@q<%?5Q9x7x
z-|~;HsvI+UeLbyvemj%7<xbx)ThJ4eP|~!q%L*Z16Y_sD*dRgL7}oQ{J`q<w@U*D8
zoRdkbJkikZ+rbFS@fJIN)U7@GYS<=y87ID7pSbhuB|L+l6indg^DRpJ&6uYHAC@h!
ztR$SC*Z79G=Lmr?*KjCbhtjE-Im8Iifzy24w4c-DvT`4+UIE!j*yCgH5aP+?5GfO}
zY)LmDNyer$@-Y_VL|pA2iox^<5L|p^Ae5_fzhsH+-219|5yXvMG-cCys`Cu!39+EO
zpPING55mzxz`CgOo|3wxH`D5p3RZL4;8;%Wc7SkZ*eBkfso|v+S$s{mA)k}HUNiJT
zxrxbo;k=6!eUrr`N%zf1V0W325=hciSeUT$(lOIT_u$6|WN0k6(<X-Is)E0t*T{32
zwX>F9g-ZPDQ@BRU%NB8$rLcKH6KgTwL#p9^Vcq73c8&S>R4YW9(d9)f=wCvu4}UOA
z%Vb0^Kp``v+NJ#~ZJ$6w{gmD1wS0->BN@-~JMaWxTAUlLNV|stra(qV$vfCT*A7vT
zWUSG`rXAQsOr?B|ZB|wI0Byr#bP6Oi8LA^zSIM41o*>U4t7!m3*-Np#pOHmL7ieff
zMwf)T<V75Rj2h{*#q-l-685*R2lZUXCB=T?V8wd)hAfqz33&{)mZEMH8tT|iGir9T
zFBH!H4L-+DLW9|_J~lLx97qsLuW%g3E!c#Y^jXn<G~CCEuuT@U@ah48R#R6`?`Jv>
zgpvDKTao?is}kd@^Al#Pksi$26h|0Vpc*+SInLr@GMMp$;=v~J%zAc3*%;Spjpq-f
zpWXvGGwBiaww}yhVo<eMBGG^Z($!alVH$IS5RaSMQgXZ(+O^()rRSY8eH-){c2h`Y
z4V#0<x6{C0=$6{^bSw*0QpFFGdV4dqHd>rCW5ItHQ&q4(9Z%xijlH+iTGJI<0~N)u
zB`i3DMNREY`JupEqcgh?6q(R(Ihn`Z2JZzVCEj7oFS)L7>}C0}fUxah!71Mp$`E*a
zCm$hH7sG2k_%}iXibDr@^P)4MCAh0^kv?^Yw%~D`u!z<L@8TNVYjdEQ?*_4qp9(ZZ
z;AE45*AoRr=P6JrL=TMNY^c%2m~hPiTLq0k!sed$j=y7|9tC0xOCEgvYg3@d&F7}o
zf1UdnmkM^A-ho=D4<^?-okyHxJ&kl%&Dls}mDf%=TVmMa#OJi*t-enPqb9-Y#%@!>
zaWNRix5MxhyUT$<G@029$?1!f)dnqk-u2Q^9sRlszqg{MBld0l5ezp_BIe(-7S){F
z*S>$*K439bA!SSFSOjKX9`>4hSaiPt<a_(@{F?)4$pl_vX>=u4xv75`C8%|gb#Y^{
zQWqR;QTLV<s!P!>C}8Z^p8SC()5NKfb%vJTZ!<$&(hI+D`*Fkf9HK-p?%1M#6?Y_h
zMXYQMhYc1<YHUK@HrlnA5&yHMOJ7X3yt+$?`#nw9+H*15x2^e_j7zLdZF=VG)WmPc
zZa$R1Z$q#*Ash$2#m^$X4-@nfm9)LlXp+9Ny50Kt?CNdtt5exrYO4sFsw<4KS8dh%
z9`3A(xXfGoz+Cx627$3YxY^v|Vvr*Q%+%_4_TGlp>FG!@w#9O(E8HE<y2Pj|5N=4E
zs#sA}lqOrjCVCLOhpDSG-Cz=P*{}gA-}9gB(Qk`BpK7Z{-F)^cW1ug@R6BK`Pu6r&
zNXyqpa{huMX#fbEK)*};0j|us<`IxzVRKK1pNr=X6vv`uo;QkX_EaMuEYgbu<s($6
zCX?z}A{Mhc9Kj(1OE27kkMFX)J`^#{<KO7xZxS>zJCQ)#18bdSVmRS{8(<~mC-Uf)
z@!PRpbwnc*x8X4&2duGdpXFK}!w>uK#&8o$@YK~aK69@;?<>r==dI@EjRL7Qn2a%;
zA<thIy*m9)+pyy&FG3RaawkQckghN68Dm%X?RnqRO@*KO*{_$$%|`}SC|qvZzB-MH
z_q-O4R3k0~RqTlus|!Ap+>nGEPtR!3_G>=fzg_iu4AE??GW7^@q9U;=)<wfma~iR)
z4!)7)xBHU_ek`oKq@}BIeIaG?tCY~$I${nY&ao0qIw7Q)Iqg0#tJ45Oo#S;f5*yLB
z`Mx7TGv-uvSmSj|xU?Tl+B~gaORKbnN&OUjcv(oZ!7GBUuu_m|=6bf^>1k4(E-0nn
zUL)uQi{udgZWb8eXr-_hgI>l|#xQKme!47gd$@;8*wvRs)GN6XK2-Z`a-5K_mJ(oX
z34Rblyu#_yLn-@I3@?_WmvjcGK{-&ORq#Obc=&d(^8lX<FAR)PHN|}lDX~u!z|;6w
z_NP;f2?Ryoq33lKd6dqmT;RC7XAUXz#5O($K7)>7nw8FSJBR+Yn^VEgZD0~Cn%e~a
zD0hn*sruXs8UWr~_(ak@>MH%_O^~}UwZc<c{Q0wBP9OAV{Fwt`AvR`-0{agY6hbCa
zsgJ`?F!znB=Y8asbS%j$6oEb^Zy0#fm+&2$cq3L>@k`yOjRN&ZbgP_0Y*Oc&F{OY`
zoLq}&R7%tv`>)%y@0TJ0DX1Xe%PJv+TY{zHuP*tP?A3*jVhe6m8S;`3TmEX20b$jh
zlZBPcU7+0!_33*)q@EinvFR>G5sCT745bW2^-T&);(iqdq4OZ8H{+I(14tYTTkEm)
z%D`-&ihEKg4pPow<;e3Ap}*v5g|=RL^UQvm#Pfnhc&%V8^zVs1n%oZ)<X8E8$1QQw
z_Tir%SMojVjqbe(@0{0d!y5i&Q@caEt}7JLZ?I|V!R-Lq>q{kmn<=;V^BUszv3zfO
z_1!8xvJ%RsY&Rss@CKSz=DI|Yd!b@<c-YJ%Wf!)8%~p41__jVD%TMmSjr=#RfkjiO
zP?0;xZ@VPP(Fi3E{89#~$><0U$0*puPTQkB<gXjc{}uj6+nP_kcMV&trmxJ548BxJ
zvFj#Kc8F-H{{`VSIhVDLH5r%V_c*eC{2TW;PfhMP?uzd$GA#A5URHahXP!zqTrl1T
z&BWqwlt``dtr?wvXu$vKGyZ${3o$M}$%V6K=^AU1NuhRubM{`Ada(Cd5^?-P4$ued
zc7L26F#VII;jFt@LWI>O|Nr&vr2Ltxbm+YU6=|ubYb61hPpr+QWtwsXqr>rLj0cGy
zx7^>vPQ>*LMdLAO4;FsdeCYhh$f-*wbKNCTxoJp$I-Mei5vIQIY}>_BT8vVlkiPjx
zL*7%E;fkR{wa?J!F;i!5Qfk&+%(BZnX(FJUJ>Z<J6Ik(;t(ezYT8e!wd}wzkZv$Ds
z7fWcUNto-q%`tpWz|GNHPP25C`gX^)ui?+{JzWFi<-a;`mF7>h5Dra?GuOwwR~Ptb
z5P#xVa|S4~tDGY^brz^e{kfv3qrZ{r{Coroif^zpXie63{kln)Zt#r2;TH8HBOCZ_
zZ5|QwBh2kXCOmu1m>I)u#R!`#WM*Lpmm;4e!*gC46ivo93)M*-<&ibnYtkv!PLkT0
zfJ!IDU9qax>(kf2BXxF#&&c<e>l<k+>Vy3UEnulD51S-hS3k`O+bdtkZR~vELwuD3
zRqSm{cfe!T&eh4JLkFeR!EgB|gVYR_Qf!|Gc1e}z*GTb*_o)S9J?Sg)h|XpMu8ar6
zG<5~>H@^iGPy%0*B_@52D@Wg9sKmkZ(IcFFKtPRMU6!bU7j^bg5|*ATJ?EE)`TTwO
z)d@M1*5G<R#Px~RkM=woD^x&1iTH9Q3C%9GY6N1$ydZD<MfzxFq4b*n#Kw!`{h>yO
zfE{rXnqkVM7ZF;KO@Y89#jjxZ!|yn0WnZfB`m-p(bhmM^vVBGHvp|Y=k92%>H<Dz(
zZBZ-m`ddDA>E6c^19I5=zpPZp^@s9q7j6!^9+Mxn<TL7CO3U;&a?+5m`jrdOePP7k
zTz>>Ren+CqRfzdva0QB8p)ZXC`dDE{a$q9!DuTZ^bZW18L0-Tt1MLamcT|yn%J$h-
zV2ux~GlOZ!g|K$Q`H5nTXpS*)3r+a6{+WX~xmA=gJ@B;trb04Wm-zUegIUD|cRlQ|
z<Xm8$`*zub(nQ=kWlh6gawO5Pkj;hek8Z~VV!lh~>IZ7rznE+G(RKL70hebcvEz`0
z$7ch14T)t`tyc@ps$BSYR#A}n1rLC^_$WPZrXPLa?9J82tJp`)EFV`NddI-&6qlR2
zII${rZ|FSzh(6uxejMC&E{ud2-n>YFF!cdw?gDFa?Q6XZ>LvUE!xO)jR7!@dPc0-g
zFh&;F@2nu~=T??-gpCKEakqdmiV=0~9JmESA*-#0VrI2Ne1}fD)YnR3PB^drf?tA@
z5_XW1Kt;=&3-y6yP~S)+em}sGmSMi83lvwb!H@QbvtR}Se6<Iyqzz*vhS@p=^@Av=
zLvOEwEUsMZcO-%BBu}je;;buqw>~+_UvvcTV3&smI@11a7Pt?I>me4S+1-nlq%FT7
zE!EgXc!u@^OfRI68dq{df=^!LhI6O{4gZ~o6E?h7(ON*M4&+ASKy4=Z$>{o9^Z>_6
zd3z~EEB+FcLHjM|&rrR~=~x3>1Dq2y#vZ^2S5I^l;S)DD{Nd!^s@r!H9!)V5V*h)-
zMpd<`Z3>|%l3^B##~^coxuWTQVn`awvNWK9*zr5F*z=DeSI1r}sZn4UqS`$6u$NJe
zZH&vljO=UqdA0)kjhZdn^t^jxtB0_B-s@s4aDNsz0>-!cPI_@`CU3lbH=0UlZ)a9+
zfPY5@?KN&Ae$t?j)G^qV&x`iYc2+iX)FG2J;n<8C;)}*Fkm}%6M94p^$4>u|158IL
zk+^R}O;zOfNGOT_hS+rPDLT#Eo)-}JA(bR%>2Q=yd3Mu6sXJ)6J$MkuA_*`Xwju_F
z2Qmb@n=QI&O1n`4|0a73h~E~PpF5rYO5gkfD}Vr)nYx-3T{i=b;<>vcU_|Y)9k-dI
z;7EA>PNm_n6HYd6o&3;ftPEHw6BUibAwu}ifATgWRQSHjAQ2z1vlzcnC^fKZDJ_!m
zjMrgk`3vu)ECcQ|7`-#!c+8KFb&v0x7#K}~xEJ<iRL;5kbY!UK?J4>+;3Wim_ZndG
z)U8b7+D&0Ae^*puoZjXYv(0Ijy&MczM7XIw$(UjzfKKcWx*G@<*_j5DBs;AAeZn9j
zGzZ?{>*rbO(-djFtR=v5wKxdL5)|%o74T>Zhls8iJ1!}n9j_=k%$|>*VR3~Z+Z#Am
zk{N$_7(+k#FbsF&=IT`{C7lWWy&6XvMW6r@XP>3q7{VQV?(B6&6laH3JFxSZO|HbN
z7T-h*T0;(d7^TH^`HKu?r~syIrTghi8FyD1D&LtW#MH(zW-&40oB2ccdWS^Qx!PlQ
zXDMQU-}JtTj@3TL21=~Wf{%U2KETu>Zh=6;40wZ2HNI#zs?wmzxxL_Vr?c-<?_Xh(
z;yAMI>I@5ve?#3pyr>L)r=)4<umpkHYH~38`RF#}d)-6g<nB^ECvC=%xE=(|;+c&K
z<WLGn4&qeXwy%|e`4$Gex}Wso5DD{E(cOvS31%s<fTwMPpk1%-W#iuK0r9$5;zmEC
zBlStv2DIkz?~>sAaaqKbtGepqF84Yn;;w+}o|&7A3ZfzgesFK3UhrvI+hU_Go8&gj
z=mXoEcdCBkv9zn`9Pv&rDby6=a{30*Dv-VBWLVrh(XbmB%XhX;)6~V@P=MB_Y~no_
zcpGXc-*#$mIm%XxVM|xEv%}=C(L!qyffpBN__WV_pQ!%$saWOIRl^8T3qpAq%0QQX
z=A8t*ln2`_&L=$RQ*SH@acfTwpu8)z(t^7pjAMGRe)NP^FCtA2TMRp5S)~N|u_NK>
zq}3!VkH3jtg^C7P&}Ll618o$N`4I%lp};uQ#8N~*`gdvTF43Nxm1Zwa;GoCe@JB3K
zEXrT3Q0``Kll&Vfu=ByvPerd>Zp|NFwdaZAx2?VaRO!3p_W}6zUP4Ya*1Y;fljA4{
z`0n0P>D@nmyAfY$JXu4Th``y8VX!rnS$AspC%Gq&+G2lT`z?{Z69vQZ65`0TAWr$!
zly6GSUExnw^zKxa{{RW`4K35{`}Pmb<Xg<8V={-8%%<?nyb?QZfJ4!y4ELR|t;y~k
zD!KE3S;fb^_>(=S9yU~d;QKM+Q(jNc65op+BlDcQ6fL;9uzBX4Hc8dL>#jL{c9l5f
zwB}#M7)<_REY7KKk_OE_kV)wlm97i8HMHP$ap?LkZ;ccCQ615OW8ByC%{rfBP&ap?
zy}lUwX#*6x2`u9|Jg{TLT&%+vZHf`Qm+y<Rapl=yIf{rT_)}%4cN!qvTTyo?Fgl9M
z+}TlAr-xSJpmu^(Nd4`6UXWYe6%Q(vI|b{KDai0g$~f!|YB@s^CECH7BZ}mUqkdjR
zZpWc{pME9pUZ<#76Zdt}KfVjlTbs{o_<1W=Z2l}a@&Zp$M<%#dSLHC5+1Z)n&Z!0q
z?6QcmOaD$?z+0fr2ePDE)0ZIJdqPn#m5AY6HrACS5tTYhyVuq6!X!R!;oe9+TW42N
zRsrSFBP;9#x8;7WQ<RXC$zt5XV`f8T5iM_V?3}ZA(|3s{sLAjC;{Jm;`&td-QP5CT
z&fwD!`i{2(zIyT0YSAN}l`!mMAAZUU8s<`yL5?rj#Xc?j?|R~!sGX_-)W(z}Hb{J?
zDU_TzLVfYIgi_}kX|l8>-H`XO(I+)JA?77wF78%ah`_Ok&QnKs1_&udi1D1c<X?c0
z<d$Wr6<Rd07Eg9h@=i7v!tTO=&#^tlZZ(ev1n3ZZ`3ZqD)eMY+DMz@n45Wn5K~c##
zW5K7DQ*q`qcgyF`3xKdGwQHo$6(G0ndZu3pj_+|X6~4J@ZORAe`pX+#8O&l)`guoU
z^7TB-04q@h5_vKckan_bjmu=ojgytK3f<eqHWGiC@O{e)If3U;VmWKmQ={(EHpM2-
zp=x?w_h!nFGTpS$MMdbaZ<0<!c-Wds3+`)WyQRvHkOx0q+x=mGq1|;QlC`(jx?c#W
zWkrW(sy8iQ#j2>C6Sz6R_MJuru6$GWFiiv|nt@Pe{3PU}$_ehJh#&!Bcb7Zd>xWOb
z<BshKwd+#fYo1Ffsj<bj`a!Y@HWpj7wldVNOJWU~r<`7OXH9#B@<M1jQXgZnosCE=
zl2b`zL9!@Tg{BvIX~iwDYmU=Ezga{OMYAqIQYpcYLS}noEDUYw3{|pO==mgm%2e>6
zT4O086x|^j4X1%3qau>1Gqvww6I&mxDLt(Q_#K9*vgGxCc;jVYv8AvRj|#<ikKRdw
zAa6+)4k4QG-r3$bC57l(2FbF@k6oT5Ot1q5)23xbMY!nVIazddDM~(P|BGRW<y6E6
zrBmYjI)KoXXRGeqG>WIbnw+*gY@M}obus>UaSNVoLLm>f#T1>~UP=w!nqo7unb&nM
zbn_M&FczeVen>KmUN@V{({=Dkp+pno$8q%O@kaC&P^)A-;vS%dS&Mk$0s9<zqUKz>
z*%aZsW+Od;)k~Pw9flxZpk$OX<5!K_F4V_C#>x>c=E;!1*^(>L3VB+!`ctL0KWbd?
zqJw|a!yFSQM@^vbxJQ+dVJsyYXZ6q1Cd_ACDn@U7Hl^ZEsP2`uLyn^ksGqH3&*3`K
za%Sh<3@nv^C1!+=ZA{*)|KXXkenBoecCz2!7nI#v1=V`nU0fgB{~IJU2hHm^*DghU
zl@~XnMNBKO!=p%Y>kiEh_|f9o+am3{-<vy8i!JcOyo~eY?D@o{{VJ^7zTm|G(*L(f
zTaVv0C}~RM+?ja(DtK-3ZN$MQ@o!V%?{h**9*wg6f9_3T>v!k{!j2nHz2n<hXa(GN
zOyZ4Y(Y_b&i?O4$>&M(mr`X?pg0SW+7b4HGzuahTlOJ*XoaZTf|7&nDA-uMq5;Z|b
zCa5TNdJ0jT<ujwUExZ*z^l6f3)0}wpfd;MSd9YDwwybiP9doV8cqeUfd>CzcMqa&1
z8&~UM2>%dvo;zdBFwxk3ftFfoWFKUban+4RMl85Su*=lFe8t>UC)Rp((T}o6WMoR7
z|M-l;TrFR&d2_{Pw&G~S7k)_iA6BA$q$KkWQW+_jV0}B5EANP?Sy#s8t_HML>!1SS
z^NG8rT^uDad`A@x*=kwyP8_3ye7GK@1@^T#CkkSxq<~%b<-r3Ip0BX^{Ldb1f#W!0
z1^fd7K21|GX$AW*2`|9=eySnN#pRSn%e#jXUbs1rHI~_bZ%{f%;P<kEFA_pVe6b#3
zhu?i@@M4SFwJK!&QBH_0c&+xQaDJ{li8@Hc+6vK7zgiuHTelw+{iNgSthWeNXHcF-
zLMNnpHmsf%R*pjd6HxCw7lRFkm}PEZDfbDV+eMOzij&%S{wf1iXW8SSYT*a(?!Joo
z?eKrOK|4^^=J(QfxtPBO|FWl0?cuuy*NCHO%YEQRe>XU*#lr+(+U|d_f6NrIpo?wX
zVX-NVE9rs~7ws82&X-0B%m#(UWJU(6_*-J%mR)FEhxokmOG+EGDeF_O!@p2bTU6=S
z{@#6JCoa0|;2wKPd^i91i7V9v{kZqrDnNMQ{N(Mk^9mC;t=mjZ`+vE$Yz{>>a5xBM
zz7oX0vnA}525hvv7v6E+{Lks9j_G-I9(=lVD<rUJomtah$!Qps^*p*ItD4iJ@#~=f
zV~78-cn+&<-Jcj#lqE)}4fg9~t&iDXBA6m<Be)@RYrrNr%dj72i%w(uWi5TBBHLCL
zOma#^!`+YC^2fd6C$fmw!+X8lshQrJNGdbXj8_$1(FFvuG8GkwP3o{z+0VJZUHM-K
z_AjHajUUfA&b%7t7(lLle?jB%(W2|r<kI~^r&{QD<`(Q|??tRv-hQ8i0VQ%)uI+dA
zc&Bd@%sQO_<=upPamV3~8RqoxDXX_$*#K6j_Y1V7F2@+u$L*QiqE=F^+JK#-V@mZ%
z>}dTi2a^I~1%5xtFsr;3*`?tQCd6g{HWcL(kymhZx+;?|O2W;15Dre*ih8YU84Vn^
zcU3`qc)kPLb7>gYP5UCg9*N9_v8%ED^F~3gmJ?0X;jtu2GRyVbD3z<R>*f=bXWIME
zNXJ<%xJxZ?wz>km@xCSaJcd|!R>)^NV<1-~gB@D|SoNR3iR5fO?Q}3q@j3WRI=zdd
zkddH=o;mNch1ZTAz=i#kL6Tc7z*yEqwIpM+{DUQD9?$}?7@NS_Y~+H5=gn;=b7odR
zAr}HS{x>)6f-R|?oyNbgh`Y^6PBrde#~<I)mEA8hSic#Y^t)W73K{#k)RS=DX<kkp
z9!hze&K+T(nrG$k_sFAS`s(kb6iz0mOHbWekdF-t7KNv=#(i5gWO+Z-iO63Zn|;K<
z_u<Xi+`fYX0&y2MvqR1@)S0Bu4L}q<tSHOIW)+LFfNYquOzNW1k>t2wSJl2Msd!qZ
z%LFm%?bLQ(q@}Yk%;aLO7dY4cu{_FAqG>6_E5|*9eQ%Ps7+V;MZAR8mL%j|j?LLxc
zBhJGfU7T~9QLx5^0${hq@oI3<c6F<evO47S61Y?dN8PqEij(9!q(CrN&pnI;Gx?n#
zv%d%5#vW*RBH6@4We4x8r&hV=Y!Fn8+vG+&u#rkf@DQnn`ot_VOo#RZG)XFpI92xv
zDR}m0fB@b0Max|U_)K2<lob?m#s<IZB4ne1<><v~mV;V7u@BAF_sgWFjvZuzSgvgg
zc34pAeDf!8ZIBuDikgW!V*~wgz%IM^InZ7(!~NhpuzxhMguo&&-jm+&pYo%JoxXG(
zw}7xSs!b@BR2WJ6UbQk&(H=r#7Cd}TMokRS4`6BZmE5t2UnL?oANw@p)VR9<Y&FC0
z+lR%oWsq;G+U4e|hB)w$0hG7TQ)5lZT{{5#0Wgi^Zd9RM-HC@Ofz!<7g`f58mSlNv
zy;)?YEt}eGu{uruS~L6|_njmr<Ke$T4_nJF5+~fJ&}|NYx8H($yLo(0;ty)kn1>Oe
zZ<B3S4mNuoutN&qA?Z`3I)<imOW@kD4A~LKt%0*FDM%7bXd?<`!?u~`@jhn4N{wy1
zNe&1j!Z+<zMW{ipzG<o1#rw|MuNDyTh&VJYEf9?yW)@#P{L|;l3e;!YW?N0R^4Th!
zyY0|Y(1YAzF9!*>e-BCQQy#hXj&on6Q5#Yx`C^$q<sL&8YEn`Wt1fdIY|Z3Uo)gRd
zF5xY8A=vsB&AIH-PE-2`$-)Cc$Dp{7_Omj%?rVdKY>jA<?I@z*+s)e<nSO`%v{W47
z;sUPk64;~PVW)NH-T4Szw`M@ii_vxIpUNSqF%9vW@wncCq-l2t&vOf&MysnX{LJv*
zsrRdQe#IJQ@g~U~q<;E?frsC98F+k23R>X1{FG1OCLx+n-{ruDkUUsu*0@lkIDKhT
z0K;~bBP+h1PUze$k7ef%BJ}>*{(wm|j2Wv}agO{Thh_{PHD|TbbfRy5-eV?(*VcX)
zgSaaSiD{e1Umi~i(bkr-{zn{b_*bn2x?d?2j1kaCsW}gbbzPqP#3`Zlu-+VD*Pzww
z+3-KTbt~6^9zlyUWm;`-XyI282vR#1TL=Q4-M7M)LmLpgF<ZnR$6b3NThA(sUt;;<
z>CCD=i|N5h#n%<U`(mn$C*ByC7XxMIhU`Dx`VB_~ZdVlXf{_=(V!L6CU_2R*+R4`X
za#vJmdb7OsaDFwnA>;90_K-@3+;yxh1h>ul?y6v)*8<J9qU7s}qu*Q}fq4=YAcJU#
zCr8Av%|A_iJ4mbd7Q2MUH^E=PDf~(I^^tD=cp(U{mYn^Is8RHYzW4vUO}^+5_cmrt
z<x)TQ%~8_cZZ|2xQ6puCuhCOB51F3ykML^Hj$#ih&8a|d`Z0shbOhSKXZmazt~wU*
zj21Z@?Qs_W4<NceRIlUS*=Bdd%)TrTwjMFX_`$YoJRu}HG?|sx?qufx)@>5L;F|hF
z#sVCJJ<<OW&|ddGi$pB35?DKd&xiJ861jX=v}z8?n5A+E{;cC>t9lYeVY=v+`os1Y
z6#SuSjKA5oiget~^l7$go>mpJYcNQE4;52g%C6GNm9^2<x}J;9h=mz2c`yU4I^P$v
zrF%HZjc+=`GK@WkeKe-kD6q)s8otO~uM2!k&V%001d3P1#KC!7$yO#a$5Jj8mt^u&
z0Fj0BPZSTsp1mP@MU`Wl#_=D~gdJGtvR@wUZrm!_!Vb!MS6%f+8h4~vWz6^-5nYMt
zE!7HM1BqYfPFbCwWWq-{aCdi&_^4Y&=Z%gmBqk^Rx@2BAQo&`X_9M{)-TIjQ+zJeB
zMzy*9`1U!&YsmtNTMB99>U8zPL4PoBY(7@|Hh@=e_l8)7Yb=T}`ZaocFK>}HTPbds
z9E!^l>nIj0{6SY!D*s^GK27Y?F4fERRNy2~$dO8Q-)lIu`!7p~KLmRPU8VUiQ+si8
z&9=8xWPECC5y}ct^vm+!E?n$tzOB$6oqw7YjY4~`1-ng)EsjlC@Be!yd?lfYmT|BW
zQu&r`E+Cfv*0aY$Cyy=D$L7FjtxmxN*xXV@j;|G4pIm9%^!?e>!f&Vh1><^4ELi_z
zXKaK%-`;3_h0sTgdP$O)8=MlAH%gpI3haJinIVB-hsCx&A;eXh)+Ya~c#U}fC+c$z
z7~h6-Pz$7AZim<vv{zBH=yHnG5gmBk-;hJhQu$<2Ij`5xSejEJi~fMgSXOziO0Y>*
zr&z{wF@D;v!mtc^sO#b=?Mf;2)N$qFv{?3dOlHVWF{v?jwm<LDP;EJ(2Zs+(8@pKr
zXCMsSScFq85g!=6`--+N<&T<M++#d!xS~}HD9%aJ)UNUQ?<h-lk!}Pp`Y$dHv=n|t
zPMK=ypeKq#TVir2i4teee=-dj9gBW1b9p;*M*-G_(!M4=*O2-@OO^jJOpMb8{eyW^
z{_ir1|J1+#RY3lqCd_}8Mf^YBvIWhvH#?9x^II=8K>}MfG8{59&#?SEF|jsVCUmKZ
zN8@oZef}@@_AeUsAH?z+RucWD85yzyA0GUtDc`g(hCkeyWGqRTb+Ju4EV`PBxU5}W
zhd;3`k2m(|X3=nG<f>mNMg`3rg?CKc^Q-qEjqW%PxJI7Wg#eWa24e@xG1XG9g8Z(5
z%ZZV?T6lSpLQ8ui7~L`7s@)>h*QOIeo%z0IIf&iYgbW)#zdE!81_?@+*s7bf{%P4=
zkTm1%M`;XlAH9Ko@pay#X5Sb;|7mGUlb4S)4J^{GyS(WwC4`6IUzl#hR-AOi*-Yf{
z!Td<%QL5*6qV$1=r89PM(AX64=`m8(0G;vS>ND!G0h+86O>`aU*Kp;3rsUV1WjAi-
zzEvtKq2F0Qx><a+LP7`RE;Z6AJk1ci3a5AO-`JkVHh^M+Rn`uH6Z?y{2UkoMVEOQ&
zA46rJmbsz2=vmUXvuYy9a`sQS;9*#j2$S>K>9yk^L+`d3)c~tsBOHz6dYV1$uHT!b
z#4o6VB^vG^I;Tr7(DU7Aht3r(tb`Q`2Iv|EkeHMZ@|~BPgBOE0P)-E>B+aYsH21_u
z`a-HpEk{tc?#JCjTBoAb4##!rjw0$U?0O{f#_!)l_pK+YN*cW8Sl7%>V%f6d%UjMG
zuKkTs|KC;o;%*!-6f~;=`A)vy*d5bNwTs2&%B$=%2kq6{`4GuzF;h_MnVIL$=;MNc
zmczikMo`;Ti7l@_xSa^!F5L3<3#&|*BjCD~6i+Bt=slJ=4zc|qA-TL5;VA!?<k^Y?
z%4eW$d&uu}G3!<OD&w~I#pi=Kx^1Ui{3AwBgQcS1QB;KOZ7pp_<=uw9didV!;Mpc}
zRTu=zhkE9bI4qI~><8vXY*ga%$Pq(Z9NR1Rv5ezMj{^4Ij3}sTUxueei(JhkPv?9-
z3F(Yd_g#%-XXv?Mxr-8(jD#K5{m}Y-TAC6h;|5X}_MVHE>dn-}``#f=Ra87CD;`;U
zt-n`-{9MJ)@@rlIWM&IPPFxa%I&ds%d<N(KkLtcVs;Tr{S46=P5D=sn6%~+Pq!&>E
z9T7y7UIIvyUPA{#=>aLyi-3Sg4M^`DDG3mI2_^IZA)$oM-RL-He&^0_&N^qUv(~--
zCoB8g`z!k`&-1+RZd#dAafV0%SnX(#o#X>7PEpo*r#rgH3!a5B;AROC^i5&)TJT@i
zwG9R0sGrsm;3Asc`SJrNt3Dx7Rajsm=&$&>>9jX|IUNL<NJasN&mJ~juCavbg?N3>
zikrzx5ch~p8+09!q}o=H)IFC~j(_J8fT+u5?K8;$y=mguv)^>wAlQ|eriTN0zpIht
zPKtLpk$Q`nrg*_{gYr!L40<Gd<)1A-lhX?hic2Q9q_aULSUos@XPx!cvs-80IOr{(
z%4Gujpd3|V**cxBW;-d#qLfyyc&urJcY769Z7A3r487=vJj-Dgsh#|iX;`)QqPxn&
zxQqCd5FxwfwVl1^14&`H4WxVk*W^8~s3KaIMDw^l(kI8j6(R|q$mFX&$|B7Nvg{Qn
z1pK*;uknO@obDaw(wXKJ?JTsi7&^*9Lbw{5Rmb;^P=PG~ZC7~|lMECOs<tbLp+?=x
z#TS1cnQ{$vXnC{Y)=_!b7OP~&=9Nqg>j&k8QJ8J5yV!?!0{Jral%j12uxw|s{8_AM
z<Gm?v)@_frox|BDS_FRFy0GMyonYpS$m~Q@vMFYXPU2%E0|^yaQ2eSyc8Pe2<u48l
zj=e$#?t`35k1W`LQg6KM<TEaP&}>_mNxuZBHQg#wWL8OM8+LLzFF&#n3JWMqRviJE
zWq)-D52xw6v_rl_=-ezKiJ?Dxls^vw5RB34I@WMG8rF%f%%LZ%bXdLXbup@nlyJ!h
zgIu9JRaWwKwBhsQOk#b`cB6+6#6TTsZjoH@7!w1``zv%iEGq}@5hc9_vRP4r8))fN
zvH`r>NiKe`V3B^#ZDwC~^GkH*;qtS{oOWeqAk6-I8#n7q#w6VNYChe32FEN0I=aOD
zZTad7b3}t~z0HK)o!F(^qUzylnB3Nq%hVFZhzHD!shCV;BKrvS0QF|a9$f=>Q)pZs
zLUq_I_gVbN&EE%J8ExuFUQOnh?tTnEauVu^KMO=R;#of*Uy-dpN)cwL^KU<GjG!eG
z4Y$AZIIv&JgRky3ZtzErUosEWE)I?-+F^XLEg0t4t?rTxjh;yNjw!?xUNYY^yf$aH
z*^G&wiF2C&lT=^lu6+$7K??!fwEamfibeutrtkhx;!t<1GSxiX;IxVF2S~uc&xF#o
z7BC#afo&QN>tR)!_$TyO^fU(>AmB_K_gLXMTO`QUtjOEIZI84iw~{@G!(dT;a;D5A
z_IqJ$?LugHis#0{LZujJi>LVcSZvHSCSlf!8cIOuk<1Cy;pr^<3f%0#?^C|Td*g^l
zo=>5C*(L=bN~?^kSHj69XGurSEtc}Jona2(9<M&o&W?i6BOoOH2}K|YO5uKK$PH%x
zRr=GUw{|a&s#|#~rpt6sj>M-Efb*ZQy@Tw}Nm=sd@|xUGXjB&4GSXm$Fi7i-*#pI%
z$@e#g;OEkjC@n0U?X+86a!vgNc9zSAPurE^yabyUf#cC@*b<mSuH?;C9f!Q?_GWER
z_1^{1SAhsu-@WxAB@8^u8Kl%Sqv#)!!s5^jB!?G9%$w7j3@`iG*)_uIkJ@npkXwai
z_#f-G{{f{8mUca&01hXNK-PWM`3N*{Zb|6MsGdr#wv1vXJR-Yavn5vyUBcLvkrRD7
zg?wroELHZ?@4FGN9eAYW7YmW;H?LDaQjjvI=%WB_1JV-)x~A9EE46tx&JY40vMZzM
z9V&NOx9Y+&Mh`oqz>RxKTIrGZtIn3ei|KGvqKo;;y)?kcrY=&!>%UF<0tL?^`<K?k
zsN^1Y{J7yCw!n>;^l;QHG4;^y^3Cwe3v+`cg%0~ITk?DNz%r}J_uR+rdzvk3fF@b(
zVoC(Jq-(04#?}(HFEO&ZwnIWno;+Cg#d-=}rqu5^<|%0%*bstDZMt<-;YI{sH17Ao
zo)#6qs7B}$E8quo=6pn5t`?jnV6}t_Ueq!qgZ}R{l;n=KCmiZ3239R|p7J$hkeI1o
zV0Luc9CarrUfZOCbDvF7eeCBHdoUSke#}5^=1MVwWZiKYx>89d;(RvCD?;@V9!pz-
z#u_e}6SG_Vm?M&E_n|-L`46rF4lHqy-J9hs<Clh*hv82|$Bk-A2-}xwWX}sn*@Nub
zkJ{;7sID-KYSr+Xaym6G10CqtAb@UVJZZ`+Vrsb3KnCSICA;5?6W>Td`p85u-5Wpl
zy(&tL9$l;3j_p+YBYj131_}Fqr&ZwGpUr^+#lx+gcccr$xcfM;ZnmPc*U3w}Ud28j
zCY^mbE7ebdy+XGDQ)Zl(+C>Rn(X7-W$rKT8-&cwdhA>+$70(V#+*JAHQ*y3HO^+rV
zEx0vpAPF8iX@fj)*A%~FJk3RCK|C}oV-6p)`hdSjg=yJQYOuSqhxA)W$Ty?uOiV&(
z)1aK?J4?ml_)fEIhuJS$l82ljy%eW4&}ehANS=}y@a4HwVir{`%z8C=jNsu2^0<=g
zDF6fp=HZWEfxxPaV5y}P%cvzlL1T1cDuHQE>SF}P+H9p1&1f+duWrCRHBg`H$Ihdb
zOs~Hl_Q;wZH2p6;H&9nU&g$MuPq4ML|26{*feYImJ*J0^yVR^LET%e6>Hzh#_R_N6
z!tJ$$0aQT`Y4khDKiKjQ1Lf(})VtZJ$AVb}2}$-Ja{j;tr2s8XZK?T08awN8&~3JB
zhWdhS?;~%?;Ij!8#EoClNFKId5HWppr#4*hz81=nhd2yrB<Ze@P)4%op*C-vmB4W4
z%yTpGXYU9^(JS|6f+r(11-qETg9OP!?iqP*JR=V4CDjxPNU0-Kz(`r%Ye6q#JF4b@
zkw}+%V<{E54@O_AMMiYeXq{ENueL4~+qA4;7e1^6%zmpoj*#kiX4<R*I;3QS5Z_lu
zM6b3lTZ=K%VGkD+(=Ek8POr!Mg;wkGA(rk>&z|NVp$RA<;;h;(o2bYW_sSi1O|)j^
zsy7cUghi&*H7%tho@SZRq70bH{eHH%A<aj_1P}A~XX=4>`Q;sv;ZKpBFpGS=;RRR&
zaW0@g&y3zN3VYXQmqi=1!Zy3%UB|OAHJYoMAzTszG>y1?h0M|WGgPQX?}f;bHg7VW
zLW?GsVhiKahzc#xyV3Y%=6Fpz&_M#gr0;+Y>ILNJTYvlXR@px|fT*M1UJqynpiOD=
zPTz7Z4ABcS*1~kH^4McOn!brYi%sksmv0$L+=M8tg%#NOc2dnn$}$=A;Ut*ImVu!Q
zNY&`9#{nQW=-b$rMoG|`L(o>QV;2~8Uv*?p!4%lRvJ6A|^uneMyv&xi6W~!#z{c$T
z3^#RNa{{N!RB6y7Y;jAfPe5*JVZ=e;fzm#HhXvs*I_aqcE(H2hjGcF3SK&kZ-tdmD
z<iq&x3qG6rJ;8LHw&O&!00^k9E_TTx7;|YynF6QV6{gp<o=$rF+?VkuiIr%Ym3X%$
zrG~wkS%8k*O{#oW4QJ7jpEaMd=gL}oCUNUB=1(3@>fq=&aKIvmIq_sG0`q9;P4FL_
zka6r@(+qK2fY4<*`3=RoFQfZ%8A}oUm+pdT4h7$|Cf;yiG!}l>E`ixkk4r(vi?Z%!
z3w>;f(6(c?;4d*jC@ML=CjQFbgSmXIP;`zsD1e?!z?-gxr)&Q$DPyJ>Z9e-}hw!!*
zdowvbo4bqT29oAi%87lbtUnQOY5z&(bngeGz5>zM%=x5}hqp5mtg0IewZ#la>2!qV
zHplTkvxS+n%!N%EZz*EA>NCd&mEGM+vk7s>vz^zH8fTV`x!l3@2|G*XlYvRCeL)$s
z^AVhEN|y85$!&IeAfYo{>jw!U#y^UvzjJ<;|8R;_EAK&-*@X~`I<XQZA0tM~t_zt-
zBz*NTj$y6=$9RGD=;}rv_F2QK<|{)kHJ=@CIi=P<=JLdH?-1Za3OGc)MW4QV))Fac
zrJQ&Yk6Z1&v|G4Hx*Pi^V)>{}VShWR+*MWoH+&7fVDHD3wLy=3lWd#xHE1e5Hc7Fr
zV|g&#W5WP&Re=vQ!2%wQD!qzQYWd=&`1gA(6HP(!iHt4|&$Dw&IS@*1SHNV86(2^v
z>>2BSV;N46k~|V|;DO7RDHkKJ0?YXaK?|1k5MY*YdR&bgy^UxgpX8KC%PJ6IjI)7$
zA6FIAcADKEK!uO8{pC<>Y=2L}XF0;}<vo3PMH3NxSMfjbplAHgAH;Cyp5L8|bCoRZ
z>|Tclwf{<0_TOG&N#HFVY+k9WNR3toddY3GHh2hy7~dx5rul=L93B|UVVPH`A2u*E
zi3?yV&ZhbY-uDefGR<jS5XyZSJ+gHBUn$^NO{=%t+f;bR@DYGo9ndKh=lM@JBJi+H
zsy`#(^WfA=K%tVQ#6J+-Ux=CvvD2Yqg@N_nUi$ySd9yF4XxM<0E*xzm*e3<h)#|$5
z!5pdAu8c}K!WW0bwVsK(e`YEke>Acf$y6Lm`H%Bq`gim3+nu(Ya9Y2o&cWJ%ZRRPw
zUMss($YH}F4uW=y{-Cgv6dfA175;&({wmkbE}zpDigKq*+g?X+T^)PylV$$NMjpkq
z>@Oy;F*gir=Ga>n&M36*BE6o{E&+so)f8pjl}R&6+Ct7C=pRs?NW{EvS=fI=rO<$e
zF~XH{5=_nA(PcR~;7L0LEsr%oZtOYwZN<P_;P&xQ1iMCZ5^fE$p$$yyKi#M2s!m|3
zd;zV_ddb2%G@92ALsEiG$_26(WZm{iePC$8@)*NZJ0|o`P__KkmGw1)-S5LQyHlVS
zs6YRXd`~-`U_C!(<1r_vNRygppoq<C{}a}}?JX&Yt}|9qs~frS3;o;$3(S?R$D$Qx
z8}?=$S3b!RBDi>^?nnSdN`D^QUz@8nxA0A<S1Qj#<F{%69GQH|>So9Wab#>rm^)^l
z!~>9A?xm#^<68bP;92KROMeE~*b{csi>{JeEMKBwRXugCiFoAb-V5VDpIJXZSy$gX
zJjD&qfT>5-SzG3p8d4j)Q33$-pNv+kEivE06|^+cxT8DDt*H{ickj-bPUTD^aKbGz
z^+^@G=N0#8sV|$cFp{^tGk)8^=kU5j;Jk0!FZxCbpyCD3sCY3@Dg2%aHLP28$-rC{
zzdxi>uRMJ)Yl{GAjHWvCfsXWwlmLA?1-=>|H%E<8+b%;s?@g2a+p+3mpfD#g2C2%U
zX3dxdUuisS$!0s8b~S*_8F2p}YevkF2X8&TFJu;ZQe<uI#Y9Yb1@c+heEbDWewPG^
zc%ISKEs9}wX>PfK<+E9}qvIWJ0MS}iJO4gSrpQe|Xm%EiZ65fJIqCCizoJ%;e-@*k
za|Sa0Py*Bvnp!_>p*q~cIi{*wwK(3d+T_b!5N`kwQ@w{f$7#pk-wXV{xi*oe;YqvX
zNjKfmown}f6ap|hKcp<Lzl($XiR5R`a1iwCc_EY0A&>Xd(9~fM8KoV4Gg)##ZX=7X
z;{<1$rFz~UYL`l~OST_4Sz&E(yDbs80Vr>_>nC84=Ki5LrnLIK`Le&rE*ASWrhU4`
z=6z0a-cE(FztknBxrv13)0FI=NqDN?9oEh+PGlQv=n19R9MBI`Ehg|CZ{{_8_F#2T
zk~j_%%3`z@Z)(%=IwC4GY$vCawZa_FlP_S#9S-#n%=pP^ulA;KfHBC}rnoXXEV&-J
z;k?<u0-&KfsUhaf{0fT-kW^khTSUQROca10*B@nydbjir40RNw!6Z-pR?YzAEA)cG
z(ztoYy91zBTR=hKWG^yml{Bfwv!7mWxbsUn5eqQrVc}swV&R4KI&DELXlU~=OHXaD
zkP7v#F_lokYDX9D?SI?Gab-W{l^njOp6b1I2vcoO-<@uw9<XP$T9w%X12|CdK%OF>
z&rvBye$k(Mcs8_^(ACZMr3ME?DD*a*uiV>cmAz2Nf5{~UBBG+9?PV=$B(k5vms~2c
zUOc-}2d`B>DOBUDysPJ#yD({%YB@J?xJ-S<cb~~+I$r>_H!)j9P9|yanhnRt)y-Xb
zacJq)3KXoeSQo=nHmS*;nv$b`D$Fe9`dB`vMJA91oysUv1mIsAcv$ub=h=Js@1f{1
zAP?7)SdDSK41q?PbX+91YLce`*UgF<R7eSrrQqvk-a%bLb3@NW4o;~8B$Y(WOUul+
z3R!QXZ7z@!hdDjAt<l?cm<3uN;Ym|W$8nuB7Nw@vNhx@z4l$50uk79`d}9xgkANLd
z_dUmY@inZP)5Sgj*C|Q7k>70-v~imnog*^k-T~+tEHWJbW0TgXAI)daQCl?t$!@in
z7>LQ375A~7=!%tCv$)OivjG6L>G3kP?mt5>WQ@Vdj<|S~g1a9SZga?zsH|38Dm|j{
zXJ7;rm>R`In+=FwwQqC{uq!SW+BK!#20-?^KI8~r3s0O|sWsRI^3o@~{cgB;+P(9`
zXcwCOAgjpRk{2mc?1pa_rcRh({Pmc+MAWX@7rU}7fB1x~@xdi+NI6evqitd~>>DXJ
zT;IPMwu66=RW!T7*1;tD0<%N>%MAY=BV!y36zzY%M7u8J*LrZ>TQ|rbP&hZz9UHmd
z3=gF}O9;e#I_-1f6d%AE4$7To+u}k7de49fPdGwNPlZh_=fRJRUP(jW*t5-9%;V@1
zjvlByp!2z678xligZR>_N{UIkIF@8xqvpgX%)K9(;(-)+k2Z74By`*Q7Q1)lXEF`o
zGWIPOSB^0AIbe1BM!HwqBQv!F&wtyp4E?PpcHM1vK{QrW(Qw|eJ;9XIM?>Wu>Yiam
z9l|1z%yDJ$c)C7x4*5s=j3h|%f>m|9=-aftni6Un)ScU-<oD_QSRuQ{kZh(uZeqp+
zI-Ts}d`S2)H$-W@%`NF*r9yT5hZ5(9B28<q!aqDG#Jy<q(0=cUnYdY~*irdDjA(u&
z==}~$=02b{36q37Hs_PbY#lz(J=II${liuK8_u?jGbr7zaaR`CiW5MD+bPArp}gD?
zx~1fkXgY3K7}?(JGp#e40w4tfW&R#5O}Hy`hG^>&lQ!qG?A)?=i=KE5ka2v3&(iJX
z-bu2m^LJ!g_nJ5~uBJh#bha3_DDIVvEVW9=oXWmR@}s*<Y?a6C!o`Y8jw}*s2<q6M
zTVCW4W%zEM#Y&G3X7N0+#_RbLk%gR)=T<*p%zUFp@lX}1p=UXT!&Ye^=M-`aQoOV_
zGn1sulupSQ^HK_<m>;2ots=9|emU#APfr~*O`d%BPzqSq;kVe@&3yY5^0@7PPg$!l
z6=$kQi;mNz>N~j;9t&ie3(siDHqUt<rf30#vt^g2LA$UV#`jtEvUY{c?b`6%sO~Vm
zF|Wb}U+8X_YTBlD_8VxVB3;1IrO_i2zEuS(8a*Qm4I=AW_crh!ehc`nfu8GBicMT=
zF*wdT<=Bbw5#c*ZRoosxkA&8jvh3#uiWxoSF7+&$2iLUL{iN$-fvce3&mk-m{1#g5
zg5Xtii#V*DV%l74uK)>el)mFqW<S{F>z~dom;K7<#gWNWnBWA{LazlygY_)+Q~`K$
z>NvGeHLk2*?T4c7OjG(QYL(LDr&>@%?N5o>AQf>~fZ&yW0Dy^<*&oLb<Y8P}t><S(
zPCt<p*0qQmxO@x2RGsO<gg-bpQlKN^Z>})&59iHL7uy8uO^#-TIBw55)&m*qHs^44
zUnu-a_+&@%Y|9iimj_?ECSkj<e764BU+mY}{1jjjPBOczfxLWXXZ*+kt5XMxPg1f}
zTyam2+sMqHJV}A&<B0DwNp6Y17%OSu6`VEt{ia$Ft2<V&VwT8~->u$@$%T1gtm(wC
zWjmS6#sN`f#gK9n(69Y`YgU7S)^jpy*K$ep>hPOeup7p2G9p@R&+q7qeWb-&^*=F3
zEK~opEn4nvZz`Ehk6(P3NfXeIt~klosbRhA5@aO(+%**<Jj#Rult^dspW2Z3r_oln
zNoZdvrD#JoDOCJI4PTT<jnLfMzT@`q0n&J34z0Ash%uuS%-pz1N8yVh`kogC$cvPU
z<uy~Qnp-A}2rDjlH6IgHK2~#ltW#f6{L_dSsqxokL-u6KwgScRi%>rRxPU^5k}?Mw
z078F6IVr;mLD0M`m<h|LCt>Dk-9Hln+$2?GG#ktA|Ka0)c+0Dn!Rz_NTufXdsh3+^
z!!_~pbsgh;T~&4AJ0(3=FSW#zfzW-*zcR}IF9~`vyP01Ez4-?&fS(5>!T>)n*H*%V
z`=^BSUxdaC;vfDmvG_mb#%FB8Uox)Sxh$#yKP87k^8ZcQ@88?$fqjkXSeW$m(F&EC
zZ0@Brz%AJ1?b6Eb307ybV3EAYOrsXbS7%CfAYU(O4SfV<#}q-AM%|OGSFK&8=TZzu
zrVn5LRhN=#yyn@z1D0PJO--fEp4fD6eOl#)Nh8CY2}6Dc%)wwOoYZFh0}mS@viXcx
z@fHF|@j;JT<Ng4%m3O7QPk<bve>;oZ=LO|PnwI{QQPGO!KZjG;65TnRa(+bpzK;0O
zN0oYq*;?{8eYFTzs36~7*=)*P+~7~p&uLxnbQ<aA+x=?L+VlLax3|9a{kAY*`Qc{U
z^Q(T3G0cx11PgQe#q?bbp?DtR>3be0$8nb<{gT|H#}Ce3yh!qbI?FTUap`986vlH)
z($N&N)-&dTSzJU;)f}FrE_rQiBsjXjQpF`t-Exv8rQse~uhhFw`_1D;UU&AbEv`h|
z>KJ4@(a;5-suZTQmW7jijl1O*vyv1cm9kVf+Ibr{Z5OZLFLadV=9u~2ppth26x|!O
zzJ9mRS@(H{!Q-wBB`2C<z7(bSEPm^^7m3Ue8*akl*(_cM^FmvvPv~v=5ZIiQxU{Ex
zKAe%Ky)wsLHQxmYpLs^C8ol9*`XM~pNZ-r;D(7Hj5OOp}MWF|L0>Y%(xBpPV9|Fkf
zD&pw*6-t-o%s#V`(vZvSJlJgwZOE%l+iFe%=*I=jZq3@qb>}`q!h^I`*i-vwD8JR)
zPgsBfAeyd`lfKNJt@f`hpad<S^gJe{$A-$WxYkhHSLHrSD{*-z^gC*N4SF%hI%9Hy
ztV!0=-huiN@#P^Lah0FkpF}ApcpBSGk%_>{CoJ$^zm59#qr6cmLM2=c2N0`875yTs
zoIsf)McC1>mw1}%;{KQf+IAEjo}f-SBEiei&v(m2^$zK1b0~Kr92azI=xWu7bB7N{
z4|=H7Zy$EK7-?x@A{TwgiIv49L|@h;*=zzDvYylND>TwDh;Y|zcA>c@x=HC8n!0WG
zv}GJ;6AT&*#$av~8s9OYBh!&ysC4#dPZn3)(!Q+DQrhzl3oo8}McwD?j|R?o*j8!k
z>6iB&hp#`4H1Is1@ieLnszk@n@KU%ZfVc9w3%q9uw+<!9l&kN|2sN3osxU5JC{OE*
z)$@MeRUhV1P={2+tvnI#l2q3bN^%{bACuRZisyk23hvhjGCRJA0!xNiGSI%{dxGPj
zA`5Mc(-che6FWUm^70u88QCZ^c@HdB(_zJGtDqbp`3*;`=C3||#lRWnz$5vlZPH5i
za6GL9BFY%OUO#<f%(-GZDamzr3Q)>M)lR6!Jk@<(ck6qmQoR6pq-7fJ16M1ms*0jv
zD7KZwdLlVYghL~{<!N=KYF4u)w}cg{_|bGWNKEmB%veYH81LbDnD}l*oq~vGYx9`;
zuvLd!MP}u&y<D*O4bD(O!^SW0FPJa*X3o)cf|mT}KB^eVVJq)GS_k9BtzsNwS*6Og
z5D@Ba+Y(_Dx$f4O?xyrw@^Axs%D&|l95EP-b-PYF8gD<bq(&K>h5Xh=w3eK!Vb&5r
z4u|_!{+>gh9rEp1_MX6mQ;*2LZNl`e!}rW@I#L~6PIMMUYlht!=MUT2e7rdSi;Y71
zVh>P~m;?yY72$c;$h`3qsW6hSa*pdGDqLdrn^h@K$3oEx<*mn_i`(DFWQbW)MSFgG
z`I#~vP#>!4A}T+vzJ9$e11uphP+}_Rd}Ue)9YgV}5lu<S4aX(*aLL0cz6Au>^;yCI
zXc`}_xOfQqCNTNA|8YNctFkJus`yS!rKdFT8eL{|%t`jJmnm6SOnXQuV!GC`HT_jK
zT;GSmrXJe9OwX2hw7BvJn`V#98Zp0Ybp1b!0&r)B%&W>xSIFp`>9O6c;NhjBo(gPY
zxcuDBMPmOJj@Z+%VD?n~dl#Vq!%aHsXzTf*sAC@Aw!10XZ9V~rO71ehUaLXN5HF>q
z)-ExrYvyLYzzkSCIDXdC{%ByuVR|iX-ePcs(fR=T6*Dw4=h1p`Ncq+ez2@XY?6X(?
z^P~0D`)%ykt8e8pbyG9&`w>I7qv6$Gx_qI&ZhfER+~7=Pf~}Wmr9)&rcgXjl6;rZY
zY1+``IW{iorN>11+CLU@9;1#ZD8GwjcZ=Z^1__r4ydq}tRysj4^83lAbqEU?z;aT-
z!8PPo42Oaa&C8vDJP4;sbGk1G8k$SoKc7d{1rAu@Hc$bqNpm|BM9>}GfvMC<T_mGx
z&|*jn`~A=%71zdRw$&@^vtW2p>Ydq&NgSNfX}0Cn>&CBbW#L^<w#lDpo!vYBjyIuq
zYbRqd{j^WhkViD{mm`+tTF^ub{OIO1e!|gvSPSEYC;kYJAk);q5pkbN=|l*+%j?hD
z?q=lKe4&%&^Nx-e^o9c#X4-T7IO0@THdZ1<SwNxe7<_+Xy;f1?GELf3gJ8$pyICS^
z7H49~IhGLHuMD?3cC$M9WC;3iKUvVs?(@Bonkw>o+0TufDeWymjp_K5+d0r1dG2Yd
zy*e|WkN8g!+V%@ycR)W@p#9=2_7fqQZ|>0}vj^|AFO$(V+Zle+GM*uex4pDhJ3{b8
z+(YddoJ+b`DIE);x<#m7kP)^><g+H5k1Us%u2hBIiV$o=BrKB|*rgkewP{Sh6aeMl
zFl5b+Z(mq@cCT&;ZCm-o+xDG;4zOeLTEQpmz+3*Fi1bx9$Ok<eLV!A1-0E7DHs}!O
z_8(OUBN?fJ6%af$(|2{Nmw|9j&ckh82IMtfXNn$}57{4Oubj(5T_6nbWKak<4sJ=4
zIQZ=O`|Bg-@RPOA^STGWCqH!-D5HJSQZ}`{mON3)87-4?jO#!|e-o4zK3-kx)=>Vv
zydtkMa4zX9(RZlcN|CK>!n2x_qgt!NWcS~9C*sU*9Xj2{g{s_ldh$X%mQw^2L~K?(
z><@4Bg${?FqY$ZdbK>p5C0blniUje{sU<Z#->)J_a(xhG%RNjRnf5Y{-*Bwhuw{$*
z%brD=NeCHThVHUJ)(t;`7BPfa)lPlFM=$>2oHQRTlN)<TVwKMB_!85)3nuX04bu27
zaN)b(P#A-~iC6pXZA7GAHn>60*NM0eUsY4!Q&H*l(}6@xnJJ}dr_WAKD%IfMhjqbD
zGDEJAvXMNgn?KffKj_E$0v9to%wyfmF0j?p@k`utWtREo7YItWKcT%VB=rS@qXRdl
zkJ0Ev6b*XR34lC_GSy^;aY<RJ*^XSEn?xz>`E?7<8bsX5A}ehYWvMcEeDs4HgS&DI
z9y1iKXV-yQ(;##*+>!@Db->D|xc0mqBSIHNTG?vYledY!B8xJ%T&Wvj?H$%HWz(^Q
zd@M!X*@%Noy+K*cQnr+^*L$dy*z){`+cO~wWB&v#j)!lO&wlE!`%a^?K!U{_!PMON
znB{_nX|C~=PIBP4AJ)i*ZhN7^Eo>NV>rG_j1z<JIZTBD~UzJa+cVdS|3c^J%Y_J@i
z#tRa*f4jzdbj-k?s#(=8fnGxBZzSwnAweLhsD%_yM9Gdy<>wxYZ_(}FhK5G8MNb<f
z?UCyog%+CT&YOIB;DQaILQhdB!yETK4_+aVlAe*?oe5yH;gZc6>TFs;NU-yr7}i6a
zj>+ncQ?_6t4uu|Y#^&>gtXM5x10kBFsqnc(_1~4Z*#Wzd+vBT>fzpJ*H!U0a)hCx~
z(@sVsaIQ<eS5?@y$+KpkG%EE<TX$u-F(7DGKFwHU)aH~L<4i{GEWI@%3~;R^Qlk{7
z^)Ag9HHM6Zd#uGx9WRq_O|z3jB`gNs@1N+=^P4^>){kZ%7_jJqI@N+^<{VeJ?0JXU
z2K8@|jkpX*0v}2uqZbsb+4iGyq}p8)HDf<)jUvdS!9Vv@8@euD9t#sc?zH;&F%B-{
zc^KSPevC)TFqZBUeXlQj<c<9;i1*3e4neG@o5H=hb>VU?j<wS`3N?OLS3+qksV{FW
z-U}c7_IR6A5Yo)rjUd-*Rgo}-5J{2Mibc3kFI=U4nbJo`einm3-ef1YO<34HhY3%*
zsrV;xatXI<vA6p(a2UJ4rl-%cI)=uGlU+V9i{@}WPe&H_^ZJi!D#6_&(etW6YxNkj
zQU7p$t?hCTe<h40<Z)_Dd(4Y$1A*pVmsLXBd`bnYQUT*FxaqsFz@0vNDw(e}dh%7R
z7jGZD|IFRh8cll4(D~enA1m23znSVDp?Wkltjk`6-<6&ZsztH|*f#{;34n30N_&$?
zg(GF@<G-<m%ia?;vG)$Dc5H^SvrlN{T{|(Q_iCXKlzbcLvw8hi10NCjK7OqmT2B?u
z*%8*+U)9`@0cw?i74e<j_{`Y>8*nWyIm$Wzo98YZ@AHADY<_O6EG{i_jp`|24uAPU
z?dk<CwGwoLlloA39<TI09h<S+(O#CpzryVos2>A9jgtD@9_kSMBLjF0p#|1AW|=ms
z<O2g=*KOV}E;lVI%cF-4z-4P|i11E>G^aJWvE?0gmEGXmgz7xMqIl_X({4zqv<rQb
zRnec8?)M*~orQ~k45lA7f8(D1S*UlGLp*p>dlqw#2Y;R&=XG~q{B|Tqa5BpX@2z5M
zpkI_a8hSnb{D(XF`SyE!b%cxEWt6c>Uwd!1@%zA=|H({3!^Iuatjb=gcbdw$ddX+T
z8c0aGHZX5FW#1OO216HGsBNvE-SZC@=#q1Kw34t^+QFKfRD_xG(ThTU{N)rme$E64
zS|9$L2#`Dk4!WxH{){?Fa<Lh@bAE5wqXH}?gA|kbsyl<M#!O;6M||CQO1C?$fbzZl
z7IB%qDby<q9V-b5S@pkcUSTVtSiD2!_6xSH{w}dP$zOI&^*`TgC+ZgY&pTZ`e(-|k
zk4^n?O8Trt^*@lY)ZVxG*1Ud_8>9x8y&>NIFGu9yutvoMRgS#nLw#&1+aUV{6Z}7!
zp83(}v(hP-@jGMTyHk!eRzu;c``Z;_WQL7%xuNE}4@GYsmizzmk7cd?s{EbcNhj5z
zdmA=h`W(L;<A3A0th(>)AC~SfCKZ)ra}7U46v1pm=)ndlpKp-Ttw8C-L-E$c*QOiT
zZ*>e`jYu*()ePm9k{YXs4KaR1PA_cUcbGk&_!RO=m3NEuqod;+{kYNvX|<?`LCvx9
z4%V<`@Jr0SoP$qD#^-+*T8{fgs{3%CTn1Vm6!^{QkVFdW0(i(TCw2H@o;J<boJ6_g
zpkb~E=s?hacW=+-P4aajmM1@;$zL{-PyWmjmd{*6ogcKc_XB<@;Oq8%)j+)=n5wn$
zZ$mMhmGw@zy$M{K1bNnvi;#J4EUCS%f;<FobT0bup2E$h%zdP?<g{-!eDyVOh0*;7
zp2h*h*!5|3usVC_TZO;N8nX;lanoU3U_PWJv;=6@@-_^ZWYpgc&-ilptoIycTLsOH
zzY85jzN`WL8NS^NvE_O2cXzUWJ_gLgT%LMb``zasC1M^N^|^08FiphWY}mW-XyQ`|
z(2cD>grmOfJb;?JPggd_e6D(7PrmaGuk;=3l_ddt`i*6h@T>)0Ui#i0pkD>2s_$7-
zSKtz$uc`U()Su28iU1D){3P+}MUJyQ3z|y;W|jSj;H)7%SH}NimqmTY5&=({aq+q5
zkG4twJ8d=o+N-0V>q<qU?n6uB!(Pif@vF7r<>O;-%!}a}^5ZX>%BxJi4JBaPud0jM
zG%yy(*zRCg=d)+&@>1p?7nd}x8ElgS^9`N#Yk?lDiD6Lnz@g^`?G@HaiPIbEBo_}!
z@0RniS|#bk3T%`Tb^J!zt>rbm%7B`iC{*8iwchV4BCt|MA}^WFH4GiEajpAo-$A;f
zL<4jMo3YQU$Ty8O&jU5M^>XJ;C&6m<%MbE~Le-yAALhUQs8yg-3p7j7Iu=LL-RD2b
z7&@f~+lXpSS?_@J*n4UP-fl&uY3ecCa?d13go?r*@_U~Ki(bC*C>raWCmSzZo-XZw
zym${Bzv^jFdeFB@miIB#Yf+T9zBg1E$Z@@fo9!Poa;l$xbBC>(=GijPItJf9Nlqk?
zFg|u>+r`y+(^$1gxwHkVH{8{?HJ!w=Nk+e?$PjPzow{nfMr^7z#bBKY-eAt#3dJ8=
zG%jCjtz5fmZP2?q(*EkG_Z=zF6y1r8Xf4v!Sk{-oLwLMAeu%HYh69PuA($AcgtcX6
ztcTSfkFL5Qj+eFAG{-A3Wje`y4_`OFqj>fdOcciXHo8Ehq#CjM0+r4ETno`@7PHKo
z_wa~-fS7^y97do}<8J<pS;C+%fm}E77ePV=luzOip}LxqHq-{RW1vp1<JIJ(j0qwe
zO>Fn#%f55<5<R3w>@hjL?WCKPR_%1h=eit4Z3qd{_zoY`y>qo&ot12$b8CA>8n8es
zGhUuv<W~khu>i8(DS36F{KAR@%@0IejQA;x5U(uE+Vz7>^Xl47udntdJcM`NnY*|f
zHNr{`g6uONXwYfk1=#!~@Ghu1<RjGFqt5X5wy!JdIa*{S16`za6k9CZ`f>EdJo$xX
z)gzS>31U}EzS`SE;@wTKEJlRo`S#dexKY#Vk8L{F>y2Ri7R7Z#=_%jE4-?){wc^eh
zonOLUL*eigDxDs0L5w4K$EL%*10zxtnxu@B4jUzQYK@8|S-bYx+D6xd@Xz%OiM<@A
zdfjhK1Cl2taNkN2xq2aQT2t>86XAtGFoZ;jjwG9c!c~mMG-1{EJ{zmr7A@+oMi3cY
zn!LnE$!j7As`xiEIow7uC#I8bOr|?}sBG`ev2+4sGDK4QtC?i-Ya;va?r^^I;x~_{
z+e>^>G^W;0|3I_9(jtY9Wq6eGal$I}VI-`o)}VG*sl>h971Fezjh$*nci-CqHQGQ|
zIr?RlatPQcP9y9gAK3DF`EBm@u*gdgoJgaTSVqDud#a{he0Jnj(LSDB)y(S-s|gnb
zJqR+*VJ8ZUe~{d$bbZxB>7ujD&W#-wHjU{`VmeXdIUh(;fgE?n$?4k_T}x_4{+>=T
z)ap24h&7aejrTcN=vr!ZqVgkHdgfz;>B?%n2x@iJmY&u)w?*K-Ck;{94oGQ%Jte&M
zT8Oa0Qrug)nkwL_LyPDU*90W)C(YbO&7Fq=NnF|9>&OlGZC%VRC#pU)F#JB!xa(*q
zE8hs%YjjPea;sn=VOpCE@}0%y48T445@h;?$WxrNZEOzYU9;NZ%X{T#9rm@4!;FzA
zkfp5HjGXO$)_(BZF?aN0aCQPLwY1OIo+X}+6k;ZqrvcGXBE2BzPA+}nTsidmIAifp
zfw!gZ2W;P#qRfRtZ3<m+(Pfw~Wv!3-S6WdahF!#*Gw?E5eCQd%eEG~E)Lw~3R{$}K
zC2bcYzY-(jX*06Z5klfxTwveyQ6Vre(Df+8^;uv0V8BR#$3r5eOZ{toUn0;7)FQhA
zAn)Oe{GoncUmT@d@2yIe3w->bDSYE`PBgjW!r7QoltMRybSN13jV`9R7$%!c2xTMa
z`k%COUm))=#swJ|Lk{y_LD1Z63j36|jf5;1X))a(YIsuN25@X=$<(Zi`K9UZ!p72`
z@({M0D}vmz&z&Rgd-6!`CEZHcuA7)`p=~7@u%>LWW*0DTUC|TVXD5JwfC%XOy#sr1
zv`*zpNR!B!oo;yj4sizihs!Y*7vpb{(n-IZmDwJKvi1lT2HNU=`c1t-h}Tulyz%kU
zPLzD&5<}it`^6-mB&V-_#E6zbDTOF8@cC@whu_VLn5;q#XG_TMPto(cv%0Z*gTrSN
z`yP&^5Ev(#6`;GJeF#c^ndlj6?_pt7Jzo>qr9`QF*ypIO-HRyqcW?3W@DEUhvo$;d
zee-@ju(xRT>KxHp@yxMV7#fsp^cDR~Vv~+ClPdBc8dkuc0=v)37GyYBhdM9d%xLUu
zk<r2>406)OF*siJ8*|NaeYPfX=P^mT@}jH6@R!p#k^vF!fd>!iTVBeTTHOaWSy)Mb
zZ|az1FfrSMFhDNDUqFh*3*`I;8j;iEOMsC69*AyIq5*LAteek(hZ4B%I{Df3W-BAV
zd%skCY6S$H5PA<*dHI8Cv>2Hip+I*G{E2K&<q@6zZ)EYJPwMe~{#xmA6c6PO_EzUF
z&p(wwTrv<EVl!cGPsp@mal7<UdU|Po&MSeS<+Bo}J{5<6G#0F=3l}7GDPIu=B{MeN
zm7#%f?XIja@GF!YZAWQ7r8oO#AX12rz20L!_beOXOF78GrSBmkc($vHIXP++S|jMD
zZT>9YLOx0XKQAGaM9G{}HbPG7Vn3>v(b?R*vG$2}nd1nP(MnRACF%8)*Ct{2He3~O
z%iOh+k24Vwdu>j;g8cri+_HWzO&j;5Y#W1|od(*bnVsVu2t}HlfRoB>$|Xb7W|`Mb
zI(nk+BMf_ONGj0QDn`$Lqt(~QhIDkL9r9AZVUFG?8Tv_8<#v~Oj6u>u`dsVFCT=yG
z#$`^n2G#6w-vW)=JI~)w+D__%{Vrvj8aX>4CCl@2^$8UyTJKw-!GaqSQWg?9)}V8_
zlUlSc+4clsRoiwox%%YiijtIp!kBkz+P=9FN=qmyEy6~Ld<qzXksrsl9550Wn7+w;
zbP1$FM)x*mi)@9zFl?{z)OwOyUS`_=H;+2kXZ&npj!2fpnQq|I)rl#o4E(FY<dz9v
zFoDDqzPaA5=cd1qEB5>5qQYLw@enHX%xcQ834`b^ND!=;lUBi;VD~|ToXHuNYCB@w
z5dLFEUvIFv5)&O#7A>1^Wd|Lk1{&Bgu{q8|I4?-t3l6-|Dhw=QDES48#vn#4<jO!n
zm?UlX8}}d!q%S2Zq4`ON>7|jlO4b*qjYxqZbo$$Si!v--z>^VOa5Qd|2qSq;%Sijk
zJQ-o$@sitCw=G1@_;aVm9#vT0{4o<o>rRT_<Hk==*_=^!VNm0jtB%XZ+j8gi#nvgP
z9>SB;v^yU1U3zrkbDwV$Px~`^Y+BnOxw|y0unkmbIw|#jI6JxQ!Nsr)!w;GxMBDBT
zscIp3MMhfC(YI$Zp`5{`q4*@m3fXoF{*OxVZ6d2{s6xIGYN~+BVO;~+vd=k0VKYbM
zh<#VDdW;fEf$XiIku?Jm1AleC{HG(|qve9+m$e<|1hW)>7{wk408H{{k+ld^jw&##
zFkM%E&e~u;Iq0?OGFtx_&t^f#uk=*$Mi@QOMXH?;dwU^a(0#KxL^gsO#1izRp4V(X
zXs^-%%hK{l>G}zP>c&ebD`73cl3_x;t;5g4Lc7DZ?P0B<-J!?6k%BlAmkVn)lio<*
zeLUTr#-1rli3VExBu$@}kx0Wy!kStFoykh{Q}(QedQU|eNB_Axb`j9Gdt#zW8G&?)
z-XpXa6CkQ0^s5p8-GBf1eMl})De<vJl)5qT=3=JtBfx)Tt>l<Ba-(EK2h+kx;*q1R
zMENb1=i1D$32QzNB8lQdSK1TeM8mo#h>J#jRc|<+%)Wf&$Cb@5m195`$BvNj{L~u5
z|K^=I+lM~HlXHnLK2zPgL_x>eL%I|?i_P@0E45F9JF|V!H<cixqnK|-ht>xB0Y|m+
z)99jX1ZFO$nqre*CyWKBXCMf(q9pYaUwf^4qNH|fX4>d1fa!%#((#YBSJFLLqJjoA
zC=#t$(vR57=etcg$$W(S6NI7|vE~6QOt(YH=uA-psMXOCB{TNqG_U*Y^g=Y9TuOmJ
zyc;VD3VWoQHmz4nY-$L?eVI5G$4;3O7V{BN(m)UD?Xi%(w(WWgzz{Xjw=bwD-5_!}
zTE3p*8o-&688zyVp6h-2<`TMy?{E_zbbX47k-zbJNkAB-v=MuzeFekF_utO!R&3xo
zKk*DojN)^nW%<kwo>W*rqq;DRd%wSoE?e#e_5k#fPbp<yGLpOxCEj28zWGKNjezzm
z5}es(;{vbclzERm{l@rI-RV?6ag$qp_%{K4{!eW05>(S%aDw?Gp8^S=GrtV*#gAcy
zf<5_^s<PZvn-QHL$yUCw=k%@Yq&<^~m&4g<`hHUkBVb8wBaNakQZfH(RCy<v-M%TQ
zm7<4grEJbgrO=jV?+Qw3M1_~;uW1sHl>M*%mx$(+oVm=*(RU#};vDeriM;Zo!UrI~
F{{ta%lm7q!

literal 0
HcmV?d00001

diff --git a/docs/source/assets/design/v1/prefix_caching/example-time-3.png b/docs/source/assets/design/v1/prefix_caching/example-time-3.png
new file mode 100644
index 0000000000000000000000000000000000000000..71b9e9b60ab9aae080a6e70a9594dd3374c3591b
GIT binary patch
literal 37069
zcmdSBcQo8<`!CG4BSeW720;WNh!VZdjxvHEM2YB~L^pa(bOzC)6K0e|q9%IFXwiFT
zL>;4M%;;s#NcR3c&w17<>wVX|&N}Zu@g-*FzOR0LuFoa<siq3W4W=6e1OybSkCk)?
z2#CxH2ng?x5EBp(jOtM>5fFqBs46|u^)cPfxt7i$QB%VgnvwA&-3{Ik5m~Z^_xJac
z)7s7piHO)K(%Q~$-&21*05|D6+gtPBI^r{OJ}K!najx9~pZVt@{bvhx9;x%D#)DyM
zH0)ti3MBuB{wbh%!i%aQH!+*3wNizaRyKw%acNi)Q79%!4m1r4ppKK?eeBwOjm4si
zgw9U0IFj}~1*&BJK48vLd(x;Wwl8V@=sLOky?5BLq{&Jda>ZoHoGQ^Cwx>&@=4>Ei
zKHt0d&5(SJiofqP!q`N3I0Un^Uf)f+mxkTymB%Il&8b4sYq&)ad4)^zEOmv7+Ub!w
zt~b@{FXHsyvM_;2v|V=Y@(35f1e5nKaeNNCJI>uvN$WMQrTR0zbZxYyx)|$-$83IU
zj^}f*rQMx*h{_^7ted&c%q4fGMG21um|Ll5ju5Yhl&(rLKhGB=228P}StGyxe*Xg1
zO-A#d_M=up{WM`i2BTwK7v46DI6l-Q=bWo#J`BjU=7ZB8y~f4zbX0<?nXRK;l6^b*
z4tY&!81&kkpQxn$C|am@_8Cy!uealMr<$KIs@?$Goir%u@SJPVg6l$l8aD?EXqM&p
zAVK7ahS}jRANk}GidVs&C08{TFXFP21|Ro&jPm<)+8C&^rD5$5tX)~$eFPkW$F0ML
zk0fzG4&Tdws3t?1D30Iaw@>mTp1#3#A$NbT*ljlh3R~2JQ^)z@&Z?7&wxct%nYYi%
zVF4^b{)$A3-5x4j+e424+j^ubfM;!D@)U5lwAJ(TOGTX)m2~O*W}&W-wjQoP?HN#-
z-m=B{am7<#=*xV<Lt9naKPrcbyU5P!#3Ksu=UFiZL><nW3UbOt+%;D^3B)C0SE>D^
zP=@pEOy-UE))Jq<g_tgIHQrq0iABlDxgN`Es4?TeRnLhaILpJUJ}fb9n819o<}!q)
zRu_)b9<55PNKML3<k4RYHB4rOZeU>DJhS&v!hfE=3}G>uS(mM&gv+WXSa@4;Iac?I
zGZ~PEma6#`!9@7x>h^_o)>d}2R~ru>B`Ql^CShR-_;b#2ldQBbcET?(nRl{r47TdM
zAO?#?tzP1L#0*ehd4b&oDkk%-du!ARHNv9`92pp@!Hs;oLztry!f5pP&Oayq^FaU6
z`BV}skHf*s?T2-Hm3f`g3an*eV{O+VfZ(#>kWd5Or9iPZ(^5wxZ{_(-I3~W_5o(qS
z_Eeoq5mmGaer$#enV{|3c2Bjs(?{SBts3All^pF1f^Z)SSCLq}2s?Q*(sdEJn}!#S
zk@ERHilQ6KZHlRZfa+B;4#!f?6KL@7U=St0Od&LnxtH?AS9MPxo-X2PDQ3$EifJMY
z$(ogT4WQ=8*5;8Eh-!4@>JMdC@}VZ;zs<f+42bx1uI_vIQCp?F08|km92CYqj(dkS
z);arJ*1}}&MF}_q_*$9Lp9QC5>#o@{nj_wSMZP{u+Ls5>yrz~3mWApb{3K4lXl49N
zZYYfEX?!|AHKXbql=-*cuzk-Yiby=bs7XpcoR)B+innkxig4W{GNulkSLU_I&Ei`!
zBnCjm*smMF6D8WtX-)u^ho`0h^R|*v0o9qc;YL-zrF)WrT&BT{=DBCdxso1waJ(<_
zzqCX{p8+z3K2*i6A0)byP*k`i<tmFKG?x?p8hSP=r8=WG&bAWH2m7ysJZvLi5M2+d
zOpsDh|MRTrTZq3UNtxFY&EAst*XQZ*(4{4VpJWt5E-AwM>v6Q^6H39XTW_MM0af-o
z;`d}B2-m$V-5Nt}*RL?y+Mc|zy}7$2M&&f`RH%uzb70)@Y@d4{ww>b?8Y22yJ@$e|
zXa=tVk}i*jKhts~bt<#+IjpFqOSAzNGJ*GzuYvC{KF{luL=B6oG+1e-nCs09;Nibd
zs*z+D@-19dD>O%&)<wwJfYeI+x=YiiOe}xhlFAbh_Zsy*czR!$cK61Mx;iEMkdqT5
z`#m)PCR?Fa!6k%+KJ0|L!A*5&6H=}ZhwA|{lGo9(KJVA~ESQNcxVdeh9r1L&3Pju8
zs=n5|<E!R4m0y-eX2NesLR9-00)pIU+oiZi{$OwRab}s`sDkQ_{oJvdapR4<xLAoB
zj8fM5H(TP?DPEFGy8I>LckIA*{#otsDEuo*LfXRAYuI#2k^bh3e=g-4u@S%Zt_>Zo
zBk9yZpHol>fy~cKj7nVdsa!%VB+UWV3q-qqfUq{2pFpz1N*>}V?A+4EM8MSrOh2ur
zE8FAU1FF?uxo^#Aejdn4QoGM5?RTiLe1>h{j<TZ=x?^%)31|wj=$#jW19r~up@jDV
zaoAGc8^1GT=4u2?_C@=R2*Xd=DMuWTTdjA9K+|8j0y9#WH~E*{40N_Gku!2A;nn9~
z-`Z?kEYsu@64w!0x>`hCz+fxM`}S{JBY&PlewZr>xkMiWWco4t94}ZwamR=>Z~u<z
z<43C8TsDdVVYdi`DH#ViH0_^6n2VnH_tKCBWT$ng&xqs=bd#z~NiOE9&afnEE0U{`
zKN%H>rAXSBI$vh|l;rb7r!nR(G`Qp2!o$@hwc~14DGH07-Ws;b^H(~#$u+h%t-tNU
zk|5bGg!J#czhP(n&$Ck2Tf%#PXIv-CELhWvJD7KWfB%21PWWGc(BiEwO|f;Lda#Zv
z6Mv@dwk|G*&U&-b>XNTVGE%qHp_q|5T=grHG}xZ*UMG`$!1JpwQtnE>zF2m&-cukE
zCQ!v8CD>F)-&5Pn((g+vb*opVXf~@m=sr;^w$zcnMT^pX1|?(fAYlJE%;|6^_MsD*
zFH6Bn;d8ZOM4>3}y~1b0_j6*R!5=@2gi~3DQ0dtxiN00Ns0nYP96{^w7e3q5VY-z(
z`XN0<vr|cWVtfPtnSVv`imEbwFL*MiU>dpMbN}7w2k-lFF{-FvA9^?E<x4Po7~qc5
ztxQe|YwaQ@_HspV6P-G0yfFpT2?PK7&_qo!u?8vya;kzGPC}dIz(9_`lO<-3<_jzf
z3U_vz!*gjq9J$yk5s3b_HD(sLe__99PXG56B?<%)KI4UGErg%!Y+7OPS!W}v(C#*E
zc<=#YtW1{vK`b!fV5x*3_haq)81C{QQVe~$c3rSY!c6?Wbc?oFmga^F(4Ql4vusa!
z$?$aHrkSQo)9M$VbCq(rB+a1JA|6>SwpO1!bw1{JM8Ljy+U{jj0>aOMu9*5jg^4|P
zj)ApK;9|A#3>s*7*QC{ZmsI;$YcU9(8jCwVys$TWC-1jpnno!;ztyrO_&E4-Fj?CN
zx|smp_nZ@y4-lVC<H{}V8<Z8lf!kjwHfFlOWLXZe_*&-4AlPxI6LS-ixXT@X6qns?
z7P;8CkHAsV#A4<B2YW58CqE32f-hJ4Y74CV#`p?X%9?*#8(a@O-6tBk95VG43S5ro
zEBFceH$8p%R^pvoKc+WcJ@oLIv~$@^DU-Y-pzmFA!eWpwxZhi^sUjX|J;XB~%YeGK
z^*Liuv>lA%?GsGZ#M_zQLU^$fnz;14ozeF#Di$+c6}F03(thJ9sLWD+<mhq9g?HTY
z^Bk}7AQa5%CIl{#Uhz`e`up1_P!4q&m*z?w@RjILc6EJ(1Hk+IV_ze2iku3^%X>Lg
zaSbN53!Emt^;~!f&N`IFjW(9aOGtPBv9j)tQfeP$O)mUuU^bG|7s0XlMkHS5`$d<K
zspHZK8S|*&OYt}%X@eJD(O|r%%zMb#VtKrAVAh$se98s?Zfs0-LM+_C*w!sg|K<YN
z?TeuLt4~di4QF0ipe0j5IdEtr+uUC+x&yGW#o)`L#_OAoy);7$D3v|7&T46NMf0eK
zrmW}<q%mw{#~}CCPT#2CtIxOPJnT;$ipzvKDQB%3u^%y-ka!oR0a!9a1j8B%)_xR!
zXqe*(|CWIkpEGY#hn5Zj@6V=jLq@~bv$`h|O3S}IY3QdRYxH=^5O_A?G2sK~ihUqz
z#!e)xl`2c65Jjk>n@<7qCF(!q9jlwM&zTrwEY6!0gKxVnDv!LMcbYqo7!5CGK|%U}
zLBcamzyJ)_;PCm(Twrhy1!v+=&D$w<>6WP6X5=P#JcVO{1Tcr)t-1i`P_vE(zso4{
z=AgHbl<QZFq*T2*?KB5IDP)L^@1r4}4x7-A8U(s6h#9wEV5a(x!c7CQ>)^y|U4^)V
z(whYsIz6*7(u*Zb4{9>&)7GGN2LUucLD-u46ksCS<UQ{W1xG#*hs5{kXcLi-sJFU(
z5U4=Me9RD4INNK8dj&Z~zDwMavdiL<TN*weqz8Ro&$XH?*|ii+u?yPEsR4sR<pk85
z8O`l;#PnmxS3na_q;vdMTgsfFPbx1?S85wD+O2bSrn1oL0a+{2H;T|aWIVY0$JcI~
z9%l2bvQ~Y+Q{n59p*B6r3}B0Aaf~Jdjb_37Z~TkTFICEwkUSr4M@cV`g22ZHMs6tZ
zbVYtfgdWf49qH}M^W>?Q@Dre9ZbLL~j5Uo5WGZ}GvltMH)dB{6Wzv-Aul)W;nZOMd
z^77UVUAxs84Bai1?&kq~fFz*Svk%qwB?eexV*^`+<2DK#H?Yuob^0l0Q`;rsI)NPT
zeT%ZGw_O5IN>RV*BSf`#;n0G=hDHGB4#H=7a#K9c!mmgQNSy2)zF*)LQp$AZ02sxn
zOFBOjwe+n-1ZsFGMuD<fY}olruloqaG>O_Yv5ugXl~ye0G^S$VLX?ii<%);1-+ChQ
ze9s@DbYdT@g3OB2En~0?(s8Qg{CG`NWVqx?jmmjXH{A;uxrWqWt6v-C3DQ0g%dsEU
zkQf**L;C0C)bd-0IMp4#%-FRATnD%h%?Ij9TOR+!O60WwIxAmx5OC}W{j4%Bj43I1
zSpC{3PCLdoB-L`yvDKtjbIL?yF{I$OdCIga$V$~?^KS2W!eRh_jFnyoLE}*aW~`9G
zO!CH+hm7SI<Rl|Jc6`MwL=Ro+&mlZz48)U(XFD|EB1{^Q*9AV#)9!v0k583obFQni
zgJn{tRz8dKS_1bU|8x>J9Fm(myseD43wDG?3JatUK`sQs94*+C@|>SPqx2>oo7w}e
zfGUK1oNkvkd(FcqBz!EV1Hm+Lw}lIAMdQ4gQE5ot`K-sqxtsXTgqTu0TpXY`L;%%>
zl!mcoyf4l{Jn?JSk`KrOFP$LWD5P)--<sbrbkQ?wxC#pReQ=9w_%Y5ge`$4dcHw|7
za_W7hri*B+)J?(tdfU8C7A>O0tr5&nkg1a6*d3X*sKBvKhKxnVr&#x~;rFlQaIzjn
z*Z1lJv8}Goex8g6H`;6G953KdVj`2(>IbWyQSpKW{2Xar^c)i60`QPxHx~*IkneXb
zm)lm)$3iN*jn){=H|j!<T%mU@05(ac{>-%{r<s}%X`drgX<9##wxAO<X3)F>`Zf5Z
zXUo}o?=`a^*dZFN3scOeCL(vLN6&^Et5%OsiT2Xk5VhYXg6}Gh8vqI>&5J41eY;R&
zuLO7`-0zo;iyZot-X#W{v_*ZtKD0=?!}sx0ja$0%nC(MjMxV_Q$7X+pGeyE0V}5d+
z)4UjFR7&?9VFbgqsXMY@T=Pz^$U?Q#kTyP|(%QUv*mk>K&)|7O86oIJztDnchVwDN
zOt$uX-LL>r>Go;E^@E+yzUfzJ)wssEk+?H;qUNJOfiNnfh!O>6$UgD(I<w~Eqv;?K
z?)hUOXZwz0R^7!r@|LYRrQM?&>7MJ1ADg|Jh9wbu#mE%Iw*q1S?Yi$$<zH_WwFJdM
zw_1nfP(nzNy(3`>>(b~gU1agN$0JSwTyYLYQS0<{#UOh5CJ2qodT5%kaeTKxc6#JS
zjl(#;%}WFBbCYxD38i0H{vo<;|1_dZ#_*$Eh_<Gb-3HtnN5LRXR8j666{u1CRGD08
zey%9T&yg-Ziw5I(Zb{nzy5rX8T3jTE<0zN=RfxrS_>d{bW`gLA&gPMdA?9YwFW0QT
z3*7nkUD^X0z*t~f(1LP$orae;{5>W@-E5o<g*^|PYH)|w|GZOi)(oip_P(~v6CC!|
zt8Th-Kt%Dyv#B|cuquJ5?%Q=)TVQr_hL+(0bal_ode^Omr@Da}+`F$&3=n!>B#0nK
z3{y9dw(PRvk7a<sSYLOD>5Mc#U0Hs1U%XC4L1=sHR?|j5oTSDrwq&0lUlfLLd>kf*
z1`r*$E!29JaYKlO?IJ-c$IBL-Jj-Z!!$FKl>5F`<`+M)~H0G{%Ia}?Qc$t$~4}T{c
zN+9g=M4i}O6EzL!fG$O!srZ!n5d%CZB@;~U$0HW}%ASO(p5w&0n&k1(D=go}AlHhy
zy}wJj&e`Jmj4ORnn{pIuKnKYK5AP;2)?KNkm{*@JK|XRs5Gx1c2n%c!ruFi$I<4SX
zd3*|d10Pq~op@iBkvepUY>kmhDD9$pWXi#34s;QyRRc^r!HVG*zA5(gjpsz<wFrKq
zil|eLxTeQ^OpTYgJ{l;{qhH#CPP6$nbFo%wZEnND1UKY+1zS7+lk0o#hT{ZQe{#IS
zHkX`o<a~D_C!%M#n^VH=bQ|1#H0Z^0Q~b8K#idyuNY+4BBxs4e@uh&r4#|*F7?BI{
z`iqFEA12@A<7=k`3tEb3J>_tDi@O5^?MCoPhNz<ncl8G^G&uD6J1V*#Bwm~cTQD`_
z<`>Ew-4+klNgp!3QX#&dC7XbE1HJvf@Zst9Rd`BWd-FYj*AxMGBHypIS@f;@cFfbr
zy~gA%VVhvdJhIB;M|c^X5FiMk)G>1fa=`sq?%|)7mQ~(P7T8{`jWW9sUk0TuP53;g
zjGaS84EE#wrGw~r!MlFVpULKDxNk47%TL!uB|ZSL!-f3TgnR3*^BD&NzU?=Unbz^R
zJL2EG3;hN&XhPd{t`uKe7N|V&TmDwov8e>@LO>O6by@EYi*au9qX;FaKvaM7%1I^O
zo`9oK$TT}<=(mOeL+!;Mr?(@P;Hp-$MYQ`4T2@lz-R2xkC4?Z8cLq2*?O@n59{tz-
z{<}gYj>RNUQ2>p2AXTgqRb;(~g`}s*5`w^LwU3z<0-lkAM<E&7rzSg<C(^*nIbCj%
zJJnEs1XUNi?-m6&AA}7dVv(MgW;1r(LKfY6GN>ll8bTF8SB5#1HU<2gWDceFLv&sV
zxJ(cqW#X@*3?7Z3+T{y7%e9>koVLw103@X5q+_*+qSGWen3jV4ZZs2uIFbr`+Ajds
z2!i&CsxB(IGJsk3;?XY6;B8{RGj!=u%kcrZkdjr$Vk`qKawog+8}EL3P1F<@z6wi!
zW`_xp@)tL2KJH5KBwIa`1ekkexPktvFZZMPDN6*2pyg{BAs}l<<sh>UU;HE972l3P
zUUF!q%-;MO$P6$@LgkPtfx9mDGNkec1)tt*o$|ON(~{bEB#G;1n6}H2(UQ1RoZoyJ
zX?5w{hu-mL7!D~z-j^YXMS(fok(sL8Ag$+g_sOBAP{qD3WkU*o<8Wbtb2!M<N6giw
zT+T}!%v%0Q%G}pdS!vmlZYEH994}s}+8jRsCLs5&3ndRuBEquHp4cW9j473lKq6aw
zxMwgvOE&NWIJWcH3YL1r!*M<yxFifQUthCuSq|#etH*b9{8!x^?+}3%C20#1OEW52
znAp8R?9my(5n*cMchE!_#z*JOa(PWrkVjC+1#2ve;$c`El4<iK@>X#P8TsBG)o#<i
zc)hu<pWQr{n8OL%aazwZe_b9^XID9JE;7&Fj^m&ccn5hOcBG;x<|5EzPJ80`tD%jJ
zieM=%_dR+FAz<?KWy2JbyM7*iZVuyqHKEjdhgc<7IPw!AXeD}IY3N~>be~!z+RfuD
z2Vq=EIg0LM6d~vo{pq;t4jXmXo!%FygcE5%MZikWF6QMp0Th|LM?dpR0m6GJvb?8m
z7Ub9F^V>lWyZ9F0SGj#qEHO(bdwJx0ORP<Z>f#T!Ky49}lBo{xtX{uveE-wxg&82*
zSP(2?=GCM85T}K>)^vUIf#^sSVHXe6$(U-b*?1P&U60s${bSUP-aq)_XX~PrKllJ=
zT?I!okJubKT}D&wQv68M3=y?8p&^>;z~1SSx}%p)SBW#~mr0AXz0LDpQK9g0z!bb^
zNJt|Qq-1ixoa5rp{U&?Mbh1fwqaK>LK_P^@U<KXvNklUn+;<5<E5M^H(Kne!vSYiN
z_VhY_v=4;pHP}BL`(Ys_y7$KC#IU;_7k*)=eqs4|w9XC)b;@51l8E{u{JSCzOAKQl
zgj2zUgB`|av@@Tsa`f3_=9sO$1o9b5qqOf`+$GfaWwY#};CU4;{VqhXs=+cBU%uqW
z0L(vas|+==z3#eGS__eO7_7Eq2Ajq)nad7Me*IFCsmdX(UUJhhsG!M3cj*vR@BdLJ
zLg;(cP_`g5gYZi|mweJqUVkULhOp81_ki0?<Dl^;(%Q9EBqW}KWsZQ<`C1d}_E=<M
z!+b)fZzx3DmYHG1SNWt*B$G+FEj@N-DxB-72wvwBMC2BAi(nLLS7M0CH+w3hqw!A(
zgSj5snA14l5x{$ZV^gdox91(=3eTnDv35TvI=#Z|8~^0YbJ)u*ujNk8ZC4sG4sES@
zw{T{u@27{P5jQE=?(K{b#ZljC`Kyl78Cb_##g+!Ih@=*-4aWd|R+LAs%hbygGoBgF
zrYDBD0-H<EfnL`{1(&PUn7SF<?3kYX4XYf3loaolL!JxAv=@^d$0cf9+6=Hf66Kl6
zo1%hkhtd8Jeh$|YQ#`ThIcoqAycEx&f(9FQ<=={U;2e40Wz!=*sbKMbKX@j=$X_^8
zVDC(!E+U+7+)V(qxLGec`Aoz?38|T$mwyiDGqh~U!>u!esPl#^+?vi6xF$dMldW`^
zGKc$i)smh`>nvSW;AThr^D~_hRUI2KY#Lo5tHHfSa>-vTf)Q%L;K@oh#o^s*cu57>
zfoOs4SWoQ!ipeZSDRKyun(+3_j<?j~rbSzI>v!*CSD96o@AGQ8Vxt%MR^3AYeR&Q!
z-k+Zk+1UJd5K3pIia32->TmEUpfoIT!2YAjZv8|7;kZGqbh%d%ZJf9<+RsS5xivi2
z<=8hGG%+Z;58o(wvc{-)aUYr6JRE>J+G`b`or;>x(F8wv3sNSJxTJ|i{2&~8!su)#
zUFpZ}as6!eWEi3e^b3|yaA}W?`8OC$XC;a7aN!?RtyTvcwb$6EukBOX@t6lgT5asY
ztlc;kee!e%oMmGpt54@xl!(|B-M7RRl(SVjDe<FN1zZpYo_BKq^?jMlQ@p||H*S(J
zBd!ipr)N75_o@)!uv*{8j1s&%`k@HKfQJDTbn{i?$kM8s>Tl`z|AWDOrh}3w&)+&z
zuR|<H*tIk3HI9*{HMu5c>c?s)2_i~7shxFqG&uwz+fBY^7xzK;FzYYG3N&G~WiAKC
zhG)+%JU?r~p2=+8Q;JkUoDUZKKZfTtl8CE##(seerZI)!O%pZf6|i`P2|g~)WHP~5
zE+pFF_f&}tQyhNbM1q|<(gy;C+E>8h6($IKrV)1)tHSWH%HFbtWU;GiDO5W^G?^;|
z^G<Mxc2U*xpsI9GM!5O1;Mqdbm^fvhJA>imNuxmTmtTF7OIo^CvIKH@o0c+K|EZNT
zX&>ZlZ{+K<oG0AxItO30<eoSgl6n42G|i=EM!fWV7)PZZXZN6fXn)ag@w26#Ca(Fb
z*T*pwb~VSKsMf^SmQIt=|9D-!!1bM~ix+iPOGz@S!;a<rXzi0vf_30-NWJmEAaXPI
z-)<*94Bmd!c(d*n$zJ`TY^`4g#hykVe`Up4-P%g68S73TEiLw>0+T)homWvmAtqz%
zoHvJm6f141q`Aem>e!t$`I+u-@Um9jNW-}M!G&YH;dV$s#S%lV7wl{?(}cf3a8p+r
z{rUacyBA;9t=ddQ-(A$kD(tjwiSGtsS`Nl}J$d7(hWOb67HzVkN8~rV6psXgtr4-E
z=94BKYv4=Yb;xq<3%pDq*Ei+mIUSEGrSC76+-Tp}es2BRt*q<|+UT=pA+3$ol#@iy
zh5hLj`Xhg5$a};#JLtuci8OQT#NC`Xr&o<KIldBotCX;}UEM#PsQudZ%hGIVePNB0
zj~x2ZKq3dI?q<1j_Q=nE8n$3onF;$N1quV9m6d)`{&z1N*(Idc+fSama+?-h-2X>`
zk<KlSNw5CI&&doaZ1o>@gscZO_7KRRem0v(+c27+?KF6b?rU`aiLPICe%Mws=NNbt
zY{zK+s=UQgH1k{4cE{d5BFL#r*^Hgr)X>AW^$d@ijTuXn-?CRMVHXlmgRnO7-r*;L
zBvyXtj{o4hXjzdc4|~RqV3~jQRs71el<)`}=T7t3JLf<9UB;evUTjnwu!1?B)bRvj
zzEw|L0R7K;u+cJQu4ql*dI)wx#l>r>>8U*(Eog=FHsi8Yal*X&2WIvLpHb~Atn-t=
z&c#*NW>r@I1NN~GfV}FJOv*(U11*8*^OHJY%fOe*W4+Nx=Z5KU3*!af38@hrLk#Ey
zk0x##Ut_%1zV^MVa@K3?AUiR2tttxLtB!fsH&haVZXF#)p7D!c{`Fsx=S9l54BPa<
zz{dN11l1I_y%Wo^{d*9r-6?UJxqMC7*sFwvWqI4aDE)VYw#@yq)g8c-pMed%1noKT
z`;}{5_%0*eMQG(6s{Qz}(AwV5K<`anBF9=M3C{x-&tkE*5;8OmNxKhm_kdWVOc^t|
z5V@}WbGe|2wS~yUYVwM1=f?i#TDTC1BD$|p^@>+O$Bl0qV=hkyWKbpdRrKR{!rYXg
zmhpymUKH~K;($G-JC|7s0I4QjhZ2kQY@cb<UFN^YhJXnoF`81AX2?vDopGx%@4Wgk
z7R<)yw4&;M(ZGfuO&-NEPG7Yf^ikKu*1l7t2pgTU%If-;cUMw7j^fx_NI?w(HTZ~5
zY2=vB8rW9mOxGLCe+vpZvueEw9eI5F3>bmi!j5U?An+O8nVI1$E#-np3`5Kwk;N06
zfn@ZujXG1fMV*wX?;u)x{`K8a#IIpVtquTQ%HOK%*TbmN7*Q?Oc6o`BfD(&P*pNFw
zD0XY{;%NEM?a>=WAAuKD6;nU$lu5hXP@Rff>w<Ym;K}Z@!Sn*fsD>^ka~K<iRjY;^
zTxI2h3P5g~o`~FAuT1^q{ca60U})uaql?i(Q8mTT4e7p(<4s~Bh;(;j&6q>FO-LdL
zmb%!L{Ku5yde9OjKhYBs_83|{dm`m0e|F{NnVPai5O(#YQVX%wU?!OxwXX$Pg`j~z
zk0ZT_izZ#67W?e5ZmKR9a-sPAqXZ|W5rFj--uyMOn`Bzls@T1?nb52Mb)(+U(Ws(C
zU9JKBi1}Q@ol8`AdBX^ItCQ!RCsM{LCj9!eZDK@*U3N=X;BZXO%Dl>Chr8gVx8G2^
z(7J*Uu&0ZKXF0g+S1;*OI4HS$M<Vi%+8n&cjTlN3oF94|nXW)Q>7Q)i-BkNJsP-hs
zCS-<KWQdLBy6nvb%Mdh9`+AQ5sD-mkA3=L~ytaR9;P_=@QI?D3w~JkRA_%4UW^X5b
z?&1XcYU(ZB5~KoF8*%H}a~5t>z{6l*zvFOIpUHHs{_5@;av}CGdk8+4@W4F>#m$A=
zp4;CgZ%++l5D+9Mub^&^oT^(LAwkCh1oeqHorH>n+JvwQR*~U^_JC6-roqd?T?(Of
zvv8#K_)Y_vmRsv*!pSUq0>>&uW?47$Qf{Z*Lf}BXGjKN^JLzW;;ztBIt3_g#!&MV}
z01<8~R+GO)@nG3AI^HD};k&LmWwcFk6@1R?Hn8rYj8L+bK5u$(ETPxp0YhCx?w;fK
zhi!1YFc_ijEn|g7<U+Y)6|_8k1W;$IlsAilvaTl5Tm2%Ep<z4t>k3JJ$XfJ26sUEg
z5Hjv!^gLwo{Y0Z)oU->enNXh|YV0v2{K}JgFV2<vHlw)$$p`=+s-K4KN$~}TTL8?T
z+@);C!_4{ysv_S2L=cc`U0xTn;DQhYMArS2fhZ!)t1cK1|Emv4Fa6jQS7$aN>My$&
z)wkr!&9)vSVHqME>9jG-eyRM7!^N?x!2mEAKm~a+&$Ly4=i$T0^Fol3a81$vQ}t9)
z9yKt&D9dl0jZtsxqL{UWlA<=WpEWGDBsR6x)qjlm(<3h(xw<k&wHkEN_dQ)Mm~WyC
zTrkono1!sgR(cw|vft-1=<*I$OR?@b`Mk#yaC{6g^=K_-YYv`>t7Ca-z{D<X5pk1w
zeLn937dP@3<BCL%a?Q7Xm(kwtV#<Qu6$uQ@j@XMrd>x3fsb}2{^LK2`p!K|uAUU_~
z)F$@=6-I02$W5e=46aQT({=%exLVP=6;t-9vl*KD=jD?GD}k}zf3{|*JZ92SIRV3!
z+0p_K-wp#l)1k(n<=~;QHvGJlI0kU>?SUXUbl=P~yzmv=b+Mf)zgQA)!a~wJB^NK@
z08jI<XzeIGA*4k+PADRsdtK!dzRPemYF|r~GO;)|F~~D&(fr};5YESdnNv7Y6WmSc
zB0-Y2GrCm>F23d*?MOlUDRtMX0gU_WF_yQ~m`2XKa_EnA^&K#1LqZ{lmg3v9HNi!X
zYNSiyY^rAeYciZ&7<J4yd`}o<Hztuz4SEb<;|M!j^KKqlywq<TB@y1V!vkb8o1{Tj
zum}p<uvf1DY|uASU+_L^J(rIc7c|INP-_U`;psOmFQYH>;raK@B0lIa;Ny7_QJX1L
z@1>ozAwDI4YT9CmTh4M2&w0!-+S_<mtDf3K_xGrZZb=7z5y?ky8DgRPqo7-yHr8z{
zzc%^G2O<P0%=R+dYa{JHx!#_`Gknt9;l^z6GI<>5R1_9!*)ROl!<dSX>C~UH6J2Lj
zNfo-#z3J10-K-CEH3iYRnvsu1f3Zr5WmSJQE&Qgf>fq3rNEz0Af8`)5-X-l5*<G@{
ziqrHs<irDkDE-N6iFV%-IPE?d8B=}!L}oc{KByy6cSlFXuhvOWe(jOTfD5^vbjfC*
zy`ezVTB5_cWMtmM$|SM^Dg{vxz968p$QCMQceFMS5uS>1^;>gov}mC(eNhR-^R>A8
z)hqxakSCR=Oi)Y2gW_^xKyr)JO%8X+czCSE57xEil*8(r+8cLcy{QDtBQv`Q1wkOn
zZFxZ^hEK=eFN8N01i3)Gi<IyOV_kL~uWu^P{lsUT(%&eNNfj}m^jHqx4Qk5@GMcYB
z^;Y1>mEKv0+<GPC9ZLAF(3KLvqFcv3E17Q{k_xP?TIcOkMl19X&?-cAMe%-l&8h!!
z)~zq@;~c<Tbn-a<R+(qr`!EV&&}-A+^T`d_jnBo--iv#d)UmFKl!KiGw=Imx3(67x
z4GnRncb%=z4~eH$1n0Mmjdxp@;2#M$!qLZvW8QVI+=6{M%#GVkW$2J=w-1<JnAXMp
z#92J8`YUoFAzdVtovDmpYFKIZ*VzBqtLL3EeO^i0E|~7^RL=2+5BFGf;*DN`R$l>O
z<BRX^?>2Pr<T(12PV|T>)X|;Y+5|Y1*Ph-#J%l2SS*+Xf#b`0@!H4qF4`#k9(~|kF
z{+}fjCV4#{=D_VJg^{t!VUA5t`-`m*`4lc+|GgA{i!Ie^*)!g5dc8s+#9W@UPyC$Q
zbavoI?RPJ&f?b!y!~b2Yf-mNm98?;j$IsspPI@oRM$B1rgPxy0gEjx^HqtEb=lG?K
zmRvC&uE+}i)}2v^F2TFq+s|z?%t4)w*<Lp!?i))!?{D8d6Nue=HgN{P{#V)de+><T
zgq-o=s-SB9AIY#s`Dz&gVZFh^siP>IV>+FxgYI9?o(PAJeh9(eRadPT$o;=2to;A-
zf1?gXY*Zstv0zoRRII0}R>~I@)%H-I&z9v$&U`4pM(DNe<dpK%LS{AT3RvaDw2iHn
z-!eo3PusOg%6;|V<ss<^2?M7a<55EuY)6ghMVv{*{?5wNU89~u)uFH3fpkj+oR_Ul
znhzXWa!Vc{Ltxw{u|ig=d1=_^HvUgWJ@&HB9&-tN4r8GS6T5M6mi<lq=&@YuQ>rU+
zQj~brV^y<vDeD_MNk!#d&J+?{Yp*U!7e@YY)>y`k+#nKltl+2Lo)aW-xrU#kPraDi
zrY)cjde>nyw;2wrNe<uNInu_TBpiPdP<o(pTT}&3t0Wjts(ZOT>8_>Ivc9(yf7@BL
z7kyOb_^kW%OnyLx7u}~e-Tvq~hvPFmCwGS?OyhAcl5_8hW;bH%g9^^VHXY~jqP`~$
zYWkU;Z%_yS6}yo_=w+re_v-RdMFMduzM%j_`4DX<aA~OZ4t*8eTJi{f3%tXaPCRXv
zG>bg$PIbjbCV@_fUOeU?p8kXDLQ56*xR=B1tzC%R)6vzWL2`IZmCmTg;4nc}0W8k|
zK1KG6<Tpa>x`Y<ouo`GVCFK&&<G3mtZwR8@xIVu)F&YZDf2@jm_r+RuEIrViLTFhq
zJ7vfyfl=*66D(sVUOJ3;`iU*u)I~wnUe=waD8h(FdbI0<O^RLb4@v74GD$6r=G7@*
z0(3O3=HtQ$aOamDyN`Qqh&e(oyss_cIS7~ac6pu?1D0h;?=-9OVlN-I1)0O~Mq_Ei
z_M+9RmIw-wrm!mc{*{rsxU2_5Y>yvwOW*HqmLcgAY1@2sp)$K4{KV$)8?Wk+lp+zi
zP#0<NXc!rulvdDrE^xm!%<U1(!e-_R%qq!Ui#LwxCWiVl1=}<~vZy9~H;O15QwI~@
z#nVfDbeswN6o?$#VKv^Yp;5lt>>g=&&Z&Q(F_%*l{}t8RLI-hVOQk<VFjo&_(>wHS
z)*J1$fZc$toT)k)j|N#%;W@lUl`xzEDO7uB%=t5{+Z=m(1d~nEh)6H73$c*BFwj9H
zu&?%rwE0|er();Kf!)9qYj4isA2(G-Mz-+O(wStNkR7%#;hSQgy!r7=ho>$lm~E;s
zD-xZ>bCGGSW)HWdi~j8_EQH>-IVpuA?=5~Nh~wt)>R7l>G9=PgWlWXEgIZ9ln+i5q
zHcuh+FloOqMSS)9+ADtG_MG9DP{M^CEwrY%h@WjzGm(f=nyULjh`xZd##hhpoDZow
zg_0|F<H1`Z2&*rXrzikOw!EyfnjwBaL6iD2Pj$xW=U%E4qz!Qo5lK9quj-pm-C+#z
z=UajtJ1yOOyiZZl9_q^USN_s5o9fOR7SZVdXaX(~_gacKXpY$tU-{wkwYv|?)iR4|
zTqa@juAJG#cbidwpv&ieGs@m86cxd$e!Wugwc)hXRy;BLnN}|0p*htcW>?0AYUsdG
zPJ5w1I*85LtN+7lhUJ-rsxO%36Jd?&s0eRlVzLcY_$s4$d$N1;>d?*>CuFdq{Y^ak
zCVA_CnAD`m{b?1Gb_}vsl?aq0r%X|S#+wTu5JM-{1|qhYAG%QW{W4TMCz9%ew@BD$
zpl$bW2I&jP9-%TaJ@DW?8nm~y3M;OxvLE#j@XIBf;(*XB`uucDC=1Jv<F!B#C?fUY
zKL+n)YYR%^In(tgY$1vKt#3()_2m?Z__f64s>p@>GMfpNiG~yY;Ll1y&Qoa;jU4>|
z8j(Ve<{CXle-;uxk)$vtp%BvlaOSWkf#l4Q6U_{0szA%!zdsaep(j4F;DdiBz-3D9
zTYe(P&^r*#n2}UhP(9!gdMV`f&qy@@wE39<oKfQl`F`aO1tN&ujQ6UmG4)Xwe&7V)
zT-1eAy9KBW!S0+N!(<hPFusOVn#JT#7mrW3a$t5}Zd(m{9|yc34aHkk8KPq)xN~jh
z**F0i((~l>CP2zslMi`>bCF|EdJl~a-26n1Pf4>KnW{s5QlfrLgwU7uNt$YYgJK`5
z1h-!ovj50OcX>^>-Jqy&EhE_l%Fj14oQ6Mil_HlarHPN##R~;OUF!13i7Pk`c&A6a
z4epPpF0Yd=<UO`yD#X684z!}W`4|Y%)3OioRiqpVp|Ak=@3luk(cvRs%>k-lF$4jI
zqs$Z%byC&dvT#ADT0`jNZ<D2MA`^)j>6JG&%P_b#$+zVDI&kUs$6*4DP0lMi&u9$s
z^ISmT%E|sQ_Lzr+h@2j@K8@VEWn6sW3r`gIi7BSq4S9+mJejg1B<|tIrs4u9Mh$DF
z94TY|?2c?5br<64QdQqPvm0ht3jz-!>H(K)*%B}D6B#s>LOGgM)%Q;bC=D1ng-Z3Y
z`_m!_%ySbzTsyuo^OYuEza-4pyUA}~4r>@joSs7ja*@Mmo?m)i@y(x4`(QBwviWXd
zGLy33yr%1V?~1l5oKO?rJo&-gst@?UAAh%j*~Qg*fRH&2yWO2Sk+#H7wEekPeAl18
zw>lvK-Re;>;PEu!C*uuISHK5KLk{zek1*N0bWE`TL&VzdyF2qMDkY4~MAB;0UDK6h
z<W71sku{&f@RPIWKf`<`ovKO*iI<{Gc<?0Pi03ZEfG*}UM8&VYnGq(}U^IqppMFV_
z>%PPwzy$443V5&rV7dq3tK0tJWygH8nAoB@uGfbM@+y4T{wESozt8hmXkV?lt)Um4
zr7TB8uCNlF>x^&41ElY~GS?(O6dnmWG|%#NJHVT}@%!f$VT&;D4foX<$@>XU!=Ox2
z^%9dpbPIpKveN~>h=9R-Xeo1y%ud60bUV46nP*9vT)jAVYl^QmBWn#AJ<-l=KAGP0
z<}_|Ze%XrO@E-SCV1MbGSA28d1#j(<#y!ae^5AE>e^H{-SC~el2#LWJ#%U>-4z@_4
zhk-2zwjsVy%(dS;Kc7Px>VhA!*&bSm9*^7uH8gF#HTxEuYkbo=^imdJTH!=R-*9)y
zv_mRgrN)!%6E#u41TpX^k@p88tn{he3K^au^t@+eulp+v-s$!AF&1?k5kkfAJ*xt_
zuClzXgO^&O*LQ(HH9vX<QVJo65$I0q2S*W!F6Lf-__z6NIlj4&{661v^rV;$m;oP<
zKACUy^}a~0z;}7nTs5daNs#P1Wx%p+C{ajNG+#DQK3l!VxYUf(<_OsO;-!7al4X<Y
zs1>J)Vf3>E5V_Eg*$$kBPGj5kgAlVjk=izGIa7<wo`xi32&&6g)gf-77(9kXigpdA
zy#F#}<ikVx`w2heO2R5DKpXU^Egt$qv_+#HTmhlhdsaoRbH(@mLIUA8c0O0nB%XPs
zTAxTY2rS57CFSlN6OrZpkkh`V5+@Y){N$~WU#+32We&{i#HV%N3}2DFel*R9&mwQ5
zQ9+`6g=$T*c-nsfOxA8E2`1|vA>2_ydj`*4#RoU=s0dRxEP%-o&v%#eHQ+W5kexDs
zpL<68gGZ~Q2<2AH4IYIJPtR4;tTVeuiRPnaWItTgFn8r`m^Kl1Rl8*+vN<ULI8#{g
zHV0TbTuV{5s8Q|Rt%pF)>iAz0rYM5EN!m-6>eJt*EL;W{yO85O67TSpk?izL{)VD2
zL&80;L**f-!fHrl>ORm<kuLQ6)cx`BIvTCVy?ql_Q8Ym+Tmty)alXOm8HUTtFCnb)
zwqX`NecY0`4t{^~^MKDy(JCEV>`c1wFj#vAiP5|Y7*2n*cQurU`|*Bo;_+c%QX-1=
zHG;hL4SjH%(W)$LZ=~ji0Rm3|gmPb;J@VlJO$I=P6NU*&i)nwwH)6o!y3-je>bPOo
zusC9R4r+1M(|ahCDjt@%H?Nc=uvop-^&o<qLd#!3(6~hDDxvx<GagcS;f3&{1lIkt
z!gB2qs9jdJG^p^eQe<*&-4*1A3i^^hnCXYVv)jV3J$Wb2>xq``^`QbY)OZYaK<*<g
zJjK-8CYX}k2>%2*y}2@iNSG2)9paH<u|B*+(-6obmoQ_@IaeKuJqI@C;bJVPQ{oo-
zuH9u)aE8BYw2Z|4Mgp1yZISs%SH8^b6qQ7H7eMgbKiS<X`na5PO<5L>Va-ZOu+dZ?
z5depA%qPoYPg!$QxDz}yTWBMwC0wYu6KIoiJqps2XA=q6?LR+{GqKHuakrSny$xjG
zT-(cZ6T}vIP=LPk!!Q}D_1_*FUp~y27adPWu9e>s9|k<YE<x~x>2#0kjO0oH$%-iW
zgl>Us?IHi1KKP-K3!im>4Vz_{Y*?bLv`|obO5bd(YJGCYB(+em&hDrDvDbBboNLr(
zz)|9U2F%nAhKBbD^peN5C3=v5w8y)J5wbdYt6}r^?k+buZ5Y=1>7Vot@$ANStQocg
zC}avl66JRbqqaEvrixWhLJwIXTsUAPnS=2QQnHn#h&t|P-fmhOUtz}FmhZ!dN!PdT
zZK}KIig?HQA3M<qKD_()-b83bXmakzN6pvy-{pP;J^1IR^u`QsX<dPhw0x-#mphw)
zEgB`h1;2a6s7E(<-$4FivGe{yNUyy#Tc`i`0!qjG$odykoN+^DTQ}3o`;1#q+mu&M
z{d?u@|J$cUhX``U{j||M-K5ix`HGR}V)xtVo;^V$IaiMLKj_r_-Y`1WOf143KTSI2
z?|u6~S4ye7M#p#v5%T*;Irm=Hsp)VOkk2TjDuNyPIuEufejjzjp&+7^Q|#{lbuWqv
zh#+U_J(>?_P+g>^U=iPTQ3V#b0D~kZ7Ek7P_XN~BMIEPqwsIHjlpxNWKelc)zXpxF
zrouo<j8);hksH(cF<UdtRGzB9leJd0Md3t2acCxIL&Ui6fB&Rp6t%Cg+4P+uOM5|S
z+;!((^Tp_A)Xr31;w3T<5q)TbI_WNRM=Mevy0ixK#mqhUM~E(d7bjVxM&)vHf$K|r
zW4M#8vyn@`WxqpWC6j_)?cZ;yL1Uu$pHBr3lziDeO;=+FN}TV$tn-S!+}mS=k@ZU$
zNV~5_VK3(fV-M{da?pH5R-gp6?g2S}z4izSjp97#O>DhND?{fVv37)GelNRpO)lj^
z@EN#^u}H&v!NOAdD7(67W$L-WMFOyerL&KAs#@aCCXHUa4pO;g8uBt}gXuq?Xq?!4
z$v$B2nmK|zX|>zG8euo9C5o(ynQ~a&hBeFFT#oEWi5!tT;vY%R=E}`9Y7qfBU~I|#
zv2uC;yVAB5u@9}u^e9IveJxjCbwn+phU9(6op1*R$vM<sGbbu4ZkP_dsYNv?r9%Hx
z@c&ZAf2x3fr)BION<Zs!PSE~(GYW6(i9wf_q1v}g@%-?mH<I1kA~IHkyUg4Fs3Mng
z)_V@TK*~1}mg9rXMe*_pFm?|7zqmd#_Gqzb-I$cE0i#gcJWfMa6v<MdNH>J%gejiw
zR@yXdkCLe@g_@+W8<n_6#Se^yiKTf9L|IHbO=+kVv&!6y$>J`vbNOqW>*MXh!PtOG
z#tF_5G1z}Ram^~EF#OiU=DmW5R}vdtdpXPNPA5{zX=biq=^at4vEa+g0UqZP{IV~K
zdpB)Rx{qA@*qp)UJN?Jo1y-0Z{W@fY#kXs{HKsYd%~nCY8I5);cG15KKAKnMw?f5K
z!4&`b6kS8L7XDik4bde_vlQ@Dr*=;R4Q`-E=vb(?@sAsD4TGG`L)?1t(8MS3<J<5p
zM)Pi#N`L98>}n<a&0$%;g3G!?xDA84mVC`5f&Z}pqxpk-A8%J<J6`1#RS!vdpKjW<
zg6HOTt7_*!9{plk4e*bzii~zEj83Ol6QEWY?4VX~o^#8BfSB}ohX~YXfoH$dRySE`
z>ieh>@G_@wEYi$x(TUAxyk{Z+)c3i`Pui`NWwbHRPYiGJ&UH8x_$zv=&|^@<rJ?h%
zu_zgMp-flI{Ev<MzHm+K9NXIUg<r@YyxE&r#f<iqXO~I1^<g8<a%1BAXomz3LM9H5
z<QY46#IK^kl>y#WcL_oZmdykA_pdwq<2bbXKOOD*o_^h!mJP(lwic`8`8hWXTdEhS
z3)EWr?boNNxHahg<HM9*_(d;8fic5zcH;rp%n!wCX=b+C49KrY=T9@<Kji1?e&RLR
zL7}fTWz^_}(82T2zWfSOvQA$RFP&MR<7acS?btD;fBB2mJNvh^N2yvZZgX3XUkn=#
zBeIGl<bHm7)uGXFJPSVeV=lHTV#<>Lc|JFSHf+G44MrV`as=eIs<uiS)(2Itq&hnL
zTdFh^)XRvoZDwcx;$}*<G((G9X+shS9HbAbQG*a>kq=N>iV!<_QZI@7Xk<cU_t{gz
zy(>L#D(>!QClKC>;$$_%yf(2hMy#%_3DRrX;9Xy&_X%XM<vao$DR+n@f}^Ai=IL0n
z3j17ITGtH`w#equ5vwj(g2p9J%$(Czbh$D-<#(pIk#ZB(1>C%mvwyeT&BtjbZWEb)
zH;EvA34@&e0q=yCZ8L3Zu$wZhZV_|K>VkQ!?(-<^CTBVaCy|d?QA}l_Cf%$pM$&t-
z)hZnK6BhiV`N;P)S~#S;`0%XFgZ8EN$U*}X_kb@h6|th#za^aGI?jsU_<CSO+jU#6
zQAkD3P0pXFKtOvfy%66E5NF0o1YJa<)Jo&Zoq;%nmN693rC6LDY}2s$VhgmSqi*_i
zM64<WKj<zNXY~aR1(501nw@U+VZX9n^2S68jo*|$P1!uG-V4axZP?34buV6WT;2H}
z(HRmsCU&O9FALq38vu_rj=Ze#y$ob|ss%|IlHJ-4kE{JAfH(dWdopcPYdl2)il{=@
z%sDNB=g-OXYR%4nBI^$jU&e->^5P`)YlY!mPsH?2FAaj@m%XwczxtiCSo%-*TAR`a
zA7Zd52kz~x`UELs|D6Ur8be0p7^~4$ym-E$TAUYveeAdQ8QHKG{h(zd0M&}iKlc7e
z9Lu9v^K^}hC97C#v0*CEY^RgE%yZ7{@&_8+;85)4;Iy*-Ph9Ed$9K9LEVmzmOZ@XI
z<+ipxin=u$?!-j8TFtw6FPLy7FwEVml?Qa9^b;q4Wn0iYd|OlpqZw-4{!vvk7Zj<C
zrlRmMrY~<kLu1p&hP3tqEZymfjg*-(175z{VDja)3|UH&#&0`eGaFew=9rypG#W1N
z87ZSb)hu}Cfm6k%qT!8KZpSl1U(odZYITdjfB8h)#F)$*_#=u3p|W^BppjFz)x&DA
zlO})J%X0REY!8HP)PN~EiF-0=w|fe0q0a<7Z-(MpQf_ew1`KTpwgzy`Mhdk(u6~MN
z!^+vM@3Td*wFC*rEK+=gDN+|$ZJ|an@$PIrrhYx)quNkV(e>?SPrBmVy4+?$?X(j2
ze*7^YBNWSW*kym`Mg+-pQeW8*V^K=UO&GZL;->emTEA2kx<MP3-&1TGU6xpg2Zt!g
zmzPZ2z3{*655z%ZYezR}M{gMz@h#ZL?u;fri+A_naGb~6{j)3$4O$ZpE&S$2he+>j
z;UES^LjU!hp`4M)?*SSAW;-8B2oJwa1-qdGB9)q{E8H-G>8t_$A10hY?UIM@B3s$6
zVjbnznQ;>(im7k><fSgHKGHXiEN>}+YW6`@K}Tz36C;A-pFsvZf7gPCMT@})Z;S{E
z1=;G>|1N8U)s-oPWFG2hNdS+HPv>RKbZZ6cFP)KAxGrRC^p#C2CTk7Qm~dSEp3C!-
zh{w6-Cjb=IVzKdJ3EUBtIH5i2O#iQ2QB<I8(}m?O;HA)?vLhL9zB#ow8vb|cH85U2
z>3p?Gnm~!LYvMC}A5s71Sv*q?0zKWQJ?~8GV88#N6dxFp#-si=<T<r}Id}o_w2j09
z(o|Y>c&7z{-Mrc2IrS52x4=L8ZS`3-KHuOqw~eqf^+p4BD$W!tX#i`l-L)ys)?EfQ
z^f)+k`s{SCb~kS9B^ERv_HSi*#@EjmT=O!MxJvE6()bl^eMwEDsbZxfmo;#$9s4tR
z(A|Um;*3I&nz?*VNqXG&nK!kVG1mv`e~Px$e`-X$Fv*3O=ISyk1HQ^3Hh1#Q<#W7W
zhnpxOB1i7nO^C&Xd|kbE>$fcR6|65qDTZjuin?>$EIgTOb(s$8_NMC=?)?+wpHUMz
z?kUE*5#It;FQjp;cfGAW?f>iZRbhKoab8*7beECy-g@KgPyVGE|1&E=y;?@<ET{PT
ze{zMy|MZ~vCf6Ah)Z^fHVsAerG{!Fmz02R<woG;y5ViK*H$!;2(|YY1g9%*h*y-<R
zty=KvS3I-W#bneQ@t3t#?aNbI`mT|cwLFk!Yxt2lH-@rH?|ERPMHG1ORpPtjJQ#Wv
z(C`&Aht?zk)NKR97@84JXRNF!@vOxLJSvr@jNF$#>djiHML7D@{!0V+l?Zwh%ZyAv
zmo>3TuY=r#P)hkjQG?bQOuihYEZkE^_(lYKm&Z5c3&(%UB>zK$;eznWf810lK9pGw
z%S`xe`L=SrF6Vvgw<4or2D4xTu)LC09s^5qD#LT1O6>3o_pMzAw(YC*c9kXhg}Yy$
zTMMwwacBFwX~<(f*9KC`iMz{Li+(X}&&K~<g!k29BEEy0e&U)EF@Kka`(M_jnTu-W
zP7BU7I3D&R#njHy`__QpcREE(UGi@aR$~y?RhAb1l_H`~qug<;TR_*j?mS;@r{>yA
zC4)V~qi;Lji-K}%O?(9y_W7T_^vA~6S>oLLcK%y!?;X|DwzUt3B1J(&R5}q*z=QNE
zRg|KFASz8dk=}a=O^_y45$T{5k=`K?N&x94(tGa)2oNB2emm+pw|Vb-?-<|bUl~KP
zGS^yj&GkIboO^D{1EZf8=kKP=@sotGg%tdvBY%DR5nZFicyO@n-#Nv9&v5>kdH!#7
z;9n`-|AKJ;co-i1i=*UU`QP{uc~cO@QGG@d=-mMO<2#Ea9Zw2@WOPQpu{FSS0yY$%
z|Mzy7E!=%aNBIlAYv1~lB9uBEbYrtZxyp5}oC2{l;|F%0fJ;a039-}_I}EN<D9p-z
zy<9)OUSWHs$6?ens4I|y@E3oT|FF+Qe{cZA{A^OwM7zCl2wg9mHu}AC^-2OqP4GVV
z)l~8Q{c)dx*O-R>@QM-cH`146f6+j#Id^BI_3A<Skz0@Zl(ie{80I7Eq{urfH_CO`
z@T*k^&ioH`*u_+f#=@F)=E@te@Soqs(Z$f)lLz~p8}3D^GM~sUdn5Y`<yr;PLbTVT
zQtaEERoy7_;af|}6;tAi$DEVy*2k}||0gfvQ(X2!e{c$<HH~OY^_UlkT~evvO*hkL
zO#mXHV?FgyqDeA@+m%9N;?Ur+D424}DKR;HpFrkVnk_ZqX0`kW8}}7=*L7_Dq2puG
z@z<r(O#Q*?x|WkTbP?GYY8HQ0-5!f*^=-0~VyT(4u$0>(_S^TdD9ttp;#0_S3{0Zl
z1&q9dgp=FKYHz@x)u_54`B}NL*`@Jn==f0-1bEWAWkhFwDRhp4aO*72^YZ$SCD=if
zaz{!X9~KJDdUl3xYJ=UG#XeK^Hng>NTf0$ml^;LaVB&=}Zn7$iiR+s0lBxphaGE-1
z^_}}SddnBZ=XXr9fsQlH6h0dsxrMOBS33a|-s)7T)?}IVXdgB&tf|kJKq$dluZENo
zPt|Nn0;ZR<J;=v85=Ko#Q@bb7BVl<sM&YAI4(OO0UVGs?jh5&K;KXBlV9iF3SEzkl
zC6jf|G9&AAMv;;ez}>leb9g#g3c$N*&=uRapMRk>p!!SiWG!}8$W`ilA1YhKN0aJo
z5bTMP7aToXqOQ*eObFX&zyd7JB7zk>b=r=$Y^>o$v@HW@f8c#2fTt-)qLCEdXUPe6
z+Yx7rpQxuDhmmx6)!6u9U9B#D<wNa^>|Y=V5Y?rgLYur)GO2Z4v!GH<ur>JJMr%Jh
zX4_*stNzDykI-iWnHU0Oy{;p}0U~H*y_}M0Bu$Y!@glvjMD0;_3gs~FH<#ZbU0%;+
zM%Fp)UI1TNK0Ixj`@})8fMMnFxwMY5#evbE6fc~{V8FU=s+2L(u$J1}YKq=Ka#wHt
zC<r?)F5O*#<wbnK0suQEbh!zr9k!^&Ho}Ew>!w(1hCPr2i}{z#E&dO%Q5?<vAS=;O
zemHn;k=JuSILDGX=4eNt>%C3MZUl~FsF=oEN96GN<H@I6HppFb4R@l`ui+R8>TMCS
z@mb4UIV2$ni|t8MyglEu-gF=MOpo;!Z?C4Re2|sw9q4p4iOp?c6tmK-#$6ksJhh(H
zU->(;06?T`ecW5LFmZvqTDE%Y!#Mm5xhKC@x$pB>N=Om7ngHN#My4XL?$pC&7yh*R
zt^QQ`CIAZ2eX;ySL)G+EZc~?7Y^n7;vvItmdS?kng{&*{NM^ntR9#bs4d6vYNMl%B
zCCRwa@2KbuXgXl5Mq$xN0i+n*ae9k81|a#7aPCE`1GOQ$;jB_$FToLWj_y0injSTW
zz<85~6aW-nkpgOz{@i>30W;oQSlr(gcG*Z}l~-QXfsO|8$?Gh$s69gQJqvdZq&wn8
zT+?*pGv!q!;!VJct$g_|<aRMqYIkB~EZCv+2u~pc5Exv=ru{;ni|gg#1w#3ZLoK6C
zVO$;%pQ;!Q6%BnTfkw|4%eLiyZdk{9dV0WeT#1YnQn+<uRb70Ya#F$PO<Bd?P+YRl
zV7_YYvhD?2@69TUS_MV{fUjwgGk?FTjZSvzF&;!I@ikwmV!BRafV<$U$6oJFigyh|
zKjF#OR?eD?X9#rjYpc|19HP#^9k4>In=q`-hX?KoS*0aK)%r(B-K>s0z49Mj=qBAu
z!3cBFfX3iC2Ei3K(r+t98Es&Nrofhq_3osG9}m35(Oz~Qlaea1X&M8RZHbyQIGj^Y
z^q89;-(@u%TOG?t1ZK5;9YjRKbA?P|x-)pT^}=>Z;)WwoX?Miv?YWyINas2N@xdmZ
zy^5tK7pYk72J;!c6s#UjWqJIQ>)!?n7pPt0kXBScp|Rdb7}4SU1AV=<<0+#f-Tuiv
z=^7JoxG7M?%>|#!84Sc*G&~Ksmfx~&O-M!%<^&es6;x~D5(J#hzHmpj^yDzod!24w
z&|stmH0}x4%~_ss8TRViLaF3Uu?JR&NYU2P3VD>#3zcL0^jRMNa#p~C<zcCQiDlqP
z+K6^iQ{AVM5j!`@4{FcevVW$xs&;k;7krZ9CK{2za6Rc(>^M)95IH_hLIWgRuVClN
zE5Cu@r&H|x*)7WJ9T&z0-<yBgu%Dr{Ni2Dwg>m$_DCzyQx@}GoK3x}9vvpuQA&iyr
z9~`C!j<>bIqp|=Fo{|M}-lN<59!PqFE|~@0`N(wt7Np{Y)wpiXB1Nz2ta!8=R+Y`G
z)^*2x`3|J56_|Y3@*C4Yeza~8aAtdWHmC~;E3&b?UpyJx&x#i>0`==v-96AuOIuj>
zQTG{wB(V@12YyuRdgf>=_iN_PS5=$TehU(0{jI*6e|F}{dGgxvBG9|+0#UeL7?Z~w
z$uv5cN{7`7m5g=bDc%h5$8+hq3Wx;Anw`TgIceN1yal#Wdnxo}0Uu4D?vb1eSfxfI
zKoFLS=~fo}9inOsQYy>xNs9C6u3B%ud&6MmPiy|>2}CwHPq5LwM4H7;?H~5`exueE
z0)m*bEYEK=o2}nCKJ%?r((<*_X?4F7LF_+f;?41cE1CK79U3n)8EQhFVCd)ZG{ov7
z1H*0a7ROykmoSuq$TGNZ-|4O0iGA?w7gwADvHGkDbUQLtUpM-~oAa=)&uspH<rpUe
zwa}&G(FlDS=SC{RcrS8m=pZBVOn?R|$Dci6rkId+A>i=@_F?kK`BNI>8IkbGO?OjO
zO1g}V<3#IudFdukJ%_5RlXKuby+g%gG+U}H6sPn}f#9>JS(Yj1#7Iahxr1LnL#v{n
zxR6K#_*tj66&moC-*$$Ttf)yvqO>}Eyuc^IBN~`px8u@}{pDwL@%RKl`--uu*Lc@l
z;#4D_nqm@<Os|N0GP(6^%YQk#kBf5!fVTfT<O})m26mnQ-q+5cUQ(S8ak45DRU!zx
zWJI^0Jb$_h4P@%1le^J=Z&5^azFUt+5!gBk*4AD}Fr9&57n`o9b+^4Bj3$3pB)q1m
z)wa+-vY=X^4+ZN;K*8QdKY^K!LRWeIy3gNKU+)94>8jJ$r_krWyaB9RFcFb?XB&R_
zWps)e_3U+Wd&P3>+stpE`!s4aDuS{#c=A;kZU;S@CxX8F;do1(3~7{dkfiCbuZFKj
zJulL8pK$dJ{7IduWtZ}!kQoB3LkrLmyNrInnpDRP`wiM{PVktVXq<oZ?zVjK{Q#g>
zL{z%>Ci#7Ei%{eI$|YKnvwIsNM4B+o3mD}p1#f|pIzU&IKUmYfqRo8j8m*Eh3DDJ>
zw!XVder+VNx!6a>B>u!ATDHEQEH^y5Vs7f-*$?k-gt$;%9NrS_zuoKg@|GYX%C({8
zBZi2^kQnS}(QO};>hvMk_8=VZS^<C8foF1isobPRYLHz<VZB^Jg6H=?{w)jxo~Gux
zBRQw!i#@V;HhBJOTF~y8P(MsazR|M@Ru9N93m$YQ`{Mu-Vye^KyOXgbN*7btpos)E
zxiuMoMK?u#y=1m%#E_BQ{RQAtZ!=klXg2tdFY00*C`cQ%Kah+20E0zQA8)7-kZH)u
zTZ&fNDj7II{ij1cGd_Jq16?Ru5*a1ayT(W{C%5{Fg%cCl0F(<ncH9-xlqsSE^vg7#
zwtM+L(VfQ|EHCWHelVFzL=#FIWxG%c^da2UaTvX75s}^$6v$BzNcUYQW$C8J^;k))
z$zx5QH)oRz(Pk8Y(wGfAz6}qZHk=F`FG0Jd5nR`?B8sG#vixDzu$kDM$>Wwccop+0
z!~?KRR-LmG4kw@j1?wU1(PFkeU+DdmYABV3(Xuof{%^GxVBVa(eDQHa8=a}1>pJJ{
zV-4QFhSA9EFqCXFaz^r*Pl@yaHG#0AUe?w(rhA8NbOt0MA{(9b1~r?TmRSn#pr0$F
zC`Ie(o&#MS<beu3??FzwpfLQ`xktN|Fu+ZI%1mUuI(BApsot}L+^O;6{*=vrq+g9Q
zUdxk)Q_@W>z->48OnmuNn9<shA<+Hzd>fr2Y#jcqSqzJ0CfjcUiN>ci^Gtnb)nDVu
z&^EL!Fyu*mC^ypd34s=NfByVy`;u?m7QM?F?a5YXH;-|wvQurm&QdR`FG}|<5wAty
zHJ)i<IDkLw$kT1Mh-i`}Xr^IW_BI=_)h#0YxGlI!-bvz2e$exW-9JVlIkF-Rse{ke
zBbSf_j8wPV7M2}mRb{e#3$b>?yN)-YK#gO40H=2#P;GA{7<aTy6vxv)kY6w*jxnlS
zpEd`2ILNPaoYgFU!LCf$p<CGM*R@n8pg&d4^2D-b?!?Ubr4-4Ntsn}8qmnW-leXBX
zUD@6X&Wz<$d$TK2BXYkor7VZXC&c}?vKZc;4yR6s?+yNyIny#Khy=HvZ{{Fmeds+9
z`X)I*)b5<oS5l2KQhKX7`EWB>>Bn;z3$r}P!lvK6s19}0l#|~1u#~ODc^FZ+CDiMy
zVnlBxZy5D0@~S3f(R8|&r=x;hX2JGKFxfjiCEpN`1=gfoS=R7aPO9XG#Bc^zpFfjk
zC9(-oLlw-8!n0D{E9=Ip1ax1+UQX?;gmRN>IlX#IwVfitR!D2Vi9wiY200edIB!Nb
zP+~4`>xi!|b~_L+92-JXp$#kSg?7)#mThNF>9{X9vD~H!^QNCC^ACgYU5kglECgWB
zpW@9{&AU{s3++$p+@}u0Ro|!v%sz?~3%pG(LT7s0u3?3Z{0W%=-P@fWM+TeJ+3z1-
zJ4)8m-9(y9S;RU;@TRr|I_7USm4-P%CZaXo*M2K9xgp3<1vxjk37IscWv-7$n*92)
zpNSCO=Nzef_q&`Vsyqry5||y~o569=sRP|wg-JjAW=|7Y-FvmID_25A58MsHcmtfN
zTcnJKZ81s#==vu~tuDMQIzIMid*c%qoUa)t^m=x{rdt_`?IHx0FQAmfINU@WZX@ON
z*a1Jr=}eCS#n+}6UFXRRq<S=~`?f=~6d1~$l}Rfms_1mK2c`#HixZbTZJSKrY~cP&
z{8gE%5lME8h@TXC>06%k#o_%=IS9zo{St=$?voOS`P1ta^oIBNiz0a*{X=rIa%Ijj
z;z6vQ!>65%M5J<7vWe9pa=M<jx%eI4wLjCxyG}#5h^yZg?hbMLoahp1cK%0cMTk^B
zRGP;KPDymQB^UA!+52l;6>XlY8PR*Lfx`ok){IIWM~^G5kBBt?tcn=w$Aj7?*KM67
z_Q$4H_vqgLzkiJA-~AX-EoR%=h6SyWDmhz$9rX%zYF@k+I;Umm*)6nu+h!x~s(cbz
zQMGLAwYOwN6_o!e4{vdq?3$0>R~J^Yo%ztq|Cpj3vqk3jZCT#dnDhoJ4s>MpXUfIX
zVZfs`%`B5Y%K(Cb9SZ+HcIZKed?@vJI4?8pNGtWtzfbx*7j?oJdIW~8R&m+vMjD8$
z6?#(G)m{&}Azvw$HvH<yYQAyNkLJ(KlDk2WXZ^n<^cmNx=K}@$-?A#e?&w$Q`^c-o
z*G2glB`rF33hNV+PI0;BwZ!8t@(oQ|gW_K@zP2nsVoV;;MM7?-Yo!f^X1Zu??~Od9
zhM~`m?&lA92Z2+$>vpy`2=Wa@r@t7PJJI`G|5H^h`RZ_LC<Tx7DiEIHBG+4Bt$qsm
z+d)4m<W1F*Pg$(MO02Z)bn~3ve8VHe{#Oz%<mQY3surU1tacb5C$-`7<YFVuX2ZhT
zc~k*s+o7g?KoL%chMtpE8B+0Yp7rM@Yb7{sl>=sZ7G2(=RFgVvBc^wMh%?y*zK+_C
zM}K@h>**Lq!8X26pnB}{OL*De=#OH)pL5V94CY*F>OX~BJw5wzc7gJNlCQ37!JVzh
zcGJ4rmP+)#&G>gq+e^){2~>X(manetL1c_Co}srG#ic%2O$$8ztE2<ScP*z%E$NVh
z0k&@j9&<Yx?31HT5jl_jyYu&JOCcY45hjU{4US_(y0Y@Ju7M%;2$z9Qe;Ssy*`N*%
znsTA@Fr=wnJDu;AG^~G&OHXcyfP;6z=<-GoB*Sv0$Z(Y$EBQl)N^w7O#LntV-J^{d
zs6B2U-bio2Nqal*=r<mM-4Fa%#DMg0+obD$!;;HiQw2B+z0_)&<)fCp=Nni{>EoDA
z`fY(@&>jYT@-_1&SZvSyF&!0CN+`vLZDxJ%XbN2J->KH-ht7>lIRFs%w5eTB;(iTh
zJ0wMQd#Sh`45tt{iX7NZ(zov<DEQP7WI+mf?`z5>L4D*olAC(M?=b~`m{I;KrFfZ$
z22Dk;ob~lNB+I4#R#U+gbZIxiBeFWrtaiESIocVExg?}RTsoELso=5HPMmRekmZ`@
zwQ{0?RzixJpY5pByWJTJ+JNs$u+v8rR4?rB02$TrMq>D1j;(*T`7oCK?5Gl(?zQ`8
zeGKHoHqe|656CR=VWB6FD}V|D<8Y@0^DM>XILnlsroyv?4d>0?RUj7}Wdl2?;AW~t
z33l00FEsB>A<r}=hfT&e-fh5^@c?K&R2RfZtxI9(Yh5+Ju`w}06-Ylj1QK6adDd6_
zOeMNN-2TGy(~^~Uou3O&XS6GAj}NK(2U`kbdnD1vs~whk$rr&=ERx#`jybJ;sP^FL
zulX)^_QE4hsK$a1HD9bX)Bie5z|+lHdYh85AC9N2>zn$FHFSiMJrd0OfvjGy&`=~G
z8daw97cG=<ic+i6ajMiF+#fF53KUA~QW`O!!X|o-;N)DCP#Tb`{o_16$KQ{uF})Ss
zc_t!t7VgNy#^dhYj%w9E`RZ)K`@I%d3&B}X@qwN%9q&!C@Zy)=0{ibsxp*z75aS6n
z<KD=0b0xNy3zvicdEIW46+P>LH`XHeYdfWaGCK3iUX)MG)85$#yO6s(b-D94I)%D1
z)$@dSwaBo^QjBr$&d!Kaa=5T#{qPIdbSKQZOGv>m7tT6dmDKNP`b*tQ_F9bww2~%D
zgLbnwU%!>J6>XvR-VIBTI8v^nKK3s>n+v;6=K}UQigYTl$5at99<4CmC;;C?{SZ%3
zywuj?x_d6OpYU7&Q9To<iqR!lIZAhFjay_%P6(6x=zW-=8fQ<1RteA@c;WOO>f5#N
zt?G|TrD1_fTCuv|NyRTXB{~_lcyTqDk~Q5c{I8G*nAZzPDRbN2Kn;k4JuhN}&F8@|
zyWY_NZIKkZO`DG9D<rW>P9fDr&l&7^o4fwNyK+)Vg*R}x;D{+9;`U<cq{x%M@&`ae
zzhG76l!wy~7Va$4HX>M3`N|Z+nUeL~$L|kfzNyMA6V{82dMLz#0hS_W95hXbhr)VZ
zYsRtfp>w^1P*c4mW$7lRo=ujEKK9Kmh&gwImvJ8Gy5%XSqwMh^P17bPEJ1|bI8gml
z4Glb&61z|c+m!(<suiKZS%E8@`j@)JA38cIK+GEL`lMJN&~Vr21pDJ%6GrSwEzp==
zH-x!`>)S$xMNQ9`Ta)z|N$;Scl^GYfDqcEG;$*LBSLxjxl=)XXpzS<>R-3}D0$L9O
zWC0#rF-^(-9BX*{5W=@|P1~X>;mG9P9*}GSDqzVK^-3JOB)OAK##Sd+?)D1iv-cPZ
z<ltgah|>)lQD(2qLH3NGR5z1B&E}#PwsRi_A#toNIw8EZG~GJ~?G<3s24~{=?$lhK
z#?3L8;oAON3gHgRe;n3(Q8(8n{wRyk0(wj4l_IseP085g*-z$#y$QR(_vhadEyu_1
zEeHVJsF-1ZprK1}RYOi*if`%=BIw2&{Fk`d2x{6D&cP~+b<%2oZ2)1C$%A{&bwD-I
zTM?TVWRiib&4y&7?G(?z+f$t%sm6bs^W#%==MMhFi@=pOeBaxHqhout_?7I+Nt{=<
zq9?pPh7^?VbX1)y5H>uqur;(jPtN=LlL(5zv1-cGR(5vd70S?!;X=?igJlsS8u^6j
zAjJ&Z2R3ERE;f3JwYC$F*ADEYnaKe^n(v(QrNZnVpS@r9qd8At)ye3RYXeE@g%z!q
zkDms?Pr8B(oTA-bw9z%DLNz=dGY6H)Z^fNfzO9@^ZC_pD>u{7IYsTP2k>{1jsyVjU
zNa57V5pRvk)Cd~+x2q(EgIz_%W9vB!{$|p+x*Oy<zAHp=*pGK>T1&o5O5@8b5HYcY
zz2B-En@2dnGM1VDos@HE<jd&XacA%?1<yq63i-}>SehXTpHIdY#Xup1om`U7C%H)p
z9$Vj8<z^Q*hJtsBT=nmcba7fiI#xj9+I&QSL?iEl0%A$m&QK(wXwyisn%`_*=#M{L
zD4U09<lOMYhKFC%lx|-?eEj-$sFU&O*Cw7?aB?Z7`|y3T;^q%8D)jr{&*r9n>2|x#
zAoxjfY!cKOrgU<GtX(7%s1N_7z#3y3dGS27$L8u^xM6^rVC^{9=CNst^WOJ4!C+VI
zl{Cd=LdW>kFPM$ZcPT+%W`CPN)%qTaCx4$dJ<#?7pUO(kIN0Tat&U5-*Ptl1j#r{U
z-O!{PCc$ytXCVZaXJXVP&>|oM9*U;^B7%rZeXUwFuSZ<X^gum(PS4Kyr4LS7xaGmU
z+KlBC<;MoN+wSSz<0B4p=Y{Ew{#^tEj19Zv5a#*qNRgF5<USo+*QZT;8|aQ)EK=z?
z1+w`I3upMK<rP)uizDyT)4T}eaMgJ~B$zrHgD|_dK9&WghKE2oP_=191VxH1I7p4c
zUEKso$Rq^f3fNXAs@8<6<^DnfPxZ-om$>G0g5NHFuk90m`B}@vv2*xHbRb{=iYo%_
zu5znbxluN}a_vu;`1`eLbQ`lPwNBf7vg0iveS@rNJ?!9K%@?zpWUm8qp(CwxZ=3nz
zq%Q}~=X}8W%^g8z-vsCZ#fAj}dB?(>s>O#a!zRx1OTGtQVJ%6}#qL4t4+*B;3DYJ*
zA{BRBfb=FHftC({F?zT{4_xttqF@eGJ3r!%zCx0d%j1zVTeJjXk)xnYx8r(9Qu+;9
zWj1I?HDO|sKj{+tvcWsNSK$wcUWJpg(UV;^h`xQ-myk`DlAHztLFDm~j9<tjcWZ!8
z30>Tow#5!iq&%BU7pGf_*x12JA+Zzj1qIm&QqHJ>?3@il@mneaFFS!GSAIM^`MnFm
zLTiu4rH!bulvVOvap1==iN`)0y9=`%yXwjDZkyxTHD=>VzMD_zm|#s_??>-mc$-3A
zNBPQS@AO*<DO~e@h!k^?T#8Nn7~`5p5lfMghQc1rnZg8i@kkQ>n6db(6bgE2d`XB|
zRnPAg?jjhAvVMM!&|UjhN9ol|&ZJ@r*(Z(MLr)%Ge$QO4KhjK>^U#k!WADPRkpl+2
z4XG`^@KciEDsy@Rt<wLv-ZuV7H6`Eoi{iICkJhPOiUTd)eI)y*Q6BA(zT)~hOiYNu
z3~s}PU(&1L$xNxg*|s_-tLxXJYEfSlD)c;>(D=bWjgqE9V)e@yK{5u>O@UZF(8B~S
za=Q0rpS1W55WZ5@mi~PeTplX_wFQf~x&rGq;T(Lwn*1w^qvwOG36s-*PL7SVBWjFi
z&4_&Ph79e~qX6B!PqL|(M(+KzFP?-5S2SgIze4rur3LFu4x~Czw!FfS`D2ZQ9zF^D
zr%`SmzD8X8IZTHv?~h7*`jYS*-|M!Va_$xQY1}f)aqQmGmhruva#)vo9mk|rP0W`)
z1@AqRs4U7YxbAVp8%aHN+_D3fwe@4saUVvarv*#RJ0oc6Pwd8?HEhX$wGG_ThU~d%
z;jB%#!NIC>EX?Fmx1Z21Lnl2Nb;36~N-5pZt)ek9Z+sQlqRU?Q3_>Ng1UQbRHzD)F
z<n`F}RTbh^dR5kwDf4A-cDmBzN3T?xu}@o<eU1#Hvg&cJ_3w6$Ph~pJP0qo(zG#Cw
zJ$C9K!y{?hU{182z!)RQ;;rd^))Yw*5v!*243cGnn?X#pchdXEMa>l9CjG!88VTZe
z&1H|(Ct_VkxJ32T?ep`0%^-?@nY-v)V=siKUN^d8=L=s0`kK%m-c7dM&LSkZ0WB>#
zdf5pp+WN1GOec>7Pgk;Pc6{C<=#EBvNwoC`UzDERSrECkz9qqk*)4rn{n97)^5kwC
zrrq9Kynkucb`Cm_kgY>Vpz+W`wv2iF%XyORil^^QIf2@LrwDfz5M<B#^c?jg@68b~
z@&ZD$+lVNcwmz@aZF6)uElc00@bENyc<S-?7FKZ_O}Ac+#5*}uaF$lFfBb6#ga(){
zaeaOie44p+AY&9123pV2-grpFqf&qBy}N%OS+9zL3?PpWr8z5nq{o&&rl&ow91%Ng
z53MKd;7<Q_8W!4#4RQ;rl(+8}j|$TJ^3C^jtYnHz=wX_=yS_=EZg-_%=FXgLm*-Wa
z*<Tp%mcIk%lpDvY{<Sz%UL!S_?$olA6AJMHwr(N=WE~|I8l82+eP#UE-g7ie|K!u5
z)Hr(GYliaST_8-bNxtzWOa_HrEvgEli0r9fJN&*?0}4B9P%JnzEP*ZrR7&MAp-&gl
zm+K$7PgiYJtn95K2^S?cyK>tj^QZR^Vb}8q=b-<|UkD9y^NL*rw{jBG4IR6atqkX2
z<On+@(UR_p3AV8>D8=46MQ&ZmST$}UV=I-$kZ>mWJ$?bVQ{(B#YcF0FFH8_b9KU$t
zLfb4<j*B*75|qANux~8A;jYq2WD@(R{t`SIW$32>>hL;P`%>?cHacn{-U$hLNSCV<
z%2Teyzv*Va)Y?0SvZYzC8P-V8wXGE)ox9`OD!F&zA6Nc(@yvZLMnnx5apXj$_A*!P
z+R96z(3sAHYnr^(Yc1mM1l6nLfr|Q_%?oEKHSAtZ-4#j|+1vG!Mhg<XZ1zRgo!sfL
zp`j<ZAoiBs@0RDKS7awskDoDRe>Ns4(nh0|4|xu23wUf(cV0!Kp)3!ZV!4V3N*>P0
z)mILAoPF4rV9OR&mO7N_X_nq7GxY{P00p8Y){|`6>jsFeg+Wd16ktA<?ADH4FXqwm
zy9kA`lp`>mbx|*D!6*Lgp<mV7YdSB%vrcU^3|cmqV=0TWEz>I69)7eXnfK}62PClX
zRsyXf(Mv12E+PX_q6)!e*H!01&macqqq2q;-F8a`Do)PU6fx#8>uG{?=Q^1E+?NMU
zeC8L7B4MA0o{*P(bO*nG%MJ`f7ZhH5tp3#sR55X$lbzsPD!S_ew<6JGP0Qyp3L-p(
z)Z_4%!m+-If<(8*{6<*UjMfIU^SmTwXx+E*XmYjLy)b?+jmHb6ObK|2Y@utec0yo+
z!Gk6Jk76Vf4&%x18W!%v>LijGoR}s#_r{F*#ZeBA(fzn@wigI^^-3S9@XiywvjPcw
zti9>A&q~(q7C8k{nkV#y@tQi2_1&~W20ORQGv5>r>o7(0D*2;+qPbUj@rzQ=Hh9`k
z;*NdnCFp&NOO$-p*xN*1k0$rBmEFE!w|&-*bkz%kE+%r`L4U}eY|rTQ67YVTabQ4M
zZ-wgn<3Fx$mK7)vbN!pxRu{$+pVx$(n2aVW3A*80`1tiat%TcyxumvEE9O#rvi*^q
zC24&;&HNv0x?QRH%*gsFZef8DT4r9oA|bdubp#??MlLf-pHAc?t4N_5EyQPZ)Ri?E
zY55#pIxROL)qW0OGG##S=ap5(bmZa3zx|#Z5%<6lRM9Lf&SowJf+$j*!HJoi?+x})
z)p<D!+I{<IalJ=9OYWLB3+*@Eqo(XZqy|Wxv!J{<dW9Zed*K$nZ!g-o(%=XbHh5DC
zZ8?w3J(cElZ{VvxFR|OMuYM$n&vteo49dlMz~O0@PfVVJs=6=;@~)rQTW??g(Cjpo
z6yb8vaXKjYeG$Wds|Aah#^cNGcoUzmm@BJmk@uYc_xPk$!kB>fD=3_{oPHxokI_xQ
ztrf4Rys-mGD0SFp8%?{D`GHo=aXJdW=mjni(|z@6(TCJ_qSm~$<T{89NU~C_<Z4i#
zEI>=d_~Yw`F^C?iFDB*D`IaMN6w0`7saXk%@7JWZ976IOb*lfk_H(hJCQZu;P&cPp
ztoIXg1S~e%am2`Cea+v(1gW8KIcc`?&9L3{5VuIVZ}MfQsYX~-MnPl;Tq6?XL&OGU
z?ozgZ=UhAmdx90}TZJtvu*PV|<Vg@&viH=@cCRX{+0UMS9O_d5fj>#`JU-*YJDeN$
zQO4(0pYx=nGFmWJqmb_W@Zg9sLe$>#NvwKE_0OwoxMVdm%O<>s$;WMHiD!efZ(Go-
z<7%%^Yy|Gc{dK7g0dGUviZ|<XUcOOL<uv!O*QbV$OEfFxPw0Yq39P8Uz=%tW=_TJu
z%vZ$_g0_n=cc%>zc5fc{?mc3L3l!r9mQ=}g(uI|HG}_`EDi#k24SP3dS>F+V*|$!0
z2g2>#XD6aXGFcL!4_xA~mGRc{B~3>peorcPghtzuf)J}{0F6NY0gCoARU|Y#J^`N&
zl@`X5BQE8}kuhi`vw@mSYR-DfE7g&PECCpj9ygzXho<tJ`4uKkEB5-4>YDE5-<NNJ
za@Crp($J(pB&GLdu0NM}RUe;sOro%qB7cUNX4E@UEh}8)%wZ0>X<s*O$p)GifGKny
zy};)^G-s<Up_tkqS`@kQ>S`hCoWZDdwB7_fS+~Ka{7Y!@vups?8TS{6+b{ud5s?tv
zj+*o@lVJ2&nCeZ)Exg3Qllu$sarFhEC9dVy%&o{OfGbSPMq5y_m@oz=$&+Y$f5%YS
zKM`FPaAUH6<kMUhI36kuZ?Ohm7BTy|J$&C)V#kVK9B9=c=so?K*n0DZY(=qsbRBqN
z-Rs-ic1gog8L#*sw_21>i-+`3q!HtMmjV2NJ8+*RwhCZqy-~Eni%Zs1RBi$Wh`kK^
zdc2BV6_}1yNM_nIHYKG#WzqR#vx6A*$0<3cxSQv*R=N;E+Q{n?N-B2DbyBlQhvzI6
zgU?&Ry@j|EaRfz>1jqL-hkUkzj2U9RsE+Kp$3sRv+Ng$TeEXP=sTMRJLM&Mbw6$?}
zZT<`8otv+?w%=3x0*>t&SWg^=ctc9eBr3SLm3q^Q36I)bB0Oq6Xbu0mX8K90M^E!Z
zy7BV(kDDh<C(IPZ#k+TfLH8w!qw<3X0suTAvY{|;vMiYnmfkbkJ`1tU^jSRiz@rgo
zc5@@q?sUI<&7E{<KegQ1Jm!mHetd@<uN_zP>5eeem{@^28S2T8IO~L}5G`v0UUBso
zl$Sef$qtf*$aBT#`5i!@8`WxMz0|{GJzST6L3~IG_fbP~@5T|+6$aCy$B0F+Lie?>
zLM9KZIR+SMyc!pgyXXyutK%G=dMVy*Q;vPKU-=+yAmj&Cc^m*IGUQFU34m*|oKXjp
zNGdbKWjNT2*&PY;<>Dl3A?-1vABm$K&94~086Upu?tY$z;5@@w*|J`)6m$QNAs|^(
zM(Zd51XB@+t*x4F)RL6MSp)C9Adbd%)%l#Pg}#$v*M2nW-F#l8RcrC3P}lNet3_4h
zYvhvsL>U2_T1n)0?py#EzF(mNqHu`s8{&?Ud8>w{v)9*yusgPSbCJ6CF&WeIctT42
zvrU`E)H?PMA)eevJC4UZc4ssvDHKB&rIr?MaFi}GAG2U}$L1$;<#dm8sB>-eV8!CE
z)`Ly;+}w9{PI(UMchRF4<GVXQKvI0$7S2n{sa_$gc!XnwN`wT%{HyOSD2K`Oz+diQ
zBfaUkYQ7!!4o*u-incxiuB*Qe%j03C;DY+Ot^|nHeI17oIlsRh5srVNmw!>GADI3p
z15;!3du174Hugv7&0MY;wxSo1@<o5gyeqzgZ+_87`Lb1cO9lAax2p_1e{S-B&Et%F
zF)B)rn6Au#>$WgiHi~pjc^d*xara#O4^j-}^n1C{4?Ssy`<1c$uN?QqHAeeQo8IkC
zE-J~b8E3WPX6yReipPD@C!19(OP0uj?>!HxqUX;eEaZOjL3G|uVWfVrvZ%+j+1Jgg
zbcHNE#*AUA|CO8?U!ri0vz>a|I~U6Jeu7}Bd?f9HJnlhgrk<^KxPV2U@Wgd5&}-nP
z`$gZiReVn7`+IxFBNeS0Eixac#{79P{I{Hz%ireUw5q+ILOSlitlJ*H5J>dhP?0Nh
zulc*3VmVp&H%+%s%%jRL>5DU4%6;a)GZp;5kpCZkXNKt>3UwK?DV-4=^IGOgTa@Ry
z>b2J%DnrKtq8B$=O~?;5YuQ2hJ9b|GC2fQ{wcH|Rf3;~XlA`Yzp6an5I5Y7-@8;O5
z2P)?1U6AdB&_UXD^2ezTa(x{8mi?FTXjat;$@V7rzigjC??+WWP`q7dXXiyZF27zS
zm$N`wd*d{9zim;+F}pTrWYO`yONzj*`}m!u`?1d=2K2}QIRmM@bLTsDNvZtfG)m()
zZ1hosU-4<~wg%96VC70n%VFNO#*X7}z)|LmCAagF<Eub31+joz+>GN&<)mq6zfo#9
z&e>KAf)dTCfW`~K-d$P1pC0$K1)nPXMxB@+6&y;{v4C!r&(s1v5qhWZv}C@bJ`H~;
zKNae&^t&&_GoT1!$4(22ZSo&q18h0GY1@1VS3IFV^s+8lvHi^i#psx-w|oj(?=VlH
z3>k1=5s+|VMUB_&@J9UhHw4@EaFe7wsJzHb^P^ahd{kcTZd7GVnuIo<D&0#c&`BaW
z_*gsW(!c6d9ONayOoEkVPxJ5pM$36mG^$6!EX&SvHbhFsDE3-G>#!l`hEQJBGLt-g
z!cBi7^0od7;o+iBwK=gX56lPvLu9j<&xV)@If?&S7y7%gn13_xNB&m8`)Sgs-uj>}
z{cb;HDyV@K4{57+CTT$6l4t)QSoK{W2fC`Q$SzEaz5b1&W*)T;G`MhL*|uf8_nU3{
z|7Qu;!%t6pRs12fl(?w%j~2dQdt9|X-hTHa*x&U+-?r0oMu@k8e)`o|ec|ejZ2><_
z=f@7A6>^y=kLsnqt=_cI+i{+!eTrOkkE=J5W|BNuQuBTPY442tE{<E#S@YSPn5DdJ
zO0TcMzKpV2Bq$<A*GYKHHK20DmNskO%~MQbYOkwgg`%y;o>R(9v79W_<)+E_6R`~+
z^;4vV+SJb0m~GC;^g2dPUP;ovffGVA|6%RRZ+oCcC~o>6E~my_;i?<C*Q*wrbS()2
z{FKU6GG#_MW2uwV=BfC(R&iY&iXTGKwbkuVm5__#cEbsf+=3b%r@j5AKk?x|uKfo&
zD&!q*IsuFJl~NVGFl8~9db9As4-`GcIn=(mP`QFQaUqLvt%foEkQ~?ZPd8EKy=|0D
z(J0&gH$M>PKaa?n&zn=7w%y@a$xg7B#Z<>E%9C3IT00Q|N7OkF^|o(;r#<YWvih5`
z1=ddk2;EI@;ulW3PO*^IMTj{?Ua$Q@4gS%4hxvo=S|WV$ddKS%UPxAKyvUV3S}(V@
zvr(6$n5B`z>X2JUqHd@a$k>u>_dG(^xcE3>%1sq;AlV@to#3fzr~DSUR~Qcpn?RsT
z!4G9+p5J{RNV<pXw9<<wVMq~t`Vc^P{xFTuAc-(Nm-ZaDORYtQ_tT|oPaZg2Y=ueb
zfxA2f-AIUEO>}8w$)D|GFfz2ps*8s^f!A$W#kS(S(lbBk2#V_c*g)V|0!M2SKq|_^
zKJ{lWOERsG+ldDy+N0QMF8k^_ZEUr>OW5-y_MlZ4tz~Y*e=RQ8Wjq^f8OkzZ>xQh9
ze-_Ia2J^M50{_Qo_sS9tX2w<z8wO3~fVXOfNO>+7U$-utth>S`;*q_g53awI!F<l(
z(_ceT2+V%s7>#N6+8Ysj;Wb#?P$h?pPd~fdj6LSHTX(W9nSIgAE$(!#wYLeY7f&{k
zrBcQ6;Qo2UJ$p%|#6MWtv@<I8s(4<!oIRJu1s8shqF#mBn}4U)N!xF!(fmCGxITY&
z&oL8eK$J)CR#K!qX8N!-7Y69QMvJIpTP8+HyJ2G)I*;80@#tgEh{v3NGMRjK@2vH_
zehNkPrXJD782}P458%MYul_bRzllhHE$Y80raz33g_HiPHvVI{U>B-?4aC64152^a
z_OB8AZ!3g*i1F{$^1p4wKc^7>whaFqDV<1&n-iXVj1gG&nqpY7=oDia0u*qQp7_Fd
z6?RJ_YbEk|{p$@3A6!@6rz3asHMIxXkbDfZmfFywMiHZYTMF-pyi2It&?h9!OS~=3
zVju73@7N|*71bmgbtm63y}P)7IUZ8lImdmH*C5<*SBt%+x-aw8nmBd{qHLr~tq0Ge
zVIP1DYQoj@1YEAGxajm2|LbM&jalL<>O@mz!7RGVJym3~__Tlb{jPl8S?I1uUcevA
zVGgU#`kC8xw0j}DA(C-;>^sStsa0h*A}tA*QMSb?5#<kdyYBewf30q0%DbB!SwSqV
zKyrL)OaZ*Rh@oY@_;l^wbf8On64u=@F_+8LB(J{ZFj?`0?$Yhq>59A>+d%GE@1X~d
z5}obEr%b5jVw*S48hK~V**sg$2~-CBL;50~JkzL{E<V3j4^OV4#F$}vnBbY8WIJQW
zOoP9a%W7;)#%LdKUeCsN{az6e_f?%c(9UGOPc%$lgEb{V*pv_-`EF=Nck`JnO9eZ-
zES_-0>Yde*Ae0@<XN*jTH7TK-L-R{+a8(e{A$OM<bQB^vLVU?r(j=lA-G`Vt>^Q0;
zL+m-NF3d46?Z4yRy-6r~Uui4}k;Z+ZCqDLC?`aB1iKG`Drv?&^RjkA-xQdF<q=Jw;
znVWWBKWi=C2rExTMj4_trQbNK<jw0^6Hl)<g2?P%Ovx;sGax%<Z}vcBzDiivte@$l
zEeV3?u6b0rj)1Ig#(MKquw8UmwP4WZ-$uwhjST+-?HXC`sN#Shk<W=qao`CPk{l8$
z9_q|cUCNHf7MbTY381UgTVW|ExGftw9VtSyR$O|hOmn#Tuu_T6E}19TIaRSU-G-Ha
zRdiQ?b9Dr)%?Y&l?pobp1Bo|fo{~%N>&EVRtp$HX)(&QRqo2echTI;94QLcUn?n@`
zn)Yqu^TKWp`Ns^q<BbFs7rb#UR6L>7zEBz#zH1$O`22gd&Z1jvE;*tt$Oi-0+@y9t
zy{7JHZWpNU&(k_q&wj^{M}wZw9tx6r<uNlo;;>4)-d9pO&RkWOhj540?Z5RHrhbFZ
z+i|3qcckk9M-;>h9*^6{44+55zT|kj^gJT2Je}akIuG1Z-H?e#gLkQw<nqee=_ORj
zBPQ5DX0BEiOt8oxki2yso9GIE5yTs*So!)mQqiBX2v9L1oWxwktSZ=_5Z4=1Dzpg8
zf4BElWJD{)ie&;4u~D9@E*;_tSj?)Uo#>&KXb>p;IuXeq>q5|;%w0cYo$CB(-{rfp
zL`GJ-h+@2mlco+!3V+{JH9beKDs@j0{9As*XLi|5yE9L0U9w9{4+nM$4MgXr;X5xF
z7*G<iHUYEVgx8;Ba1*-g2>2C7>L9UAx1{J%gPICe0|$<bSAo~aFFAQo`OZGuq#~pu
ze71uft}a8i`t+htS~61?4O<D}&OP?oJZWcV*{G`FHX;M1FV0nEg*W5y=zV9YSKy=P
zRrI@Wuy_+nlPFE}V&_CwbI~r@k(rTb2T8mUBksu#6|u#+$Oom-fm)mF(qC#mdqTVS
zZ%gw%+T2BHB4GoS^H|t5@5>dPnTvexs84mC)D)@BP#-TX+LxY&?H3RvSK?nTjA5!g
zuPyU>`#RXjT)DR+PJ&qZm`7vgPl2e@a;F$?Jygi!i-&Yg@Nq1Uoaatf?d)0O`_@{u
ze?4P7z&Pz;f$5Uf^F<mwnq(=NE}0m?kN!U6yM*$D=SNRwyB;`%jq-zixzK$EfrES$
z;{qJSt!iH}+~jTs>VP1x#8WL8wO!=!I@(|BoPjvZsql2gBFB*)zH@j8@%&KhqICcY
zySuMZ;3E8v*o^n1Z?~=o56J%_Ma3z$!!n6ZhAZNmW5_e$l*eJ1%}TsjJ1*{#B0TNz
z*3xDeJQIRq*bXHkxMrT}JKbeVMg&Psf>}YO#w;$f`-mIB5_6I~DDnOP*{1~4@I7ql
zhzn}xt7t*bdiXJYKS)sQ#Bx5OSL++fu0i@EFmF3hfQbWJj2cIGDS<ywNrvmZ&{GAq
zSkSDFCe!4t*?X;TgaJZbOl3bm`W0%S^)}AOPw?@1lkUY*GP3N0ZqqAKDiuBAXJ$1?
zU%5-l9aqZ=5+6?T_ZJ(_bb>nt7ILN&Kx&u!tVwx0GL5NxDLFRB3SHbs3VPRphnrMl
ziFn*E`SMxr0ha+m&?@uCh0@q=hSHif4~B1ffbvgFdR#Ki7=#r1Gyzyj7?d%NQuMBH
zzGtg}&~wOk?C>6X8SAZoW<Z(5rt<jZc)J|8^9*5yZ~lReR&^3SFZ0(V3Ot$<{a95O
zZE?Jjv(6e*1!5`#Q}~!ShKF0XcyuiDgkAh{$*O>EHRDAR-xVmF-zXqhmpoFvr=7@P
ztz%|4Ax4nGyMdTWiGk_u!_L0NzGJ1l-ZO#O)h#>~;g4lS>CLv`NopYZlTpgICmTv)
zPl=|}bGNHGKoAAuVUa|zy;te@6=og~0h>l^)s8vBN^T(@<V_p_7$;lr3tD4?@&aC2
zBJhk*(h7M!RV`LgTvPqA72P(TY-Q(kwD(6K63#QKxrdkT$n+I`kH1AomZw}C@YV^x
zT!IAgz_fggQ3L)|cr3Fae7e!#Q|a>2x8nPHAbCc%=Xz8Nvpegq3V91a1eXw-A~w)~
z&!NN%94_K%+HT-AF;hv5A!P^+2*>hViemy!b|zdAVYw6t4pw*!QQSi{oD23dgGw7-
z2Y{?9yP$qdet6^hEE_^b&9@v@ZO9XQT*QjBFG;R}n0aJWg+UmZJ_pGxwi!o~8ec{R
z{q79oRR?o#zZ-{8d^U<Y%CneS9?=VMFh^72b-<v!DBv9Ae^BZhMMucPk3nnKhk>wy
zdQ(ui+W@G!+M;KjnGLt$A}}F5pD(zpt1u48xi0!_d@@@6dfz}cs_A2j^7}!K@zq}Y
zL$%K~v0z_5s)z21)ng#5-44r&pqI>Mh|w9@xO3uV!gt~sS=}(&mtN5-;RhFVK;8N>
zvF{J8>DO|z9GB~gwz!=Q6xI4PKe3W@Zi@qU6pKuWb5dx7U0uv2>*t<kr3UzZBR0r_
z4=^w3P6u$Jp+KHl+g*{3Tpm6k@2O#hyw+@G0DYxs>WF8$2i1Vfy_MYmaUF=20)tWc
zo_fnJ(Oh!lTK6E@9aR%VAd60z-ryH;=_JH55(A0vWJV=?n{h{rMxg>zVtg4OLX~c&
z9J*BPl#UoS5YkqX%$Q2J!kkP4Dc-E0?+Eu~$}k!i>OO=Jw}b%!UR<n3rHN<GwX|q1
z&}SYa;%u%<Bks2ONea}|S{fj5^zHKTg-U!kFbW8daksmsD9HDD(G<8?es~VBFWf%c
z-{_-q)ZOKFOQwyxGqSCZOgk&Y>s69ydBxn%?o4t|__ckf$yFe{A4QJvRl8ZCc)0E_
z4io%_NfwYH(`G$Bc|`a^l!ZZ27(`&gGwmRm+Dvvs<Z>ywzd>ZoA13(iDzxm+dkEA+
zNbZvM`5FXy9D>5ESwv|VI5)LKdpA0u1$fKzMXyS<C^2|6r5|<lJ7y2lF6erAoeaI3
z0<x>(Q??rJNbGvV?!84x;loT+mAj`<Y(ZVEhI^!BeEtm%LVT-=A&wsyp-UOmBB`ge
zL}^Es%>XgJAdMirezJw)-j&-~-^_d@7X9ols6}>S<;_Tfg0G(si>SdiC%rG847vC%
z4loJR=0k6_aA@jg<h~rDaF%M*U$=lAL{Jey^~FtJw0fJ#r@lb(grC!EfcDI{5W_Iv
zCk2)!lqd?pc!HLusiN>3CTh7StsBY>-^EljJbegWf2xln;H?95YFcUU49D^xC@~4X
zYbG)X6N0gMN>fNh6gQX0V1Gnazfn?zuX5ZgF_~^K<fMI<@Ct$q%*cPyZ;(Ok@t!5L
zTKBL@%WJJKCbRjE<ry>44T7RxZgPhG+fvb((L!{?!Nh#uOn@4_-*s^C2q=tJ;d|=j
zn_$OgEmPFjJ&B|_B*a?nPUJ^Ig-3RQ=_a44hy_2ri-?A`WkR>?#Rg)Lo4G!ArHaz2
zsK95&L%qZXzNJgm+l&XHo|n$mk|Sm&I{0jP>Ox%$JWc($Hdg21{E8h`H+#Mj$BkE)
zr4WXUMDL{m$CNKiKpc-I%yG4V2%i#o76EF<=4BkKtTSslWpA#pB8>t9$|<>zVE1m`
zJHYV`=IvwmiUo8JN3G(qaEiDa>FCTnD={a)^F2W=kFK0z0rxPVCX!+Df)`SGzvlya
zNlOuv{4r`0?9=2^yVf`P$EYr2R#lm&=Xt59<Ix#XANeO=%bO~bmMv2K+IE{2ObdCU
z%dXM;HWYK$2Cb3D9`+FHRCoIOQeOf5UyR9QOWk=jyvh4q^Svx&SmI<)FIgu{&~7u=
z@|wCi325|4O8ipNRx$^1+@N^ZE}<rqdZ;G(ZNKf&Mop)Sco(OekoYe=9KE0yB51|K
z%1ODC8(s=u)xP@beioCri#i~}(c*($?(-gKF4V6Gj5@?l+6>>`XFMLR5&ndK8UiwL
z@jt&ke<AL5Q!Ap5JLy{!cI0aIZ5=9nhGRd9%cZ<;A9=h@-RR!s@<V@H{4QOk`ICMA
qB!E9D-A|hMllc9A`@bvt$38Ke5I8!(HU$Lydnl(YTX4_7@BaaS2u_ax

literal 0
HcmV?d00001

diff --git a/docs/source/assets/design/v1/prefix_caching/example-time-4.png b/docs/source/assets/design/v1/prefix_caching/example-time-4.png
new file mode 100644
index 0000000000000000000000000000000000000000..017df1657c22e0d371c1a194e7a76cb36ed8af15
GIT binary patch
literal 41530
zcmdqJbx>T}*DVSJ5?q42OMu|+(70Qmkw6-E2pWQh;7)LNNpN=v-4F=w!3pk8aJidv
z&hLEpR^9skcvbJdx{so$qJiCe&o$?oV~#NvVd|=K7^ozuaBy%K3i8sLaBv7YaB%Sc
z$j^aSoIi{@z`^;$DM(9de=yq5L`q=kt*@US9P}O<1U}pcqXjvC3}SP0n+`^&nzUJj
zlZOaalW!7w-*rCxsdwcm)LPWE<so?+bzx*%naZ~9@aB!~fsl*>4RQe0|7D+0xYZ@p
z-mT+dRL9EW6jaNvAr$C?X>|`n6ZG%Xy5!<*=H9+pk^M6Y5}^JX>^1G>3a`sBWx2R3
ziYSQ<e0~he)=p{oP03<7P7?`H)?d-Ue<*~$$;|E@RLD^EY#XGRbTSyaKN@Z)CCb*$
z{!@MV^jC<I+w{u*54>OX52bu7H(wZ6A)6B2GJrZ8$>DE?P)AThdvI{lJ=jsy|9X?>
z6bur6zeSl-@}O#8zCHTs3Mk}z1S2sUmP1q9LST^!Z$j%y<|r^NDxHQ<w~i?=&$OF-
zg9hFRM4df@;pcb=i&vR*tv~jBZ%>3XxANOTP5$bnxV6RU8Kgu%b<XG`I)|U5ML)IW
z5UYLm=CAw9WwBzA=(1IqdVKuyjf%HSYvt0`ck@v@$mInDjs|t?Pp1O!!ei-i66#3h
z+earRt5fNrgRM`b?<l^BzQC~t*FpIa#IGDQ@l&YzN{=lSOq(8+za2qnj$UqIJ?fne
zfLP#LAYAL6roWBoBrKJ_>1_F_1+U(hLT#BdgpglsBYQVP{pdgtg<q8nQ3!tCL+p4t
zZKXYe^_xE?Kijh0#eB!|)n9S1=lDC{XoG3s!$;?9p^pmR9*$o4vM_*5kf{H-J4NN7
zBTanf_2@Gk$BO?3zIPe2Z$iMc-Ey>X5J~sAGB~(dX3pp&aOg0TpY&n;8s2tNZ`)@=
zJsJk*QwqID$O5le@^(7D5zu>$*@v407FF<TrQ9*wdA6+lZCds7;<GQ$S45V6v1F7G
zICu{YA#Qt0HYwRU{LV<5_j_|?7Idk7wf7x^rSz|&fQIG{($IB}?-Jw&Y}p=mmN`|V
zk-~m3oZI!Kt{ArV4k);V657RUzjklJ()F)t3bF0pI_2dLUKKBIURHOhg}6zx>&U{I
z@t43W@So<LLVibF-J%jUBJ-}JjMmyX9ABl8>I_F9T$<V*7QE_2;%9Bb{k3pFQBb-u
z1^fu%JZJ7P)&u$EPulIXPo&lgzQ>p%T7RuJq<a*|M(VGqvF`~4zGCW`;=>+yzmtpT
zy7;xY`<NY&RM>v9`EC++>x5K-@knb2dN+%9)OFn2BXcflxI*{^Ocbz!svZm#N4aYK
zj%B+~QZGQI=Ktk+({*!Ns8Vf#ueysVDP!xU2`@i~y=J{~$eC;}<CGIC2e%R*+Q6Nm
z(docp7=6+~y1V?hOT548gXnz<+`Y9Xej0M6u8jkDj;$=D%;Zy=F*l8~|Ik?=cv6aE
zFv5!%{-fa#c^VwXmiT6zhurtBUYENSIR&?qFOVh#9!E^3OLJt(N5>b3M*gS1446lC
z67b4oTy$92_ZNoZ{FA2MFCQ(&>G8VX1))ABbVI7yOz9o|ko{G2feo?zFYpGE^)`Mh
zd|Gb%BuvmM?9%70==2dg=>s)IY4aA!s4=30e-D>CC2kJi@>x8cZwF@vf3v6NA~@Un
znLB%*j~&vrJgPdjK7oBQ9J|D)p=|SxvJkiDDe)+FyYT46c<T^vF1(~3J!G#3KX&3Y
zMw{Q4RWF!F&<Lfv_5z&#PHl2pHGc%e6unI%=f=bh@V@dLqzAp)N0a<{LvxleJ&4A_
z5-mZyQNkn_<1iP_r)2ZTJ!nD0C4@mj53h$;XAnD`ZpuEasPb(svh^M+R#}Mrm!K~{
zf<*SF7C`X*qG`FnmcT?-%Joo?U#O4;CxgBEW7MZf6c1sdtR&P0dw7Gc9Ygum?=1AG
zS|mJR3xiP`;e2oXRk0#g&%qk2-wYLHHZ0u?c3RGX8q-fzEtfT;{dMD!$l-`g$RVRR
z3Jt$EHq%I<c+}jWpxYOauou``4b;L(as-srX7&|K=tx#pi^4)*RZ(WdzdEFv9WqFq
z@HcXQ`G${s@^f0IDfJy1;VD^?_G>>o#-KM?`~(Ms#%Qg!*h%<nty2qU*r@7(o0-uo
z3;H@qq^F46Tec2P-kSKa^9)nu&-Tex=xT?)glV$&w5=8W9L>;&&Ml}P2Bi<Z-e~O9
zase5B5l1#B^?lR`ikO6+{wnKHhWx!h6LInY^$wkB>FbUwci1X(?GA4aG3%h6M<vEC
zX|S0Ob>iLAXOc*bu+sSz%l`Z^9d2XM?!z3e5{-_kzrbduH<4)*`PfUr2N|^YQt)$;
z*$h$XEn6t+fh%u$J=72j%l-NHGtm!9cRh8mfvw}U;X?PF+5na`-*=(R<il7b<$-A^
z%0Gc*Q~gHJiq<BF!9owx5heIOgK87pgd;ro`i|}^W>EH9m&<qDE*HggWYZGTv3imq
zc+~02YaD`=W>d|@)t^gUo0<s0G4@L|D>wsoB@Q%G{-Gj*Umf(-1U99_-mI`ea!4>0
zzIl1fwa(qC0||1W>e;P0*xu%EKF@fkx3KmpD8(~}bye#3axywbgjdzn_&xrEz}F6z
zGZnpNih$-;kq93#5JxeAN5i)oQN^EItzT?NoW+9|6_jvK|4w3`7#euye}KW;PlzRd
zg9MImv2gf*w^3RAGVn6{*Wr`DtN-sc`2XrF5$mA9z3ZdFb$s|L7*ejB3{~b<1{z_8
zS12gs!)h2UKj&sayedg^7-w;c_i3L>q~MgcmaK!xprYrE9L@Rb@yGi=a(Mz4@~Ia=
zZ76wHCbWsN8zrvHa)nxg%$faht~MoOK^;;Fxp^}q1<JQlONbiY)p3gX&e}qW`Qu;7
z5>+?kg?J~orHrZj)iw4lHH&<x9Rrv;X5<Sso-UxVqaCeS>`a$QV&j@yiEUoObiw^<
zq9c}H63^JKV7%n%0{^_-7gY@2mQmeME^WNq)C6nezI3W^c0Nt(zb^34+bI+Wlf2z#
zktP-*xQ=l2e%-$I&4otn<L$<dW5*v&JYT6TKi><<b%N7uaX<e1UHfG7&Xf5{F|z9z
z^MCHlSMTAHy)#S9@9~cwN#{v7d7K}G-{Ye0#6+|hle6E^tXIwL-J0c*w6CZ4NiJ!A
z%^mbJzx0XU>7mw6k0lvyk7Gz?fkvOP@14|5{R@ZCg|n-6ehQ^Zk;}lo%w?aEdgJJY
zh^zHQ-9krEH_LU*=8?u;@h|s}7ZV<Z9!5T@ctzyV(n4{57mH#Z%WgY<_p5svL?deA
z-fW^r1?<v?mEyi$?%7<I{xQp*GB?BA9&(3228PAh?z;?P<PHyNCR|bX8e)vv#7{@o
z96RrwKW@Gckv_aLIy|dXTZ+0nmB0S8yhz(<lvdA9uz4izBP_ZSdS7$X)eGErQT&PJ
zk$AkA_W*sI7e%GuW5EqS<Zdr!dC+(3SEsu%o>)xh!={o3iR9_NE@kHo_QgVcw3U48
zO6<RBU>e6Wd^bI%tIzk=Zzypu!}G^Zt-T~g(@q}KaA5x)n+W~=IOJCP@Q1jspLfgP
z%Luat&pnm#SH&qJwj$n5)rzOHgr^LltI5>l{^k67{FxpVg5x6xb>>17=o7^+4x2%E
z!!Ac}8&n-TPxt$LX6Xr47_;qa%JMFJJ=bTo)OgK>96Fl!GR(e&)m}=t20>nJv5R_Z
zxyICxIgkWNgUefgDDGI0mJwfjjyqM>@1Camqb}l5cvlu^>2-CUo+=fF9M&qWV-{yx
z?4F0aM2|c?UaBoCCF|Sep5Ho(5d^Ues7{1gYGo0;b$W0uF6UR>%J50rzkj@g{mQSR
zeuWmZ_}=p~sddx$HWzd2psMrtr%el*5rxJgPBDFpSN*z<3cpGd3LmehLgI{t_vtdR
zSt26S*=en%DJWcyjWh&C78{pE8rN#p^Uy4;&lC5CWA09yc5ZA7CX_|rz&zwqGwj9{
zxKAurqG$XX&%d3mVroBKCUR1%w4%!OB5P^nY*CRh<#hAxGHGDAZ`>-#oFHa({iMFv
zVPGk9E7<;^jC&+yaqG}_xHLp2!J+Z``IzV&^$049Yi=jD`8K!!Mg8OFTw%)XhV=66
zVX?iz!q-Uc3>d?Nm@}fz{1k4ZS+l*4I^)Hi&MG>R59jU0`BkQI>$U9bD%d^jAm$B0
zi*!NUTSHn3>>F^oH<&i5DTpwIpBYH3%fvEV*L`ZQ;?nXaWjywngii;JR%K0(=bwL#
zzinLP@rpU>c(7W$p9*&Zo5a1>fN3VdRA|>cjBagO#+{WcZo`sra=<kFMjn6q?mI}?
z<OwtUvXqsnZrc4Q22j=IvmEMAgfyCJ_IWb<CZeYG${!XVR=n0Gt~ax2f0;O}&l|m;
zrKj;=b#s-yI!JeH(pk^$)j`0cp*w8v$N;mLFz&d1r<gA;MRX%2P(J)M_)_uO*n0(u
zCW3C6HnS-o>%p*Pc$Y=?<TMm+gA{G?Dhq4b@t3mU{h@HBnFIJ1r$|Q}9Vq_zlSor6
z)#5bga7}~+*P*?#QR41Ar5BB<K?_z02^x{3nIbSMspbNh<>3H(`-hwT_FuA>uavUz
zC%2@ssryH(`)J@%cZk;rF|kHT3%HgX8|^1bJH1UTd{EZfuGh)+3PDJYuwrDCyDACK
z2<CRNVB3Qmc5!do_Vq#2+FLDn-?PLJ66hJuL~ZZmomO<2t{r+IYr`l)>UVU~k=W3k
zJ$9kOE{qkI9lc&zNH5J#CEXwC)Zxa#q!sNpNsqatqSyBx%WSB&3y=YfT5854-<gh9
zGYrzaxXbI~pw%CfDU1@-ps>)o&zDPIyT~InjMg-RfrYv{!^#>G5`S?}Z+f9>j>t6q
z`_{3I!+F`PeX6!@^;W8Co_{Rz0C}|Ye!YCqzDY|*NP$uQhJ&8Tb>6;pXq>!pjtSLP
z8r#UL>t~!WO=eRlYv3cW$uXm@>B;(Q#%!Q0xz8vFX$0dWxZhJ|w`bprv096j#W`-|
zO~gbmh(9)y*I>SIV?R1C*VW(h%7P0*es|b~hvSkj)|XDAk(610OLtVAH}T1uUP57_
z*&>^FmyDK%G0iHk4ELwwvd1s`yrwJ}1Uxa<fu2-jEA%c+WuIMG68&Xuzu~s{Q%BLG
zlh_H<kQ@^2wfhr;#TkZ=!a^$*Eo&E*m|Qx7yGona=(UjRWeJ)`cRM**ayt2%DvlB6
z$aD3f+UZ2R-Go&W+<J9|opat@PIz@n%WwIsn9S|ng?H9a6<MM^3%p|f;1N2Db<>8d
zZ88ZB1={*$E7uab#vK=3=T1tX1EVkGk=586+{dvCbnKU5N<w_xs^V{bwlldtfzM7v
zBM_OSrQe;PEJokmQS7kI+r12W4O=O~Dabd+5GMbogK$pDE*gu1D!v~X+mi|LWPJtB
zW*0dqVd`L9W}DU|5m5`)yyx%sa6XNDA$|D@raJyAW{^I^a(LEdi%n|N_f%QDM)F?j
z-kcn>AAx|Ca=A7V&K#3Fz^sXeJX%`_a#DO)aCouN(|4=saCbXCo8sGIy$d=DwOP&`
zG>wDnVIDj}&30MqDK-b)iP`D7e^GtB$mi$Ie+JF?3SW-MWdH03=8<Z1b*_gm?X))Q
znV2C^2a2~ctpf9`+~$wzw=qkeTr66!Otk*m;BPLu%TJq?S*>7>H};?-4VlXnHez!2
zec?w#W|p#W{<qE_-J^nD+2NJAUitR}z?wc0Wc9|pq+9x#l*Pf6PRT9jr!0sYgNMta
z9r8nnqj=oCsyJPJFQ$PRj8g0lOxL-bcw+I?hcNEm>SoXX4z7oxiRY_)Mh?lM!l2UO
zXSi`u33&$5p3EUhM;S#^73Io=v{;iaU^gW|C>zj2(q*%N8bFJ=;%mFr>!nO`gYKsd
zFH4~fPB`nsBP3k1=Mm<BBc=__ueZg_1vGaMiP<bVI1un~Htwlu`Oz&udLX{=w2!(=
zR(mkf>8;nd(vtO;>Q>I0s%Mp>$z&Q?2>JMH3nIVy<Fx$a0L96*tlbzzFdX7npAeY>
zisE=;wu;=2qM`5X=mVidNBj#i6+r&ea$1zL(n=6hpX|uX#Y8u}eoj{|U#mlFJ+$dR
z%r$}DjE18iUNrOJlB>*TKq{LC*o)+GbIwyn?#6RdiRu)eDSZwyegABQ=mL|4n0c$+
zeTn&gKYxqcPn<9q3Edoti9m(n>{c=ImWIny_v3;0cb+!BI!8MOiRxe{N=Iu`0aXMZ
zLLi}~c|OATAAgL4lL4E2Jvkh_&SxEk4?yo4w@P)Js$?p(`@N*>KQ5S;hqVX}>~{R4
zrO@xxWW5e-12n7(F{2o93dN2i<c%f8xCo(_H5$)^j37dsLC-JUU*~|!!N}{&rRZ|Z
znb_g3L^x5Dop<ln9i_o+^;gh1;c1cnkV1UeW?TAe==J*E2yX#?0Q_YQLc;ZzN!_LP
zr_gesQdM0WVW+Gh<isKmh@A8U2y}=l(aI)C(IU7Az<6q8ynS(AX3`V|XZS!@W`%=B
ze$Q_--rTgZhNP>O^;>F3kG-fEKvplR<ESBQ5q<wFQ7`Sc!;W+sCxRupI5cpln_I<y
zTy|`_<{beDi>6NgH+y?GRIhioHJN&(=0AH)DnDY3d~L3~3rtf+faG;BozytEdIr%?
zyx?<BO)8wZMi7d6H*;cpr+A%bi5(h@tnhj(mD`IV3N=anE)7MU8IBnv_ypuZ@{rf;
zWExeV3qo;>X7VN0?7FoND6vYdI)X`+J@3GgKpY#BI1Vhe*&#};B6{2|zwqMXl8*G^
zhVNGw&deB%%C%i>BoP}}bDHRTD){JPV}I#2wN%Ur<Z+ZL{JO@s?{z>zoEvbL?wkHn
z?o2t==a2y>x^Wa+MmLn{MIDyY>gk)rpURNmq9y#$I%tb!J~R4Ih_yLd3s9A(#mY@I
z0Nf1{F+0kvm2p@ir?J6S&Lz#n@5e=JyYuJ`AcBHJhO9QqtdPZGlyRuoma8sVfL$&A
zAnTnyYQArR+ry<T233;kIBW!`*f)0}(Aw-E&MokjA4I5HMRy5XWWoviu+IA7I`+;K
zZ?rWTaxnXMV)u!_qY~TnE;evhqh@qM1PI7a-j5BKGfg>qK|`z&5}uD#78N@4!z;!R
z+uCf8c(X~<f@DJHM^7cO(sKFssQGYnVt=qKAOtQkgFJm9<6!#hU*PU^8laEvXZ+r>
z85h7GqJ^Q#m{YVsz7D2ejVOLxD%FF%p_53J>$!tCs^q+fEIF4_?}=dYM~MLWGTt45
zg7}4S5_<si%%*zVQ{1!_At4&-v~}Q?raHJ>wASTVKWxFhsdtIJ<f-!^nBTKg1m1*|
zMPdhi>va(TOUJ{4ynXS-js{zAt74;3K%KdyS~=L+zHdaPMV88~mccX8aONw7muD{2
z1I|r5((QO8y!t5WeW40XqGS`#t+u@NLDTG2NJ$?2o50H#pPFuwT4HgcS5)_YXVteD
zZiD`5f?or#Byg<2Sk^h#hz&DV2RBZi?Bij4uMKOV2`leEHW_mBCwQg96*^mBCh^i}
z+ksn@{g*yor>1U95ukD=1m=9_>OaHb++X=A?R@tYy@bl4qwdCA?7;%I1D3T49GsI8
zVa{0rcizOU2eGUTJ;6#HYg(ou#hyuw?$H+`%cMa;cB_N6gHZch9{p70FiLWgU`Ts*
z+^LkQ73?REJ31{_LJyfDi?A$TezEfPZNkdm*-QU6)zV_s-!U=>i^oWeHNyT__&#OT
zJ{;hT!4uemVs^ytX{sWG5a|~@8O-loIz{!~N6A;^8V4VKWgHefIc8$@1-nNG$z$X6
zwn>V=q4pNLmTI<4`PIw*6#bTD%z&wO@H7)U`X^uo;^@k}_+pZVDcpRP@DT1&7-~n@
zdRW@$adMQga=8UEtl6vLT~YNckSX>)G+jlL>-TLreN2lZM`H4ji3kQ5Cwo#YO6b8J
z)yQvO-^mJAPaXAaG@oE5P%jP8>I+`Ngt|Ao3~M(K+eJuGWxyOfR@1bk3~SHKF|p@=
za2O9DGSNJP?~8qgftu(4qo@1%C4`Ona^ryq{T!~`Lop}@{QH3ibs^-cA=Ll+(;dOW
z8LK{zJbFBYw<k@Sc2VK;m?NFNA+cjHuxWfbRIaD8--#TRJkgP(%=$63si)_89?=sC
zMtBu3gGPj%(~Sj8-pw6@S{EzTGZ!D-Jc!}Lt8zGNT)S-4<_O%fs-jI~Fm<Pb?R5Re
zQ3^h9!;&D3JJMOy3Eo-IJRcFgQCV_`!K28Bvz}Q|>?kqTZlO85Tnx7;KR&bWA5aea
zFy|d%fde35d0H`D{>xoXg65y`7i1V=)G3_LXF{jA>tE^kq9)?(F)eG1K`Y%6nJ(un
zCjQV|)&R4S6ficFc_Lp*Z3r{XX%3g=m#H25R>}KIE%enoF=g#wy3&Mqtmu>_YUQfo
z0HA`dU7dL|FFagf%kGcKFXZQCI?8RswxiUEoG{+=jO*_z{!&>mDSCN&`DeTQ^s-jY
zeynA_L<Hl=Ya@L4OEF7~)BNI*cwZpitLgRc)c?br9AFR7S`Xr(9&LH<YGhX$ROoGF
z{UYY7Sxq?IjhW^dM-J5aW^tLfKc@H77Q4S{f`{vrLT0)xHgr3_16;4p$@=#EUNR87
zv$N!6wUzB9uJ=)c`m@vaQS64fPdTALcQ+X~pg3cV%>wh-Gb8^|s@*|$Rc<yOv-z-#
z_h|R%-LUb>3t0k3Sb>q&jeYrTt+S)Z!?Dj^KFVa!(CUnbu30_Z>we{!ocnil(4)OM
zOrRA~Ez6hQp!f+*Bv3A80{QjxcRDL`azdrWGTrku4lJh5p`wI(nw?4Yd^DdRCQ#4C
z4pPaC@PCMu#?Ta#&^oK_Zw6)p<+Dz{EYTJulClUy46k3pFVZB9TfnZ%1?F}Tt@~2f
zwuKuPsX&F`lY70D1LwL4l-zC)UAmMO$<6ns8pEWy%mZVpjQxJKT8%A|{jc*whJ;kh
z|BF|u_EGd#75ImKx^wu#wMxkD77soHBSq4W-X?BX72}x6ec~b=iow$(9@0*wiEtsl
z03HukY|fW<l(6q&o3x`k6X9wyr&)VlNvj~uIN@x^Qnm7QBb@SYHFyq+N!Nu(gbTls
z|L;hsWtDLOxAA$$WaMvLayoo#Pj7d{CPLCUZ`M>j@s+t<p0V#d(_#WoHr&-DyojS7
zW>kQ?{I(Rb9Y=G``7)mj_=Td<t9#{Eg?{^5Ycuozdb0n6jM<!{nVacYmKiQr2zPe4
z==5RRO-94kP?Z}p>3D5_a^1&=hrN9KG_@t4Hw|i8G4qc9iT4-{OUkUsQfn<e7Fy!n
zv|*XlX4}S?5&6;pPe57t<J_~U{6{ISVs=Lj^go|6y~NV?-Gtujlaf|?d0@&l?eDMA
zc&NIs#iAcA*<T1{&QOUt(u=zp8#P0O#e{jxcxHihG2q&z+a<o(I=4q$@1_$D=cb!C
ziPKqXpZ}LPwCap&#}}0BX;tep+pt**qaU&={!oSEDl8SZHo337-0m?eG(VFz<gcp-
z*$GDCWYvL~-skz?-Uj`{B#M)m#bNA>RDWHHiRw96IE*C={(c+1>CRQ9YmKN!)D6M3
zgFC#RDacww0`Ze5Lsj_?CN-`cYLj=mnSJ@DM+|m+fZ<P0&ay@z3ODFZ?=-*5s0df;
zi`I@n!0he&rB+8dFmf-Sq&a^!r9!m35H-c@c!bKfpnXhv>9n!ShLt6<d&DtgM|B9B
zJ!<;V?X7!yffhIy?H(ws+c2K#42FbXTIsRPo6G~nmSkQ|>r0Vpy3&k^)Wz}%qL9z1
z9$yp~hNHisy|w|z%oIo0!B#B}?UI&$7JbJ>_=iyS<LUGDvb)P*l$cHH_E61g+1h0l
zITDsfNKo0iUi3uZQR6W5JVepEl6;vHS>Pmu-6^c*zlrj^<2@AX3!J$Um(WUv>pBt>
zxs+-Ln%Y91%8I<jxHUH7HVF%RGO>%DYgFQ0=qN`y2iPgWVrGep21HDoNhcBHj$>{m
zj$>;i(AGW5)ms6f7I_zQP`maBT}iskYG)*cS~L;wp{%^$G6eyWjbIlTk9C;e=pz+r
z$L19uvjZX(G#YEFjDsxGic#*Jg2<Uthy^9f^oyluECoJw9>y+47E^ka<E?Ecs}o%O
zOZ<*0taR<;Ps0g0fM-)n(A6B6SM^KTvK{okO1BHIn~{~lx5YHSuff_PysX7f;5tC_
z#Faj2RS8xU6nj~n%_t!loH$XvpXPHt1DA2yb&DAeS7<mcx!R}2E`fti#!H%{)uAS2
z1-^Z*;IDb9jzyEy1~-I@jxNd0ab&2s$O8lvH$%W7RfB?Ef>=z}KgfX$^3ylV>#(UO
zg)k=p%a;*$m@6zN6mm;2fjRy>2qK`9cHUJ2=n8caz+wmS3}*O{QE<u^3LP1`n8B2O
zz5Sr*p7lB)oR;&X^CBP~s78(qrJ*0ck+p|Smu=SEY6QHM$=+VwtTkG`$1X#D9<8(X
z420}Vzy^)3LPi-TZNQ4JU02W@_UMg?x7t>4Cg6VXM8Ja#2wyK-Nl(s=3MV2#gS>8?
zsEIqI-z~ZK=*4BnDH5`1ZOhIry8hhJnDM2yKKeUI3PP{e$ukQ|hlQzM=75V{DXowh
zmo2)6Z;PCCe5R5}rHn{f?~~~uWtZ52Gx<|%TXqFLx28I3T?zPvlQVc-&^G$1NbD40
z67BgePSFV!PgM;T5M5=~j$?#bkzEn^{>z}T?-gHRUuy*qKV-jcAU;Fji2;AEM%7YF
zCABveP)ll_tuqKrgu@U1{1hwYBw54YC|-nwvkchT$J9$J3?hVtWiovk;{G#?g^pGS
zt;!6XobFuC^)zaBu|f5wx9^8f^m>LL86<Wj(>sSmS0fE8=E(z!q1~t!ux(blG?xoK
z)RFlh)}`N~M!xHteIxQ~>s}9FR3Xu$=HzvL{)FSr&{dE41QsEgJ0MpaoA)Dw>xw@I
zR7t(V2#4G7GV+K@tcmw%8ph3G6@#8y{!%umvVQfMM&j{ijekPs$567^6U=eTg8LHy
zxqZ#oCmVF-r;f;^^G=A{Tig)PVmFeF+8hdg8yEm|WlclCo+MsQ-FpMa8V7f-FZRkY
zq`@S%1~{2@o@B!P^@t@LWYWZQ>}K*`Pz78;{)bf?!59wOql33Ae$4pH`J>snB913(
z`OudBHflsB+&hq;ahS%>3DZqK+#FvbQivUF@c<CK&2jqC@g;R=P9tA9v@o6ZnsoQe
z&`!@k5wZk7uu22XW`e>tb4jCag-R=lf({5z(@7&fXaQBz6XMV;P#Zq49R(~N87m7d
z*tq0)N8U-42_XULqc`RBY(H)eGNZwX%8Lkyg;KI4PR<jH53=k*Q)egIUa0!s?XE$D
z2(M^q@wr~ddb{kZB^@^l>}?v1zAG-<Yc~hYK~YC!%>NWQe!rc`xOZ6hiZ!FoR)}BO
zy2=(s9U>6QRcG91$j&@DiUDs!z(5O|4pm?_Lu3N&<URsKnI@t0kH)MPZk8D`{9w4k
zPiXgFy9JDADmtAPujAnU7|@Ry)KOmofbk5znNsUTinL=vT%j{P`t&;}QEIXx=yMR!
zURh^xoS=Kie!I(O(xZBSik5}k2G#R0MKJj`_Nl+Z7w~+hYw$jm73(_tIs%yu@pER6
za>!@zhGQ_@2V>SF^p)FNkBIzKS-vSVSiNrfqeD^PyZbG_ZakKF9CQZtS1{L6RuL_O
z6dKs0=ln}I;8#`4z?+_6o&yhVyq0d7wcS6<q-qOs6>WE#vyYJpqhS`RO+j+rnwBTx
zE}i$>Ej?q(j09gfPrKz~?j76_Ar8g%D>9b8mk0bK50-)9*MA1T)*sgqUXVv`awY;-
zmo0LBhhd=+C(f1f%9u54-7k`)jS{*r9(H37C@S{Yh53?8rX@mzHy9eh7iiV`T<GL2
z)DqThQv9qc+Eh<ze^R?5Myn&;keIVdQ2~d+;#Ova{B~ZvoE*OYLYSv!>As?LWGIj~
zN%hFGTMD|s*qQjeyMMsTL<3_nHKsq{Ic0}&pQE#R8*$#%B2C1vvYKEc|KI?46BKk9
zRsftinEh^O6ivNj$ms?R)DO9m#Lk)I6w7RblVP0^dB4j^>5GlX<d*)pXymV%#H4jM
zLS{$l3lvd6{BrZpkENEkpFaBSGVC%vSn48WtG>*EI$@hzb;eS|e{5uJ;N(YS5}k2!
z6E=y%m}v&;6rEJvlRI*VSBD63nL^ku?k!+i8Q=S{+pmtvf>ix(E=dmEtG4CWD)(dG
zrgM<A0N3()fL6@lA5-vS$>)p6yU4PHVw3Nh{a!1wrh(@Ex0@KRITt(ur9<xUUTS>l
zFF`AY;2ztXiRe^-33Qz0i%emQufsw){O@<IuL}m(sL;E%ILQKf!=ri0OIxrk$r7kP
z6uljrHQmKZdPjReozE3Hd<El_vOzW?!puVls*scFW`=fb0Xs*V!vf<Sudt|L`eg#J
zI<)9#(!?JvO2P#>{O)|+)S3WOi!{SYiW2vuhzp-xIgEw|I@5od_vUzkFivG;;a0LN
zA6Y@i<D)pOfueogu+8ilas+)+CnKe&L|rIKS7cDmEVK|M^KSSD%@Ur*-59`LmLq;f
zv~p9W4EW^fC6=LqQ1|haI3Rb{+V*A6)LTUT!Y(Rgb-piKo}OlSQJy=4YL_QcivFlD
z$%Nc{X;!x~=SEwgNRh!WX4$zHMsT_ChTByAD6ir@vipPW?ws&SRqnls4j~Z=R3L7G
zABao@Z^Z1%(v7wy@9e+ugJL{sYWo)ZTE~<y_~Y4Wjs}lRdHu%TN24FqRF&$fc}Ow-
z)3M|tI#@N0(ovqDrnFHUwC$LaNOr*EE<WC@FWOPQp=6+2Hc1tAmP>0_Wh~Qfh!e_}
zYOs~V(R=q3InLvV&nr{a+l8-NS*~ank0BI^W8sZ?oj}@-v=_0yXgg8@-<T|NL@2%>
zSrpmpe<y>wY<Z-BjWK3q+#In5D=xg8B}Xa7Qt|Wy(m5=;ga2Os24L(8-u(Jo1lOx>
zq5is0zQWU!@pYH~z4U%ByBWkwT0}4qw;~*T-tN^I_Yb7$k!6nhkHQWJ<h1`zYBJT?
z3jZereQ}Nry!c1%uNGNM(Zs5JvjvaI3E#D~R8K1D7&~VAtcz2bopDW7#TV=1(urJ(
zh>Kztx@Ct&*3j<?DI)H=L;Vy3JESMmv3q<OH|7Y>5v1SuqyTP#zr}uDh5R4i6tRC$
z<OQ&Y->df5vu`7Xu7-2|>lm>8A{+H4kt$MD8p#;%HZ#H2coYr02jBEx+rczl6iF$w
zB#~DrY6sJW@T*@PIfPW<EI<DBE|e>f@ant?;JCoX_tx0=(5&#0WS79xjr{r_1B8A&
zk4@P(M94lh^Ob4hllJ?u_stg1aaTIKYUj!LGmROP%*Q_wm2wU;cYxa<Dr_QZj{2WR
znO|Wv_G4?`+jW@$dTxm;$+Fk|^%zdM-|bHWJ<q$FUxyLUtrNoKD}C*xpIg}-m#%mO
zolm}k^(9PU6~;9{7>#?l4ylwo^laO_iy!y99ZT2KyX@p$_ZnHk?csg6$}Fzo$Vk|{
z7xP|uIA1Yf`)X_qpr0La-wRc|iYL(~%8fbme|-d{*BwrO!A{FeQbIuD-UmGa%<;#o
z{d+*b@5}Z(wrhcMMI^t>Z8Y|DzqmaO{0V00oJ~BVk$yjL`qiuAFX7X5AZl7Q*}$f{
z%<ida=Q~l0e|T7OE!;?oL>#l%%}7)gn>E`BA7I1j`MHLMwG?|Vn<WV7_T0nj^9WYU
zZyT+d213io8x}ZP<o;5(FBhF8g~p_D3zt0)O5?J(K#himewNbIBiZ&zsk^`<3o=qr
z^zbkF40-A;`1zbVta|_?i`tVweDepXm?D+J@2W$i$z9;(4iB=nP(6b+kG2X5bVgeH
zxR)6xr<bfvcRf0miu|zoVjylgP#%|Z2$2bc4P&1!UYQksj!a{LOB!#8bAE7V%2<rE
z^&#8XC&i(&=Xom89|)+TW4tD{aJ;p$_9lEM8FQX-pwc59d>*Ouyp8_P11Y=mleOl2
z26W-zBu6C<7)!={`T+tK@$r7*a=bQLoHf&RT%qQ+?qyEGqUUCY==~-OmiaFRi6@An
zF{5KA=dYjT`#98_S7koyAVi&YId4Dz{7Ko)clx0;EW%ey6XFoLKbrb*xECY4k{)+X
zZ)DVayCLT=ywuYQi|=Uiv*_9KTY9v`TEGsn?s1>6##SJ~c%v1-!H0u?SKSZD&6b&-
zKgvw%ZG#`!#BY`K=p#9fqZnYJJkPI;0XcT3(a2MPl&kxwaoJZ;6qvvG(0Lzzm-p7n
z3bggI7`M}lnmjI!Kf9O_vt<HFUKdlgB^kM2yx;lcbr?%=eJwi9pWR05@^}v08h@I@
zOEZQAE`FbTIj-|3&D(cKw#-POnZaTJQ2$yQuMs`FX|Lb4MuBlCcz7sEmK0uL>H7E>
zurDY*IW5k`#cy-E$h$t99|OLf42PBnk{}GfC&he3e4|e}@L?mwadNQ&aND@9x#`A0
zlQx_tP!1awEq0)2gXwO^O)-y2BGu1nCl5Z>Au`>qe)1}KGJN1Qu+FO#>sj>kkvdXM
z2zd+03;wLla;%zW-l19BREiO@8evl9)})9S4}L{InY={KrSH08Eoc7txM{#{z5+eU
zK?nX`(yUwr8JZ&dlvELj+b)-HlTqMoe?MH^pX|Jhh*Z*j3)!(VnxXbO%24J0qrIbq
zwVZJMcwg*jc?r!9_DNUyH4u!DZ0OoG_&C2t`I8wdxc`87#Cs#nbv2B!!r0jLW^89t
zOC^=UV|yuev;cnAdM}DwiNw(Jy09h~32pw#qESl$*<oUlz$pXsRfS{gAk<+4xog%$
zpn>1QIr8PU{r>!|#)l81M#4q6`!y=-zJS_mMxgjmjLuc2Mtr^^1?U>e&%Vbv+dGJR
z--eY%a(hgMumg$<E}dvqz-h%oaUxvdLbeZdwCYA4S-qQdSwddO&&XfQ>!^O`d_rZy
z)isxRUV%{wt7Gf9Jv>XL$$(8bzbi_x!j&(pVv_wyvex0h{vZQn^*D8-2O1KSbd?s?
zjTeup@DAcn@lAcLXn@IYzJvJ4-*ELVhYZkm?hTiYge}O07f{mLFSQdZ<lErWBbyRx
zbWxTLrax=B8;`KW;7*dQWzDz>y02}{h4e(ub1AWYxmPn6{<U8_?RAeVTc!lalTt`~
z#EEU!eivhBdY}L+4r-XU<XJDt?e;;k<zq5&0KcMt!&3%UCz(c}E?V6+8#CQWnZy;<
z;<UttxQuWbBy}3nHlz&fz&wF4J>Ffnrc$2yfjkgV&p3RpxX2uYf&h6H=bM%)rze0f
zbf*^ehu~!buWyM|{0YX;)MeS7{2rStr7xLb?&D#(u24UR;@7n}xZU)f%ORgT9}i3Q
zTCSphTFX%;I8=|#>5+FkHvc*0&To==l6mUXDd&*m2y@uQTupTv?nTq8GhOor^)3B&
z=n01JJLLJ7Sd=?q7mM`inqAuoSVuqkL5$0-F?2f;>pqn-Wr-S+L=&N?4`6!y-6z84
z&4p7ve)3NeFOoBk1?I$lOcv#)@g&HH@$lex$@Y8-9?QOG!2<k7-ul+?{UV}wj!jpS
z!Ip#kmt?8D=+u!jnLg906MzNeRXR73J|CTgLHsfq!IO0>i5k{u1T5}~FL=!RL#Y2T
z0dcxtq_7s(Z`r-FBsO5uPc5NBhuuR2Ay(~xl!*%G8(KM<$rZFQ{a2=G^>HC2tXM*;
zq!t&H{b?Flqon5g-B=4CLMzi1NO&fnFm-zyp91(wFffEwV(sW7k_(q!Lj9x(8gG+^
znz|)eF6mao;~Iz>t%Tp7lL$RMk+UYyuCI}Q&FwQc10y|F6XBii?a|84Yt0LB^3QE;
zkU7~aRpfr8%}kc|L#Wm`>~fwL^$75LJlXnYYp+$Sxz1E-w2-dj8uyS8h?L3>I<VcN
zjmi_=?7C1Eht}6nBb$;u<>NW%7+GL*q53yGT0!%~yk%muF&SKS?7R2x1bEx{@in>O
z6_G3u@TwI*uN`3!ThO2<MZ2g8(clhtr#sAj-2=tG%qjor=)k%e^b|(|MRCNDbL7M1
z#@E<qV%2rrQ8cd0?@Ob0X}WO-UP8dclr6MHfY@7P{QDAc0`fO;mw&b<74s&><o`?b
zK+k;{?AW)^kJ-iTN>PN1P`)Ub6RQeX1P*F|)Y0n4BRj<%3#+0PBiE+9Xp8TNb!h+D
zg5dy0NF|X@JMp1)3U!<|SWqd5@jIX1&H_9ds{~_v9_q&kcqZ0%mYkZiuB+)%+O^wZ
zH*qZ!LKf7&jO>GgdrTAX-9Nvh5Iag@BmfbhjA}&-an!b$+?(f81*1sQX<75>v1e%w
zZR69D6cBGjSkBkM_tWMz_4Oa<vGfLT6g~AMMRp15b^`*cAT%DLG)ss&Q8c9F7iFht
zsdUPjv<%)2t-P{-+BVu23n!TkkSsC}y=f#%pP$214|?ZTaaq3Ee;VW7*cmGMH!yTI
zSEmrtgc}5hf=F0T`xmOb$E%t2W4i+4oxmBx=qv4oFHPL$02d;ShbcFF%j4m^xp_}=
zOb|gzaff=v+kax|+J+2kv~Vy-vZ9#B&vC`O;yVlW8~_w46XdKb&rC#FSt3uAMfXK_
zEtV||Ga;OZJ6`^xyDP6WfEtuWR&OOFfDrVh8fU?oGj{{_OOnOnQ1b~C+ts=W<xKmS
zo<r<?F3DaIbWrA?aB##ekxZ4~?PO*o-?di=;Gdv=)M88zBF0f}U4hX`SZ2MyxoPa>
zfvf!il3kSA^W~mCRmJ|nlNTValL~mVrZLC6m{Ei6!BpqdS{@d^EHbMEH5^6W%jxLN
zs<Xhf2S0Ja_hU1a=9X6%6Aphf0tVn>?re-WTbO+VHfkwyUfd(h)2Ym<o?4)cxUCXH
zi8mk7N(sPsvEHU)oG_^2MCimTh*&n3ZMWj**eYA0*E|)T0g2Bk#?<Sq`Vg7S$AS}x
zNkndbkC-1CP13Alw_3chPll-!#{!NRvtoZIwtN(I0>m4L>x^Cey*pBg54$I{10CoZ
zWi|=K^RJ(+D0p{{A{%|2u1+nfu@x#DB<|fM4mK@@->qmHdZ=cmO|z>IcEh|pUU|od
zXk6+zLkzdqjM63oA3jASxp$=1U9%I2?*WVxl?I|X1#L=1!LME^>(<!zBf7nwmJQ?#
zh;4qjvhP^Y*Wu@}Gni8L*B4c7Z#}Ts`*km`v@~!ir<cd7FAP`h6k?uw#Sm#w?QZ<f
z-f12O7lhtpnhRx?K~fpPhyT87#>~%L<HRMr`l&yCg?P2Rx&IUc9}Az*T8UmBsh@1z
z311|USBLp<$VoggJB+w5gGGA3Mu{yA;|~qEyt>O-aijO%v1R>yyD=~U&OVIEL9gcE
zWfZ9=4=|`(Y($){!H8!m!XP58Mf>aJ;UTVUOnxgi$FU3&qC3d56j9W~!RDbtBXe(p
z0pnqH8C=h9F#fL!pxso@w)MIrF)hcx<|D%dLfhd1)nZT;FzK`t$fTYh{pQepV?j%G
z{OwG&uRnt_nxPzQm(<9d8I3t|b^6nIFt~KKF=>qAtoYfmh};7fAvLGBis%m>xQceT
z!g%SoQ+AJ<BQ-nHaxp^V<AKhRq3E#K=g99`iw@BnCu?&ZxtWM?8g0v4yf4I;UoY4A
ziTx3Hj`A=9OvK)R(d}chsAr{42NFb?_GxU=PLN5YOJt>aT_?*)rW({c8nfJrV{+Y{
z?Nnxh$wJZRE^wvw3TN5i4feCa30m~a(@RDQ-@%<T45p)q9+r`DoG+*u%ZZl{rm6po
zc=jO)gd_~7)o(~@zX#_^J&ap@VfQqF$u~3CIGa|VBz@1jm!mcaOVo?h=vJLT`6AY4
zxkQs53wFbKrnC9FJbY#lLGc0m2O6*ZN6uajS6Kgy&5HmFJH||Ttl*@b@8h!)88e%=
zSp`Vs$QQVqBqvq@qqv*~u2cxHb!1&cS}6ZQLnyTmfyIe<KD%rG5SHz$38|D{s#{cK
z;}$lE5yf}4PrbY4D$=yZ8}h*T{F4ZaSU%bv0dBhQhH&1b#|2Y}>*4j#old&D3<daw
z6&(_jxw?x4QZ9~07va2R7975p+&|;4dpzqqf%_C{rAsU9+#sYy5C6FptKx}Z+se5q
zAL6r6f5=s5W;>5)YpQBwo@?#?JL>Z@mI3Stv!(uX3p@Ty4up5PuZ0Rx<3iO7O$ml6
z`A#v`4@L&8UQQ6qyd2oHAOG6uSyvX&XwA_S0!@nk(n-0>Wf&NW1C2yO26r~(0BG#r
zA-q8iWfUdipEWpOus)tK&z)7jmyZ_(j1_+l51gLl)IPHAn(4eaD&dz`_BC_n{1ahX
zAtk2z<=Yf&)iwlC*!VcM)M&P+tb;=W<2!*tDM1D0Wqam*C7|LBs+d76$A-8LM(auN
z8(???X&=i}FDo=(AhpA3EI4r2g-1TGzPrLdIjAAr39SD*g%=-3gU~;MLx$O!scN_N
zqmJxoiKXVzqRCuG&z&}N#`hcfC;qhVZ-<TVY-Pqvs=vJqtPUyp9bwWL?|gvv5k9ZU
zMEUP?4jMa?X$2alZvfnh1+2mU13sb`Py=D-Web;DY%>rl!;<dskI|ABCvp-0b=f1{
z>*LuE%~K70xFjLmBhfxWdW(V!GOzZ3>AD*WZjzst#VPAH$Xa`>K&@i`xxsveXeV<6
zhNIzc3vINbvuH^Njcs0yvjeVqZO&lnC2w`rBoozg4@*+&`5G%8;Prt0n(rcGqfyr(
zWIKDX8Z7H3xecot-5qI9$TIjs!Q8gk>Bt-t0a>@f*l9Oz7J~agk^eu%<}$5}N&NK8
zOZpI(Yxa98?Q)wJi?am><7Aau<|PY=ub6GFY$3h}ZQWz(r!b5jU&xZp>(VHGdFQWi
z=F13w+6!x~h`gIYUmQN?t6{S2f5qS^pkiG&GvHc-8*umhqU^(F><hnw*TrG)zku1h
z->^^Y%%Q0cXZf8KMlrt9;6G?KHC>cWrM2U4bhG(kR5-zxUZE?llOXzG_Vd);4BBKQ
zGFkFZy`z^o0i$oFkvfy_7Cnr9^5wG4`FlxiREO*@R=sy2t|><)Rj*YU7I7ICV;wN7
zL}J3{OBcLdDf_Q`mE@D#3M35tLYQM09i2^{iacu(tkU#C#b2{Rpgm;E3qG7Th1LPY
zFZ=Q;X?qd_P)O;5REB3)2hh}`#|vsre@&JLz6B(l=c@j}>pVi^{*OD|RXF00E@^nA
z^QMx?aQK0TG|X4+bS5@i<@{fbYkOW;o}4D*vQWuUrHM1DInf|LJejH6PYYi{TvqSh
z=lM$gVa=l2e3rgDqssf5@VJ|R3%>78;AHJAd!Aaji)NuI-4t&YF%?1#HMRdud_}!a
zv?>+)-dK9Jmt8#Nenhb6fuAJRqG}Z3%q*RbmQ|pzJH)z272sjFEXn=A?q&H{XtST=
zmcNhQ%M?>)-aGJpdsw?94Otig+)s8`GL)Uk>mD}&M*d6=jbT;mcxg}cgK*XqntEWo
zxK`F_)5J8XrQ8cZFzu1*;qUQ)p0&emG|7lNS1oRmnvn}d*OB4Dhtq!mh@ioS>s*Rc
zKq-xcVv5RHJ11;8%-i!-AtYFnhPv^EY^B{Sj~}4D*v~q6*6&;T?_w5czH=08wTLDo
z&?qAb6(4rQEGHcdSxe)t;s@I%?xTB<Q~>gsi10v)Y!)ptifL~zHQn`MMvsa)h%68e
zL*(o;_FKCEl^%OiR+niGxf#yEXsRFCMqDP`YW~hG&qt2s?t|9v?J!?P!Gpxalvyx!
z{b^`+?p?yeP^{IM-$Bo^kZGa|*F4>QYvM$Hv);A|er%^XT2$|k!09>Qo5jmACB(_|
zL~sQrDC;V4`}whoUdsf8gbu#{D<^jKcOV{bA*ns2LP^NTW!<hBfP#Gbb`C|@$p4A0
z<9cuN>zA>sBA+Ixov#ET!PMpJJnUY3$B>(lG(y-O+hxO=-O6iFfO8HYE^Pqf@^dJv
zZ)IPcW!s1x`;xH-$bXeUUv#J41(O{#9ZGozQ@n&uL@REk`Ak3Gqs@!U6)B5ika*AS
z$53DzRTs}=HG^!5CXJo5BElSeQv{rNxd|ga!0ggO;kvUSWi<8NXju%xh2i96aCQJE
zW6ZZmZn-x>z!6o(UG8l3MyD}6>)`LbBg^@^>*Ra%ZGw4Ci|c?pFFo;XZG{)S;ud{I
zfRr4hqj{^O_#g*aUG^CPK*yDF0}!>kKR%kzGUh1s+S^@w75G_nrlHw_x1P}Z10<$=
z31sbu;w4yK@8!rJDpd7xG*g)-Pcf5>eB<lv)6Y2YUmPaX_*M9B){eMI{^vl*AH`py
z^k4v5a4O76{PFiJD_Lo$ihnQ*q!Dl_ZV$c<WL(MWe7tFa!x;Iq=#^Z)=i*P5^b9&P
z3qw}dnGU{`b~2INcyveFeqv~N$wH+<6VW6G)L9<N3=0Vzyw5KN^1Z&VAgh;2YblF)
zf`sO6ylHoA5O~P)<Pd*vj^JCW5=UkFSyc83>%%*K0%}A)i><n^iPdLl2yGPL!1B9H
zg5-0`X8r)mNYNC>izLC1fZnS<9$UBKCzMC_;-}v4O>sasjUytMPO-|Xh4QwV{#Ik0
z2u>wF00Xk}E`%6f-*G5$BArc8w8gPeaBjy;+PeaPoT)vy<ErnD1vY=T=T(se5C$Fs
z$f|uc0&6kF$2E83CH(FPxR+U4pka^$zCVEC%+hK>mkHg&1mb;_q_m5G7iazOL<fBi
z0?L2}6Y9lfrbhqc@c>ypxUHG;^0GVpY$Urg9b+-%5Sh@AQh*G;<@O@o5~RMe`{(Is
z3hWG95O8^ufCU1X9a!^Fj#PRmQ2mW+FNFK4!slOeos0d^)|w_2mrw1!@tvLRHHTEm
zr}p%pMhK1(P(Xg5dKM*K-Fn0W%48^lQ07JY_3#HVVc9A_T1W~i3={n>rYGhVl{bvQ
z^4qK8zHT9xjQji6pMd5W>Nbp6NSLE6^KSP`)NYlzK9Tz<B2$v;-Z80OU+^ChM;7+7
zy>>%G1(G0qKnRcl+JWvJz^=-yK8Ftnqv?j$ieg~!gDwC!#3z{%@iu<Ye4we7PZPrV
z(q3s)cARGs!^0d-&XdQ4&ul28WAxt3Q6%ZULb#HmCN`*mOZt1D3!i0%Z;RbLGbLXP
zgB5lXIJv6hTJ#yH0r1&lVr;#>NNY1~59F7_Yc!C?3tR=_ThRg?cN?HIpAVr#gzv{o
zKjsID#nQO}U>`gDptT2I)QSO-JZlB;E+@%zQ~nSL1s&f@kC)^Yy$s|GE%|*J&KeD|
zN`W}L0EZvSq_)xoBhHVl>z;(nvgb0t^+3@d3Tw>cupa>xlgsf-u6IT1?N9E(Z2abx
z(N`zL3}!P^3z4izCiN8_Az;V*X)ak3LuuM+#fsFI--YMQKC)7XOf3DJs0@(nen(xs
zysoF?UsDPv#q79tZWQNHmi**@h-QFMe-NVEx1F^Uu8<@OxCDrgBb|QY33>0P*w0*u
zM?W3#8mu-0X)gxKvcw|ec0l_azZc~`x2qQWQdv)`YoS1i0PfvOp)3?qNe16fAXBDl
zIYWeo9ThVqIS5#*y{IgznK)xYsO{$cKQtjE{Gj~gj}(ril3VyviN+*A06m+qJYcJX
z51-1FpN1-okgXf_X8_e*R9oY9b@@vN8ITRY_Bef_Zq18Y2hN769zgmg-O*F9!AjX&
zPqz<(4|l7aa@Tp8Lusy~(w&G!=&X)p)v5j$&I73cI1gK+>xllS^8~7FHOGtK+G1Ti
zyt&5V9i8|ZPY8UE4O%H$l+}4j+j1Ai1x^XY)laJu3j(c*p(7+vYm<NuQyVw1SzPpp
zdW5M@Lyuy%crIyJ2+!(|8{d?BN_U@PqR)mL7QiQ&bXUeaOEu_khn<7!ZdP1rSB}wi
z`{!?ijz+viSig(;vi}F)j?t07O4mFA^ik$PG+Oh)BbX4$zPZqBqm>J>Uf05I4yt=)
z&a;ih)*+)JE%E;qupZIlXn<Ml=js2FBqtC{JhEfupAEUwERigz&>*Hf3Z{*)g8o|t
zt@D@JVUfK%8~hx_*=B}&yxl70VX;u|e{ksk9w=vH_XRHw@hK28qSx?F1@c@@$p4Mj
z4LOVcJ{(-)YNPW1GV%tFD*vkl-o5%t;Hp8URA89=soziV!8kgdNb9d7$kVI_A%CAz
z|G(_NzWjYG{r{g|du8#B^j2W_OS&t~_-qr{d0sPx&{}4ZVc>t)w9-#mZk)uT>XM@B
z_qP31Pbth?mN#uS+tzat7MHYo(aZRrnEy~~MG7Rj2WBBg<<2}KwYJOD&UQ(=WD3KL
zzw+qH|FYZ8NQpY<=FRq1>{k9r18)4{8nv?W?MMoG`{OP_NkF`<!#t7$tc$1XQI~Pb
z(nHt}NfJBF<O0>Wlw)Y+k^YT_P&LxwOA5trBe(UJ+XxgYtN(+zw+@SP{oY4K6qH7g
z4gnFQ8$n=D1f)Sg8bm+@B&0h82PCDtk#3}8l<uKJVx+rfsG;V(7<+%dd+*Qweb2eh
zb<Y1Y@B2LKS#hs>t!KTkHKhG}RZTyvF`P4k(wSEMu7Qf@Z#~});CZIDnmk+tM$II-
zuCfpmErY^<6-~#fr6Oo&Gw)tRwL<PMhOz2q(bUyD-vhcg$dyZOr6_W0mOA1R4uBz}
z(^mM?-@d|cH|fs__)Pi89Hcb(-nsEuiOb<VOo4)R^I@d9$qCG--a1Q)qVZN*iTVSv
z?*{0Qih{4aPrb3ufqsO5xY8tu>YUfu+4pP`x-})VZYK-K)<V`^wTBx1jNWGkanjsf
zFP1ZD9^?0Hx6)vFyOCI&c+=hQTsOQT*F`1JWE+*@zf+UoO#@<(!d+pMh^A?rler3s
zj!2(90<k91?NXZU%X**C)oHPw8TIY7e9^nAwkjQW#RKN1R%*qg)=w6}zn^)<qoo$A
zHK%0mX)`IcQ=CSP<|_FN*UkF-^Sj~?zrN}k0A7W)U0mCP#d4!L*C@j{*XP9LeWZY^
zX&QH3W;1$(6k477duygqu<tQSHb3M_NJ0mm41eK4YhId?3`Lt2FwO!U`Ro}k)FY4C
zlpHI;@C2(QZx>&Nf7aS!ocFyC7BD5|o%KC+5V!~jm~}&O+moOCRxw02f9GA?f##jO
zixe-pd_!c9b<Q65;!R~Rp#d<sf~I-Oy||6losGN>=EbN!*WlG>K=$TxxW;T@QZj9g
zAW>R9)9B9p4lrxT?RJXtoG~zuVyt+#&A5}I%jvDl`(X@2_%&ub>ia9Z&0j?Efy<`u
zLgXjZIvCsXMRS7iWZrJy+q*?rPPol}6Q#{LEN6Dl8pm^P>nEtKya5A6y*#^%2(=cl
zgG~4p4}rYw>j*tuJI2v)mIAvtmiM2cU?#(#YL2SSFLtM74#ZSP&ybBZMxy-%n80Dx
zff*yxyVo#Q#&Ia`j@BBHjQcU39bt87eZdOFI}_QVz<N%ac0;kSSS@#sH!jjF?W##!
z-&wPbTJxOosfgcI`@yV}CY$pt;n6Q=npuX-x6<71j$-tlJqBhMxgUHSy-s^CPSh(_
zjq>*Q>#J{WUwn298PH89!Sk`W>T7?RwQT(szu-L)2I+G#0-XhO-TY>Si1n8eX9Qky
z&wWjSr)gZ@PO%StOju$0>`bpA9>+nf@rubFt_iu|HPr$<6W}@;J*8VVJKGcYTVs&O
zy$tOK=m_4tAomByEm<d)6u@0F61Q)n%_r$ky-$SFvJm*@R`w4-GGIUEh8mL4>&X|%
z>A1*>(xK_Gf?B}M2V=z@ncN<b=`SIF9Y7Y*aboyf%r9=n0+K+<T9bF@ty_^}$n9uD
zChQ(A5EmL&Sq(i;9y6?{md-Z5`kG=%EuBbNr?*1{T>;KTpYkihq4<LVbwE!jq%Ek#
z6!K$kQA9CBq_>mG6{E3YS@oWNqtDY$bCLl!>+4{XVkuLAkIYfSTt@0Vn+A4FO<Dmt
zg}d*w%&(h}q8pYPUi}<kPo~9ALk~bj+M31$HPWhJxqQ}|zVkwDfYJfxJ>6gwu8Zm!
zpqKGLdyEzs4HJHCUqdEkeVr%`n9Wk~-5{@4f~`cLt|idcZe@nhBe-Tav3F*<1wLiz
z=MVc@?f<&vmRYj?lZN!dE28Xq;LHlLjKV8=5J%i1-oP}-!#qbmtD@SyV$B?OwoFCJ
zk!FhCTW(U{@7;Je;J`Iv5>0ely`F318j19i_!;a1rY~1btqR7buRJCO72Xn{GQAoV
z=mAX05ht4xZ6BR}6L41ft<oou)TuDDPZ_wu`ASDi93OXDprn?(82hZ$JwmD~H%Lmb
zwMj<(nx+vg*1!%XiZ`0>2^n6xB27g9%xszXDlO0(u{(^VQT!g$<?4RQH~x2QNq7=W
z-BJv1E*xLaWd>6aWJ}*0Rg4~`z9)_p@O*ZfYK_nR9HZ#@SBBD->(2!u#HpL>DUTKW
z+QQcO?i9L}_JUs|3Qy;i6cXt__nwVcYcZU|ReA1RtX>a>DdZ=c3`Ol9ycWPD-o-nf
zS-u>QR`<anGQGPCY}x$|iyZr^U2rp<jk7#^uZa((oj~GOpj$*XfOBM;u4U;9c(pVp
z82?zNFIlRbJaE~YJ~*#aVSkYM8wm%{WKSF~P}1#fGJIC0l;4u#Xt}29r>qIye~!i0
z%I7FX6>}T@Dl|Q<*}M7L>ZuQ9vxV+6YEiDYhzjG^GywBksc2Bc1+_8ZA*P}0D6P~h
zx#id6dsA6Sim3a`HD=fu6F(2Gsrxg%)rtw_fkL9!Eo;4lhj&7Djb2m7bv}s$zr0PQ
zhVT_|*L&_Q7OvL9y~FoJ3r!Z2*O~(DfId@hF8}cP)u=UsV{T!F8}GQxU)*Z!gHi3A
z;owY_dIuOXRcPNyD}0t&-biQm8AE$X2K|Lqx;iiZJ@LN4mk%12@rt;)<>sv;rO+tK
z<8K33tSDZ`om+Pir3qfQnmsAC`|f{5?_N!|k2MCTc$w|)QL0P>v!94zx6k{VU8RPz
z%{+!SADP4@xVBolgB1LM|2=OK%hmJW1yXF}3m<)6zpGJs_ftnMa*QbPZj+`@QYqtS
zqb4ef9X-Ef*PZ1$S~}cn?q5K34K$9G)J6lh`ZRAI)!A{fileDg<oc5%ckR^H;&eQw
zWWC{d$D-Rm&NYNI()_yL?MRm>y4+H)aJBRJ9+`!Ar4?A=Sl<W~Uz^6^?v76bkMo&~
zT;CN6p>4(k!erliN4-CA*Yz<Me$lnp$MW~!z9ML@Z@yPO>g)SrZh&Z^_qjgNDrQ&t
zm@DV-YnHGRvENnVw_Q7r-EGgKwaL)UYF}=7k#KWY<Mjl#34#o{24%eyqVahcF#s=e
z`ij741e>ncwzENr?Y+Eo*)b;OT<$~t{M{Jir#SIfK3y2DrsOe`#41{{0&77_e%|ax
z*LP$FBZJ&dQmqFIj1PSgad5NN2~d0PJ!Hr<Etur?5Xf-Nh7(r(hPq7Bv)gjeaUP>}
zN3r7jiFsDL;y)trvqHyiL){&f4G0W%Q367Q(bD6BggzZ(x{n7nw7IoDb{19xPbhEa
zf8e0Vdn;49SI2w9A_JK5*G<B#i<ng_^#(^or!`O>S%a0mUv^D!-oa*k!%cLgT};BH
z<jwIBhrRseM5q*25QCYi4z#r9tN20?rRT$Y>e%=Q;l2#HOpM29<`QwH<Hasa6)b~y
z=A_&!^nGm7ue;pA;gI*{X;nlDY^SRAgejzcHGh>{CW2OH@f|#1SciqE*5bF?9zpXK
zo`>>RD@Cq5`Alg&nm{)<1gM|kSNg@Zdy2tBd=m9n`96>7$EkUtLl}vz*$A!$Yipj|
zA&Hlgpx{2UI!O4SuKe8F^L`$fA>?rsP~nwj*JWmD38!oonYd%KP-_pC=;5ii{w{5*
zO>oU{<dM+}dcLsU)0a)@U%{4lZu8pRBb2(=P7naLSt?+XPyst$TP^GwP^NB`wHU*0
zO*Y{AuJVfT7S76trJKqGiqgBS^nArM<t(qrU+vG`2^p->t^xYz&-jNh1i@HkeBm^`
zN3FPoCkI(?2ZtE6<PARpVS$`bwTgurP>^m))+P^KhO1mu9uu4&{J|k3O!piS*H+<O
zA9s!JrjrBrS#NyXVcdI2P9AdOtz(~PSip(RCr>T3z@?#W!gjzO$UmGEE@Nn2!(eaB
zqm?)Ar}g9IP)20Z@K;xwTLHO5W@vQey2u@!@8}?f{1#<R&c_4HWusy0&T6ast>oG#
zck{L(r1KxM-j?AsZevHZV{FuAQN(uhw$Ac9k!d5GF~OM%&TtO-Tu#;Sq18!I(Xtb<
zcr}ge;Z+K<?kX-%j?owyhbdAEzOuBJeXZs5J~uo#$Zu`jST>Dp_aG|^{8kZzJKgg6
zd%U;vV&W%s<EeN=r0p=FUm0#|3g?29FdNR-5*>WK>#&0N@eDT7Bc;p(V9TOHL!_}{
z2ByPRXmzQVLOvp!oJU~b(UMTrZihEPYcev<1s3ju$2Hfy^-<eLg={_5;GV=zyOZ^E
zEqriAi=tq%*gC;A-5X6VvnCC|B(kX1!!6^xty~5?QQN1O+pxS^J*pZN)J!}d4RtV?
z>0Wv@g(_+3$L{7AUDrp9Y6fKnb>BWBxOZb<)sF;9f5)cVk9XiH5^2PM#&(rR_Bw??
zaS?$wNa|{@WMcP2tjDBZ-22O6V$IRk_K&Y6Z#1}km)xDVs;Obi3xp&#c5r=pSlNv&
zJbW=zqL{z)<@3+!V%H3Vec6}dPQGO1>f5KRiN8QwrtYQxHIOX$Ef+N!_M+mf5pk*I
zGY^4>w|?ca=vT9QOXpKZk27(X-}6|DCM~f3zn$fB-8oi$CaS7Q>@U}?Odjm0#d@m$
zBPk8$8QR~W4V*8PA&0-uzbw>6tRP8$3`8?*79X(*digL52l-eXs3{0`A<nRWER4(W
zj(Lq_8TB);dpe6aadqON%f<B{5{Tm>;GfQzz`HNeq<kA77Zu<lsH;C-Bu^GVVWslM
zKZ?#E7mJ{6>z+%B7ut^ji1%=w;8rtNNpyeYR0CRs9V11c5bqy@+lVkSz&gZJbS9mo
z`9ODZy-doAf%xUx{BEMFw6F(medJlhRtE}!Gk2i~&O5|L86SYbAI}tc-(G#mZXkuz
zy>c_Md*$o*RwAXd*eAIRe_ym;_EXSB8>sbW@L~`&7mrX>VpFt>G*yL~-W5FI7Qrc&
zi9xQ?BKi~<_869J?Y=eE*%(0Ev=?oERv|c!+|xPAfuJUXRB9g4|5~HYo)gG}TW$q*
zc5D%Dz<Rm;$Er`FKmB}|@RNO<vZK2O?oti`tK=_tWSWkHDU6@s9^H9cCb?|7%5eU@
z(v2P>QryrS@!XrD=0ceD6X<-5Nags^3wGQ~wl+96*-Kt{DJn82h>g!cxQ#e6hl#r0
zfH<4{Ql9DeGrFJsbq9aG6qhFh?52FYj9vYSR~}?pw@^iyjk(#Vx0ot@ZFLTF>lx%?
zs>DFaz+*YvevOe-mQ?q}_C77dgCUT2?6-c<57ET|4Y5N<`i-ph?L#5}b)2mPayKEa
z3b;JC0b&)C`-dt2Doy4m_S>Ilq}CRW3}QYcLeAz#nS*mk)7q7qPSu1*)l{=Ibe(k<
z1^;{{O#><v7z<W3_`M2Bqho?xY9R@~)Y1g{e73gPNcTPRQMwG%fJP?dJa4|l6DWb3
z61>fH<PLD)m_C{;Kc<C!eaQI~wV||=UD3~D<hmqT)CP>bc|Y~0%&HFAZXva8l>AFG
z+2_{_*dkM$dz)~raJo48tWucY5dLq4OW=HZV4BU_&!u-l>*dT;FScGPI@Ct0cjp`e
z@eswL1crkw(a5ovDKCosxjl}C?+t5`D^!!;U%4#*-`UzmVeMnri4~_}1Fwa?)ADzF
z-NS;f#!)BdI+LY!drGNI5w$?pD^1m4QMWjceYwZ1#2+?B68>H~HpxlNcPd#hCHlu@
z*$MjJx@tm`>bjt_=2*WS?YZA&P_dDzB*a-Su)SY<K0M4>chBQU8WJ|Yyw|sLE-bn^
zp=yVm4MP#Ur{%je7eLv9MVbz`42+S;?hX*N7}{`L2X>z|Jn|Td?62j+URe-WP&fib
zZ=cno4^baYb2gszOVgtsG5$I`W_l#@_&BT_#aWMR0=t9V7dSnRBboJiXAgG~YaDLW
zaif@mvq#2@pxjG4`k~T=kwvkZ$bRdiX@?GlaxQ$Bhl=m1eeSV;O3SwQmrX7B&
zY04e~SddPu`g2_t(S<Buis|B}#bf&>mz99#KlKUWJifqbYIgb+dXc18QZ;t6CR<RX
zP8xDb<7zLwbG9(_o<xt{V@oGha!*uY)lc_Dy5!L0O436Q6q8qQGd_ik#SiZxMU(2a
zgE?7|>WP?|JgBPQjOM5sDL-%*`@U;5EE%gQ4BQi*y|5w%g=x&}*$5x-4QF+vI;gmQ
zd#{nh+skL?dT1gjHHQ(NRr$j$Wd`^)9;C07qwvMnTkW@9R!UY7kEByRa?}syuh|bn
zKhC126IG@~^gVmd*N9aWquW47fr&%NcJ`>HYJ9<fyG0(qMYZKl5@%hEfz2dx^tolI
z>s}(?lwG+VHuKvFp4?uMp)#poewZZux^(?cgO)EP$+F|}#Gl#$K$#VYv%5rGwbyRF
zy!XpB_@2&QnFdHcC*HAcKZQv@<m``iGlT)v){W%DH#ZP-P!zkKONhfzrQkGMU}}d#
z^(^=Kw2$;iNtiG)$-Yf}Jh&N`{<ruDZ&G3nsE_FiVpSj=>y4>Ht=$igI-z!ZpJL_k
zMGvD+4jPppF^Ugo_9+IBB{kr+C4~IbM)4>_?9p)TauUDjmLEC;1mp>~t%5DO<j~R}
z$$@oCThAsI*M75IJ45uxUz`hm1Lxh$sF8HvY?7;(J2bfXUb~sl5bseHUt#D>`=>*T
z@8G#g?1{|RDVOR1uM<WW73%R>X{sKHtp<g|(jq<|{_sDGWx>Eyv)B(zDK%DG`p#vt
z)GpIzDi3O)U!}7x3hPx)26Ma$U_^!sE&sVqAlu<N0Ndb0j`bZI$1oCK4Kuqxc+mHG
zI9qV~^vU_j1_&i5u<i_vT*xf-l^PmY8WsoKS_jXhbOfxItKrDEe(STDK<_C81x(Jw
z3=+sNFd8+%-VqwplJf;>DJ0=-FCJP$RD?S=d2>!ZtP<IjswRw!3eo5O!_go#fqZlK
zOh>DRRP0#A(c>$BwP{EkJ}JC=il+C;1|{rVeNiy?F0P)KL*Od83()S|;e{AxOZtEf
z-%T%RBzo^_?SZO}Z?m6n4$2PVsFiPzh3r)@Pv42)vqz5H&oCtpF9rHjjWy4Q5}P8<
zp}P9(S7|YLPQEs%)L(R~mY6^Xl3>M)b_PFo=5@HmG+eFU0L(MISoepTNP~ci)p;5y
zkjV4!+%&_+_sK5}t(p%~(odarKge6-4bxShxX(;~2?O{f)wGt&^w3VG>tR7>fi*+x
z7CD$yZ*>>NI{(-LFzIA=4>-Nl6_uQ%#6J9{)suQ2I^RlrNK0twr5Z}aDkamqN)g`F
zdG4)mKa(6@zv7}XoAmyZ{)4&Apkc;XExJSr*FHUikY>CeHt!ppJ4e#y`1F6ksLKF#
zDcte|M|f9GuD^xkHT)L)DhiFlKb5BN3jfB~_$<nBku)NYEFi*0a$m#X0=Qlud;7B!
z@(t9AQC#b~QcbyCMM+UWANApc;LHZXptE~GO;ss5`ZlfAz*CAkcZ%2l0HQ8~aePPl
zRr|(}>Zju{`vL&HyqReoNOq#Nj~M&qtYdDc)o<G=*R>AyNN=a?ob4WIVJ^&(&KA{k
zkG?Z4-R<=UJUbpos<7mKF#WBn=qsY|tcqbR(^Os^=jC$|@)R%kGr{$7d%jiosF?&y
zwn)Gn_&fgr>|KHro=bmFK<xyj;OgX>8hf}EZdonSVzdWz;<ssEZ9Gl8{P$%rU*rb(
zwc6SC1NWwjl#U_MLl9r`oAR!W`Rf9}rGgV-%IrkT3e~Dc5)zf?tzkOOWhWU~kM5am
z{fPg}^oGF5X%@#G)7HDx@^ECc#ZGRA8Ne46Fa`C^qSjK&T_9|l-w@s{nKr&phra!U
z3V#K(e%S#mJtF0#cTTUBmq%}~J58|H3-m+2H^)yga5lmSti#)4WjSm2QdT{s3N4tD
z-Lw|PR)1qV7vs+8Gxv*`9n<ksDqA%PG2U3ag`z5@9rw*oVCR4lB53X1I$Gmg>YfJ$
zS>zFV4n+s+SDgkf0pU8qAaCGP&u;@KZ*x*%AE7qMAp%Mz^{0d22lh`Vu*YE7FDLb@
zZ}}^(?(_D>Sw1yzWa-zugg)jU0U!8R@cq9)pML}2{|gZJH}HLFb%B3DU~2vWDRe)<
z;3KJj$R!jA)nk5w;LAO?{sqbXKiZC_{EvH#YD7<n{oA^jQiFL$Zbi!T?B&K(YEP1M
zGyER_V>#7_za~7Fi*#u~el@iX>sjpV+j5*?owAuet&}(<z&XOV^snD?vpHERunL)m
z)-9gs9(msRg=9Jxe4&u!OW4HVbm(w5b&_f1w6cYv$==oVRKp-(x@R|yL8)_Q{ljpz
zqVUR4XCv*?U#Ac66wp~*nF1Z_ElOXIZmns@Fr_-VR0$S3akeb<*W3}^pHQWDI%K*;
zUkpEA_+N!(LZc*#>bCcJPx4`Uk~Ye%0l;QYJhyDBgqF=86qixk&hjx8`{k61Q)mh(
z9L=5jWaa%p91LL^FyzyinA!8uFi_WQo~mK_W?(^<sh+FEqCR-@eh*<vH3(jb;Uu5}
zvq4y9?4FAcjhJ4Zc(}AMg&#GlPl*B2l@nLKfS3}TH1{_QYP$^lZq+vE?wxsNavb$9
ziJNFUcitlT2L7wF*uokxg(3Alc<q!5vzz94f$p^E8k5iqUV8DBB-1AQ$>`^n5A*;E
ztLfkzW_(nQdmw6n>gU9UomN_V#Izl3&)hLv|KSokfq8AjLCj|G5a<O>o6DRpt>j80
zFiTzS?)+Qx&@hcB*H(2Ay{Y|zR_WjcYQ#0>i)mCNEsCxVzcnMsYt~HO+H+@I9intW
zfehi3QT>JSfV&1%%F6pQ4QGQ&6%BYZ^4@T2qLO67Ti2M8A}#X3q_tELG+E@_g-eX?
z>!mw8jYY*~IXHN+m#FT)TEu01<!qYUwvU-y?o{lJf4i(zPNYsli&9zc;JVnf>rdur
zn24B^xu5j;<yADa?zbToy~1w72)2q(cD?*cC%ZvACZ4S;<QGwl48SKSy5uW*6KL$#
zV>f`Y{gU!FJlHv(cNIlo;1#og8W2;%ZvU{Nphw+zdf=W`0n)neAXJMg^yA%&nJ~z$
z#*U{EM`pd*p@gCgVZaqFmk&?xu(Lro?eF$`q?$HeFaO+}9O`qE`r0Z+qGfV4I@}#h
zZkkVvX@>mrHuVBYfM&jA6NZjgQ16E3x;w~Zg6w^0b8<_fZn*<-D7wGOmP+`*u3yx;
z(VaeqnIVb^o@y)adVEAC19$rqYTT68-W6>&%TDHKoFBvF0STL5B8-@R`zxf{_8<1w
zs+%H_bmBtPL2w^hrWY!L6THVU{Xo*OI!S;Q=&WZL%})UbS|Gtl7upmh&mFI0?t{>Q
zmvZch(S)ptiwD$UO`2hH@n!DWY)KxIA%kwgcP}yMe-GzRTR=-{syPLPwN~h{+E$<H
zuC!r2k}$gZ4kMX&LFhDb<F<w#ujtmhf^rmLimr}k>kU)sg;34Kr=V+hJvFoX8|Sk(
zb3rKmkKn~+%#{Anv#dS7M}$}dnTkS!L{BtrYnJ6Ln(1TYk2mX}k0u`)@ju$+G6*l7
zHu^L=man7LUk_iFPkPaY*x`A$=%M=)kv*~D?%hI=bAJ&fNaKUwsOIIU69{TKc>ZuZ
zg?g~-V0mgQBAKBdwaEBxn8NwmYD7<=GE{DvvbgFHV-l4mirg$RlY(FK^4q=}shDt?
zXvMoRi$ftNRczKN?&pb<2k1~p^ao;A(;Ldo5l&jjLXoNE1{#lUm-su}DpK)}C!ATg
zpGNQ<{0!ZLfcCE=63&3}j4No5toK&4*eja5CsC)=j`ogG#mMkb1ChPLO)uca(l9*2
z`u^sZA;Zri@F(6hZ_mi%K@R=b$EQ<j{efmvvJw6xv2$(5^WG^~-?j2+8GuD)ElC-r
z*0pXNrkV8Env5c(>`Tk+X)J7+!o7qvmvqfP#-?RLG}%ufS8jax^v*5rdK8`2#Y<7+
zr^y(W{xe5i(*iKZmEIE>hB^OQ!nD}g{pb_^=LMtVlU%N0A_-TPyns(sQ28k_i^uAF
zF0J+=E5~4Wbv9oBg%AB~?#dcXWdb41^<d4LWmiv}$nV3&exfB2dU2-2H*n@unTOkk
zW=|5r+cl*K(J8HK_3GN|C`~+*Xu}-#{VF&WQ=PXKz^9Rdml@!Bf+9LKI*P`H4=q7-
z;XA{y3#lfTTbQypoaUksewi@O7>0RNpa|3Ec6pj_`1GOzDB02s#f%y&OdWmyaNhUC
zyEIbABNc_UL_^3@!14;UD_wh9+S;Ds5Ka8cgsYH&be0xPmS&RQw4VWf=C%l&8cOwf
zdHS)e$1N;n;YywDQXL7Ab^5~Mv#=|b{Tw1I@Wk>^eBd_%-(F6E-ItpVd#yhHJJ>Vv
z|86#5wn%tq!@At$)gx}8Df^vaIg(WOL6C2%7As-ej~u6|(7fGsJvFvN*_%FqAXrlb
zX#e70E>GOH4Dd&#U{dJ=vv^f{@hw)jPpKocj_zF!h2De5WR$Up_uza@b(|YrE3AOV
z<G$sIzABe-Q@Jq6;K5w4{UBpZ|NI>)pd@JE*J{JD^cBPzmVTw}nV8j_WW^cob>^6L
z<?oNr2&vALCUHo)sa=L9*G>|xqR+#whu}p(SAvS`d0k_}>DLzEd8$vKoko*%&Q0Xm
z!*{NB84Yz`W2Po}!tBy5*sNUBh6wt&?Y4Zd*>s`hf^(hO&b~`k_$J_O-G>HmF3dp#
za8L}ul-)$yVapgzSL7yY>A608qVy)BHu2&8kAK%%6r>SxB{qwB+{W^54&w?VQ&&Vx
zsXrs9&<fjAjIOjVvQmUy5sz$jd4jSS{TW$r-DSJOoOZY5X#If2VWlV~VHkm$v3u3O
zxtYX!4uK*>SLB_>)f7;pdIlQmPSwvAnXq4sJV|D&Pzyu*#N;y@xvxA1xQJ)uvN>~M
z5+!k>iYJs-O_s-OJMVUi&?Na1_#DZmUyGaB580s&1{M?(2Ai{^ccbFiT-S%p$S*<~
z8;b~*2;Z8ML`%o4bEh4NAYHkr)||K8Cd#5|EPC4|I4|hVpa#oOo|$*h$Zl_Xd3K3C
zvZ&YMYiKLs`fK($kao>6p0v1F21Suf7Ismmv%~#znfr%%IBJo(BIUp;m!#M)lsbZ3
z63M(SRvv_P;B<ga<FG}=lvvOMbFFdO>>8xdWp~Vz;DnXqKGJG0KcIz395Gvp*OE;x
zVPdL(GRc>!2qP<CJgme~3s=Fd?ivZml_-C<lr~iLKcvyPTK7;wg0K%WL=dr)(X|M^
zz8o@G@U@gi)O7Dvgf_<KS)sHw6PzQ};@9Hj=2;q!n?{`Pp}puWuLo-mjKyos-j{=p
zo2R08=A}M0z~!-2Qr5ZnL%JE{4#)-`(s*#6r8da~&q@T}NsWgDttIYW#HPmv&+WV>
zupl3BUGdJ2C8X@W1xYnkdPbY;=AIp!AWUxkG2cYorJY$~yDU^$Ae&+Obi`U+>s~ao
zn?qcJ)Kd5-R~+UJYQ!T><|FfGsI}41dv=L>hj^sadw5Q+37L;JyS&GEG5H#UCzNN8
z&)sPc_+mm31cG$!l*#7g$|;nI#SA03O{`<#&jZSX9^9rFwIP*}rmjn*<+o)-9nf|Z
zIA{smqo~i#m}t^sGT8<7rVN~rUFI%1LBe-g)$v#?MNd}yPS2z^LX>;lZXvs_lJToB
z^{~CYYGBr?Z7zf#mh&j1tXY$Xk7S20hi<Oeep$Pmefd2YO~RWCx0l_By8Z6Cx9#$q
zHX(hoD@{uMOu@}m-R-{M==O*dAu&|}tL!fX!xIBRMAq_#4`O$ej=Trr+7mgsddQwF
zm+nn*&BL{#BDGs3G4f^9XtbI;P5ZhQuljlr_mPV}<?d%KEUyD?kJvp%51U(D#_MHX
z4tZeoY(B~0Wl9ref*I+)AHpkGqM`O_AkxE@;(E*Z^qcYnMYrcMhR?YeTFE8i1?Yp=
zD}y21DIKh-_XQCYNRXcNv1Q!4fv#gFOGg&*+i)NATY%oz1iUclSm0z24SCS$w;<Ra
z?=mti-@eP=@i2^Wfj&Y7KSkh6pUhr{-@^Mj<Z&`vm}pJ_LVR`vLrQ+p=0TaEEPi0m
zQ)r(}YrL7@S{`YV+Yo*;Q>ZRk8+3$lXm=`?6$jR1D=$k(N}fxfNwJ-=8iX&a@lX}#
z0J|3y3)(T2&Idk-QrnK@IZiEAu=M(H61<c!tVrVU^m#s&+P9SbwTBC5M=`pcN}{7t
z=6Xu>AgwEz_tuTg%pn0mKg>*ii}dGS{PWJ~Ie3&MrTC{M2Zc<jwqp$(%U)2#YQ90M
z6q9=;Uir&%#)d{A!d+0|9(m97>&x8e!Y;~8a;m42y2fb<PnHb-@?aL8{5g)mRjr3}
zakR(yUS8I5FrJGGxlw^<|M^4~7%By_K2LN$LFs9QVGmDJjLAL>s_7Kx|Ipns;(59I
zdnzt*-#F*#(bbLBTJ)|dIG=s9Ia;nSnIRYB#ka378dIH;F%*y0b5#K?*LXw&_a@AF
z(<)WWEY|9+4RE~rycGpTx`zJptkJtH_&=8(7;Ih>ly-hN`>mwxn{707T7=KI^95?s
z4L{FvxY>JXc(QUiP*%bpj~iW^GP;T;9xeGzZsez*GE>Nab5$Dr(#4+(zfC@B<5YLC
zf-pS`=9OQG)p?}?>F7_y7(DH2dT`*^D$YZdDiCX$?EX~KN@7=gQ&20<R@rPLoP3}Q
zEUcCAW2b+v?oYu2Ud$<H<Ahc5W@9wd2fBTJX||n}R3}B$P6?AA-N|dk!D!NCvY`aD
z(G0vU*=S!`ov_{?Tj!AzS%~b&%xXi2EKpkA%W=lWw^o~Hmt0$FAWc{a?%q1h{G}1)
ztzuC??orfgU}a&h|J%am2Ve2@iwJ)hpX*-e9%;Dz?T@ttAyahT41J0LoexB(+fLi{
z2$}0ekt?$w#uusy<$gZs#91|#0Afxn<x6J<0{qy}Z_WkO=U*QAYF`vSxu1eBTkG<`
z(^bftb-vQa_Vj|;GxewU_d}bj4P1MG`0_U8pcDOg=`efw$xjsvXQXNFuX3ZcH)iZ_
zFi2i;v3<)_Ey&OG%gH-2ZTP}wh_;;}3n!_04gnq9A^Khpl_K3ZxxdZA)fmIRJY6wH
ztJxd=#i=$ag$U{GN^>>UQ-KlRc4cmHFXVn77J4^&yGwxm$(E!R#(o=q6PC+Lqd+8P
zd_@XRw}S{-vcp;QhBiB(kiap8u^Eczq#8<9CSiW=(Rc+XK<xYWx<nG52Kh(z>}S9E
zrNGAbDF#euPYTVEH%;{k^lQHO!WMY+yY3rE%O<%HG@j|JR!xlmG{NM;#A4x@5|Ft%
z3gYFVSR^s7%X4r4qbH(k!JN8Wkj_lS-}qDaqfbTBN@C8nuouarBk+_bp5<4h6h^vu
z<XkYKXozRi`%n#euOf?_uo^S=96Uf`=enR4`IfV**qkK63hr$*C2CP6hl`DSIOuSY
zR77=Y=NSP;l+z;w2}!Rt6;BjKlt^b}`HSaBtJ#qj;n}8^6|Gl^3;`Mz?Sc&HwCBeo
zA7gA0CsMOX^gZEBvlMq@`B{F;V~JeZH4%vOUiw9~Utdl6r95-(<yvi+{V3Se`T&8Y
zCXZu1GH3VgJHj!)a%__840_tQu%YQzAg%9Wz`ntG>4^ZV>U@@hwxnKYKSsdmD(Rwd
zzk{=buVj9aU0wHV?>WPKw}oSEzyiC8F9v>b;Psu5l@RGOXo1$k#<L*ie+~J5XiK+4
z_T8WC(Y(?(B84G?(LTin>&uMsf}&=lD0jSg3RkH2;>ak3TqU?U_bLP8$n<?Wsh)@-
zy4$8`JRe1)OQLr_Y;nm!l;xb!vnf$3>z{Vj=Ws%ne0Jk4@9XeYk}2ajY_@&nVsC7g
zo%0H2hV?w`NA|i4iS>+o$m%(CNTjDLKoE7+zxR0ygP1oZlsDUVob#o~BpA|~JT6pj
z9U%B>zu5prI5vzPNVqib8{}KNk_;kK8{3Nq0>tKq#STqLw@Mh-!-iY-f%g+(u&rCa
z<5C71Bbd9t)h9?M)TnB%hDO*v(1wc0biTWdtUcm@Oo`)8&5b1loVDor!HI(YAO#4K
zFw<kzWDuRn{L-LD{L+0T%7+B}iuJ6Le`pq&a+QKbyCJVXW%#`Yl*t4bxYUdumYemJ
z>cu*jkIKHgW!ZVLp{UfW4$&ig6heRHJ*m#KrNmFK#gVD{imrk?9w$!f82kPpVU7lT
z;p9+#`B~FG6xo1p7`duehg+tRcJ}SUy5-z_+luqzQ<{wK$JR_cY3)f+;VCN56eQ`5
zQz@(Q-0oB7XMlR#!iWOAv%7U&Uw^b2PAhIE^{}$wgwxu&WbmYaVMk~1H=E&I&^;Oi
z#fH4K-(J?Nf@t4o&##gssdrE7^;mWmWIg~OZO}Wr?K)j!R9M5A{sg$jkjsc-9b+nM
zp<xBh-@gue66xvrcImiz;H7?iC<B5cqx*D%Ig`RFGLP_=5n6ugR3iD3`$1Ui)qU$5
zV(d7pb4*G4ND<ebt#g34q3+eu9?W_vwMuBReWzv3AGOGLkgO$3sr>oXeRi1JvgEj$
z{Gwz1mMV9h3YnTK{5W;X2zp-_l`zFRjKPeJ>anG@-<MH`k-EuF`s0A$)gYF`7nu!o
z$Dg^<$i5tt6E(L5OdFJS5c599zmJfto^&b6{~QvSV(HpVelG^n5>;X!tTIdBuAQZ2
zy1yI^rQ#%0CSM8Pup%Dg)rboE`mh5sk1w*`n*Ui+LkLgmzaISog>qF;D)$Gy4Qt)X
zzKITfxD#kkdx_VsOA&<XexXd`aexkNECM4c-9w(Jk$=#eXWV{iv_5}`+I9NAq2@=F
z1-bHa#t?RbqT7Z9RAXkB__mukcNH&gA|sB<SiO?Phn<f<VbwZ++P+l>^z4GMsakhb
zV6$MK80#hBGJ5PGv<Tv&=wZr`foXp_b9A7pz+ai_QC!Q^bzv3DT>mR`C_&S6+9!#I
z$_K@}j7=6;DbG*T&owczqzYLbg3db9HKL4UJxx{Gnqqt|0g6C81>rBm^gQs)7%ey<
zH_{}=)kqw;qUGLcSJJ@fo^9$!sjet`1n(B`r#c-jm8TeWT?T-AGQ3AlVlxwskrSS+
z5d|fK2`wiuiAY32D{+_w`{lKG1kaWbeG4f!8?8k7b7$<tbCfqIZbN>-*7XoH>Z3Cq
zL$5ZEp!;(e75l{g!~OO2`;MGt`*3xbiJkZq(aJ2Dvg?Bnbb{fAk5k<jh#<XztQuBS
zPc&u^Urf9jn_MZJ-X)Ehq8gB)UhxbuuTjXBdO{r$&4L+S75y9T#ea(QbPT5*tW42P
zA%R{uso&=TH!d(lk~Kw2_{80|dNv2n-nVsS$xIKQcX_{Ob@G`jyUd@+&^<m<W`{-8
z8V9@m;c{&d<uFBc+Z$UV$+KQ}0ScAw5V`^<*DLE#r>{i*&MUSl&Wc&nsMuM1>zciQ
z^R|muSWmur+eDIYn4C|%7%pmY*Nc~CD6juspzLD2mi<=AAG>&Ew#$o;LXhB7pNV`&
zh^(#+S4ZUWCy4KY#pG>`Jxr>-$b5VG`U%$@QFR(gLcqEGx8wSEB=%3P?BB8AKS}Yw
zgXZ@pU%b&^wZ{H&@{L1ZeA4j_Iv0yAmd=7tRQ??suXGxlKN+F>0seIE!y~VAnP>bT
zu&*T#KmGd0AK<Ep_*dTb@A&(l)!P|*6!UPKCgpmZem{ea^D*Df`xr9{!m!ObhXh*}
zm0x&1oH3<KM#PJEE;)aR({opS89uvO++m>J6Zl7{-{ZM4--S)GSVw($Y`6@UGI_gA
z@-HW45`*{fVx-i1sp9Olu_g_SXjW(Mn>=#!3w@9W%lrr%0H%^%=0@puM#uFb=0eat
zW8_wtyfUyf=-IS+j_3$HBk=bR_lB-j>AB~bw6saOQkSS-rt#_tQfg2;-=Mqy2LO<4
z8CrYx{j0=TiB(N~xd_xd)NNY<HBmq`dbDT;RNi>e6K<WV{mL%A;RBxf6kN-j2I?^@
z<jVSYcRPD_@^-q2I0?DXzp?*jEC5)cxom>nAk5py(c{fCwD%p`y5RCxR+qo}mAm}Y
z`Sk$!QG<qiXvbq9e6CMw&_))$dqZ$V=SBK12EBe+?U*nn*n!hyx6;5BxpO+GV?|q)
zIc}0?qJd-GJLizppk-3GW_n}iIL3ukWmEM>vSfS{UG~h``t<X#f&n-6koPRWAGDmp
zOjGT)cLP;tJrTQdJ75P3>rxr#Q8V|AkxmCeIKw#QmXg>?#ICGrFFPp}&kj8x>EHO)
zMSaUf1GemsL%we9vOC#b_ZiJ&e-!^rmSGIe+b0r~>U8|hbFM80r=-?o2G4Dqbk^#i
z+06?#)x~1yC`O07p*$vtY&M{+K-gqf@-lz^Z>DS#lYy3eNs$n~6l3lg#)tynkg#2S
zo8#vYC*Rv4P`^nn216`v%&CX0=sOg^>`=7tHN@I$Gsn=if1HFb4pFK)jaDEAF`@>(
zt{1J(W+u*^`f$6}YXZ-*0-^8Or$XuHuwZaH0OgsYx;P&8TniIjBK?}yJ|3x!|GIJ_
zhhFi3et}ZO1;Fdd7cl$8<a3~Iph_MH06h+>u_@=CZsE|u<NW8LGeA>N^Nz!J!nw+7
z-O}l!UP{B{1MlaTnd^T?P=nrZP-7AfZ1CwPU{;PQEuV8KWU<}tY^hp=z<ywm>3zz;
zv1o%)GIZZ>M(ARx+m<(N^kr(dIDaQduC6JIzx(IzqENKX+ehiu5lDA4^ln_XGp|b^
zG4DRFMS9}ni=jS??YVd0^q$6YK6vs2%o^jf_GSlJDF$7WVD0v?6u_CtQVuY|j_=Bs
z7<zJ~W}<l<Zb2pF!-SBVvPoVpZGGvz``#7oqB**15zB=^UV2E+AM5z=gDd(chsq;o
zh8}n*w75D@;n3otq=vIfPO5twJkyJ{tAFJR-GR)LM`~oA3PE%YaiQ&asx1S$R2h=5
z#0%QQqJ7K>hEPRWkeGe3R3lDfc0e#Ed*mzcQ;M9^ebCw(aLBmC+sNBoAp22JeyBTu
z1Hl^L*Fdst9=$L^`Sb|F=%G3!%}Bi+p|1jvGD8@|dNzGqa~W_T%4py)liqQs9cDF7
zGE#cbu{*?CNX#KHbkKwIy0UwZr#ly1|N3Nc3-j#wl=HyrbU4ynFJ{(t%@MUeESSF}
zdHXk&((2iC5vY1*7pA`je0}#?WmC)_pihQE8K<2-h;vCp*C8Yjt#Nbt^c?8Y&@*`u
zC*ioI7p5*|U4yJ@vN#WEp&DdR60Rt3&_}O>g@uN33yhYa<~i@pWmt6NdnCm0rI7`z
zk5j3C;BEKKBR+%TK7CuC<B7_Hz^gD>NpHJtdTDZT=)ML@L|>z~TzS$+VEStYOArHu
z#`Z9q40@md%qo_vfo5H%7Bh-4WCNH%TaMcyAez2Mmg5vvoBlRe%*qtdgr#FL6M#kG
zv#ejL^BhR?xhRapSJ^24e5#mu3@{2MF{J9hf)Zl?2$}-7C^_%|ZSkp|hFcVVH;s3Z
zLNzjyVv%CoPKku4vU#fzIh6NC@TIM`W2Rs=-gC#qh;n&p2Yu)jN7<<$4Q)7xr{%WQ
zgM7`_@!wJcpyELNyZfIAGpqNj{0(kw0A+=TkT$$}e)|=7CLji&kKW1h`jizW=NbuZ
zTtkx-8T3xKZ(=xfH)Q+3DzU8_d`Z=PwJ8ssu%&D6V=<z2yWbkWMqbzwSIlkh+l?y+
zpjgF5AN!q{eKQ&BX!|&e3{2v;gkPzwni%@g+SGX7?X~V)kfn_FOk0M&y<n<pMRrel
z5p74L8fF__jM#BvuW@CI%$rRktn(Uza%~%eFFCA;3!XhfP#dwt`J+>-6;dj(YG5QC
z=?@;OK=LQv5>PS<of?OVj=#C6IHJKyT}#$CoNPN+oU+QT>LdDtBIPCyl8NfM1l(LB
zfs>282w@FP7e87cL%ZWDMds*rb`aJag16@Kvi^K`V1V)j?hiYu;H(wndo^Ye5pYS0
zb%JE;`uQJVMp3qsy^QXOua`@TM|GLHf?S@z4zTO${D@KUHRS#e4%EC?EBdo^k~7%A
z^V4Xva4(kAQ#(XpZ)N~J#|)jt4RTljkO0XilQ#{}n+tI!-Raku7UpnokFrgFloY?w
z)n}^fH!v(}-lE`37h+cHoQ@ftIKY+D%bM&7CU4neN8C>c2ntY52Uw|a`)w`h<2gj%
z66E;1AqZg4KvtVbbISXG+xW#rCFZo?O?0IqhkXlT&i9XLPufVSi~khOFBC)ZWR|h$
zC!yAwr*G`e@IN@ZER$IM5n89#lsQzNRLPPc%sK0uR5xJDzvxMUjDs=#HtV`UZF85D
z-xm7se45JsP~`aghYw($$k1JvO<d5CFD5>m7UBZ=CcysZ>UByugpQ5}Bt+--EJo}8
zP<+0>Fn@nimOlwwZ;z@2InK(`)Z=#1M_$ejg{i<NIL~f&oZTrztr(=>I(0}(B8&i|
z{AF<)gGdmFB@Vj50n#ceVEkWULcMF}3qhvm^1O<)tP;`7Nk$~763)~k+KY?X{3W36
z_{CF;ORGOu#b>c(KVM0_dGYbwaToY5^Ct+Ry<+s_59a9KL2*zTWr;BRe!j<rr){jm
z2&>Wz0ZXUlc*>`tC-G=SB&TveNM{j4v1*vB2k7)o3k8LvQqZr2>faGM>7$L}#&Vc(
z3;(F?=XqPXV-*ICXFZv{KrsEqL8SDP*(t~2eU(5h)w%xWBmQB%VHYeZjBK$78oxPn
zMN)|kbWFVcYF#_9wm*m1%0zx*WENN2reL0yX}pZ3&b4SSFVWo3E-wwr8%Nv`z3Ekb
zszM<uOz4=nSk_A12kg-cJYA&^judbyMB+5e6nSJVr!_bjXjR?#op(Q4C#yW-pHAig
z&(!7xBnB@Ex9T`+q5$4~MEFUG?#Xgit|?8EEbx|JWI^ApmK|G|)Sj$jt;Y`qWxggt
z8s09rgA4#0?$c&-uGL47M6d>5g#AM<M?ke6fe{1&erWCJ)Kz;%Q`@|X$^}pTHH*MP
z+iAN{lJWY;iqo>{G`$#MhlDI)WupFpwpeu@V_U<HZM{IiO<TB5IH@|Qpuy%;%wy!|
zRR2H%b=Vbc2&8#&cnRQSqTcNVuOPzOgfmnB>A|mS)>dhY>&qUyS0#qyWO_B@K3!WN
zpX_@|?e`B`<I>#3&9p0BsFnDKLLD(iP@|zeo}-8p^FO5P%`eIXgZsaRuHZ@XPv28z
z_~mY<WPb!FGPXnTpwPtL^amVro(CTO@m(XOU_9!_)a&7od@&Tq>Yf)*NpYv+k%0%v
zfnTNPrZD`K+F`~aacfzn+H~GlR%`hifPeq>r8QZyFEx}p@c);6rT>!x__ob*31dD?
zc}V;ZhiF(jK`FKJqQlNV<L^Ay|JbbFT~V73r021_9TMT?8KoWj3E-tg-=vO}%W<wd
zO}z2|U3$YqNrGQ6N!xX|ANwqUAqM}{xkq9Autdxw^IP^0sPvcTS~l%Hg^b+7KDq7>
z-GZDRN>=Kx!?8W*F>%Pgm(&`|OOhsed^YeHH36WM!W_)Sq)>Kv?T<i`Lt8h5Pb1QK
z8BAJGm|OM9sfH(J=S<e1v|j<`f1FBC26Waf;OPCS^mgrp5_g;O&)AgrT6_tPx;R^V
z@i7!+%UM}G{6{D!>mS@G(MR^SeSK4bhyBXHDv{~Z6=VIUTuCKCpaW$Ps=*G!u4436
z=40#JX(<+{cgFSQ)SexbujL3_k@S~fTn~Vv>M8JTg>Nw1K5}!tpQitxdZkCqUVHrR
z!q3Q)-~zG|W7qs~r^9--q)Ltz>dhk<wFJ}~g!O&}3Adq~onQn=$kiIM`wnrETs_r^
zf@yH(TEc(S0RHoQx9L}FrAzR@dyV73r=Gb5Dxd;|FP|d(&`eT4SwBuNStf5O*K;D<
zLLvIDMQ0DUS(7@j;73<6HC}8gTnwB$?H3e({%r6j9C^Nz0+F<g{0A{SI^8TYAsd=8
zT2WwamhTtpV7o9d5TmV{w9l+`A5UO?z7JHVjA#9xl&k;+yF55^;O|-@Yqks*lYM1$
ztjfl(l|PW-;KIp-@#u4K38P=u&}s}{pR8r?O<7i^(ArHm-KKU}wfg#b+Hp(E@Zt6s
z#gZH+Uh5l8oi9^=l)Dmhs`cw;y!Mwp)O!F|uL%O~Ddk%tE~NrCBm?{=u3F2?+R4Eu
z&;d})+w+P)4$!V1!je1r9dFEkyJ;fskAm;FDmPvk&K6G#C|x#B0RJbGf{6p(zW5Rd
z3sN@5Xop^cf)`u>cEGE|i~pnSmYCze(=>rk!N;{Pi-;>^z?)^9zUw%y08yS={s2To
zoE9#?GgAV|N94gJ!gkv(kM}zB=@}8;-^x+)L#9pN<{YORD%0oSEdMR6As|D+ZA|N7
zuXyb&KSs1rZWQz5FfN~!X`S|WO9o!S72M?Wcc*mua;_7Mzq`^umi4!Y^8dw4RtcI6
zo@s=>!qNE*E_637DO8BdZg9`loTXf5F;lPQ^39X|){sRT_1RLU;(5t7@2(EXM{GG9
z`^i(=XFNJG1wHSjQeUX#gcal>n@Nfc33CvMkn*ZT{+TSH?D<+ltTJ7mv@CwvZ;e^h
za@wV?4yoQX_M-!`#R7IU+Y8|c4GkI9=VK%{{_ZfTo2~WWQ{^M0jgxm*)N>FL<I#M%
z4ShpPCU&tP!4^+O4>nRPJm$5Hhzs4J<A@8*A&{K#ltXM@30GN`>f+;AK9S9e<fTBF
zPCP~rMp6oocXCN$*%_fqmV$Y;ooi<U2*$pMZ`%;rZ^8$))odFPJE<DQPr1vpoa9b+
zB2w<JB}X)|t{q0alC>PS3bk{Ed^$MYR^Xj%%z{LrR<8Q-c?`a;j`2aqyro-<kk78;
zf32J=A<FaOi+7jyc(+46(O2&jR#HU_t~AmtF{53n*!_}-Ll(}&Bn*3K#D(6_SB^Ew
z;5QmWFg>?J7500=ylG$zbm2!FNjNXW4M=)ooVvK3w9F?LdD4;|UgTsI$R>zYCe60u
z>%DwB+~H#2Gf3k(UTiCgJ0zOc#-GNh1`Yp9r_BJ}zOgv@`nMTICZXaHC)uJ6mbI*i
z(EDZ|M&LJa2^u*-Z(WadC<Z#^7?V{mKt0|Z`a>!lT~0)?IAxy<osaLt7Ipw%{@{$o
zh@vz1B!&4S&sAk0@<mD#&{Z#8C!=aB8CItfyf-RC5qVG?T=;pEk(Y*OvLSa4<=kz#
zZ{lnlw#ZYbi~^YVQ0~@z#Li1tZ<02qYW?;BQxspmtHT6Hr_T8NkRuQMmH6qs12X08
zm|oQ(ACA04oT;k^@Yk^;Xy777c_X1-6IwSd4B~#ywb~I=Gx5BjD4(6_zBM|BbzX<i
zgVQ=I-ut-B9YPMy#z;gm$C^hUPrN)TUBa*czG9*acN+?}alQ6hG&XAChK6Y4i#WMZ
z;2yt^p+k2h*iND792=<H>I2=K5ug=NH<i`|<BBjBx;f9m^Z%rXR|$Z`Kj}4ci>yUZ
z&h>Lcl0GiDfRhac<<j`JPo*`4s>{NBw}Z;gB2wL`z*#-g9hxEwiQ`M-Q${hb-HXKu
z^3op?Ep~IJB04gW!H=@fmU9(uugyxAr&N|_Y0);)vu!rjr$7$mns67%f#<oczZ~^=
zv^S-{#h_4cLOF5V?`UE4&>nicbP%=V3r7)Fxv?D}ZDThL4hc~eUhyp?@vG~+X}mSY
z1`zcjsBSq(%x(r8Q{H4S6vUXk6ah|5c|jrc0yeW0!AG{=A$QUlkG54XVk8cFE7K7F
z@^NF9=gwTj!ojQ9R%8j|c}9b%jY@Nw%y~ME3ToSN27?n|R(t!nKNjbhq1PqJfkRVm
zyCj)92ofqA#IjeY+V;K*1=PtVD^M!;u2^#9EyNiI-|1j&7r^&K9W0@0#bZu410$^1
zQBdoKHb#>+W!P$jN136ED8#qiCY)w+nsFun1_PTd$`&uum|VHvHwl~OZI_E_gIwsK
zbISD0B!N2+0R93p#r?l@C4&PZwYB%IJ%Gr0o~|uHzkCQww^a-#R4j$X@FmLylQ4Dm
zJ=-9s&lva7C;38LcHUaf#lJ}82BNbR=YGbg@1He>&&jy}rgN^#vY3~Rjq;W9p6v$O
z@1|#MWF!dOq>pT5#P+WNUzaYQxN~GI&gt0Ysc^LXVw1~uAQEk+lkt>Pf@ZRZ@f5q_
z%4UToX9Y1Xa|o3kY^af!FAZ`Z1*w+{M#nW+9EwO`M?v!BC=e5AhEI2#0EK5E8-NIT
zh^Rv7$&_OZ3j>%suC<KrJ-E6~)3M9KQTt`#xYb!|H9Ql~M&$-j*(N#Q2{gjD6Uu79
z&BA3C%^X}rbBJ2NKw#<;ye({B2nb?ripBo=THL?Bl`0|>TQ1;UaCo3dvBQjLg($gl
zC}VeW)7p+^?56agf)YlQ?H;JZ#?OxOG}cZvfejcd8?X}EiOunW-v%cUjXnKuQ7uj4
zdu!i^<GbmV-5V=TUDl>gS>7R=KRSH!Q~AK;0WK7pD`u~nV@WXteet%Hgmw`bzF1TE
zL>MH+0m#{eOcTe7!c~l5IouH5AcZ+BU4jlFWuWL|`UpaD&D;qC)ojz9q~NT!MtE-J
zdp*SP)+bu+@*e7;KjM6q!c(j|3l@3(WOxerX2K)~g|h2Q?x{7)_L|o3*&%4B(f5h6
zTKEWPA3zyjG4?t3<GePwuKJE!mbQmkRQRi_6f=Emq*0JjAuFRiQzpsa3+`)4`q1kJ
zo06^Go~w0XruGSPp+Zt@^n>30s}UmON=3dcde6k1+tkAnu7Jri&`h3y7d*dqlXOud
z3l}sy7bs%m9srAjQ?7Yz$=>vRN)wgIgx*bj)ss~;TkpQdW<<lVG5Nra)8I99x7x)F
zcmmw^u##L^FkZnY5AmNhLT|kES-Rr~#@I<8h2}z_1lf^NzH~;lcx^O9ICw2R5!@Uy
zK|(;inqpx8<MTxg4(@V9jq%j&d<~5+`(-Io?{{E=m7%j*V{dJt^muEd6z@!rIVqGW
zCaaS#wgP68*wAG;rTNkXYtGK6)MsC16j<N5u#1f|MT!O>?JHvKt%Y#GzJ}@)2d~9T
z*w;egsBjK|q%3*1LvS{m2i4`WeMG;3nbB{jB7Oe7`AKok?Bg*4kr#DO>z^U*_|o?8
zl_v$I@R4!48lm};aB4CF{JIXek;bR7wuOW5BRpLQ4)UAcZ0N3^fV?%B<g9BA7Wx1N
z2!>tv2pQ<n`My#<XmO{9Cv9J9Smuju=>OH)mBzD~?O~)sTS}|*lUSmdVs164UO{a|
zW5#YeZpGfBTDPipLPX1imNtV_(y2)pTdQqqsiC&CYPf<DA;wmvVyQ|jK@*kC8}qBV
zo!<ND-uK)4>73vHod5Pb&p9Uqo&2=2sjMj_J`{ZX^P;NIZtkXJ2&n*e^*B$jS{^K$
zwsw<=hkK93ZUoLi-Km^TJU`$f$Wi-i11US~MaaJPN(PAj18eT>;MsqXcW)A|Bhz6A
z(OAgKnY!mTL}$B9JM6Q{VI%lg4wnkwtZzImdG4}$2bgO2={!pWq?u0_iTdVGtQf?P
zg9!<S4oBs27MuW*IK*}rohVL<P1PKlWQ!ihT}PeQa3F;(dGoR8^FeMN=`eNULQEbS
zNJpq(AL2m7t)k;3NLjLdO~*0`C+J?51d>e<`H3pg+5tpQOeH%ZUHjF5YU8`to5AX{
z=S`i%^_7^HSW%tvj5O8BpRY$dqf64xMrRw~au)gHOF=`J=U+e<Dw1qEN?DRR#3r-&
zD=9_#t7R<XpYu49JtMRd{OK?)6qpdbm_`*aJQNy(3DR*uya9b)cbccag7m?Oq~|W*
zb{V35+0h4d<aK><Os#HJcG3m?LtUx;&mu!Kdd`fbMzxAsV>jw0niZxycf(*&g9v0_
zqpe*Y8MFrywDS5(SO#y(5Za!D46Jt|D`#FTb<i1C^k2+HTB%%G2eGrk2h&{r#}p+m
zIT$z=L?g8YSnBAm{%Bh(MrwbSGwNKid`81`T|aZL(odHQPO7-$PKw?&^cc{H7#^B$
z-24+S6=_(WjEIy#Qw`S%yWb0!#g1Nz0LPpsIxCo&)#|rnjt~a|(C>`SG5S0d?t?mR
zTZcm3;SHuZ!=GNzyMC6GzGZ!omKR>1kTok!UnMWJtYW}KwKLXH0(b(_wmvgh)vN!o
zwz`}sXzX5HcaL!7FpUy)l&nkgUP26Q#bi#x#lfdc?~0XA?FVRk{dAq7zE!ysz1k&F
zT{$b8ZV#fLGTw}L<{Mveof>O<ddMYT@>kz7{938+%IG--4hNX=e@bF~s-Qguibfu8
z1YU`gYv26YaI^#jT<p=BTt$BML>7FUGxWr(#Vhgd6G>_G$$3<Ws<YAremG12-GNnm
zJZ!;i-r>cMCWSVSzt>;A42M>i8EGlFzAczDYT?~#*ZjrPcV%2|%W@%?Pw8#E4025C
zOexwQR4?=+j)ivv!9~focbW({iiWg}`mXscl{W;7tT?$vZgfXG=wv)A$fh%^{f7yA
z)NcqLi|Xa?W}t`!Y1;(dWE7qavSvc0><FZQmW(w*hunpU5(N0*J(+>nx0cF!Bwbw5
z8|jfSxEbu;>ceu*w;Jd07U1t-U6~E}9JaXZIRI8-Ce}Fp!iE&aA1sUPjBKk)%V8TV
z+-cVW+()fr!;{@x+2Hgx$wDDVxeU*ptbb7=1zxrrvh02t@KS@3+FA-X1JCcqv>)Eo
z^?xtbv5#-T$-S>HUWz>=FL*E$$Zn2fv>YJ60dM_vm`NGQxtUE#FaEccc-G{Q?E@=#
z*H{3vl5&kI(2I{5R!DEJTzq{XNZBPMJ-IRc9R6%NIFrqgm9*T)JN|B}+O3*E1mp9L
zflXTD%aaUKi%iY9c})$zxnxaC6+0C@mAxuOvb1nrB2SSa8w8*K!m$SD)Djw9zM;Vm
zHrvEkGqP-pe=K7O*>(Dd@`2bypu8$S4w0qX;<MD&wS^CZX#HvSs7Bv47gQFZuE?X@
z;W`aD0RrVB6wGb?GL=tXB*k63uk|t;_Vqo)Ks?JhH^;<n01wx^b=}la5ykYR-N<`$
zbN{ZEX#23lbPMFuU`ypfquzweuT5`UY08cf3{!X}xv)1~UrQw3iX1%c^gevKskQ1w
z9HS69mpGkd3S&Rr!;0)gzdS;~V+cCU*kPT$DFMVT>l8$ECm+Y1s*|e9?$Fjg<;M=+
zlMz;b;u|sEPMgU!61rg%+#fw~i1!s4i&jn#Jz%2*Q)wA?{-O_EHDT%v^A*gkO<J?h
zQ7-qRZq_NHCE>0HR#fV#MUHCRhWo_;0t6#QC354$&{xcyB6eD{u_AWu0w9H6?oh8P
z_a>Ll!kW!VWSVob{Uf`YZi`|XKpPIM2;nI;+oFxyCxVKub}n@eAO<`~+4l?Dx7&29
zOklpJ0x%y)lqh<U@V)lNo*Ax2K=6h$D;L^F{xu?BEa@?E6Y<Yqqs0Aqe`~9bQqCGY
zA_k>C_U0T{#`mEbi?er<OhjT(f$4IhmbPA<H)vvc9}2o5X~pl@<Y#bN^pZ_L-dMk+
z+1pi<Nx;*GnzdB_9H1!O!3$2DI-=iLZ~eA!yj7I6*l<UqiheBYE_uKQBoL!mTmxE3
zK*XZw2P{??;{{O5!iKno$Gtq`9@A!PLFKru+K@L!8avLu>1Xo;zz_auw`=|wL3^Oh
ztAt6`62DW%K!FmFG=r=SKf2bQ+Oi?=OJRROTv3gvL=INcpnZY`6{ZeO4yv}k1>9Gn
z4*wD?uL42WvVTbw3!8R849+g~>WO_R!o=B!4Qc4#oSB<|j@$g<tZOXC3*WFfQZo==
z2&9+Ne6Kz+WGGsGBvZk<EBS~<#k}84+KQkwJ`Ee*FT<S*rG%5VS-o}Crvq?XeP3E!
z!Ug^V*NZrFs;0+Wzgv21ltSiVx0V<e+^-ovSV?k^keiG1dGv?Wdu^szJVZE^Iv&e3
zu=Q_qqjnt_rAvu75RQRxmygFPugc6{CG-kEh>uY`y9(O+byn>`{~_EWbbV{3xZ`hK
zVj2f4_(rB8yoNvq(QE9!deL)+;*a{n-gCkeg4L9`;K5^4CNFqQZ-skeE5}GwOmes*
zSH}0Bj4~SmRKXwmosK@t*Ur`j^PVUv$Wk=w-y~ZHK9dmLZG_8F03tix(6<-VAL9Kb
z_u4F2#W}PoZ&dEWR_x+ZTHHQMmeNGMZt@IQYPFY*jd|@F_6Og)vz{BFm=7~sljm=%
zQY}0m*>$f?Bg8uJFm?0Qa+wU?rL?%=l$g?vMXVK8M?cRp+M_=;3e>N)x91G>212|A
z-!M>PZGQ*u{z=~p7G9>U3FIxiZhr!gS!X%G7(VORfaXG$CfW!k>e0nNhy|`ZX>)Pn
zw#0=FMrF^5Pb)mGU)psWY_vh(&S<`kg!W2>Krb3~K#ts@(mpasZc~GI+osYc4qF!8
z%>4I3MhQo=*$w}E{LUH^THt*q_!C*@j#Ies!-8e&^*$jNb}aE%LBtN&_!(R9aR+>M
z7-<Jx{7ZBBr~CZB2GIz7{p@N0(DIwyL6=4WJcuL+oIHQ5zs_+jziT_1Z2Q(W82CT@
c7Hhx9_L&zvt{JC9O90<Dr(Er+wt?4v2M5m<jsO4v

literal 0
HcmV?d00001

diff --git a/docs/source/assets/design/v1/prefix_caching/example-time-5.png b/docs/source/assets/design/v1/prefix_caching/example-time-5.png
new file mode 100644
index 0000000000000000000000000000000000000000..b80dd5b9949dc331eff4e6efe14141368d4c7dd6
GIT binary patch
literal 39727
zcmeFYcUY6%+ApXZMO2CiN(fb|BAw8yf}w*5ihzLh-g_58YC`C}C_#FWCLN@P9_hXJ
z-aCYOP~Ug&Z-3vMnR8~&T-TiI{1exO$+OnF`|o$(PvA=hNxVDYJJ+sV!;_Yhcy;X>
zR_L{B*FWH31AiI236H*Z?ZY)`iDxR#TI)%-%E_8*YDw7BNy+pouan7by=SM8<E=T;
zlG_44<aulSCf$IKdpP%`zN8)bj?YgfxNV7^4ONY%K;lOR)F@8g_Vyf!ppO!fBWhiy
zh9&V<qtLK>&v734{x9(nz%z{rXK-=NN*j}@%t~vN1*->6P;?R=oW~i#8{EYSBo?Zx
zQqxMA3Ud<4qvnJcsg|+w;q4|ns(&O9zeqygEWGSQ&!W$`ub0lUvK)?i)7v}Aky3$V
zPbsp`PYShF1!E-Hs@a?wnKGKS{|L?)%oXYt(oSoXUn_<9!np41*@;m)*yGWquG8)%
z4Y7o`GeqiLvYN#0lZSc}|CYBsvmv$DGSy=aap}6CcGTqz1s62BL|3}lc}7I;Ha(@X
zC?l_3bcu{gva346MOM8k?W;_Dk05ppcPOtb>U~b8t9v!kWQ2EOMH;=VdTQEAwVnt$
zBkPGklOQ%6EVb*4mZ$7mq9FRcTR<iD{Z8L=D`{oH89n4EVf&?OFRju@XT^INOGG;=
z@W_i^Q@NLZWexF^1G!uQS+`@vRgi1!VP|yT37@nL`NEr8bYeS_mR5iqb-sKMHH3{X
zH}`IVLefhHVbBxdO-}L}VYTkNMA}**Q_01o9h2m5QkR>ylH|<(IW1V`)JO#`TRTZI
z0Ct%YQ+VB5V*(Ojz+)uHXIrUD(pz3q+9<vITb7lHLRkFOX%omIj+x`N?m`awi>G*(
ztV{`fTvVb#rjp&1@{K)IGN;c@MU@Rt#x$__6;(V%8!*70jZ=L?v0S*6%%Il=@Dj`5
z_LSyH0$S`YP^+r!XkX85C5QRwmAt-4Md?<~(Ae6PuQLJY1CphK4y6$Y_>gjWh~8tN
z+OsEciL5oaZ>Uu>!9@oQ6)CJB*t#5p;EIpot!aPy?I$-M7X{+4oxJ%*BvT#k=SA^4
z1I|kFT7JRPWO#X>+m#_3++l&-9TjVEGovn0(B_gsK@@)U>tx~>k*It#$%1{ZMB0%^
zED*K|Cw`k$(CH!?noTHR#&Yol`{Bzrvcuf5?~3x_IqgM;y@NjRMvH^|Qi`q(iBX1Q
z5a*g|qSB34P7TXqKjdsJVEcD|%S}QPtL-r{c7rAX@8j$c8P3#&1}V}Rg0BS#S#4S<
zizX<Jwz>>Ed?l*dvz_K^Y~i`hscY~)_o<mm@}kR2>AM*yO;<B&(v$PiFgT0`%E93Q
zjrKlgz}p*Q(xtJW3(cyqHa!J*rTASpW8?_gTrMD$5-E!skyP4H(RY3%>?Z<m?=$3O
zGhyHj{}W4My_-tl%q9z&0?iQ)!A7rjqt)ny-zdr{f09jN_lh7%7L|66_J-^kB}V$u
za)n;b`5FZFynKJU+((M)Ia6}M0wc??InD6s&>u<;U}<{C*x?8MJYc~d8vfXdGx^g~
zzkP1oj21&3?A`|kmDOZfO)^R&`lmK=;Ld(6FTC{M$4HfJQ^yJ%DJB%L_qBsPiF?@n
zc-DecxBq^(&1`qFAE}%Py-<f3rs=j%ai6t5cJCl}k>gLrDg8b{Z@9fBB3t>Ld~UP8
z$^(WB?;C@(w&Xs>Rkf$s_la8f)f+aY8sAOA+EH(paO5;f?QdzMk-x)g!&zkOyVSO$
z{o47aSBYLe$dq^Y8BFRgHKlv|4j@-cBypQ{O^!>HT=avs6METwBvskrz0h38z-$J6
z<zV6{IKdjzZ700>ITMhwm}YIvm4x206^sa>416+$Qd=SwK!j0!lg0Ou<{w9P`X~F+
zhjz>H?%JhH#J*6(4wwA=c9IipnlOEgeSf|9{q}f|NH04Vs$au;h_k3Kr~aVmOOG3U
zUW{KHpkt;b{x#u^@r^z{*ayyL$E8f6Q(QTp1!413=B+1xGD8LSeUJ-hV|`{6e3|1F
z6trlVktw9R5;B>T+(J9<2)z(s61}fGk}1IH{|?J!jf8NGSQ5H`=Q6&MZAgu9YB)|Q
zVOKFMkEOXw)H?+G$7Y0|PA|KMuU@!|FZQ>!+?pj5dW#r8qZx238`65eZ-Vr|d!HO|
zQ}a+gn>I2LRJ4*yO{yj`PB4E!WNt|51k4L-XF#w+xk#kuDifBkEgb29TPo(UkOe#*
zQ3nXxzVcw`Yb=w+*S{ls&q6;}C@fVcq;q_DGdmXK6qNH6H>m2Qy_VF5T`YHH&TB}F
z^S&`MIe%PUt5ZzZa~x$;t{b0i!Omr-qNTio;;29r{}``Cy=7~~(R}#AQRjtRVG^(K
zts-2ygwgQ?zcU(9kb`|a&X^syN8wE944D)t!S9Q5FFQ1-c$Z>_k8q=b>9;7Q-be0y
z&#dldzYDG93t>fM3p(#g;dblz#1<fTU8SHK=E7g@A~xRRQSML%wv!DFhvemZ-$+-`
z1qGhDt4!iWniM+XZJz0EhJ>`Q9R{H19_PcdX@ju)csXQq8R;k&V<c6~H=r{75GK)%
zaKa`@<f_VBDyYy#d{L7hAIMVU%$WUEhYeK2BlaJi#`I)ho$;-ZY+8hf9EYL)J)d?X
z-BUJ~g@%y4v}SD*XuxeF;jb1=J}yS;Y{<}RDvCSv3jzD(e_Xr-CWQ5ug*@-4{(GHz
zk2CbIs^U(el8R?Z%R|idKgsgXZ~s>*I{x=P{<{@=hrMyX@1l60F**&CW|Bd%t*LTt
z;8GV1>4QCsB{prHes=ZGGP92ld)AxcZqDPWt0J{0z3K93jetH&JJ5AQb)WpB-Pf1x
z8M?1T9z^VB8HGufmN5Yqmg<4ySx%SL_oC+TsVjbc#jD2)${bHpf}~ISi(8SS8c|nI
z61gGzF1&vwRqO15u>bVCCy~2Zv5|_+Y3{3P^GX|eGJsBvsXkJ%O0U{06<h=*i(BUt
z%bcjl{o3w;c{lU$4L#t^B?R>ZF?0@<nM0m0(2w|3?|gM<V)(V4>q8e6x19Tc4~+Ux
z`{&fH)+ewAS|wS(fp6CBq#paTO;I&jI9;rfP~FgRah+}np1V9g8GGjJ_3&V{V445M
z5>IMM-I0p>S(Fju!Sa@<+f%+|`dP2DnPP4ouant>G!e{nvzyPc+UepM`i8^Fj0S6P
z9cDU?dJ`U#{P6NHBIe^ZwBKALuDqr0VjfN{B&(X}@h&{$@}#S(-pJtqQ|xqWDc<vZ
z5h+bBN?vo+so8cgyw=ez#DG~E+Pbj=?wN1Tz_2bTqiGPPR<D73NSxg}w?0s1f!NkM
zV`l7{%?bo6Q*TP*Dtcy?k@J3kq&z5dv7f{^yAZ7o!9n*rkR&C#*J^w2yIbdq2BX&p
z_Jls#1>f1@05sp6kXr_=p4Uar67F>;fn-FJb9htAS+ra}eVx0!7&tO(D5%*^bDl2g
zYstwkH@8-ZPA=bTpXci4MJMjGo-9g3_CD_-c@D=M*5WJOhq#xT1uZL-wkhG6aqgTt
z^UhSO1zw#)NvbJiTO|YK^OvF*#{=`#{DujoyK0)6*6lO(Ck&l-lciL4)0;cGcj39!
zB6Gu7bIOQuwt%64-QIB1ufGgof%TV?N$Uwzy&hoi`idXjX&{ZGqXNCyMU*1h39zu`
zh<vMbn|6-H4}&t2T}??_Pj4(eBt#lF`r!vMYPp;4o_cwnJgeTR%t2{b6<OXJdeCy2
zRh56IK~>$d?YP|M2$m6Vn*AIZ3P6E_p1V=(aw`<7Nkoc*@5S7EXU_V4^&+)gmRqJh
zg0jm_d_e<(3jr6iCh1Rxnv-$q9GtpGjNqbPb7AwPq*Bn{i*n&CLadV^wy6LkDjo=C
zq)HLnCYXsWPbkTAAG4DzXs$(@--s|0IBF7~Gpcoxmj@lO2sC=`I$pRRG`DUH54^!`
zAT)tZStL-CQ<Pp>Miu?AK1$c(j#r#`yqoEo@XMTougsYVEk*;iGVUB%6ZP6rcW)z=
zq{kBg9}dQHU+nBj?|jOj3?r|@q$<xMv?|$1H|9Ww8~b|r0d;392~o$Qt_NR^6N&J%
zTo5Fibs{3h=#4oK$4REzv(df#M7k)<_DCAtMUB%ar-<%3>yDt}RGe>L`Gm`wO&|vy
zoNdxIcTrfLEU#F-`nch46R)OYC9b{)U|a!EK_p@`QWEwE8DLpa<%+3Y3nvsWOYA|h
ziH5%0im?)<EU0L|vva-5iX?IEX@^*RCW@<@-J>f~eU*v!tLJ#?s)fiS=n;w5drSju
zD6A#_HK)X(s|=}B)AGj5*#>zNOLlchk%TY^qU_Q?JYXf_$q^8o$*32t%>=43?2mUN
zJ9=`N=$b0ht25(|9LpDas-+Xx>|<TB3Dyf=X+B4*tbcu|@fd?rh09-9(4|zqHu1QM
zoulD@WGdo#_JA8tWiXh~DXOQVDJI3iTzA`ex#n=0rOgJ4^N@m@#-zL_6kN8s+NM%z
z5W}qa^kTYV>n2{kzhy1Nq<?I!I7IZZ3tEq0l(u%cgm&)cB2_X-Jf!E_JJ+30a|eU(
z_-l8^=ZM7X1Lpdiv^9stYB9rkJ6>LAo}BT`rLAI%_Y*ZF<ec8=_9$`dfM_ST&d^gt
zO_EhsWoo)F&$nq&Tk$ZlT~WW3ZkQ|bWoEMX`FvE-BaabMQW}`RUKLdhc@tae<RX#l
z+>XZF<>_I~aq$q9^kfDjR@+f`QnUa+F5MKFcH}23YO0EZBIoaFMz`hEmAJzd?PUnP
zrZm52SFqOk)R$Z=lF`w2T7oE)?e&FR#}?X_O=!*d$T&iCdrHmh@aRU(P%PmJkW1m!
ztT6ZI?g}DqHA-)wDl5K)L;YNaF~>Va5*PZTB~&665)_A$A#V3wA}5hRybkzIdpxsR
z&<+5C6(pg<H09W9FdTNQgKw-dTmh4TX*mYYdqM3$(nVTM;MZok)$Tf2Z36u1dp*m=
zbC7QKK*;Gy%AU#on2p@xoRgdshK=*n03Fr{ZGFWH2?{lfucHS+Fz}vdzKvZoxXp}}
zFN*;t0BxLV;0WRB;uF7zRu#rF@mq5nO47cK40&~#>F;??0~lQelP)zWx51rFyCW^t
z+iNCxbYyo<w)8l$;uxpIcC~zu9QE>`yDPP1tTMA(g%^g>Mn73p6V>d{HqA#Q$Ym=r
ziCg_|$O<ANx2o;Xak)+h=q_a?thN?8d+QNgyeY9)BrDsyQ^;g+8)N<_hsLzF+@Rbo
zlA~GqA#JV|NHM#iRVO#<+xtVQ=vbSEqi%wfL|uwqHqY&M_mVS8LavaDh4gmgfc7Wv
zcOIR#(t3D3V(|{q_4Q!Nbp5=#!FFyzVO@~1dC!M2_r}tQtej1LqHg~DZ{BIonv8$q
z#U#Foz=l)g2Se}CCfcDnG<3~5;x}2dIYWj2P7DDnh+6OSi;M!gmD>Gjyl3v<W6zB1
zy1u@yFD^doz+ziuC`$NXK@|tSPl^t3=L@`!b!|Jf`#K=69Ki>3BuZ9$ZLbNj+5!Y@
zKsH6}K&gMKPFYbDLhe{rDm3fW0H*LxBMP*o@wza3nhHp=sqyf;YkaIBhI5VDjEfD5
zI)@jRD_WyJdk&x3c<lk=10F{FdNU!nrbMsSJ8g1L1U{}8v`92X#v?$Br&V__zftdm
z!?6)JLUR~@21&a6j?s+g;<$DfmyR}m90hPO*f=^{A|6JZ;N-G3&IQTINx_|lv`uwB
z+ED+ED&>2kiY62P^!@gf1{o_+WGD=`XZgD|^a-iOK5v;=u800yp!nfr;bM)+^Z-{V
zgs)SqH&X{v?`74bE;ak}eQJGYJR{gh+>la*<Ho;=bGbSKKRD*CetkSUIH6j5aQQ(x
zLJtGs8Kl3TOrQRU$iK&$H<57N75hF`2Zt9IrDM>+(9FZvzolq$Fpw1V*8a}ZHANQ0
zamqL7kXY(U23afwbFod+F8fqgE=lzC!$df`9VxS1GV6ZSLzCVY0;aMmUU3DYMpiu+
zI#MY6hWPoq@DRgN@tg3uSZl5ko{P!X05DZTy?97YxIVo`CI#u3j+>Jvm?ExLOa??t
zaVJ(+;6}M9XY&}s=gm9Iws?wAtJ-sz^X0^M`E+x=A~#o<QnGiNJ4mxRk*hQw-5vgq
zXhQSv_3F>u+rvWWd8l=pqNm8){q#Vdo*G30I7N|z23G|k%Wco{(;RoKz-66`ojFVP
z2Ky&a%(-*!g0AmJZqZyfdxYNOPBC@bVGjl>vh`T+v`4V(`kG;9l#n0nu0ZA~wjR{)
zbxwPRG9ztGZZ<kxb)f|OeScn4l^)t?5j(0UyvswZ)0N|Jk2<|rDFh?+$N|<9q4|A;
z&zV437)yFCdhf!)eXo<>jdOJabrl~Ya%ix_(c<J#7v|hd@6K09KrZyNM)DENsnsPx
z%h`JhQr&3IliY|2hcS86hVR41xdVG(is~W!Hk(z%r^5U}FEmg9px$1>3vDyJ4N0X8
zWg(oV8jzG>A=p+|oBvkAIU0wh?_`26mS;21(gS60Tes-t2<>)zn3<LAEwoqe6Mjkh
zsf19glUh0Z;@CY#EyuI~Eqd}TUOp~bO*bsQ=fG8_L*c3pmi<VTkI81n3N%)UAoy+w
zGx3C!JOAV>u&{rB@|8qZtM2$mlYTpCRpu_3_Cp01R^SYB#{ljCW#J#frNa)(D&E&0
zi>DZC^*`e2TZ)&O9VO-&^xwg3?q>?Mi5*H=TN3Krcj>>8@My#g9FtqMI~NZcFjN`Q
z>RS<8dNJ6-2vN@<<70Q~BjXAMu_T1Iwel`{TMrIZ>}1)=b(JV}K?la%ZC^KuaT<he
z`Bg=uap~5cUzlV*?vO|TbBDTXA6pK%p<RF3ZC(>>B)?`owfzJG<qvq0X30px#ZZ2W
z<7*!rg+qZ1$EZv1&)hwtgQB&@aMGy=bf`Bx8;!waej|$_YS`i3#6$Z>X{jeon}aaw
zQ8T$TQTdoN7FxGMaIq{YXR&Wc<QH*lMlOE778p@4l!r55l~ii=-ld)Bq-F8JQ`E3|
zAU>U&=%A67`>9eJ=Z#=-Ng^|qjRQYDRhDxKxv47>`j51Qu%yx42)rNn-XF&5N}vVt
zAaalJJ}J>I!bLqcX4?577!ye08~D<7&YCt{x>OsfNp&p0p-t9YIW9V3uq31(6j+pO
z+2qhWy(G+$(Ns+704}R$+)@-rUkc^xQ@hdem1V|q23${r4XXt9+~n!qnTw0<ZNUku
zF?4-DS$qhpNk5;MkkX9v-hV+m$cqR#(CF_=?OSr^j!;ae!M@L7yV!k={VF}ZEH~{J
zD_SeGh#nwY_!>pe^W%|-^6W{<85Y@$eqJ8ZC$RLxVZ0LGV9DtYdkAdUU?WII#1=if
zBNlnnk62s})%J81a#@wI{660g%`K@L;=Dm7)gJMr5d#ZyWr|k>L7=D1in0IbZp|U{
z|B7QTFl@G&v`HJcoYu&9Rz=G-;GQ&~DuY=%TZ}g&!tPD)0-QlqBq?;W^ZITZh4;Ku
z!v#fFMv3PYyZk$eHA7TzQ>j9XoqK3EcqVIIFIV)L$b{F_@T?{gbFp0MOaRr@L)2{p
zy=y$b?YU_K&6{+e{Z6MaSUk0U!4U%cScw|Wi?CR8GcB>9OnJI?dRF`g|CJFcSq=LJ
zhJ_{X#!B8}Z;m?sVT`{J&izVXSMq1BtGDsnGg_MiXquX63q(q^Iy1hDvB~Mfn!Qtd
z=Y85U`~2aMv}jeAY+iNaDKk-W8@6e|ML!*AfyiYOwdGZj<j(tqNWf<~x1n=h-fF(q
zx4y&;M|t@oE0KZce0C(kldoa={i~I4x{z*7v-ZXJAO?(4<+rL9M30OBW2}YuGhNhM
z!4}s>yg@B`sT`mf$S79i<8#Sp>A1H1twY@`aK#9ZZt7XOX)$ZRa7flZ)tAPL7$q5=
z@Abrca#{_fZDk=zP~aP~p0}10KfAMEII_GWTRE^#VgC=<JFO+kfcop;Ul{%Dl~B<m
zGq{F!5MG?|I?pc7b!}yVl;ME1GP05VH!vxk{N2HC50$hR`@Wp5N5(!8b+--dW+F2%
zmw<idDZ&r}=)=CGFoa{PqKsMaJ)=47dsn}8bLWS$luBA0qtmC#eri+3Eg>PG<gT_$
zkonr@&y6z?(gbh1yApzAF6uuN7sd<TO!|5YGkWU!AF!-6To${iM&b`m;PC`X9OwGh
zDtm4Iq4IL<_ho<H)F(-bvHXs8>n0_(wn0yl<Q^O}k^u)LqPVtxKGqkKTJwOmKu-0b
zaV=2oo)Fd$!IpG{U<!Lkt}>fIrK%$A+-SphEO+=FqZLF*_Wy)du+$EIV&S5!eD)kR
zIgI_d=JnIvKs=S$><#~;PwbQ!KS+&|_7YJGgVlV_gLvnita|$%n$=+U)N?S0vGS{a
zJt+Uh_P<^0RYvgtBzmO3-CfsdeXKx8Fm|&Yzx;4ryQyTc_U&CYm4Cfm{+nPp*YG}a
z#6d$@y=LUcOoS?>lq1RdMmm#|wwu=ZP6xq_8MWE4n`zB<Y!`Yp4+xyLR+MP+?)_mt
z6r?BYE^fwEAw`_~=<Ds)?x6`5Mv^_9wR56Qn&w!YS#<_!6)nr>OXhd*JdbJ1w4#N0
znhCbdN;y-GwYF;dqN9xX<A*_vX2^s&{7J46?|y=m`=+l&c0dtPs&55q&vW%6BtKRI
zS*yKpj$kCGMJ(B7-Mg*W=i(^#)qk*%QuTG?cAwIVZO4arZ<0%w{EyP$Lz==aI7cpw
z6c5Rt(olZ(n8}j7x{c0mq>~lQGC3b0F50>JMy1^q>Qa~PB?^A^J$dBu{pbk6QZyZ)
z&-jWLS6o{2i_RMA*MW_PBWkU1T;4T=#@)I}0dgk^^aALNn_B}zrUoKCE6`I|e#H#I
zgu96@^Tf9^tLn#rV=;)ESs8B@`hX;_F|vP)Yh|){q<KJq#by2$dLi;xbFm00c+{UH
zSA2RhJK}&TbbLtgW%1d#r?W5z4PO^TnUVJT-A>VJRoeVKo=~CnB!9iVZHvpdPn(za
ze=%mt&xOz-MqJ^wR@59ajZ5)J2ACx8HuVuu0}j8&-i`9Ngc&93oXTOfNsX!Jgyi3z
z!*Cpt7dHcRNZ)H9OZtWFbw!Lzx?@S-LY&FTL#vmlL{zIg?+4o_c|&aK8rV6pJ`B~J
zFP}KaSnw!dE;lo*7|@RlWO8uRfQmRw9zYs0J>uq~^8tD11y#%QY2Lolb2_61BDQu5
z$3s*?MA`s_(`G}7=_HnKFuC#0Ve+;#!Jf|9rV1DL%8;l#hsfHoPq^fhMB64$X>!W*
z&O*kmN~TGprMz&$m&vybmwx_C?LmuA2co$z_SSZ>P$_j=mCnrz(J&GU?$q9LqmpHS
ztdL4pT=atWRWMS<$NqzoM|k7Ib?H&h#N>XM_xz*LcWearj^=3O?xuLV>-t8byOzIQ
z=JM@YXXLfqTq=9@NBd(#x<QAr3o+R+&<S?<(#{Dj)PKE3uX=BAZuUX6TD2K2-2tro
zb_am+DEU2y<^}Z&;u`oZCkEz#dO8Ir=NRlgFVe)tmBKV38m;YG_(P19PuKYzh3-(E
zZ=;3v9grm5x8k=nR|QLLnvHjq8*vH94qqX|wz3Py2?5Xj?;R0mMgJn!E`UL6s}ynT
z0#iUu7n9t&n;vr{!A;!1J1)UdR&b+u8$aj%0GGMKoV-{eNRr=DcraYXb(l?HqxDxA
z-u)%)2Y>7<=ws){yo=$Kr&;OoOp}g%2N4Gx8B{f1y4Y&8dPDWC1T}|M7G);x3l%6v
zW!$(~2_NPTu}S-p6=|$%5r9;iT!day(sr1E?Gp0q4DS^%9t?{0@}>4vj9{C+hJ9m$
zdI#1ur6R-89<w@bD7$VEv)zgbXCb4|^E{*IjFR%rQPX5uYan)>me!Bv(48op0vJth
zvuY-_-K-tH)tusaZmLQug}dkNW=`V8xl#|<>d}xUql`b{lsK*{FKH|CaEhgk5p#q`
zhcv$87#FzBF4sl+^00fGq?nHsH=FH0^b|@Nfs<VAC2UicJeNF4GSXUUkD$L8K_Y+=
z%#pwfTl><ZrvrM?z#jC_?}>gqSXt!uQu&y%08n1vFGf91`@|8%;k+0Z4(DSJO>6{1
z1Zz)Y7iOCug4IZqkdK~`yG;vX+ZIriWi}bL_u=*_l86EXP}>?b<C}u(ak}$rRz^r2
zP@l|muYHg0jxT*AC4A5#wj0MjG-h6@#d&i8VZ@1L!a!13ao2E6(+P?Y8NxJN4Bgu5
zVX~PRVA6`29<L=P5KQqH{;7Y;na~(oYes#$@q9Rd5bBl8Z=@*j#Cd`RJ6wTI;O#NK
z5vf!yuU!{MZqXy+Bbbi3e^W{gMryZG188vlu%0UFCYut|a$W3+I&JXNw3m1v={y0)
zw#|VLWpy&o2=eTGCXK-NzhanfpKX8I?IdjK{l{j({Fh4+<>&KZZpdHUVIaVBz2@vY
z9Ae9<V$r3JYgug3Bf2r`aXQ-g0Bc2o*TA>abzVLfPD5#y-6=L|c*ppxi#Nb&%CflK
z%&a87z7aralTD{NNP=OOlRALmAZCKTJnqCH@_r<|vh5K*H-Eimnju}~WXs!CkD3FK
z`;;r<qdD(eB0-OMZ~F{fu32%=$x1D+@By#3Cp6SBr-@tr=9V`gru#`Ga)Ek#_*#G%
zaIdcwtaHQNp=}!uydbIcuUu;%;BwAM2GnsMO{P?L3Fja)iE~V-s%A0Hy^jLA>HZ<y
za`!2O$)~K>vwzW*=V)9!eH)JC@uwO)HA%fE`noD%TO?c(%Mg82kt6{|@=Dcvs>*MH
zVS)g60A0OV2^1`l^Mp)1i2}6({CNpOa@?~)Zs5q0O|dPDyf`3w+DU+K$^=Or?c4@Q
zIZdcVd%ItbRGCfMMH~zU5DeFts{rg+ta$pwE~b`U7k0<<<ZzCACRXIq<E0fYU7wN~
zNpruJU`u9x0wqH;?tGOA7O@WVZM_?V9#&9ylf)B~?=mY(kDmiR6_}L+bN=b_fn6*O
zZU3f2aB!0IWA`Q6eJg(c7+=HuEx*6gcY~OBkhtuPIxlTgnMESA&!i5eXxK9v4``<a
z7f3E#-hpu|_IVBJFyVFg%t3rk?#p7Rt&B17)rM<vkM{BCj)c-NV~g*EY*FwrIQD%T
zs*H$EfybcI!<5^B<pdguZvupDH<hoN4jP2#)}=C_=lLjZ#_!m@6JPcZP?qxPUcKUp
z2Gl0jx;PU@h7lCA93KZYuuN)B83x<Gc@tdn0=6a4adwi>Gwd+Z{h<~lOaZi(xcd}H
zOVf0gQC;DrQt9R#6OlsulCL$nb*U>@oib2wg3U_qOk&knczhKs5Gm8O2DluGn>@M_
za|7@xz)ha`Y#JTn(H(d#ht{_pr5wY$&rbSTuzO<!KSgbwg=kn3H*q-35f3WdcbT8e
zykZ+2X^v-1qlW2xyDZ*P8S8w<-Yw!nNNe@w)b<GE(4!US9iqLPKFIT0zyEp^i_KWG
z3lf1DTj9+!A!C5yC5NyU#i=KXIeL;(PQ`c9QN0~q#jDJn5<*kc@gQN%F*k^JpqA|*
z)##|aTQr&SH)r|DPzGcSrlNJ~Qxv*m|GiWDx13b$@XOnKDH_8vHo|Y}-KhDTWZD3(
z$wS{mzXnxd-$?vM8$<kNlVDwU4P`gLqWspQ@L7n1eNWr_s#ivVU5)D8W}gR6`ft-9
z#>swl&J=*oSswA)9Tye9nwCtC4TKU2E;XJxxX1i?!wx_a4Cg)Ykra<uv0iU)kCzSs
z0(+66-xK6|cY8oWl?L-zz1}9De^^+x=GzMFhB%ouzXWF?oUJ)ay7|>uET=NR7c}1!
z%k<nS8_q4x<!im0zHu#dCAF_+XL{?a1U$*boaCv*i@w0=PI|SAN2h$-&6PX|ciz9v
z@`q4%BVs9A15ZCg>Z;TCuc7+s3j{wR=GzcRN{+UV)#aV9nc=-PdeXS!36Q3a?eOOo
z4{SPJiEz74sYftMzb9<?*x^a9NY&<rJsN}9tz(tD4H}8Nk0c|Q$K}T!q<@@i+C#)S
zQ_>GjTmHk>S#gkugL5Ut2lQos=yW@7&(U*X6sRoJVl`9TF0PtBMYU+rx~M;`#_=al
zc6*^&r2OCUgN2kbI55BKTK%aPo7bx`#|yVUhLCE{BdhN2MZS8vmlg5v9m)JRK~NgO
zyQ6x7onogu=_SDD%my6{r;51}T3*n+S(dAG(P%V4p*i-f-|SZf`t;N91}V_r6b=^3
zu6cbMK*Ag1K#^rySbrB9aA`nI))z_XFf$Unxd$10@qhU<eBJD9L5EkSiiNcKRD<~c
zU-eLB!v!?|n`diGN!=xgqB~$zDhU33sx5cfjT7<wPvbXevJr5||J|wo|Gf-)w9H@j
z`i4_JUda#der&Q65Ar}MDDl@cUSN6rU{&JJV}`St*mIkH3RB|l)-CmC5PT{zP_nok
z64-5!{W#YuT{+dZCb8^(RlA=OH|17O%<$1R&ZXG(^uPwRfk1iMK_8!T&;`<V)Xina
zrBk`LgFY##Hl2C>G)As5-JxMnzR|U@pI&~ey0$9KAGv$>YiO}V*e+(@P};^1qb*G$
z=`m&PA;gq%WOpe>wXH(zsi5ttXPIkfF3VWWHq4uyUK;5nXU`hTSeJZqsw(Y2eEUhv
zSJb^0B%2E}Q^sDD*XL_k$nCx3(`V9lqqjXe{gwE;hR(70$4R2*tio!wxIN&GQjLbT
z4c+pMV1t89cZ;X|I`-7t$tXdcUlwlB)=BR{`kL&8cBH+AG&*{}PT^SE@i|ow6Bd74
zPY(UX1plJvVxdAgZrU<lzW2GFipJ4oaQ0N$<<qPA)MW{*le5gz5l`ZQN!jG@M5nP(
zT)JZXfU}%T0hxFMYNN+1YbvO89ry?S(j$93x+LfM9<NA94iW2>hxItJR~qBR2d9o=
zueQ@)&<Y=Fs%&~}JyqACzT5z2T~(#Mg4<D3whh1&q_AJ@V-Jbx_Pc*aWTO=A8w7jA
zCbuZa9z@B1_1H2j5FTfQ=u&UN+iln!8j5iBi`NNXPxWz!Za)X-D@g%S?0E?b#Q5@l
z6d4Ol$dFrs^ivf*Qvt1RN$+RIi==RjAX5sWCsC!BNr7CiDw8}W?15}BFdr>EB*sb=
zHAF+0azuZjmaRnO>|voo*)}5t#b;$R2_Spo541%*$)gZ`pSfDnby|5t%_8@uRBEEF
zkh`V&a5`pC)KE;kH2r0rYVWy1A~RN-y2`GvP%R@9!uQ90rHY=*p-l3WUFW(q&Az!*
zkL5NoR`S;fs#Rb%Rthu4#^hCoiKD#ql)fb~qTG#&+IN+{B#N5b)f9ZnL?%kxA-4J8
zwImV3uE7IfkIVV9DVGE-t+n}U($J~9PMA6wSKy9q-MV|>TCnV?M}{x0YdsiYK$_pm
zT4+;O00t`cCsG$~PAU6r9r!36c($K$XvR3iws<QYOn!X))X)`_d&%G~W1o7J2A=N+
zEu@<2VK*vA!c%ugr38mAs2X{z!Q3nIWqP#?0y~{zoS!01-fg<kfQ?!{X9}@&hv?kc
z@B&+GE)uJiW80?Df)ONXs8GRG_Ca7;k&gC`A<W}~nMqNe>mkdEBI(XO>|vhH9ROve
zYG)MiPx0)(XBkQ9v))bO8_a~dech38n+8h>FhbK)i9T#a!o_+>#p1}+u<!dAn;jN4
z1;m@tWGQbRH9c7K0NR+A_g`nE=r*}WEWSdE?T&EBIl9+E5<@;KCR(e$+X^!gnk(6V
z8y$<i-W~qflzPSd5oJnWjgvlUYc6-Fz50a!-WfXcTo^(dxXx8xEG*}4g$Jc$N$kFH
zC5eFVdc}{;=ZR-dGsj<^@@0n<ZIHg6GDAVkB~&cjd->9pgdj{?yD}g=2`+I%YQmOq
zKXN=SGy7hoRl;augx}8OSenk7#DG$)E0XS|K2Z`8Q(*^8zA+3aHD^YMwL51=l`3@P
zwBobeIpKHF89K4m2dPD*!~EIt1_>fdU@545-XmBcWkkz$C<Mo-sRgQK1Zzj~&Xd6{
zdPK^xITd3iVZpM1ddpPq>g^{ad2>+YXO;A@)gs?0PQlx!WMMzy3ge}H<@w(FnxB&-
zDaaFB+8Vh-Q=kJAGH6))%4i`~`p%3DW8xO~kY89TZ2)_%WJkJ9OknoZ1L#PF$gFm(
z-=j%1Ai08&_8pO<hG4MN)4Q8ShUaZ}H@V(n$zpv_YTrYyp1_C;;wZ6ds}4^<Y7ehM
zj<KLIv6|i@`_@}RknlC=4X0pJGU$h}B6hGBHbXEUgxI3#2QFRu&m^Xfeux`^S>gzU
z9XGBc$%Ie&bXBQIOn0{vr}6DlEU8q2tsmc}Zr_2i3MY-MmfGC*9Y)G^&E9k+hmv0I
zP`S0@OI`&8*%=>d_{C9d=6GgzeC=a(MM5kSgUt{z13=W)yQc(78=0gn3o$Dkop*<E
z$kci9d~}pfcEI+pV)-FhZLNn^eV3$~Pm2r-TUm2K`fhUg6wi5dG4H^J%*3xH<v5(#
zA9-{g8S8@$f0{IK7_tvXjsxyh61b$G=AVjftHHSLlR19Ubh>kW#fHR?`RzL|Ji3?3
zn{K~T!Gkr)ftF6S$-?BfEKCRq&Wp|zFsc>w7A+Fbir3MIO9y`)gY2`0c&YG6@?@V>
zWjZo;SoK+hzaF$|V71vzf!fZcRYba7pZEF}nt(X5T$E4amG0LSE`v?e8F$Sk-7HQb
z{LQE}PiS)h9wzAA8G<QbGWl+*(t}-&eIIG~J`D<8Fbc&y74Ei|1ER^OA7m4!Oa~a<
zBuNabg>LTeGIxqq&Yr!|=VBZZ8;vw=lKT{sx-0cXUh|WZ5;vwzp?oJe)7|{l>FL6L
zseFNhO~&{56IN5|HnUmb`A;O=p)hjJS<@K?9?GgmOrqMoKz!qj@YYLY{2l(kEp~UN
zi2mb!qEauJl8b#ux^!=&CHym+OeF)8XjONE`HgUSOXg0^gvb|@thth3XK=VAX~*+P
zi}|03L*A@~8BxQ9k=w;EhlWnE$>iD=G4;mDVR~V|mKOu48=sS*Zors3_vm9fyjL2G
zChS4>hALJ3{<rP6v}24ZT(Em@66x7=I_r~43EXzRkT@M}<_$GUFX#z?FcSa+zF;UL
z-sKt6POV=9EAFP*M;1)rB*!=WFRFb*pl4cVM>u@k*|kG7CU2FnPTr`5NY+*mbBF$@
z*-6VZkKQN`Tr0&dNybfc(9&F@`c+98lV~B@@g71+&$02=9$0`A@{09Jng(A?D8bda
z;@_JUfH;cT)7wa4`y2}}B~IDmOz#_JZ%^SyP*w9JePLJ5(9M1g7{{;6_P#G+_nL^0
zGwOZc|EWies}{5T51XyP?2|>{T@Gt~m2NLBuGN(WsK9q942`gH7)mI77Hmuq1d^&H
zd5Btd0~i58kBfxN@7)nCLn|DEgbi9NW=)kwK3TH`(1p`7N3bFGZi<pihRWdVNQFk4
z&8~6380SH>lIQH+h0{#&Sfo`xfq54L1<z(C<+p?<sJD#c%fy3{NZV?o>TRF~P<D?q
zxaQ6ncxkEv?||U}dLJboEWW_{Rdj4Ls4F=%k_<9(^z!jADc0LdmYPpcruQ~jDmH$A
z(=d<Ak<j7};^@OEp+S3fOHCVa?~<%S*f;hCVyMKz8l&*k=Mx~GC@L;VE<hinTnd_d
z?XHq9u<e4mzy?A_U4V&q(S)vOg<u9^jt8~9t%sG`g|yZ4s>+PN<XHu>NP>X#v(tF9
zR2>2*AiByr1<k~n@TC-FJPFoqp8-pCCJpdQ#1-3RQyA$_0fXh6r(<JC+gmrMSe*0{
zeR|7tDR_`AIHU&;_m;cweLE;Mq^^_Ew(I&X%Y@W3>@EU?_0hv{m8oxFWWk|6^HkGj
z?ab8#6i*x__$FYhJAl3T_6t0^1XV(WNhT|>;$nC1IvU=1Y$uqSON-(ee`bF;Z2ZJ}
z>)Tmdx5BUSX^F9Vh&&gEnyy5!gNR=q>GSo&z@D@(M(Hbwru{N=%(5V9Bq5fx&+wq+
z7pv4&{M=YBvfHi5RtACP`+|?!c|xaov^;F!v4(ohnXXB-7*Ld?;iC?*qPD<1w_@y_
za3z<%dkyS;k#}1fyPMw!9oBP)9{7&vc@%{iP;-z)BwB03vMc&x)&`W*3yQyk+_7$x
zrRf9^?g#<YS=QdB(@2bHX;k+k(U28nrXyB95Cg+r&xUQ~KLdgRNkBGYX-1@7xJQ$y
z?V<%V=SvGKmbdzf)!yKu_&c5YX=26Q54v6q?Hp2avAg4Xq6w84z+<^bOALEIem@Gm
zlY!)*3FIg_maEbWiWWQw$%;!{<2__o_N8?i1sjI<+~v}TG-<(2sqDf~c}fSu9zEMg
z!mjQpRv7bJa&SK@M2zvpz?1B$7Yk^go&z1o!bzs-cT!=xsfn-v3Z$vJWqAV0TDlHI
zfPacRy$;rXqG44cCFjC!@9AOp;?@rfC5NW2_Z}(D9LQTB0@#cq(sV@coZTh%H5If)
zG&tL4rR1{@YJ*(&MufeVQ^RyJB#}^VD6L}K<NKPvO8(>AH3NYv`6-EY!%8?=n@&pR
zIfx)w7Ii1ZXX3Nw@cRI>mA1|z(kS@PO4@)(9}mie(~g>-4I%Q4@4sYLdkz?~QE%FH
zc3SVn?9w7O&J@Yhs~e6XFwQHU)OYzYyNIVA^Ifp=i&;`gnmKK9+wC<(5x6t&{TtEG
z_;zR4<)4d>^PZ&&elGsO&2;d?nRB9)qCSylSF`0@9^YL3B2LTM>b-!CqJj6wVM@-4
zm`bhRT#faWW`M`vvuIjX8mTUfu6V%C9o-{qcwfKk-CxkK2RbZZKE~l7Q{<(haK?Q3
z;m>~r9QruGB2q_7;mSdQN+%pN!oqBSy*BsfvA>@E|K5jjsAv?i>?@*F{0BNRL#3%h
zSKK>r{Krz{mltQ@{box3=Q2^^0B0Gre}4DRF(Hyax1-ab5)`{nUjx(M9#4Ar*u<{x
z3}6%TXN?-+U_yu10u^Aw`tYk1y6)O<kx1Cxz_XGzmFRy|_iEu@;#drg@Z~@L+^UNZ
zQca&l_&<3wfk2>_vWiwZ;gYgQD|yF2#<X<UKhL`Kq&3oWGG9eZSyA{vJGCviy#jCG
z)&jG{*!}-}s%7?Bgy#;`tAs$?ox9o6A0cUj!B;DB9FgFDS0z@@XWY`Ee!o|@d$ezZ
zCnwU-qR#A|d!IBuZVT01t|slIWmsh{&y{V#FSiGRpH*Al|4UkFQ|I#)bSYj(#Uvl~
zgaZMIFSCE7V$3>7eRuxRG@Kgm_iFXtM>I{TR%EdDOZoe0kAAPZ*bg9m=iE$n8U0W&
z-^Bi9fW#d96qD-Zb@WWg7UL3|9SM4N=^P!cuB-Ag?5C?9s%gsoQ2$;XS>)m~!jE=q
zE9YGJff*^D?}bdgUFKHO$v`Qdv&Ym5!ePl?*D0Xeb9Dc{CB^e?wLjPYry%~hBD=bL
zYD{uDD<#ibsB^K6S<9`BGEHkCSx_r2!Q|K7(}KE0;=1Ewhn>{BhyW5bW6ZIr*X0{)
z1uv365?Ne<FIk0TBsq$m&yYDW7LcAaOVn;3mp&U8`h6<vzHoK6_GY$BhTFZNzgPH=
zj+tMry{I_8IY9Re{GV3v8+comJ*;DhV5dn-<ng2i8ev2U(K|nys-|q(Ss;_@Q}jAE
z7P6i$*>Q+T0g<n5rQ)ANG5JrA3vCqcrSB|~6}%?;-pE5nXRwIsB3d~i`Kfb*m*xMK
zj{d2w38|mko7fz-e2ut$&4@`m6HZgUj?lQdwF15;0ZUKOVyOp1q(oK~E2F^s3V%z(
zWaKMJprW8oUHmsYML|9--NKtb<t|r#BmkREr~e<(4>k0TQHq7}xu};XuZzbIk+G%@
z=2GilQ?wPAlVeR!nXBI3o1A*_$JbrrosP+O9*|IK5<%m-xR<_BTP6c8{sEu1+(bH$
zWMP&>CE|vO)bEx5J@<$IE|-6?V(3RdCtl;`g_ul^k5+Z|X0IMiy6z0lABcJ$LC+5|
z6jk<@thA*Y)<|1Dj)pn4EaP5$+{Uo(cKwPhi?^{{7Mf9mz8uem#=fb2{HGYAzPcrQ
z3QyXgFP;`0@z2LP2X8p#8J|rY1*2=-b<gWny>54R_3mwXky1JLw|V1>9F|a9@&^5J
z=c4_7jTDB36Abz?v07v`($1{iB|-BrNU>tN{dj}TlPXa|F=oo%s(L`dt!ZqdQqjHd
z0(}Ej_4>*iuN-36{%21{)6n>i_3mX2x>ij&<>zrD0zu`zQ!`ynlS=t2BiU^$c>3+%
zJMNa89XHO7QF%N$N!cpb?!HB3XL{bWMs;gxfat)$U$_OOvH4BIedoXl`_DH2W=31O
zsMjJ+(T-y_5p>;2yvdqGp4-P85!<fVw(If8xjY^+GT$SouP!RSfDMPC<<uM+Sw)t%
zoxj{%x;wl0C(0zeiDs?ZNp@~!7Jphm^47S=`M`lE7gVQtButlV5mGU0AXz|Ow`YJ}
zoNeRvB8IRp#v4sK_tOO5vZ+$~6N$pURwX-gOOLf%50nlD$7t9@$?$L4{2<(^rhOP7
z8KO?-xqvM2*(<gm{iKR2?i66=NJb@kJ`<Xg<7!r&!o0j!K=wlG?B>9u8!7aPvaEsv
zD*@A&x68(<6a_ba#48|6ho<<F4S#$%u6}BT-@PFnE1AFGO)`lW;(%<uTlYDhPKs7>
zybhgYt-O1gMWPjTuJx95%WFPxCw5?PvGB%Dsgp1`G2v@fiaU3O>d!3a?gzY0d+>KJ
zDPUQTN&fsEUA?DvKB;&adw$tsR2P!sc~%^xby|=1Ieu|)no$>&;<2<xU7heNyj+D6
z3@a|RV0tAjY`K5#NS=~x2UVbL>U?8&44oq5h{sYCQdAcxFt3^oyh*j!<i~HGHs{CD
zP}R&(O19SjSMbPVHXv;!>aHze=<cBMc!1^@656>gnM9m86=CUd{K@%er_UTp>*6ur
zApB1vfM5Nh24A3v0Us%Hm+iZL^*~6~ttBIqPaHYT{wdCH77n`IsEDn;tqcPBh1g|-
zL<MlhXrU8qTR&1G+*xb#V)hmPsT-(W`vnVvmSDEhce?@1q^(7N%K>QiBij(-w}>g=
zTbOx*dJb(jN@t1c4OTZ*Ep5)7d^$W)t$&)Z#I2tPQ*J^hDM|+24le|C&IuWts66*H
ztu3S3zu434Tx3{w@*iy<`<*4<nRP(+3T?Oo&hIr?huK%)#M@n0#>WlW`FVOBD_t5|
zd9IhvjCE3P=ER_^OG3^sgjw2}!tGx9XB;l%+We`8&1lfDHg&Q7$$9XL&zzg3Fk3(Z
zBT$HZZp5)vR|AIqdk7hmh|qOEGUvVIaJH=ahIX4uR3{O37kU#+yIsFBM(Zm=tU%WC
z(?RF-XFg2I(OG0L$m!0{a+Wd#?}TB-{3Vt9*^zT;Az4$^hLiDsFwYl2M@;*Yh4ib)
zT=-h}5JZebO?QPlIj}ps!XHf~dcJ#$Gc@FHX#(F)NV(+Upyt8yezKF3SL1n)P1c{%
z3tGK@=jqLviN}9ZSZjn8b)17fpzPzZ<K93g#=2X~c01^<UIjNQ5JJlUew;^j$?D=U
z%OBIsML^lR%^8UK^;=6iW9K6yVf&%xHpHl^s4ESh-|&m5TSJ#;RRkaPaImD4Pt-l4
z4CYci&for>^7r;HF^8VJ?%fqj@hp>OLtS$-d!1o0k^7tfOl*0~54b}ixarBRt7*>y
z-jcMrN&>HvDR-xYLPX3blKL4%$)ADMm`tfnz7Zi-<f>a<F2X7MhPCb63}!!(Xfwr~
zwg+6@ogrWdPD4;*yGr~K9i9P<x3bjrX75biqKG<2kMho*+Q^yZ@v@2>!tO=@-Ljm&
z>|E#pM`!>(G`(m~)Qq7EHosC?MJ2n6T_;LVv+rqLP4-uscgZ%QmM772a|05${f?RF
zslmwscNzWK^o6tW(FFmbz&0B4p1dOYEX7SQV9bJs>ipyCRsQjMh&I!2Bo`gu@1PPT
zQgl7_JFdfQJ6hxJh`EJS&5{E~{+Ii%-6pymN2;QT7!%1w_qJc-#YsxIVs6@l>gAux
z0%#NouQsAJv~K!%w;7e>qI7Svi-zFhs%nEXbDL79xX(pQ#};TF^7C;Z`oHWIRMM03
z7h&s{1q<!BXDQ=|$R-DYTOYj95*`U?COB$VALYnF$U}==Sg_O{{7I|xS()DRo?i29
z3#Tt7Vq0#PiG;#bUd4b<ELIBsRbX2e>xylahGZs_u1msqJ<15{h{kC_d4~vxge!}z
z4EsF8A`K7t6PC_ir7dZT`4f(p6sZ7g#fAPM{}O(~UK8{-Fl;vVb2P*9-k<&Tf&H6s
zURr*Sd!9UO>`!P?<}JFD%s9JZ6kE6*Qgfj;U-~Bp0uL(kJoPPRz1^V6@gihzq-Ti7
zxL&O2Zpp^A(XM|ztNYT)=8>V~>rH971T<OOD#M==8k!8SE)s4txg8AHGY9+2X#{g>
z{#}!E9-lQNu1Y%~_Wma@aa9CGeWk$;f3YXa!YeRB+9N8e2)Q<rgeKT>{Oh^VfS4Un
z7L)~ro6u+FY|zIpqE4tmNv$VdR>KZx#1vo~romZyzZ3$J5_qs|{NBm%GWKb@_*oUN
z9yQ9c+yr5kSM}_#oCnx)_*|d7sJMZto^YXGNLHvNXtAMp+3>FouP!qB4>=u!Ioiv&
z$(C;O&Cbi;M#52;V^rm@s^O{@{C7C@JBqb)hOVc#?bY7H6L)+yck4%T+Z%m6ItJgR
zA15;ttbfIfSJB?%#=#p*3!UmwU1ek3q%WMOPn?*R7V^CH<7b`+vxW(YNA}!G=dq|B
zz;CmEAqrGG)rY?zi)^wxRvVH$SLD$kr|pzA@0(Q<ytJqCI(dJUL%Z{2QiiGJ3!Myn
zFzU*LuLA&qYB?$A<gwD2LSI0@zqzk(7nO=2SVjM<QahPv-P-IvUFfMb7{B7}uHHWd
z>YZOO=?W8s_XlK(dwy+y#8<}l;%^TUstSg`Tt8?x%Oyf{ZJj?T<iA7U=IF&9JK0+3
zN)TeC+3{j%g3#Bg>aU<GYyM6wZlu3ozHFo-J#plkkLGL5y?{)zSgFg4d&tnIdEejk
zioM@A>2l!BhTN&r{JTIaImAw&^ZR&d;#_%fyYN|i+mfb|T3<kj$j13Re=)|*z~tDS
zc2<({f*NNF`nMLNNrf3!jOV=u1<n>a<%qoYt+#v-A-rRbG1E2oC=|vvzFYOGarb?W
zi*V?75;>H)$`*exh|6l5Sno+I*g$~l!mhZ<ZKvvop){t@@>jXc`(J|0j|tKLjwT&&
z>W#ww20O-|_NDyQ|2`d{#uD8G1qH_X&@aU;$cJjxoJ6+=#w3ht!|t~1b&GPw#%tN{
z7m5Bil^|<c^GCSizgGgj)5U*k`F~dRz-oSFT2C=&z1Cw73;l+t`1yJ$tl}J_>XpaO
zxWz%ap1GF1wfc#`^M2D7&(#SXX#X=Q@IQ(e%ILUhRztP3Y;n3bWd9#v!2dFM{P!ZL
zH4xv%2QSG5SZJD-$@CACEOBAZE=ZqTPbJTLv2ZO$qVbadgpdC#qTrvDfdg%QEBwE@
zV*CBEVk*amxMzF_yZ8RwqSZ=;Yw=ZP8cKx&f=gO1V?OnTNJU{bu4<xR;SoT-YP!n4
z!URGzI2q=xOT40&7S_3h8?~1+^|WF^rWrN(0qBZZKk7sye}hj>AFuR&rR{l>oTS56
zcLmy`h+H@~xM!eDG*U>imw=6C)8q>6oNp>(UJ_wC1ykr9COmSbG6Md_O@j!pnE8?7
zEq6-tv9r&}S+XrR0URp{A)3q)qZ^xU%Y|*&?4Hii&U8^#w99V<Hz`wWF~044HX1Dd
z5x-xRGP18||3clh#D5@m9`goJ%f}IT)h&kjM_ANcYy)p4`Fr;r&vAWoD8Dp+mdw%C
zm~&w#@XVdzcQG%Uq3P^U^tF;IWMxiR3$vUv{gyV~?ATp8X{RyRI9koW8S9n;$yKe!
zJ!AiVyP8kblQBT(;7?@4#*O8vJ?xhJxc#6dH(H2!lE6yBKgHg$deC+5<2KD~a?8?Z
z<wm@^r8wU3ZGyTJR=$jhW(T*0er7=St_O2g5hYt5hh5VkbV#%N$qd}M>;hk2faZKV
z)pdOUF{@MD6((f0y;3H6H{>kdbu}x(XnHi{^BK9VoY2wsNyO9-$K|)w1%E1t9vxYX
z$MM0`M?HH=h}k#oljXJJ14XX~zPvP!Rvrft+BaruhoA6l6_*76w^Mw7fcLy#C;a5(
zs1A3w@9|vhC9MwjZWr1sf}HJ?)&l#!9;aiYyjci<FNQq#J9ZIJg*oe9g1WY9OJ5h)
z!wITeGaipbB7QMXVPAn)Rb1k@S3lq|icSD;gbEG8?JhPZ*KT9AodEc)a#2#WLyYrP
z`&r-g&eScX%ia8h7RfN^;`}|A)jx;?m<(juDo#E#n$kR)1Ud}02~9ib4x*!yip4nI
zUtsT0Qak&quixA9_+-mxJsBmH=+z0H=n9!6AshmL4G@p?vy`v`m8cab&sL~>j>oyq
zx6;k1J6ui@rmI~?&!&+PVNDJ^qs$yW7#nSnl-uB!m3opXSrOWUMuh0sW(VirI!oOP
zDOW=b$C~`?Y|E^rnha>MsPL$s(m~yvdC~0QQMqCiK%)JBRQJ|#QFdM1u!4dpA)wNs
zbaxLW(j^8R(jXuVT>~Q0HImXQrP2-3Idlw4hYa8d2;&Uh?>WYGU-uQi=l!1VpYQWe
zXV|m%I`>{{9mhKMI(-j5zvPp=zRw+C;=l%mm`ZrC{EKt52C}8x*Zl62j_B_PN}a)B
z1M#4{lL3yrsor`Z$6bk~ixj5Z2jx@l>6x4{!i3A+MjOy=GNCkai?$F>vEB5|K-OMl
zQFl6357(1szEu+$uXfEO%aS1Gc_kx9ied3F4RnSZiFQfwsxz%0Vka&^Sm5T%YY6V9
z<H#gD2BNfw<HlkV-3oD7F#qV9qifXga?<@utg+q(HM%3_RO#;xEnjKc!e-L0(e`C-
z10ve@h7@e5s7pmXgVlzG@mzv6Yqf^*iXyzu+??59?D+6H)Ku-WZPL{Yua&`GpH272
z6LN6qf_WTTm1+nG+USAcuufYu83Bpi`Qg*VM%nvo-h8n!wAC(o0j@D0Nv|Lcoq_!4
zvm|RlUDmY48s#r6pdetgQ?6Qnx1g?o)^&EZSD=q5p$8IzqF)K@Xwqhd@k66{>Kz+<
z$kf?Z1~2XGGikleEnhQS@0!`H@PdjWvTA^k-YK;KdF0ws{;mj+G^*Uy9@PlO>soiJ
zu<l~Fcq|IU>E$$moZP_ft_fZEqcxTZY1fapdQV*9aAVnLCB|vMb3dqCB_Kak8DQAc
z?SuMtx31>LX*Yd;zf2huxYp_DHjp%HD-kd}R>}rUslT{R4bxQRDsx&C#bwXkxPMR8
z=g^*Mlb~`~!V5zKUL{&khkrQ;4gl#H;K5~>_QYgQ6K_l`{(1;=M9V<pL5fX7d!>&z
z+02My8{&h$(LM|P#}L;?COb5nsAudkNR94r!iy(8M;$dD{>sbewPK12wNGP~S0_%A
zh<3EFOUcd`3r2y&?eVRp=8vv938t2@0wUMR<rLs8yh6~I6x7!!lPOxx#+F|?FnUcP
zKYOoU*L@%D4=>9ph#yrF;`LXe_KLO`(FdKpafq*5cdmE{cXNs%mLd7_R#x<a0CYLZ
zY$*B=O;4#3eS>Hf3qmqF*l*$a!KC-JP07!Iq+j*4fOrJ`$U#TIx^qR^?v>DEje0l2
z4lN0!rfj>hvEM|FDpLqvE|u<tT*c;fp+`AAMNMyBsAUS{)(`Ufu}o;pwR!7zBq*&_
z<mi>S(`E#I;z1X6*f{$d>Pmy9fg^Xh&9;|(??Ur>Kqj^D*5}Be&dy?k$NN(~f|;7L
zdh7*Jr7c3M;eravbJKQPP6=lRy!R+Jec#OL=PEd?+2_BDzGH#2>Ak)3ISmRl^^mF;
z(>Z<K0JuJN+vgs)`7zbUYK6KQ9KbajXu<rUun%N5m4am_@r=P|<*3)Icl>OWR8bOA
zZ<?|WWYGZMU19a*afG$*rM^@UKdvg0lqtw_J_2c+OsRBQU>xBLV`pe6xGQ>m3X)vy
z0PY#w<e(_zBY#paTe}qDN5VQEV8ot%rL;*p9C;bnXY)A?c}En#R=Q)8W%dF|<c~}p
z`x{utZK_#cp0%Cp##h=aCi(QG<j@CCuWTIPI((=5g{?Ws!>TO@8zx$hzIu<Cn>%sD
z1mi{VC4sX^gjUF_|5V9%29SRH0tI_1p^1faFa}*AoCARK(mJhJz0BkvO{^bxO9_V@
z=5y++NSG@%t)MOKaQArq`wF!RA)S3u(`k3F08sB=>v@~%#~U+!pSP~Uf^lv&wGkg_
z@zm~Ir4ByR>+oSkf9mkl9SvUeX?52}VIVTL%fTlx{i%a&-X#4@>9>_6v>K5SR@>n(
zDBJDq1nzZ`NYKRBq2huXzo*S`mI|Ca$tlko0QzAN5m8^EBm{J2LcSVh?=E*gm$bu!
zD~XKkHlXw*Qgkj;^XU$S>ujuV25{FU_p+40xC*p@N`MdIr~}eodY=*^1oiiNcVAdg
z0LT3T6pslqOHl_as~_Vbkv0?Bpu$}pivCQAyngT`Eok}7h~A$!wa<6GgYP4y3U!5X
zfgk-spyd1gY$e{9hf+3Uv=S+#4<yrO4?DT91in96zGgp>cy9~vt-XZH^2*s{i9$>P
zHjEq$gy%Db{Yj=K@tAr9Hi!Ke*i)n!vLB4ehmp4VPdiA4tMHXp_I~oEaVAB5K;2B5
zaA%#C4ig?=zP0Y%Du+2`UE|MG-x=sneBf)6kN^+~sX@6N-GgFxB+`OF$DNh&FD#TU
z2*gF-Hd4L`-j~kT%5d4uQ#~mgGTdBu=SEAvb_-e{RXXN@Y#vbhoqf1kY4ug(4KD_g
z6%a}loF@p}QkDw)S1!QnGFMaEHYY}*{Tg#X9fhf5Z|;j$@Bp54tcK>TxugmG0>OLB
z&TkYH=$Q}&$bJ!MV*n;bJ6`Eh({V}Aa#)uM^11jBNhXzwmoza`tnfLi!DY(sPLx6I
zyA;Sxco9e57Xo<Cy-|)Ljhoifd}F&WO;wiq2$_|hj#^UBXLQ(AXdZOQO@B1MgUeKC
zYTXO4P-K(`ul-GMRhzbyDa41WbcLXCExR3k0!q!35ea8h^=~>l_-q3?XO^!`+&9vo
zQvk^L2f!-wV|?42f?J~Lp`kH&V_%m8+Ux{(h9n6pL`N4v(HYrUd=qlFqAwDeXfUIK
z2tZ%9uOLPXi*(%RCnDQg{U0$pEt8oWC%CEa-Cny9W7#4z_14fVyvjlCoeyv<9{@cZ
za>|ccACbL`k2L=r-~=B2;Ar70`)Y_^FycJR?7|&n9>Xz6HICFmR$7l1xffAavfuQ&
z=xdov$LiP5jYS+q8@b!QY;3^M!1f{$g1U=;U^*MvFOQ{^3_BQ_{wiQIoPY5?gRYrv
zN1u=Tyg<dOnR4=1X8y`vuY?60n+yAB4MAw8!oP6{*tRDI7MUbXFFZ4LoL|ZZaKEgv
z7`uCL)8g>e$s=uEpk>3SWl8TqA?S7}hUrUIKM6HaqAy<j`pVYm+bT<q5DVd(4!Gez
zPm~SzkpY4U(?>P49F`has_AWFXL`<gm=8R5&{HBgHqi%~agvU`UZrfEO%Li!qy@tz
zM34fKkF9%cdxW4GM!V~mVrXa{qOSbm$FW^N9M4InIJ&?1ph!)rqEx;y$|t=4PDuFs
zyQqXQIn2DUq(F2dK3@D`Yg31GoQHd~?pB~QZf{fA@Yk&Vru?3-cFu~^B6%M{fC&L2
zQs86sz3H#F7HpLw_p?!XnmT#25z1lbJ>CGFIXR#M8R!lQv&Z$`$R?7@?V5tnDNf0d
zq{Cq{`~a8%PJ4LHNy!Mh?~;s=$^Ky7*AG&h3x0&m4j(t9Bl?|VvXlAJ(*_1^&Pb%h
zvX9fdW%wdwl0z04sd|QP{u^ik2+VDIAx<lzC)6>t9n6^E+i`*pl;YhVg+=f42kQ;X
z-7vwmHy76Wy2>^?pUn|n1gm1W;>&7qzvDwL@EFeOiDjf>n5mN-;cY!e^TgQ+3dY(_
z0wS7FAX5SODIB}WDL1LqaYfTK)m*TR^$>|vjC+2hYf)AZ_Xj*UQI2CB5cP#0aL^Qz
z6lGfyjprl;jN}Aaef(I=)JWkVy;WlC8ovIHCqEYJEP>5pzidglpY7N2VM$aQ6N+L>
zhd)9%*5ItqQC8nF)yAvnp;i&&UZqtn;KP{MUZ#yPfiE_<M_&N$TPWTP06K9<=WI?T
zO08Yb&Y6F!Qh945?cU=oz;gj+SZ3HGm=O*`UEj0MC*Df)6UhwNg)q^#uw_gn1PuVh
z(w@ZnRGdcKvLR+%s87zO5XhgG#HY?a1CCB4c6MTJ;W{SMPpDGirOqR!uv1n^fOK`p
zKgde+xwAwfE5o6z)MU*lmHujRPE!)`Dsu^2K|O;xn~}s?uYN|)z)~{&?p%!(rKz>?
zKii<Vuy|tfONwB0NA_$P?H#2#e!1O`><_ppkmS%YSVPcG6_FFvdPd2gpZ#6own9Xf
zGPS*^V3+W91f0EKB2^@|X6t4HCA_D0Pa3oAmcU_s9Eh|mWk#h`X9WpMd`ZLEUQU-)
zgTHy*>JUxHL@NwZ^&4eLc_++GwfI@uIJl3MD%uJo+Z`%G740d|6|1CWre&u1n_Xm4
zGOm!kOi9HUH*n{^XU!PL{h(FKN4arzKn!bH7JBpIiPTrYPf3!NsD5gpLOoq#AE{Xj
zjT5@7PC~2>?jw_(iTsMORaDl^oSA~^be~V=NL_*EAP%uYO;AaTv9ff?IbUrobBC1_
z#Is>W`NZyo1#2qiy+{S+=UJUniHKv%8?Gx-kyJUsPqnU4pUiSO;<80YG~kiybPE$7
zWc#M=P2z3Z3qLw>L2S{Bs&${hiie|(r<yEEgJX*X*fm}wjc2}xtxnH|z17Z$BE>L}
z2ul2B#tabi8GCbG%oTouXO&g00SqtuBJ1-5T+Y7Kp2L+7!(ACT)Ui3|LhqwtI^zc~
zht9^`tBf#z4VrVWzj1c-#G;sK^~(lz<dk_+F~dCXZweB-*%FDsv#`t&;#b`J*|9jy
z3!c$)U?)wmGZC{?sUce)S3r8jG>-~ih1(^pNWm=d!13LpgcHc3xqc~vT#Go?);=jI
zId*Txt&ov+P|)hjK71>du>GjvH*?8$g4ilp>bgNiMUrXl;!?!+|Co!y)~K9LZu&9u
zW5Uq|<RMzplG>C!%)IMgQ_=0=G7cYo;<5;>##to?G#F!BVCOA-x#`GG^C3Nixf_Iz
zt_-*zAOFUm4My&5sbQ?4->b-Xfx4^Bi_hN4R*VY2<uuZB-k&(oMGE+^E3ve^xE$3n
z|4s7=QW^mbw7uspS>}8h8ou?<@(BYAPWCYcCpRuV4ZZ4}rU6!S8~lD;et!)yBq-f=
zfcu&arNB~f6S?DqtW0m`I@G&9Kf#An`87F3FmHaRX$Wr_m(6A9lksgEo36V1rkXXT
zwRFFkd_a(K>&hTQw%>FcT3g1K*#Nn-t6qx-_V1Mo{mldU8k$daA*7r^Qq6ngTR``V
z8q)t(8{oZ^iHZUiF_@LD_&@7XUQJ{-a2RwRZ6LK-O7+aFr@Z4b#fT6G?Q#~5Lm>Mz
zxiM|oyLc5cZS_0bD3pxKX`h_h2J?S2RfPQhHz?2_NYkWcqyCvF{Y9Q)Io<|&KU9rk
zg->i#^Xl4ta@_p_XH#OthjzIk=?O>lq=+&;HUA^(pWO{#5axc=`n8*XX@2_7b=ZPQ
zOKDzDZB_n*)acI)qI^d~%Q~vXN17hEklCw(Fl~5}WjDh!p~?B2suju5kl#&f3SJyl
zt;;V9y|wUD-bLE`56kOT8Z0)ar4jK8xbbJ|`IpY-KZ=_l#Ye9>|3P~+p^yfJhP;W#
zfFQa(=Zki}R`U*`__|w!eiW=;5c>M}=?S-DT#uR?n9c0FN^-oh4l}Tdu&tO9qBcR#
z^7?IB7W|;8v8YEFs4qQpo%dK!K+Mql2!wNq{80N~H9iIwZiFo|Xiy6ld3>oagfIWz
z5LOW|i~W^>9Oz`o7(E%Erpzyz{Ff{XOGh`fMD4^%In|n9J;fwUuq7cvV5LN{slcIm
zVf(W*@ned`bx68$xxpvaj$?y2rtVK07gbFgssb%H$C^YIVc8zw#@Wci3uq}Ff0<vD
zBU^@tDhS|wL{=IPJR4YNaw|_hFql2o`dxjf=oF%jP^YUntlDc+W*hdaLWZg(Xk!a!
zfU4sK3w@rPvFE2M*Dqqhwtg5w6WQjE89ffa#s+bvARmA5MfNY?qjP(iu$%E`-4}vh
z?C6ax)RuL(c8PwWOixw5^JV=449E-r+v44=?;VOMo^QP}Z3$`}pK%V<Ini0f4`oQ}
z6+t`=$^fWQblDWL2=s~`(D}oeo%Vc<Nr%uyrmi*gNJ`nX)AHsUze|At#fRH?;zD27
z-2U>@<96>y+dz>wACty2*2{+(lr`)7@lCAg001pkGXY{5#d`#S-@EoW&Hab@Z#`+m
zQV@4ekJP4aX*an(PW}OIs$k+TOX_oGZ}=YMlyaJ5g$MGfGJEqBBH-M^Tfe!`xvc@3
zWY3snV^8<v5p@(kI`2#37v~9b;l`uNmysXPc!gbOM=5*HEW>R{D(Z)@av~Q(I#9sT
ze!N@UrS?&qrSWJ~%gHmw#+ImGHPEQ=7mI^prd*d$s(%zi6Kxr5XU%K9H+ICdPR+c_
zSrR51k%BoeW^)aWOrP&i@8!3Oh?@$w#URTFTcsGFIG?s>2+1Vk8um|na!7@6W1GE}
z8wNBb7*+`J`Cb-sgok-K#usi&EzC|Yvk>mxooW*aV+inHne5TI!Z!Rm&13RgZ#Yc_
zQXS1`m5(H*7gTqhqm@GSr3bI~j1^`d-@4@RVEOeg<`slnfA)f{JBqhn19L{b{^Xi-
z%?2-87#EM3N%eVVz^F*C9qxf*vjC2~_(6A1^^xh!w+g*mrwrwvZ`TihU&H%RmjNnD
zz4x+$mfPl^Z+%A<J_%$$;N`;-S(DF=Hsi8=*T-^oB#$!O*fJH>OUv#7ZGiXJ&T75r
zKf8YXLQTJi^Kzb$GPvzg+WE0U`}~j*zp%R&w}qh3=NsPDUn76Nn*Z|UjZ2SO-O29R
z-GpAnG5*Y2#VppVX};N<_9<)1ObGccYsYs=XyFL7;O3b(swUJDn`;Ii@c(kkP+I5n
zc%{}GHq$B}yhK$RNXg(eeSm&;g^w&ThhE^kO~<sNam!;N5e=wvGKN;=37GNt)Ydcx
zAnVj7JV{L4UtVeOnNC0_Dw~C;=s%{S=zh)!A>2TYl|ruWG4PkIoj@l2=MQeUBtY<K
zJvBg&yB<Dsz4o6W8*x)Nmt0O)8z5(lgCKBm@#{zxexOL}dxx$9&;l|#2`L8DoO4nd
zv9l(?kH7wQdpzk%Ng6E)HE42fxKltJ%2fnn4cHG6%bc``9NW(+2v|fEXa#mjxYx>j
ze{lS@zXd#F#t?8yL;-tRVb*fGd9l+U=3_Nt-XBEy{QGTl8_>}n!&U7J_rfdMQc%5l
zS2K=M2~J$-QPy!ijlXxnHGJ3V>IpBtJKg59j_VssF}0K3U74b2PO(dQmdi&@Br+R#
zZ;wNH)D0&Vw(J^rh#TsI;$jN*CcCKL%Rd8EJja!Aj39mJ*=&Jn+MI^0e_xsfCM_Pa
zH@u%j^L5GF4MKU9Y<LX^H}Ko=<Bo4`gb~hZhjmewN$~_1Y4MxQ+y|YWT($4lxn@q`
z@K*GucnY@<etb&E(P8J=#+R9IU)L^4U2tKP2vxmwdPV@~z&;@|_fB<JE-pF7Y?JHH
zCiI`B4qKvviA()kkmwYX2CIIZC>Pqj!*MUfd0d*aGpD4w*lw*DvgH>nC^@YUC#R8_
zVEU=o?a!|{aT7BazNorVMN=d`7RxWu<$PD~Erk)Ptl?;A`SUzJgHEqyAt3nkhrOu(
z7TWORsX92k#UsG<+)KudV4Xuhy<)4}=iwzwcJ^vpR@1EA)5DX%Jn)pyYkFOdLe%4M
z*jV!L3=PXNK9E)3KGL|lbrA1>0*YnFA70Tal1_Y_d<VUN>#X`-`QuF^pbQBYy}xXG
zWyIz}jBYZpWKkZxr|zfj#I(V<5#-1*{&THUk9OJ|wqJ@A(vmLdlo{52@uj(VVACrF
z^=FkHBAVeCBW5OXnS3(QR0tmhJJn6;ZVC}EuTq$pL|9sXEU+HwkYhD@GUU{*YKw3E
z8o2EFaSPFvVmv+gici>Gz{}|yc;~%;j4i7?y#dfXZnEm-g-OVnvlKGUvTUX~jNU+1
zVeU~iJEW;Qc4oTx9$Cv+7Jg^LH#g|f03@;K-n4(zdyL2v<!5fa@m`E$7H{m5K23NA
z>%_JNSy}Xw+ycvp#|Gt;Y9E(kw5;*o-lzAO-(Y`qB+(+ffODn(e9>Tot`F=50b9%E
z$8q))Nu}rk<{l>X`5=ub6XvSb00a@f>!^7Lmp$v5-3K>zuxMi@TZVsZB@Y^O%_%4(
z^n-6(jW;Wo?H5tiVqrbf<rjG%{P^aSh})+OQy!f|*G^%niPrJSbZk{eY#9($(CNzr
zgak5eMw3%8Lhda7^ej)7YA=Rv*7lIfQ)CKzf`Mm!i;NHulhf;jIRY93l^Kl}**tRT
zq{4;<eKYN8H?+p<W$T(TkQfh9cU(N(LpWxLVW{(7KQ~e)&d_`%&6XN?Aapkhv&8-F
z@#ACrrQVx%9v$twFtV9K)LCLbk+f`hPOT_66ercp2IPH;N|MkWs|#)N+aticyK$>!
z?%Ilzx#ksoOwZDAkw5Z02jTX&oi{^Qhvi;u6g$dxmM?RB*cFR<UqLke;<Ir@Vr_^B
z4{c8f1(oiB7UX73v37Xo721wn#Tw<x;ujln0c3@y<!arnwc6RcCudF}a3{PUs~~1}
zsy-XiwPqzRiXkl=5s!5=vw65Rpw9X*&*_eMtkh<GOaDxFbyT0{%hU@#R={qXMaa`V
z;zH*eIU!5(vLAH?S-DQh>OD0%uW`CRr*w%89>?Z6b2bv05I#6{qOkCIi}y|4`1oB#
zW=fzMEB%Ie3b+OUtx3HlYxGo<$qvv872@{likxMJt(RG|KnM%*1JXGw<HT<`I<=8n
zHsqWPa_7;h+`;dv(b;^rP0Y+?y6W^K{2`uf<tl9tW?V1PlW~HoA$6T{%zmnS4Zqa>
z;Chc@7!T>N$R6kkzu~&bhQ@KI$b(>D^6zx<F>lEwOJqGPPwwd-Qr2)4)E;Bf3HJ?>
z%1B1oE6Soo9;9E*P)~mB!3f4V@tVem?|b5uM}jvJ!?<D*d&7-AbAE%K%IgCkxo9Kj
z)PpDGgKgdR)F*tOm?`V*{)!^NA5W!yYpGnxoS}r1%vm(#EX>E&IcMT~7gwKlCmkUh
z3d6Ge=H?2OvzpY1HH(qE9@nhJJRc`A#$lb`oMPkqkkd@P$9dVEvAC)XYb{%x6i>h^
zEr<{{{;@ssVccr{6I=MxUL0DLX`*y5(yr>7P|jOztw!-{z-<X#Kp<K*c<o>cr;18G
zCn;}J9104G+fbP$Z}ZwrT9QIhZ^Oz&;_;{Sl`E6|7{|<u9)o%30X5hiNuF#9pDh;(
zo2G;LRNP)Q_L;{n)CWVKQ~Y@9y~dJRcreQ{nWwmT7r;A_Zsnnmp}>4SU_n;b!`3@a
zqWzIsdNo`5wU;dUYN(Br@9)VCxl0Xm0`B0R7p+RwTH1Pdx?^?mOnzUV2rAYpy(-QY
z5vt18nXJc9;>T#~3%LyVRp6UzcqAj{eoUe^I4|ff(>PWru261Pc8@-a5+WH#FYe3(
zs-c9o%E8UP9_AZ?c_r3twwRi-{1Ds-x)ZM~j#f3-!yEX^u#4gN-@gY&x!JQhM-^4j
zUDp1Wm>c0obkaeD>1+y4F_q2Zj5e^$AfQ#=^lOz@g)RB>dv9yY?H~dOP*8n=BOka=
zie88S<;}pFv3-6UnJKkkCRM6f*>!{(?To|NjFpSOPifv+Z18h()q1c$eCV#;T9I4v
z!{4E@(?@W|xns<t;l<og|2ZNl*QRge&ZNO^x?YXAs4gqHX<kKgPr$C^fX&E^;BP4s
zP=2?<w-Bdnn<;8x=({|uG%sI1Y8@Wm+KyyK$k@^cBoSMdt+74(z%@ssrwHUPbvBb!
zVZ+@%nSm>sXr=VP9M0$^d)zW1i<QWusWI3H@0MJiJLN`Y;GtJB!sA|%3uIji*2Sb0
z>pGLXA+T}ln+4G)M!@~A#p3psFl9|<o}Njs!yRTt_0KYA)Kxl&>CZ-_-*I?8QGXUx
z%lwIH4O$G`&9P7Cz*Jte6MCiu$bPU#2smm0T>Z5aKxp)9SZPJ^MA0IX=O#t3sj<F&
z4$Mfp*SiI}U|OjF;_6$alXEn&!D-WCUcD(PG&kg=`p?e{xEz2T8p=rWZl|4h{I1h&
z7v6j8AwNe2<VL?{U=?vRF*QCtwRL$iEv-4}VmWyb&wEh%(~0l<6Owsdn?c((y15hH
z1MX6_EdP9V01eNtN0&Q(r9oeznxL)z*Sh}raU!y1;MNTGYcAWHpVkZ6>#xRt<k{KW
zSi76Doz~@1oM9$KE^PSM`2kJ_xZ{IOd4O~RK*7H(^z&<xDrK%rsypG5R=}kw`DvTT
zW5@H;(XU2<`zinRz3JzljepKeL<Ji5m*F1X`QJREzs{4C0B=72LEKw~8sMT;u2X)`
z@Qqi4m#B-m!S|1!9MK`>Cz6Pf)@YE{0=-$O>ET8}o8@5?_wQ}=0rBPCCcv|ro|SPP
zM`X9G&yRNb9XO`>cCPDU=1~TB{>b@Gw!N|&d@aoup0Q^5YuR`=!7b0E{@5rWMRF-K
zsg*^>&CfZ_*;J;bR!HLivfaNemP_4_!M#puLLpYBne+HVL%_HBO1rpHy~d2w#H}-~
zOMLIKfeCnqFY+w4Bqmmx=1TnZLYB5k=ckrt97q$02-%&yb-F}EJReg4hY}uzI_}M6
zLOop}h+VDo5TmAuligir)Rg|P(N-`gWs3bS;ER7aMu7uZc_aU6s@Q1*=Mx~xi_O*u
zl~;ch><1(u*C(cbCeNLBP{G4Zo2)oavvFp}9tGOANo&~6{jc7fg;oU*A9A!C<s7)5
zs^*=Hd(h_>=$WY@J%q=1_QV}Jn(&9w#GzY!b!C-XfphNKETG{ae*i<wt>vC%f?|%k
ziCr1*Bhn0iY-wyrucF)SLKzS*1E>Z{csKCO`n!?A;bJC9m)oF_^HtLaSlsgC%-Z9-
z0{35vBl}$$4az%*cD*p}r%7U17jn5hinFBpXPjOAVz2$7kwj`Ukuf%43BF!%VR7ec
zkgeI3>F2ki9Gr;${dtK)jh@s-`MpEIb97yVUOYF#cZF~4!LIN&s(}1E;b^D?GP0$7
ziT=se-f<X^ymHPXp-gk&;YPGh<K%Z16Wj8M+U6zGTLW_6XUMe(F}!bny8{Az*H0o7
z9$NIC+eQY(xV_9cqF+`<wg#kHjO4%Q+vF?983@h$r_;@W9nFowpFYj?E$OgjTnB)d
zc$Q^P23pIHj&j-L4p_7JHTK|DEYqpXBSSZa8?ElK@NUS#v&uZuJ_z`C_INj6yb)R2
zxFSQLOl=C<tgXxTT}&z&Lf>=KW$@cuB?mC2z6m*%^h0chiA57Yx?K9wzck(7qH6>O
zq&kN|nzhqIJ4D;~lTH|jc3mA)ob1@tSgk-<AWC?mdDaD`ji^Uv@@-C(wn$|WsHF#v
zcK+UWK>l#*+n2-lplgs2AJb4^b`_9?OfojUMHm_>Tz~QCGtWObmOfwxU@{kOVo8i2
zF9=u*dRPVq?PEkA)XIRlqUJdoS5DcMfXM$r%Pu{0H+n)JC?Ki@>oOK%z3!To{Fd~O
zRDIcW;zfV)*G-38z#wEWwGOXlzxPocs<YlvPMlmM1Gk1nk7&_~6SFrM+&jJI#)Vp{
zRk@&_5NXs>)mPf;u^W+4w8g9Y4pJO+h;cSHo(`f798g8W`+3d4^!g|H#O3$8yw(N{
z*PXv@26v=(>fAKO{yng?Pdy;fS6B$&2!N&Gv*X(e6vY}cd2~I%t)t0mzTxJB@}q!5
zMZJdJXA}#kIqO$|B51JpI-RE*<ig2i2p-J-Vuj)F2SVn#I@jRU_-VJHJ4e^JtNKJ{
zND`_cxC!}Aisn6{y>|AwyMb%5K(nz=`hHtrfzvCxmzbU{w4|k&TLPXkaAyYUMT)h|
zzao7sWZSJAC_0x**W<?9GDLKh>Z8wS(C6^|1@rYQ0G7B4-@DTrRCELI1@9ADQry8G
z7gg|nB;WqcON+LdnXAVDwh*<w+IIOx?G`C4dvViktEB_j(>7&+!%KX6?JIIXhdje_
zeLqqlorMEgvoiLsNaJaDdP7|mu={aIR@G|hij)q5zT7WuMrDvq$3jRnJp~vgei`)A
zMpq_Fv2A-Wr~=5a4K!^8pI)~{U`LA3kjdW|$h{R9VXF>;T=L9vflnnZq~IFxlvJ1B
z&-n=wws9xfo92W!<OF?sp5U;CYJre&@6oQ5x3ubKZI3a<B%p^=sqSA#+3I?40zZij
z<o^l~fZr!2F))X^>^O~a5;G6sb3qBo#-^?Mops^T$Hz2J`lqnT&e|TZXl$VVM6lWt
z9Urtv<X`J!#jzoGi$1NY^|r#C)%ljT*2QOqtPS$n)j}8&z0dO0gp9iSJ{q4+99<a;
zAJ*KLBKv8sh176L(2OIRio(ofMGm1qk~MiK42BmcEm-VCE?8V$?^p&5yl(mJ0axx3
z&?bp@m4t<koTV@x$ez@Ir(4kEVr0O!U<78g#}^Mx;&MPbY<ibrj2yumW3|qo#?pYP
z7CM%0^0(q8{&WEc1LTR|3kDL)e$zpn1fAgR)wd+uRHx_F0a_`#;d&_e`MOvi<atfm
zk0oM{vWf7?VhsM$Dv4I)*&#ZbkypL6MVRGu#jO?}h%vvAfXS+@(31$~rp==Lu2r$F
zvjapPW3$j7K(>OIc_sm91XDTg`;ZIzJP)O_*z0)uTI%_jgkw|3=95<M`Rii3v4x+4
ztn%Xp)jd{akDIn&!xtyU`Xr%_dL}jpNg0_3M720RdrVLGp#g~bZL?#UX8X0JB-HHD
zhc;HD&W>bC9g%cy{0c+sqblItD^|Ud(69TIyq*bHK60Vt5FoNFEx)CYFis==gMr7Q
zdqceS+Z9jxnHC%A-w5H~+fc0CMqw8C_8m&?lZ$X8^zD~D@pZG|Tu}1T<~&Xw(e2oa
zdwmw#({T@V%^DE;JwZjfQ!;Zws?SK&KUC*-s8N%%XjucE0ob~4_-vm&4QQ8w9uH&u
zK*CW*j8|)mw%0xty0#Re^!Bp@mh);kTiAf98pDXkuBTva>{Pje2hNshTI%N&7Y*-+
zmq^7%iOtvz0zUN@loi!B3)p|qcq&=g={6uEPTz3)(mSg@tdpx$SnzZ|Q#sgj>&|bE
z4_JFRKFbJ)H-0=D`;Y{HP}4bwZVh!~a$udnwYZm2w(4X;364f>K30Wf#H`@iqxPa)
zNT03F4^R7iyHvS16355@IFwd&D$kdCHH2D%Aa$q^e8%!yownb472HLq9vgXqf~9LJ
zWp*pQ5bG!(VS0X-yVl8RK{s(DVS%R&jHp}h9Sb~}0j|wj0$N3M+(N!dr$S#=)ngH*
zH@~Ay2jKf}`k#Z0I_dn@d`g<jpH&~7E_lKO81qgRAyo#YH`vqa^1HRFZ-3ZRJA9jK
zmg4^4W9wyRS0e-E;?M0(xT>=4JVf*Mt>%veg9ucouh0&-1-djj+;;(;K*q(B#o2vO
z&`)>d=MLZJwj|4*g|t*3pXuYr7q3i)AZY>1)MPG59aT6K)^IX)U&>wDz3v!;o7(Jr
zkJzfZZ<hnq*4m(Q_;o`W(F{523}dKp06k7+dA84ub(p_)z0cw4^Ap5kM`(AT|DM3!
zNfb#|w4VNxJB??i3xu(Q<iF7|3!bkZEx*hqfS?D6n+MHfZTw8V<P=orTq~suFM=vE
z3q12{IR&^@Q3(=@-_&Q{CkNW8kN$WT%g#Dtn?azRXI!(KuZd(4M+FURjEDG^{EuuO
zlF=~f(tvrX7#{`d=0K)TVA?L|B39$$j+wa|N<?I2<|)Jp)z^WeK*09QSIFa#>)|5E
zF2XOn+3PT5Qz}?*jR03|4WGpR1Hd~P({dTiimmG9xF~^nkEI`c6srN{qpEq{V-uU0
zvQH;p-lp%#A+B23N}cc7B+m#8<l};zFYFONnM{>zjm1M;vQ_Lau(pn;BLq0vhsTp#
zEgsqS4#Ec?p_vdv-Bj6_c$qg}B5>B_Zk=kUKC*f<A{BgmmgT)^on)CWeWJc6J79H{
zTxE8>XZKSj@5$>1J^CY8+Kuxh6O{)y9$Fh+&jL;fb7o*SDY}9w490MLc(e9$udc3N
zbP{dP<6qQSdI1`LgVxM_X!M~u8?B!l$hYf7Te;6~3g{<_`J`H#)y_F+Xii~S;CWM=
z;wQWlfsfqR&okfxK*d9*Gj@AlRZC58;5b*%s&rcMw=B}Bcv1?;x0c-646(vYYTsC5
zb~S6tj<zb~B9E4#8`hb0I#UP+MY5N?E5CmefHR6@LreBBhHo-)Gv3ACDRiP&V$0)!
z*!2s`u8-oamr$b`Px9rxvwp!H;7UF=KK%#lX<iyq`~ztIgoN*3gtA`Nw;%G=6Oz)N
zRLYpQcbN&+h8;nU*!PGZ=~7QwxANj-O%6&Ao{HC0#wW9VuTadKIQc%;;_#v8cl4uB
z96nw4^~#<^nPz@zUI;jm0Jj7{8XAYSfUW-vgcU!W&6a#5ceallp1skz67jzUhh94F
zpJiscEp_Ryw;W9i7l;M=dV#!0TK9_*U=8~%5KPewd~Ii}z}a&V2#t!rsuetG-n%+O
z<t<y?Zozd!zunsZB_#i8DJ-~YxP(Ze11%<O)f)Pjhb#pKp58ki=i1W^DV6j!NYTX}
z^ofES&#E#BXlQ*u$D+Sg62&q!9#t?lbNvooe}$_51El8v&im|1-P(g%7EGlqNay<L
z@se0wq`<~dRIsoqu)waJpXHb=7y8--n;h@2?OU+YfjW`!QpI1SJaCQbzd+|F)(c3W
zcgBBF1pfizSn4>6abBQqsfGrB(e6M;z1t|pNbJ(ypZ3~TXK>7yEC-e2UA}xgaP|Kh
z=X+^SI0poj`gspQYR#^7zK$Bs{7fJAjiJOc1h?0t($I1AVz6b@E6l;fZK$Za8RWnf
zgoI87uSX@*3vi3eXs-x6sb@DsTEYcOf<781O@|J@NmT(}hb0pyh<RaHSYa~#LROIq
z$yBX-a=3HmCQQx|WA3f@2pad%aL`dI<x<A4ql!)7_S_-cyG)y}#m|nnLl}uqR&okV
zK(HzC?L+nZ@99gHx_0&a97%r{zmiXPrlQ)acYMKznr&#TKR7o^<_(h2(wn#J!@j-S
zeB=&h;(jn5$*CXnW15fZ_p<)XMaWe>XHq|WfBd7fO_#+2J5|D*Gt-@;zti?l?FdlG
z+kq@i_DnyT%PiGPt94tQ)sVh{{GH;E8*m+ui{B`mL<D2Juq|h<g~jDq(Y8Lx?a;`u
zyZ?7$<!WVyk=o$`dwG`;i)KkI&}Q#JMtJGDbJT09cjO=3yhDNy8;$Fq7X!c1{HI^L
z=WRD6`zBTk7razYpO2m3wcwjNJL2bJ-b&uC`UiF@Jo%;(wo6Wz;?h%krN7f;Z>)Q;
zt^V$SN5x--k^ZfQ@zjp00&83yV`1X2=6^3iF&c(C{#Z6e46ZNP$<dCcER|~fn(o=@
z$y)^zc!74df#JmAuE#+{DgNZ09}~HQoygy5t2_Nr9$-1)h&Nz45~jcL|K~;pt+0*w
z-qhLAj&xl58`mUPML6}NhY&9{{x<(V*qukA2HQy}GOGK$boAfB?f=Jqo<jA$b2=pn
z@V3t^_bCE-<K!oehr(X|hS)@L-V(`!(6gDPrZv0pyWW(#neD3DI6uwR;Ly0Wo8Dd8
z>|*PpVf*8h{KasJ@EhqR<b1M^d%Zsjs$T#b9JY0$x=)Vw++uSlRw(ExO6#XDqaB0s
zyITA_459f2QNP$lkTkmW6L_H+f{3tXxIXN?JiQOa^$bU*`l%eFrQ+~lU$7D8uZE;h
zM712;oh{Htc^?KRWK_1B`9)tRUVbHy`|LZPtbd;b)HcK0?&`$Y@3@ggJneks9p?PR
z_~0~~&omO5<`%o3*02358T;#o=8YVZ^zm=p4maQ28c#2!Kvi_uv<c|?5HnMM_<oEY
z4cj1nzUTgjoth6ZD}6}W<|BQO%cHaX<{7@rvHjR2?{DKibJ0@bE`RHG>i~Fx$JSE7
z$zeA`(}{|UqoEJ+vSxyo7jb`6ttn;ajyrh%2j{t=Rz#NVJ$oKa7DW;Ri1=t~$?W5L
z|9R}mlIaF7)P9o~y~_en_eIW1d(OWeASbpeHEntifEg6XP11Iy6;e}dD8dyf?)9VS
z>zS85SjQrR7@0$S(lg;TeKN(egpI{68rbyXuP4BZv}$<M2<AqdI#>MIho|Q2Gc+QK
zhxY7f%PPrXOLIIHRwCPNpF^bUW^O<-TE7(PhVTmTF@q}W{C2s;7AxSfnDMuhuWtaM
z6mSIv(?8m3T?<_mm8%j1Vsk+h{yZVQF#kNHU=Vn<iEQ1iO|xZRjfO1?ui2@B=+e&2
zfcBfb+|E}VUUZ36{tz)_&3hm4)<Q(+JbQh}0dk6(j5eJOC<<T?Cvg6S0Ux!wipY!u
zL05oOoDiXFXuj%Q2wulq1Y$3sF8onnO)v16pr%88SgLzpChavPXzH<D@iR&M-S^A#
z0xl}6q*~WZU@xLD#l)qkK}-+@iT(2s_?sU-Hd%z=e+7wd^zFYz`_<_cgZxn||1c1@
zE?xZsI#s#!3Sy)HCB2B)qD{&^{;IYYdz;48>W|G;H$LN7FOSVw(ONT3@eY+BtlfUD
z?2W)bIHmur`L8mMpPz(x|AVLd)%1+=@1=;JpWLMW2lt7!E#T$>v;0*>!1&{n=XU?#
z6n}nlHRkVeK`ueRHn&`Y`dh3WG0+^>$4MVPOE*f~94BHvXYH+8INeJ4h1)(Xo|yY=
z<!Issks9@5Ty?_ecn8rGg6vdoL-?BLUF=&RD%`K_o|$$?yi(=99&*5A8nN*d+=nAt
zM#G;^MZ<DprN^%{Rhv7fPHm28Cg#2oGQnFc@@8n!w?yys8;mHJv-~3C-$KPxRy2&g
zmrK1rKQ?{JpEl$G@(|AQ&Qg>sY)>o1BV|n8cNC9Wi*q$l-M2J%l?8GSZl*3N0~wik
z4z#<T$4l0~rB*W4Eo#B)>aJQkBL`T{sh8wUne48lh{x=}7dZO>Nz()Pvv*4}F=Wam
zI>}M-3ubVo+*SifVQx#(wy|nUY`pMZvas`7ovxeXHPhOvfHANgt6$=_;z1>Y821?I
zT*vBIff%lq-zf7to1}H1{<HSUA$h!dzR7`WLgO4h#!uAc##3CM+iOQp<9DOuw-zdB
zMY-ryGCEsQs=PR({YiLNvvnLwbZmRD?}ZW&Y7IKBC(5|^S?;v&c?|FBT>KN2kU>s-
z*9VCjx3pv5dojdttk&uXhJH=BeD-04OKoWV9L*@f>h&Ik1eE1j63t(QCCndO8If`E
zq3IiBMI%lLku#ENHf*T`G+CGT_!GQb6UA-0QpMNhd}X*->o7=eeAlG-W-~Yki&Ns|
zy|N}U{4iH18WtiheSBB%L`Q9d#$1Uvxnn&I#I{PkYRlhNP{zU6rQXmPGf|t9wD{aj
zObp&ATis~oN~wCdCN%=B<Pp(4JiIp~ydw^+R8*zf=jmG>?&$E;lcYzK_fCHr)j4BN
zEoMHYc|ofJ^i)*+h>+XVhnK}S)3HF}P-By)YT26Ejz|3ZZ*!S)a?kc{K8~^}B;yiL
zByCrI^jep5_3OW>gGt|B@>J&}hf67{f?x+mk^o^Oq_Xe@M&^~gttK)1%<?EZ-jgfP
zmJmu$avoAZm*Plf9hcy8mqY5L()NX`e%cEi*pA6Urw<?n)75}1oXxRI;FC-|b;Fl{
z#y=?ibMpC0)qO>-<RRQl&6i0nBXlOG^K}ECT83Or#{4}Qn=M!Bm-@sF@40FDb+<qU
z#LO{EXNq)toQC+Am+at-lDK$c)K+{659~ZFVK_0KH#(l@tyuCBKvEGQ8G!b?Y5{&J
z>{Bx7UU3&QSyx`+UX(QS8)WY<SsCugv7+IgaT(%R4Pl=mTMEps02!TrtACek)St~U
z7N>T|{=T_+sSka(5LY#9TPx8$5}7QVle>`AVeXwciCB&2WnXP$S8dUgphvzCMkTw0
zRr!>A>wAGs7oH{0L(B|02cb@A+}0GA`$qj{IDa@V@DZrZ+2`xnnK7J62hLU*sm!oM
zSH+DVzmUg`%xeec97_X{g0P<~Ip|ajUQ@oVau(&ff1q>Erf<l~dpanWuvW2GSDUqG
zb$znN_f8^-k)ed%Bf>|eRF;|VR))nrwpSA~9olWX*)#})`K~NoA!#UexhK8dv3qEm
z9JyyD0WHdp-y_$F4{bTF8E%<0($6sfy-o4`FD&p%b%b&Bt+p73%Ss%Y$TIhKQxBe^
z1OMvlf(P!iOH2*vxM?JGeb|xF=#G8o`GS>7cKgXue4TibN~^3YT#3fqK*fkkq&Z6*
z{{ka|Jx_|RPd=+7hZsLTlnnnX=B*`b;&siT0wKHolRi5=V&*-y5q&>cio<PNWh9P?
z1oTK*)kj}*Cxen@Ww#{)cXeGAC`I%-!&eW8)3@W4>0(7H-}vuM;N&azN}SU=%YH-L
zuoFebc8@q0=<81C;_@=LYUv#7GwJQI;XA4fw*%Wa8BcC;#nWe{%VkF|^?;*!T*#zz
z%?H}vQ(g@VUWT2&#I||B4*m)Pq(!}28Nv&;<%c~NI3Vn0mY2d@?;c3CY<VVr1Iku3
zBq(!><Ondzo0tzc5q1@ICArzjvR*54o3t!a={@}&%;H00)@yx18Ek5BhH27`jN2uA
z4GnM?p8et~Tp!Ow@pbz$2IM(c#*M#8OoR!jx3cf_-UzEY=6vPFX#WW0xYbHUlMZjz
zah<))lBoj^O!v-^r&h{xyetm&QdX7RE(YvwM|)BeS%s_Gp<<ch>HZXt)OJMdX}_Mb
zUaP(3%EsJVUkAbwQhxPmD)*9k`)>n5xkbW<12<wdBR*--8h`BzvZ{IUu)myRgMtOA
zT!1SM9ca#vlXom#tcywL$VdnXkqB+W*;Vbhk^SjSr_ahUJ!hDugKp*+E!EFWjswAo
zjiJN;mMIz9EY##+cf`KxP*-g3s3smG4xX29Z_|LuUS~PE{8_G<!6O=X#DL{QTh%Km
z?mSA)^+?h8`k<S4V($4vtpKJN2C;oX0sB5?uW55?)7Y&9q`@)2TE;_Qj*U0fbZ^7*
zJ0+m_5GP>6Sh64_CdxU?!-V=^-T-+;!c}4U#)>egW8ghkPf*q0BKQci3uCcRVz(6Z
zlpg05+M1?+n-GWseeB@3j<jzogNk^H_&L;c1#fQ1Ron~3wKM117}UWS62zL+Nq)f5
zqCwLhq23<OU#!U|L)Lm2>Ju<GSMn4IcZx&ZU1<AjX#F<#QsM?(8QGNS03BqC&&0%n
zx?%qfU0u3^0bTkdskr1g2A$I(@BkccFC_7qPKA}iMQ&__pzo)jC^pg`D>TPf?0sIk
zVRW?+LZhVGa@)*ysUAjJQB&5=suS;-iSa_o9X5Lpy4Hn7-0#ED_!u8}W8U79jLJ3=
z{WR>MJZm6ZZacTOa&M;?aK`n3i7eylAQETXlpXF&0d<~2FUF~p1TRQxr)I*69Og))
zj9hDBs8e{tkk~5q?`SXW4j>=W&9$Q)BaDqtUp-Qa2=0Vjw72_gH*#DP56A(F)5Mjb
zo{aSrPrEaD679cU`WhXo+*=;7@=<|aT}d_7-*OmT)LNd2XA>2FUByOInkmO>h8L+Q
zZlspli;Dr(A5*cD6G;;@BYSJ;B`1Ux0Qay{#a_?HL^PD=#h99DO=C~p_qCJ{h+Q}{
z+D^8?2^+PrQ*pJLD}xiH8<H=>`m>!Q$=IZ0bQwdgzCi*W+DB#W$WsULjZih@Jmf0*
zyLtHv2*yH3c_AGsCs?j(s4LMX1&wC->TsgPN>ZD`zk0byF5lTY$XO}E?Mr{BuF(_l
z>*G!z%9%#$hjBu6#i5w8hKOF<7KKO`R@<>z(ubvTGRcv+H8U8aXnS^CYjc0Zl+YTW
zF7?8s=P&`6WOn7ff#7r^c~E(MyrZ*hL5CS1PW(5~W5QQJiKZ|Fxzgz~6cOJY-W|I^
z=179x<8BSmf%6~YarOr-@99Ifhwe`YEuR;jr2#*X7y<%Y=n6{s*zx*~BRvZde+&~)
zJal+^zekit_XKpzOE+e&6VG4-X5d)e)4|X@_j4(M@7?&!eNT&4<yBDr{q0QVOgyau
zl)lSxoo=%^lAeH8A0-DbbAyak7P*J#qUn^2iI;bDTxG+(uBKXBE4_5XrES|5p`mBV
ztVZqY3w-%dSwTp6LtIP(3iJ~e0@l#`@@c^O?1RZZ9e7;Dt?>IDcLd4b1RxT6)w^j6
z5?gE~pxZm*78;aWRE@X<;__oT#B6a9CK&S_l9#i2MF9itFQ22t{pW+W0k@iBE;zN<
z6|G%o5|SzYeALX<>8l@AdKpgd5d9?IdtRnkftBqO5HaI~MH9yB(X$NUEhW|L40<-8
z3HZnkac)a<bTO##&SE7aBVfEncx;udSj#icf3?14m+n}gQm<}LN=~kh#_(lKy0E+B
z>sRH-NrHFYv?_sF_JiS)x@$l%{`_ij;PbiB5yTxjvsheOy@0(G9mMAo>9roA4LKYm
zE4#)qBZH}3jE})Avh^hR^ys3FGFyiRD=|GW7joz^B3&m8xBhP8?9q4C#KfgN9e5j$
zX>38q&1k&SUW{3W4d4YK;I-qOm3&A^X?9GAA)SgOM_+g*M^p>R$A!r_OO3FuNK7I-
zxCq6;fUE}+Z>fpn6ZZ0VKN34CcUaJcLuSv}k%c)5k4A+TGK7=wRb^GqltCvW2|6$?
zITywa%5yJR=N5qS{h|)$f!iTFtAlJL8Ego3)gmc=M((Q~_`+&sINK~xmG5yiT`exs
zm~AFyfpC+zBxaW5aAh15C-k>o5IA8u(2kQl1f%3|AZ#b%5H?GiZlZ7b7?<K6<U4i_
ze9LpdCzhOk5a-BtTY10*iLWllbcB-+i|Tft%OL+}uRI>}e($r9KA4)uV}LQA>N<;9
z(~>xk9pVw1eJVc4?iIX<{NVy+v;4JFJfdxV^3W~C0C&~^8XIcK3!4vfjptRLE$Y-I
zg$@la-2eJIfr^4<RV*g{lY-3+`=?`KU)%<FBjT@1+S~@4o@g-ht4AdCp&Y4LP<d({
z$saS{<dVXAmkhzuWBcp<jyX-D#-*=W3E5GTZ>JYw=h>nc-ZauCggVI4(uvYZ*~5UY
zM&8E)#tMBEzMR0z;qM%^Z;*l}^Mqb%46V6+iHnzsJPisMxosB4bmtft;^1{0D|hkr
z9GAv6ahAE33c)&$#A6%Z5WYbr9Ww1eErv_m&XzfAGSG4Y^8o8n1qNl`L;FM9Lj_&2
zCS1H@dbD^*QHh}LcjA25-qQ9?fGcRQ8}MX(`<5MRc3u#wtz{Fl;mNm45f$_q=5Qw<
zcn>(Ek0zh4ua;&qh(QC*0az@p>U_Grx!+x=9VL6Pkt1)+ljb_+gXYU>;<+>oS9X>~
zU=Am2`S8GBlO2Ym1J4dm<h>hyb6G>%(^2;kKt23DT&ks!Pn=g9pzi;Kgy#~fPDE*d
zH{Wi@X&-I|0fQS_eZ@hqveA+~w<W+2TR%vN%OXaMV5<$N<^$kb&s>+p%n!A)P>g|3
z;@L)Itc+va=Q>c**e_IWgDqwhgg=B~nIvpwtMU8otu8G0{rP1HGk%2wc<2X>G)P`p
zhl{@($B^!6eO0dz%k2CI@i!EFed87zn?l>Ha=a3z-4%>a<UXUbAz<>36d$AA5;&U&
zIX*3%^^$;APgI*p2P)4N>Xq(q?G@nvEE@a?(fdUvGgMnyudih_I8r%RwLNYFMrj~k
z2KWuq(nfd>mUQV3Jj+HJH`4Y@uTKxkf|yKQf_xB`#0y-e*4u*U$sJgy*Q|?{i}4Sl
z@jny;D(e+RR^4dvqo$LncbiAr0q%K6Lkb?>$}`_d5ic{$d-G!no6Xbyv!k)CB<~-S
zC0Lje+FWSKPCr+-H-2TVDGpG#$CjRyCj6?2|A`QhmJg#XQ@~PwNd;aNG44IUJBpP9
z{Jbd6YPGXGv^93xRm)p~J}k+Vl+9j39{V3_0llhL<i<@W0`}H~_pXh*PnzSS$oK$C
z?C)HU=fqh6Mgv>*SPHq3Ax1CbXs5U}EanB8$Z!)!qC0BocZoyCdh$Xqwgdw|QDKt2
zQR(-sQSl*QrS7$9pcV2uMy+(slQp*b&z#m1FU{sJgta(53XbqaM_X%}bq~D^vRrvb
zzm<2zfhXQdERLJ?p><*CNz2M*x*plHy)pDw+`{J?W34793l>vis|^pQV|gi<o{6jf
z!$}sTrc1)P-O0ZYOkNF9_$7`FJE8J*aYm3&E6-dylgDHrzA*A(nLY^YVCfKaFe0)k
z#bxH`1zAt>3$E&s2}#<hD0=3r43yF~JYqK9D^O&~2AmCs&F%qoVD?P&;R(Uf#h={n
zpLWlTLCeXfxLHoiteoyKX$fRXb{cKx&6>gr*B*hdPn%i{B96Y1XxEjgret2tRTvMl
zVo4~j4B#rnZYAncQ>t3D;pPq(S8<s;$Zq7Rcd1G3d!HjtEMNVnZRB}FYo?)8(~a*<
zEjZi-0lw>)G2!P7`kW!3G3ve_*zC<AvsZ-@XZx5h-&hj7aJtr?!euU=`TSsE<y0fR
zWbQYmtarFGya}%PK2b*h7eX$vI56Pwc4P0P2tqr+=*0cgo8KPdhnDtF3(oT|@3ZTO
z<o&ad`A?kX|Ihfl-=ySu-@VNfgG9nAmUl*+6R&w!KNU_C=qMf9yc>7@@glKb&g+*U
jejdXAH~$y@Z0k(<7Lq?8A3ksn_@}C<b*JR^qi6pIyZ(!V

literal 0
HcmV?d00001

diff --git a/docs/source/assets/design/v1/prefix_caching/example-time-6.png b/docs/source/assets/design/v1/prefix_caching/example-time-6.png
new file mode 100644
index 0000000000000000000000000000000000000000..fbd7138596e8919b174a3d5489881838da2eac59
GIT binary patch
literal 25462
zcmeFZXHb(}6fSD{Y$&1vQUw*2CISM|L8T~NKtXzm^w4{ag<dW6DqW<85_*Rq1PHxI
ziIfm}fB+$ca^7J1%9%O$oVjyn&X4;CogMPNYp=cbD$jZrLC=(*kdx7q9Xoc6T<)o~
z+OcCNVZi_E&z=T;c3gYdb?g{OTu%DYb2t5!q%*O%UQ`|+PPxRG1Sy0obU(V<UZl`1
zY!;p97+G|k?a_M~nd0joWmF%%z4`g|mq1!xJ(*h5+Q}bXQwE2-NsIlCyQ0}<9$8i8
zrr9VIt_JCp^0h!+-M|l*{9Nmf@1@6QZ~R~NV_tBMb5o5$Tw@;Ow{24(-k_yb*&Va;
zmdOsJG3Q(_hxKjO+JmHWQ&koAnD|9o7EE|b#6=Mx4fo8OHy>lEs%qA~`@ugTZYnI+
z>(^J_%YJ|Q658{&(G`Z5O_h5T|7*jcB6p}dM0;uv&X7xtKRq<~G<iN5(NW`LPUF;Y
zQ){pO<$h()?!g)HxooXCjJnIC6CBreQVtDL1sE)<O-~YQ9a*d^udlq7{caaq-;?+F
zP{Vxi`hHX1_?+`UUhx<d64dm=ZQ}VnzHwuPq5e%}??_z~`JOH9mcd_$x>ZOI$R7lb
z=;&~+zmky|x{{_fx*O_Zl|dfhSp3G?AMVor_)u?NvP5|7E7z|>YQD)2Q;9VxdhwuW
zZ~D478>reYRjGIbe?HFSYE_kus<~{ci4Ki$l?j;F=f-)eU3U<jcA7Kvq*S@H#sT+C
z*ZuQ{ZL(Sfh@yqwz$+p6wy0L+YR)6=j**M-8ovjf9p2yXDzv2~LH#fc3!3pxzifk6
z;6mF76MaTGAB?s!ptv{+dAJH86MrAY6>=l?9vtbvl*Vn6e|ngH%+&<_@<siDOxPyX
zJ31XVA%>TA+V3T}ICbQk_jfUQj7uW-UQ+OUV~nUb$zS3Ncg-{upC1^h6J;2=2r>|7
z2aOeQZcO<dP~DX(ZJ3NBkKOd_blA*=-(Sm$8M6K@O_VP@I_Pb<#e8TUqYeQdW`eTL
zTau&Gxx=(;&vI=acH=O~+VF(9)0u4qj~9xp4pQAC7PgfKHL2h8kWy;mkA>=`_AoOb
z4mcS`1hs|eY!@N;lE;U%UWaZV@rKzt+Pg%Bq3^8(S1k)M@g({&g%StxZSD+p5F&Q!
z&?>`Qfb-Cm<HDZDK?S)9*dl*>0VbgN>l(?PN>XIn8qNbhx&R1dO1oWeH1vwX6p?K(
z_*93LlU3$s*i~tYdX^P~l49arU-)(@+^i+EOX@Pc4*Y!mz$H^|{z8gfW-2e}2e<@y
z^y!R{_ty{MFslq^#*D7+)15m$TJGSd*{8<(bD;Z98U<!%Nvi`RyU|03-?Vi)3cqz|
zY>2-rZLGQ(&=$d4|Lbz|E}jkJn$R#HYuQmpE#k3SOD)puAY!fTcl<kk<d{}vX#J)5
z9Or%cQ<Nod=w6FYju`XTYPDSdey@>TB#xSZ1~=^SJ{USgGFxZ3&zr8qOiVt-osP8W
zFba9#RRFzka;5sf#S)C_?AT)BT>unz@-4sdlXCMaaDVIf`?1hd+9g_7!=YDyiW8{X
z{0syh{?;)(Rg`$|YpAVXB(&8ntCMz3S=YAswT4s**4QWWfg8wbuzgl86`HBA?oF<z
zH~%ruIm^uOBU}9eyD8n?4bB*-N`oT8E1RJV{T*zmgAnW!5rFyB76~u_|Hk^*R^^)$
zOqWZ|*g#7&DxJUXkMRI+B-HOo$s1?d7&3McVn?kn`C^|K(iDAv=nY<=E<I{bp4_d_
z@~Ju4p!>A8Kd-ssLjl@%VNfNg{(~Y3tUY<-T@PHih3_x}3No!pw$^sm9XFW+-Qt3U
zz9C;281djKa{6%a=t0rNG*96Yq2_1s=pplH3)5FWjko2?D>e|qSH9G#n2{1;<#pWz
zyg%|hv}gK}h@RViXrmL-Pl_JZ%Nxf3nc=<#?L*1)C4#Wf4<>KFCefo*g`x#p?ZUY9
z{10z)4nw#Y%v1d*7#8ODWWQU7VMA$5o9<BPecivMR!0<YvIOrRE|ZsLtAnalHJFC^
z7*KCweJNdxrl*##b0yNz^Wc2$Znm4o{dS0Ck3mEP_;c>PU2rPuFGMXna4ZwqAN<W<
zm=vd`m%JHzJhY`}Xu-8vm_`W5<Nu=AGONR(<+=s0a?yBY*vV#5Dt7-K>{_z`%+7IW
zR}u#?det7nU&T_Q*U)*O!m>ZCMd%)VSM1=xQ6Fa4m#G|<kNv8oiFvOaw!#&Lja3Wh
zY73kp#L0^2HRarTO-XMt@Bf(fL7!2k;b_*Qs^3!5KL&}$`b3h4QUc<gs-h)ct$PM|
zV4mafyNPi^k+zqCWfKh(TedR^*W7QxqYHZKmP)Bjy-MD^xW-qqv0vn(H*1hP102-q
zhp46XaFm_eSLHA+uBDp$?W+{VC-Us}*5IbXpJWI@mD@HMM!x%hz5IQPz8Z!udY_wk
zVXh%XG==5}-{c%lx5_=0?xe0!_jNAwF>|{-OZ;2jq&$R2ptZ8CRyu!WMm;K{zbo1a
zo~rx+Ay#^Uc(~wJ0%qIp=x`k>HgGDLd+Do}CpF*XsiYBaH-xDGPZZ0xZ)sI>hgw~`
zQ1zM!&7X2Hzp;H7ftBwJ?Uvw5tfZF;eUlj#?fuyG)k3tU_7E>Fui5v-q#`kTCV&Cu
z_e)QK;p;Q5yv@#2)gQ3NG=D<86jVdQrQC;gX*sEO!n8S$zYNv2*lzIa^A_6IHsi4f
z@s+p24HH^%<=(@EzaDqCm$&B7MI&{8`7KkkRro!2r+B^V|5v3AzXkoDlNn=WUTn<_
zM18M4vo~{nhIQ}3(#^0H8N$;o%Vc-VJ`sE|)ymuH>*bgNUZbWKlu3|RhKGPnvb#@G
z&?Mhsh8(2x%JKJ$FUoD5Ui38EIayBceL8r5PBm#S-{4S*-sI-J2hXU3Hr_AN*hAke
z-cUVI+yB?nmD(F3J3rS1uB@ayuMZ3%e=alG9_k(~>o?i%q2OdAw-r(`^6<s24ymsO
zEv=`&21pYW{=Fo>=|BJD=i1=YQpkT^v3OrK`M*p5yMm;NW$eo6f<IR&MrQw#?e~($
z)5!Y&EJ@o2Uab4M7A8$(9ps)iKKbGU8%2)KWg3Pr_bAcq|5;+Xg697B5?e1r+PhzC
zEWHQ+-8(|sUH;#D|Nl}Q?A|MFkj@k$JR$)_*z+sgFUqmqTSoTUW_%zW2}#%NwFCsj
z1=GY(zmrQm*l>jmI)r<ni4f-fEcM!*AyERS<eqipB=TQ}PC5~GapLY6p%_sZ3t7;S
zbVN}kxPK+CmUS#zp{rLNpB=366-w7%Qyli5ogs)YfD^gPo8$>~0ZjQzH8*~$H6n7=
zeQ(rj@oTPNy$Z^{JX)x*ZrkIYvP%8-yZQ2!%>hmnLU7bL;cAj|`(UrfJ{$10L8-Jo
zb&oy5uwGvMLBC`r)|1K34Iy0gJ$|X86j`&AKVL4jpi{ro=9RnR(#`C(h&I=%5B_O>
zESaOs5HSPWbc-%5gDvcZLgKKl>$|-l=_;M|J8+eB6s%wHV3nJ2xh+M1Tx!S;zeO>z
z|Fl&3==(vFm#*zavU%OD+$`?GQa#?W{vt4>C0S2|nx&z#T0FM#=Ba)(K|?D~1j5_Q
zZ+x$h<SEW0?$Fe))-k)yuB&>{7&t8K?Qs@{_>3vjYLh)W&s`PUf3bQcseU_MQhI4C
zoxZI!LE8Q4k##u7u^}UQSo4U>!;slJka78_4~;ALlk!P?j#tD!MHj1LqGbJHJ53rY
z+o}H=#axJz*3J5vDI_Mvn~?9-pKn^R@_lPpmC=gfOR%(ihSL~pN@tr&?8u(-)aL7E
zvj;O?I6VrC4!ByXZa%VA^g;PBWK+T{8ih|4jo;nLuBXi(jv$@5_^l4f+ls79eV-fg
zT-n%IY%%^#m*bCUPo}PNlyjlioA&b$Qt(OhgXNGE+j2y*%@|%{-K(_5pmm+c{;PcC
z6M5a3$o4%vS~D@8*||@vO9-_FRheb5tum?Gbl{xjt|7SgV%5{e+Xzkjj59S2v+d0V
z%Z$0Ccc%$TpOMUrgw}5OM?9#<y-ttzf7}H^dUNg8QghW9SuqJf1}-J}Ohxrp9ht@p
z;JRf--iL{<*4a7|6i|dr&K|pPU9YV(>8R{dqEEnVzw7sjvJ>i*f$idRktON>P>{53
zn0mK&VXixdSXP2{KuM|YY#~GkpGdAR?9G2dc1MVBF;aV+_~ixS)H+61T}OC(Jv^5o
zFBLc&PNsO~5Eycu1rwtUocmV*`|$5}$Ci<7HoOdQhiPJJ@^oV*_q+`dBDQrn6@qL6
zcy$#i5%!h`?^Z3|BG83v$Wjmw!Wl{vvVkq+U{+zG%kj3tpzJw3Q397~8gY@c!%g9C
zS*WNPk7!jJMIgwU<e4HO)!^H~OOk!X7t!}TEEqkO+gC-A&IMbA2R{URx6PemT_*_}
ztol2qd@Juk%JLGdyQ+W-KJvHO{E^VDw2=00&sD5M{lO>6s>k=>g9T=-GVd%1+l2?W
zwTF&u0C4LD@=|m<h35;a8q4ZDW*iAy%-vY8*&K6mr>!p6*_49~hAu2Cpaaj?e_GvU
zgPDPoJ8|kvf_1H4-LCf1oF}uBZ@&$ZX<6IxK`bmMH?_Q8C8)vyB24VF+UUPmanryq
z)o;SxXn5&~eV#SRX`9I>(d%?axYjP|5zeCyDv5PX121%SS3H~?83mDItAqyU?vk|r
z=t-YTZQoCSy{XE??raRcOs)NiM)#X|$>p*7h3<f~txH@r&%K!4o9GA!0n7=Iodec1
z=iL{Kf(%~(51yiy4c}&wVdT9ZxwTWbb~8j4{%xr}rY;G!HBU2c<}ueka`BJ8MR;NJ
zX}Pve<SJg<=$hwP#Z?p~CS~9wyf*1xm&=3?Ra@6D^$z4flbmO)3#{l;=+xU=A3ka;
zpb5S@Ntk0W7~h!;jo!*7g-RgK!~pM)e#^TgzOkIUDhjIbx6Cveb8^<Tj6>_<WvXkK
zTY33^UM6;sD6SNzJ(P;}515^5OS-y0xtJh%7~pjv_eq;W(A0YkM7WQLqUiIXA!!2$
zHd%lanF#tN@y$)maXhA;@M*R}>jKwO??xN!qS&<1aKcP6MQ1qRw(P;$&ClZiivp?(
zT1wc0j!j^;ppeFMC)JAD<tm+t+P6*E+r#6GfM=Md{r+(4J=-DaSD%ZNFyT+vIoi&@
z`tm;1I#<a5j`EA3ji)*4AW`CEo9^vf)GvMRMGe%5)P-Kn>b3S>Fo}r(UI$#Iqfkjw
z{*d0W88~?^+BsW%+i_wT>>>FlIyqZ{7rr-nFng|*^cK6MWu-)?L}}{qK{iK+)6l@}
zN)1wf0o<;|mXVvC>|8jpvjpYbdmxu(4=1aP^9&e270<(b{4Aod)S=WyG8nitgC=QW
z>Wc#-5;zZwmjPAEl81xXs;#CeRxva4Q=61MTO~pd@{PVK0<H_NJh)7WlC95a%86d7
z=65;hNuET9rkB)hJx57QHc7V*@msMkyYDe5-iPH+)5Z(ExZS}5B>yX6<&r<>Q@~X(
zI!5lhN7}Bexc+m;(W`QyDMbq_(yjuQZ<!97G}BG??~<Y-$%g@t-QUouw_6I!7`rj?
zJo|3lt72FYs^zy80CL?4H58XjpFShuWqVLr=*6my3~r+0Th4-w0*4bX<eUJ657Ob-
zsGYSO_Q1$QFu?*A)-CGF1QXxX8?@S+ldA)IPy_Uk>`EX$(KXBzNZDV=aJJ10nKAqs
zWP#86x*~(w4BSU^UV(L5p|Ce!pB`ytjIz6i$G~cWJl!c*#pl3ec<D}FQ5ubz5a#jG
z!Tk)C!0S7cD;A>r(>Gm=87Jg(^R8y%tN&^8_AjXt{W%8g9A8~|TO`wvFg?}TBBar3
z|0{9;+w`X`(q{!N*UlGraGn{tqZhfd0bH=2mYgw?Wd}h`#>Gm^cipf0_XTqnF5Zd6
zp>mefbgGO|pDLcXX8^z8y|sd>WTK{WTfyeXUIKSqQ79{2<TZNvGI`#N<jjw5KPx<%
zC^><6a-ThxWWYm53GdS!N)RG}y}~EwTXB7P+Wk*Pr>FiZuA$HSbVF|cI&f6CJ*1r0
z)B(d&W1D&-l*s#Dk-s_5;z2Enlx|wy0ulhgRnYdo7=AgXbln(|az${$uJhWb7d`Zw
z8+e~<`D3=zFULq(G09x=Z7Y{bo%eGz>#>vGZ9TrqkbVl%$47a98r)@N>w#sruTo=-
zRf2{Xd~g@V%W`ixe}buLK+W8k@oFK&Xj@6=^{OVxYjRvce*glMLl{wr?;I6qF1A-B
zFY(oMg^PG0?USzA_v4}k^(#Jt6lb`EU21)(ex>#7nW0>~o0wyTzNu=#sgK0)`&c%_
zI@7CaaZb81jek7%J|-vvm&&JNkH_G_mDVN2AFPtCY9|tfE->gEP9#(~o{e~zC%TX`
zNb%1gj;?h&Fj;a^|E#dSZl>F#oBgRho$Z_XcK=IKMIf8DufT2sj^l4hVPQ}n>vUG1
z(u<k`SH4qXl4*=7P#%A!B96T;sIyublR3O1_(vMRZ_0<Qy}0x)h>mZ;SOPcTD6ra1
za@9w-JWVb5L|EbTw5E2XZ`gdkRD9AoPaaUnm$MTdL_OW9g|}M66xBL2FR1|)9GUib
zeRJ@W)mo!O3W;55!P#%HuN0=>IKyDvYs@jMgpHu5gCr@HJaT<*-Yn~zIHn$Ln&gvW
zV>z*grnog%Qmne6y<zR~C*{2D|Hd~z=Q)SJJ$yv%GId>*EdY;HbtW{F)itjYOD_m4
z{OuR62#TD3PsCEmHpS%gv4<u-e4TI;aoQ8gH&$BzNp|AYB_Iw<Og=YX+ox3VsYEkA
z%5i1nn`$uRUdx)pwv^m+pl`1jhjRQ*u7S+pbGovfUO4X;MGmRP01akIoPebuJo?NH
zfx{XUk5_rw>iJBPx$E8OQ2$w)0rB%^r3EB@2LD+i<zv6rfb;rvR1hAi4S<<k`F-~O
z8dd;R>uozxUzx(1yPIVJrZZ_h4`B|5Mgwv2`Hb7<M>dt8l@ySZ{>*a$RYLx=MDiY9
zKiBAkwpIQk#Q!T%1AdlvF+m%Vzoq}RL=m(R^Y5j|dGg-IUuz%C4}VsXz}32ORPK@N
zEAyzZ{Cf$gO#ZWU<RX5q0S-a(KN|(^_W!0j_99~!w*71m#pE%5oiEI|(OXQNZ|RlU
zCv)5c{;DtQ?`AOt!3vXOb)<j0o>;}emm^Nh28WBUKYV3apVjq72h@l4<-0}62uT$t
zWR|ZVbobu;X(@kKw{cer&5cg*Bpi$j`YdllCzhB+s0)2&${Zfl8np73{k$Y*db<+%
zRqX^HJP29gP2cV)LRIKhLWv-MQX>+)Uojh>kZ$sBp|yO{zFwiLQ{}rDuA3n<JzZIh
z4~2i9p0;Td<ZVf=t`fAN5`>S~kbjcUo%?x2_Um0CM!?%l#(?`psSDp5NxGf-tY_M)
zLnDL%ixhH$7$PJOJxamKA^X2V*YuW0f^CU+qz|c(Ij^YCBhMVpMp4Y1a=7T4Zj!Y4
zeblwu+G#^q2sfR(XYXWuMPV}Avy01Bc4;URUKb?%5OChp*PKWtP~>Dm5g((G{AfhB
zK0&_}%ti<pRQ6n^;hL$<p)uHRWb?MUboePqJqbwF`jR3fR<_6L0wVFrA0$SIp`yL;
z1na1J{C>IgLQ-(RMuq*1h)4ONlnNvheP^My*{8H7CLgLY@dy1lO3GICxj|zL_KH8f
zb)UH)knY3EJ!y6=tqLf)NXbMvkY34FL($@h#FGkKCFAcg>7mx1VJ;v<bypA$o?t`A
zRnjz40PWuG@;^J7-B}wtV&FDN-n;7@IAioHIMJ`buQoSTJ2ji!hv~5Z5eayp`1DS{
z-f=(8z6;I;?jYhI^IldQ++F@u$H0r>z@b8^Wdo~N4odWgE2nf<&pb#PveO?|u30Ib
z3Z#Kct~Uv$Zz{_mM2Pl8sg#gtt@ZjfyYhu}Ig$_mwXc->Fxr@qf;wJFY%qm1HhJ#5
z@hEy2GO|CE4FbCluOF3E%L;||&~*gctxDH4&AZQ#O4G65e}=*-TAfuwORSsEq5{fX
z3&mBvw@&~nvJQiT)x!7%Dt=*B3AZ-ZiH(5zwZXaj$(R=5!IKE?6x+IWf|>qaG^ZPi
zn1e}k{%#=zk8TnoHr6V_T^_Q7JW*zKkp?_V9;^i`cHZ@&!!^A#9e_X<?kW50Z4+fo
zlPvn)MAy}keubX7i22)3mn+-uLI>^>0#<!)X|qNGswZWlbOh)b$%GWo(K13SwwnEL
z8DT3BqdJ-XaXJyNzr=ATh<RahWaBQT&gUR%xNMxI0#At%L=m#tAe#fZC?qDywQkSR
zfDm3ix3b?#2CANv?N8f86jYb(7~IiZT^S~~7XyDLflxj`@;Sy`x8A=;^6O<^fjTo*
zG<Cl?rVfYL^X232bRG9b?Ew{?r9@8+w07kUvcjoTYNt-&UK&ux+HRa!qsWyqOa91R
zJQ5krP#^#s_`ztm)9Dp3fw!lL!_pHoZTCCHV<p#TV<t|5hrKU$Na~I|HGA1qVtT!h
zCodvhM~My5hdG+lko)#uKbaqrQpLV5e4mwMB(@H9V^vpZzljr9xW5x&L&poRSuGwu
zPS8KY)T^fT28zD%t+GMbVS1EC&{kvIO%Ae=Y5RnT#?9{iNXh|faB)de_3iu^$T&6*
zXJ~8XWZg4b-WmJl<Vr6@ND$-4vWqT>Ug4^n?IT>HfJn$=pC_I{+$}dgSjB@cbCpa(
zwCvF@(l>i=h3Akz2b{TL`{Yt>9yGH^TCivaS#e!*?b{442&d9fSaHB&Lk9^WM+TmZ
z5W1IA+Hj)l?QMAU3GQJ`<CB#6Ns!`=V3Pw0r1<!pq(<wZ)%5eIp9PTElc3RdjQ(Qs
z7L^20>9UuH5|<XO@|Ox^R?+HL4f^QLMH}7cQQGtO6svEZExPbdiYbx>d~lYs_5hBx
zPr-|Cn&b0eOQq^ShO%+mzdgnCKI~*M@c)5<7n-if>HUkM@YaniQ6?vZP((#}vQzi%
zf;9U^zpLFTJ}YNz=!S*yZ<6riDC>6}b9j_Tgbti9QS2LHE`mpjGru^VI9{MjFAs;+
z0x4!>zPUgvzk@g>-}EvkZ(y|nyf&diO=1HcUHq0?6DP_cx|77L)>CjL#H>YQY-L%(
zwth)}HO_GO++g=ev3G&>o}b=>Y|`d$c}SC-;_2d5|Nil2E-DbCmr+1Gzw+X{aFIsw
zA7652{O<!L-S~z@g6+z^6FzNnPu~Qbi4}y6|HWG>_Uz@H61l(QCoWh!YNyp8eSd?3
z(<YX|y126X*j{nZ$fO(b<mzK|<5O~GZ*1ia<0dAbIcm)YV{e=I@r<@EK9z%SC52Te
z<F_%o9$MuW-c>KsTU95NO~lv)AGWvTC<7^CdxC^#xSiCE35GH8ZOH*=uBkUt<jfT8
z9Q-fmY?TL&=|jf$32EwAk%foz-ZpgbXxe0#1N1#>^xzQqst`|0BFjORh0M}13U!;l
z@lxK(akmu@{gqHI4AEFUI<V98n(^VU8W?C>@9!*{>Dx+f$6E<0;^EO`AEv@<%@Sfc
z3h?WccVJwM?(HKYKq=WYQfwn?1g@-O8~1Iy_Gt!-pn9%c{UdnpJ_mHwd)ggO_eq*Y
zsy>vAyTHAgc|Q$qzj6lvDU6=JEKjOnGB}tl)O!<Kf8aW{QfyYjdIo<6f|70N!#90a
ze7F>D;9t2iAhfZJjz{z1J;*vHc_=cKL%H&$oNQZ12qycMV6E~b9+;S9d14(1QJd6p
zy2Y&H<&=<lhyLV+^}826JM2QumTR7rGPB#O2+gEvt@7a+J(kFaCZZhtCk~(O+kmwM
zZ`*<wf@BieQACt#wB*vq+zE^yjb3Nj+(sOR+dpzh!xr3Z6=c@j%Sti%<ux19K75T4
zBg#cwMjF;Oey7usj^hH~%~c2kan2p<ZBz+5bCu#2Sy-x&Q4KUR_ncs2QIE<af68!V
z9>e7PyaW7QFN01U(Ihsd*)*AcB_Im&(%uoB>U(3G>uV}h&t$7imoxS7n!n%G<J!k~
zU|^QS9dn@i>EI?dfbo~Ivw9O8H`|f*%i%KB@mV&3`xfNt`9NI$C<xnTOf7!r1Z<}J
zio^xF!-lYr$bo5fmiY*-`gIB}Ci5xb#+DNUSI73B2I1acy(OXk=&ey!NTMDR!7w(l
zEj}{%fTf$2O!D2njdJk5)t>@cttd0opX_Wnz7LWA9<qzDtuUe0)Rr02c9p^#&NeQ9
z!l9YBQ&7rU#_XCov*(%jlEwC1@<vvhLhdelfO@{F+)mH~0!Thf2jA^bF1@Pd3`^Z~
zFw1hVB8@5sTcv)Cfd$%YH@v!XQ%i7ij)S~%(*2&V_+1{@bEaMFcpDv`2|nGR<#+Z)
zt7B|5qbS2NwnnYAmwDnN8CZLw!X`p&>O%Q)ML*It-)`L1{d9m{x3}i2!=z{?ZODqx
zOTxK#-y(3ciwRgeOggtgkrE6|9d84`f34SzEZI4IYnyimM~v?-3qPzG^|ys)-VDJ=
zh)<fU&IoRTL3Bv>s*g6XEti6uEO!l{Os1_6jQ*q6(g_Lxk;HloSL@20!yKEYvmcN%
zt3->vYjXTSusvv@)8P<aIeZ-P=pe>N5Vi>}Zp&J()1lXqju7<VRTG9H8=qA9A2)a;
zhh!+QP~<c*h1j@(5Jku9uO*)`LYs(AopMtyVs44WoW^`C9GFf#KchHm!~;@jj>2Ak
zmMz5e8y|MjlK|uS$rAZgM3IA&#v*PN_LRnSthZ^7`RRt{3Kma}IP`CfO`Z5*WPvLo
z3*$mnHUcN=2^EBiWmM2P^q+?)gxxfy0a>wzk2oO~R?A~KU@CZ4o0`BO**^b8oPi9u
z{<i!U2?I@Xk;!UkB>z~z?&Lm`HN$%5hmq9OTbwU0%n|NN*EX`dY7~e&(+|cs2S273
zl!IZEIO@j?zh~J9`_VJ&NV}TYf`w!DyC|iWs@1|yLm|IlrUQ|S5p*}B!6!iqojF1q
z0+qV&sO)GZ5w0GZI{{P_52vx!%<2y(O@W$qT<^T0QE(s~;)_5Q&BK?TjKPi_<p+*F
z7X)Ck@S)4-xR15No_=~Ca`>A{9Ta(mV=4TD+(01}wtcuaj<1Quh0CR%o~&!ZVYxt4
zU_eguRe^mn_W7~?n-L3{FD4mue0ygP<a8~xS`FXZutrSl&I9NMQ-jm4r_glVLo_@7
zoQInqo~2u1BlTG~)?4ewV8}bOW0>g|f)lIVdZ?R$fEisJq>h`CvMM~Z84eDIW~9Z%
z0s^q<^qvf#^bvcf72qW5PKAFBI5Fxw&m;XX$HdQkVMGf*#ulwHHaySabntZUqmJu1
zZlFKyk(Dcq+$h43<4<=VUp>cY<uR3UCt{j_3W8=DC&*)Yx+5F+aj$WFp?t_47r4up
zf)z)#<$$11^OpfP1i&{T>{A(8xyQ>y$M;|pM#ItPU!UNn@n?=`47(@bKJYY*D=YXd
z^{JdJC>N6lt)^%K7lkg<t8uT<7`r$@7%d$Pd9rWFl5ob9TRIsyswh$t)H32wqT|z=
z(_PUc9(#tuR3LHdBBgW)#SD~JxODm>pfn&)ayU?ZdCqEwEflKq+{JFy7s2LMz7#AL
z-*QkUog+eW{--UJwgl$%oi3X()U^(kH#m^<Cj0NGjw#Tejtsbdihe9CG}h5z9TjJI
zXKSs@xcIW=oOfy*E^tU6(!EpAHMPYFS_<4y1olW=SAq1>`L0!^<}M7iZgrcHRZhKn
zbYWp_xQ>KnW&Odj6#PyNTZ|OT$(%Z+8Qt$e1MBO<wHgpvcws(G*F{w?NyWLrJo_1g
z_YLEaA!g)6mCSNIK}Hawgq0C~9VGg>XQXdho}zM1z^C+kuX&?cxXwt(Ob<xK1wjzI
z0zy>Eax-gVo1E@&rd<(OUb*}$IpKfIW0b``ydKkWp+jCw-2zI^x;vdu89V-gUT1DP
zRm_Ife@1tD)R>iJQ=U$z<DfG{>Qty+{l$Oe!L&M633-*Pr@KrNH^&1jj#UtN#361U
z`Lf=wxb%<Q+{O2kJ+*>jgB4vv>=e&XO0Qj?F%pc0qHkuM;$4P!o=iUf6b^flM61Je
z)9WHDA<d<`JbB~Ug|^3r@_WJpfarCG85Zs3h%Yj8S>XN;q)h)ibsY48Zh(_amJZKp
zIE_t?kx3ujo!BYCf$m*iH>%iXy0D;YpEO7I^4J)mT$~$*!l%-Bl4ub}2>}Tatd3we
z+@r?Jbsw+FCj`~R#w9+!9HkA8K-Nt=@&FZFHv3Li)rXtfXJ0Z+deZhilsFLO@));T
zwte!?BSX|Cn%%)$t`ey<7TD3A8qe(IP66BQuqv>SoO1!5BmP?G`?+}@uF{Uo<jUvC
z#9&HJve7;6=i$B6GpxyvJCoiJrc08j<8E9imF++I2&jZop=b|GtSWC;<s&O`%xi4Y
zBLr7la>rTiO$&>yTE5eKuue;)*_dcB{7UyV$t4>$4rJ4a|3|2GA#h{ScO_dQ^j|zz
z<0@Ud*<}@Q`c{D?N1+AVv`8k}3xKu6eTBxX;!{s=HWi_tN~7Zng`iC%8+qq++<9l6
zUSLhI9aDeTrhuC6k6xh=Wg;8s#2J1rn(jJDamL0b)Y*xmY5K(Hr44{s#6XW|j(=1b
z36-H$?W={NA9eU_97Df-Rl_}@{qg~D#(aXZmVZ^M_S1KDC1~HR=ANJ;PF~6M2-@Zb
z(xiMVC|CFK?%Nz<wC`kM>}+1;Ax^$$;FzGPT&Q|<7)JlSF_i1nrH!*~k6EgyacfFS
zJa2Ov-7=Yg{@co65ENbEsN@GS+Hdy9y&EBD>vvwF(6rPb*qF%xgfxfxbDja$CGUhp
zOIJ&LggOIowDj%IDLl8RhAD>c3#@pVO+VD0&FTCRB|!z=8$Ta=6Fvaq(Za;*tU8q~
z?uD;w8M*RGe1e;4FMn&E#58+@+}*H^6~2YEpes%KsPby|-zZ*%#@^kb)OmCR@Ai7r
zmkqJZ&+WIkOIreJS>_ubp#`F}rrhNH@A#E4@ge<E*siFnYxtI5Wd8iNsjeFt;wj*6
zZ!>Cg!E;LZv7kkj>aHV|YC*$m99=s6!QTB1Gsy}FZ(@^#W)<mmN3HcV#Yhi4D^fK)
z1QK%GgoWdMg{vc%I`32=Z}L`6>t`=1c<SotYI$=R9X-<=*E&~$nB=!+i4tDrO{yZI
z?>E8m!e{)(J5zS4FJn-?QOnLjzgq9NlS^`hrIu@x@6!%nLfRIX+_Awye+VmMML77`
zhKceP1y90lg@&#*FJVuKC1}|c>TI*t82F~0__SYaqF5ig;y0a`rZ!NZ69aS*j<7kE
zpa6}<+V@oaO~a^Xv{R<CZ+ru2_Q&?+I@-Slbt(DBC(Zr!7k2NH=<((NIzbV#n_f0`
z*stpk$<ZiVb6fM-Rv*8U!<DKIZ@%O6?WaaV627;rm5bT&BKB-w=-)%-{O(j-(W_CL
zcvE5jVb!c2%`H&6dd<J}WaQ&Zi2b{*1D`kSW*h0RItjZrwVU!t*U9)hzO%lR&11`<
z^PO5=2(_a$V$vNFz=;4`Pn%!)jWd&+QLInX#b_g^9O<<}%x9}hQJT-6+N~G^RYwhA
zmY9nacSS$lw}&_E?mbq$(UhAB7MX^mKJia#_xyrqM9PtnqkmCacD1Ow)v+_l{lB{Q
z{|oT8G4yfCiFI$G&TrZf$yyVO_<x|AAFfc#=Bz&}wdvVV<`kq>7?|uk6*_gcbh*c*
zi{Uv(*nDz_@69}^usU1e>;%Y2lYCc^VAjCKr+{lQ3MN`RUkjg=-sMdmhMl-^KqDF=
zSj<)LOhU~61$Azgn4=k%qih}J{~yUf0u7<N=s;}q)MDNiUbFO`^4|=j-z=VgY2hbI
z`@bgkywe(e%`W4`eDOXEBu$dw59a-Uj4e0jfg2)k)=`|H8{_?-z|{XEs?eaJY!)x9
zukeic6IuDl|6`n5C8%9{xii(}M2zrl(>@b2*Mx)RCEJK#*N@IpzxyjeqnRthkMowD
zR@GcdIPu@WHK`+NoW?ao-fGnoyYRifN^UWkzO4cvqhz$?&Ezpy0=R@rNo>TN)-*;#
zAbdUXCO`_gvkRJdc3v@gS%1_V_3*|E%^6zPPXKu0R(N;Ge_F4o=4MEF)W}w*r6nVX
z_Y4Wq5*u)na};u%-&v;|@62-C6<1xd5$e+O+TI7%aM~jlL}izdVI=cCA{^A1g=-f0
zV$PnrY1&F|P<YGv$$!s^R8Hab^mArgtw!A_c(8({OA3R(cJ|>4JlYIQYc{FW>4{Ot
zo{aiRazsas8y;9qAD7sf*qmg|bi#14+yv$CYe}m!`TP<*&Xp2pDRHQ5ndAAVfpt4{
zWi!deR&J&?Q-QnM%coB`MtTXaCgwy!^t^zk2Uso!x<Uu^ZhfJd#Vm^H{wc#nd55b?
zX@CMqPXQQoPZvmVPa*M~jM4!YUW`7U^TiRRjzpz00+X7&kv{8#%uc8DB~Zi`+^PUa
z#*G4q<$*i;ihq#n72=Sd$Ha)Ezox7`^As*1q`pRSVs4oG_rrsPRE@0r*-!2adtGLj
zb{)#ZIu_2{Tv?LbSNCb%Vz#fx6hfjdM#a~(iHr{&(N*LNmn1gUNpPfcq(Ob)-|FEk
zgqcSAEV{>_a|>&eiJjH(6t4v(yu>a-f@iFDI&3KkU+;KP*JC9hzVPLY_hwCi#Q0UL
zcy^O7QgDLs3?aHiRQh#Rq>W)KbRxT#@Gc|Pws^u^HIvJ*2-<3OCgZYZD7XEc$(o4d
zO=5~yV=XfvSZ#_&_lwsXan333;y%M(M^gRlCab}(t~md=+=YL#QH&eKOU!km?Xy#`
zeN3LKRt35A6w;Gc1QJ)OuQ18m)o0YX90m?~OyOs0^S<<0Dd&J8I6`DyahQB{zkIS^
zu^fN^hnR@Wa~pHq*zQe4#plXn?vi&95yEIN-f$aH`4E6t4a%2$;<9JC>$a3jnQKXH
z(yu@t2;lMPNH_Wo`AAT@)?s1T#?_js({ePX)9uhZVGK8x99Bsj9Imk{TyOS7S0oan
z?~-4w`h9O#=C?}p0F=J_o85Blm(~=sNq0=V+sGtSx)$l<BM7~FWCQ#XXs%C~kx3)+
zi`43X7Wzb4XwHOXb<%`5LwYqf^$A9SyZrG%Qvw9zU;OM59sF%a2PLxVn|Cr4$J8w}
z`j5Aj@xfZ`NwkWToG-dQnNfOiKN?uV5eCFw<_&<qh7yPH@>4|*>mN?6#s#Np0UdHK
zDK!l}(8^rxTNL4hBRBu|Lg(Mffu$LMb5+6!HCx-&0H;D|Fo`BIou@2Oki_TmX%`U0
zR4;pf_Jjyw1t4oVF?#M{xE5QhnJ!odvVKDZA7>^xH}Vgg1v<X$5L+H=byq!{cjgNX
z$65!TswYWK8V7x@J`38oN=cB7Aj;H}MzX-R!{sh+%mWv5xZFWjB6(0TcCU5^eCmlL
z>01A>5cQ_!l^wo&b|~T0%HAaJ1%YlmHKKCNY0FJoEBB^2nyRb%Rrl1#o|Hc@=*R7<
z9L&S*>`u;}`0+S(8?|~S?R|^V`?q~0+(tWGjv|apK#@cuD!Zf3r#R^aS&YI~qC>bV
zo$UMdnpap&z2eXs;HzJFYL*4z3HuT|^OO_s+?2zQZsd^%1AxUSGAf6iP}SQPpWo`q
z4+B*J9H^=eif{qAE5H@B=jB8Oox+`wAR0UBh$*U6{~*H?NMti)_4G+qpZ#AEB_fgs
zmYZLx9NFzWeZGb24qb>0*PbJ4$KLT8M<A_dmz!IP?D@$U)UFHyM(Sa7@2$n5a1(Qa
zSD*%G7<u_pZ1;A|N6kr1?jzbJfK_rEd)2s%cr%+R+t%zKFTZ#S74~4XhQOfxidUcx
zeR`dtKu<!w0_`=8L!kDfsv$d`PW8Viyezk;<<ajV)=A(vS>X(;q7zqs-aBZf+SsAL
zyzB*zQ(ASn3m8%4vp%;~Vzk>TZhY8NT;r3KcYSlXoWv*DPo%U8=(ehy162deL!AQW
z3zvp>+nDz9bOOxQfiBqhy)F`*z)@DW?h3KUsx?Dc>s5v##W29djv*?>XineHX1E}G
z#GR4>C@f7QYJ?vkMY>qe`zH1sSU?u?>J;_K6c}Ns3rnj#-Ocw}pFR>LLF#ne*Pdm!
zvPzCp11{T73Y;&rHPC;8!d*zUA@gCA2<;djcimS&5$3#ir(C{_*6;072wK^GX}^*5
zSp92JB7ibZ9wR1LnS)0S$evAMzZi)C?X(HAdk?A<2qwA%V^o5c^-MH126N1v%vHn>
z<^~?ONr^m()g#?#GG1ZNg*nR$|F$bn7HaLLp5d3QaowQtjcKxrehI?`OwWzB#90aT
zXK_F~t4rh1#=LZRj^fVmN&tXbIXH*L%;Slj&QE1%$1?f?`3rdAx{msa0QN|uSun*Q
zv3;)mV#+?uWpwG7rf8=bQ#@6owNG;|jXv>F9Xd2gw=vZIN76xI`lIFCAoX&fbrR&c
z=^KQrp5)i3a%ft|(T){eYPof5+^QLsS34=&<#{aMWeHpaB`PW~WBj|ygPR_2kN=UV
zLb;MPfH(a<Z(`A9kV4I#YjPRY`({*6htzp-M9#}?ljF&aw8{y#<(mY}x&&M3%Ei_)
zi9$*LUXgUUFz-Zlfb8fCMeARKe<BMnsxHcXC2_bhhd+5SWg!_qW$UKA!{lAseT7EC
zQG&2(BRA}E9>7jiyEDNI@KF8x&&76}d0<8nWE`AdGR{JzWN49)X!XXh9Yr*axUhBI
zK8XnNlccjT(bcpX5F2GT#~;ci+X~5^OHKd@#a)_E^=b@YKpznx)#o=S)ck=1_VTG>
zE9whwH~@yQ>Qmcg$9*g5aB)r1h3?X&dj!<RL({a-=%C^|iiznH(|?Kt_|=(9$t=%x
zee(qX9X^v%{8C9CAwbPm`o1JFUJ_LdY0O)ye&A0pg>V-#nr<;49skGmj_tDBUeaRT
zi_M)+b3MIuiB%hH9+gU@6X(~B2m&ZU(fwBMUg8=u9yV>wRe$am33+48b{u7K`>R@9
z<4!(-mFq{i=F5?&(=pf$>|g<>0~>&@sUL_&QEm>M#hwd1C;zgW@Mx*w6Sudh0b~Y0
zJ3Q`SR@$Z9>fW+OJxuMAJ%N+Cn)ES(K~SZ1ay(6zu}Ti$d9465R!t)gT}15j)BU#x
z;iCrqrA@fyY#k2Fd)wXxSd*1^vb$5hD~D%3fNn7c7Y_(yuYIw5m3&H@xGZc3G>9D$
z7D?zjWeQP_M|<Liw>E}F^peX*jWmZHZO6r`hE9@LvCcBErjh&ILy+uT;pR<nqPOiu
z4iWYe`}`REwqUyTHtC-KEce(4sYl5`49cgQUS5DJ`|}mtfIlxzM26PBnm%jemgJ&v
zE{rQJMe_76KzcE!XNFs<Dli^|$FtaG@WQBI`#=Xu4G|~@BZ{+{LGJv{GJkH82_R^9
zVV!icTj5MgB#z{qh?rqqo^N#Wl5Mr*_c^=S>NJPRUYX+lLu>wE@A98S2wENO?&glO
z9JOq-I|YHd3UyYJ#)-S}OF->(J!8%l=%U)*%XUsiQpHw_kJ5tn=dU{J8dYT#6jw1&
zfQRYwNvRu9a`0t-pw;pB;Nh%Fd3siLN@xi_7LT$R9K-xz(Q7@^OtJhR;7U~Pcs;xH
zdVQxL%1gCT&yw59Qq~wuAh$3oJWtk@>Yf_KgloQZy}H(>!(gc`*~6x8x+6Di*Q)k%
zfUry?L>@r-_D(m`yNG69xC-0ur3`U2%$QJ57~VaBE>o5ffn;{SD%=9Zs5wTiIwc}j
z-F$93)-Y7aR>-=<+;cxqma&ELy2uWDSj$M?gfHbdjiq^I>g{{WeTe5X1II>f*kco1
z{1~mk?y<$!m<%Tq`>NW)Ms43kAJ;ba#^(U_tRGmCHW^E6Lxx>}?`=OO<vJwFmjy7u
z#5)+9QIx;{a%f%+6%)%jm!<Jw*~V9HYvPCKWvoz^Gs;yaO5~F_EsSJiR_*1JH!n~O
zgGRj1*(Vhz(%q*^kn)<4zG-S)X~UVwW>cDbIY9pkI<D_&C>Lm+S8Q|AX#UJ6GUJEM
zs|dw`Hh~4s2Q~fx0u3q?7L3s|y6t*TDUrYZnrQXp_|5H!pZsB9m-kaIAfJ-8hn8h%
zUc|_c^uO)6JFRG9T9L1>!X)sIgor)}x`h_)Fn+Gvi%vWpP}CY8e6jnuKb!9u{pzf0
zmz6hRBkC;>BzwvxNnCi~7<Bt}$M~<<EAh3w7P6yQTUyg)?be)J>otwP6R{|lC<6?S
zq>zL+{dgt=g1jM6779JfYLq6M;(gZ2#bWCQS553=El^18<HUmZlZk2%vsz=@^S&k2
zj|c9*>;^P#8WfPv{G_4v+XYCSfTeQaA?j!A6j1e5)e-bmOJ}ec70_WFHH~@Ea_jMP
z?Gd_<!ZCZD-brM9UL&3AiH#=HCCZ28y==eL;S2`93#tu-G8>?sku%7uVDiHx4TGxa
zUK4xe?Qq(a-)4LpK1?n!6z2(p_yy3w-|7pDE6@d?t78M28ouANq6%*E+_Y=?>vd<1
z#K68e|DSZi|Ei&~wFjFq4(R^EGI}Y`_sb{7mGUNDzo^R8z&{o=9VkvIKPvV!e+DLp
zvRjt4Drm+wpFMXP)vV@O4#b?q9K(AN=~=aP>3pcdr=%7CS~NK1*R8aw<<YDf(h1yn
zdhS!ZBK(0kRTSeCgUlr~+w{{!f)$1%^+Hz}(?uz%j8W??QeN^4R?_720>~$UeNw$`
z5_x5h2Kstr-o#%!&~^+wj(%iWo%!Ch9Nj9l7Gk@{FOPYf`l8#0b@>q6O3QpGA=A>C
zJ7qhXGA2RDAm-P)BhQgYrT~qN`!?AFAH^4<>z~i-nANfkIONf>P}f~)IDhh0^;gQm
z0hmZ{1ML_g-NZ436jDfu4PaZhSDg;$qV#o#Lv6Ef2H%_Wxke^fI@h~shD(2eoa;SG
zH31Len8#MrhP3`w3jRH*1(G@1{?CPf&uO7ve#o~-O!!kRQ?~Hl>B|a8GYKCJZ{`Q`
zO0y>vXFa5Jy&acCZF6Xi#`6WBPR?INU>-JCe<hOt|3uc`V{xP!o;N2UY2hmGjq4?e
z-_ITUN7<h%xufLZAUkPlXjT&Sm1OuwI0PwIw|T6T&@Ip{Y{+{2CqhA@1K*x2`;kxi
zAW757zi{zm70aJA#^Xl`_go3q`>s6Z_(;BM4=;6%(vs#M3$yX<1SfjfUSDW-$ACo_
z80jvNYSyR!K$8>~Ncc0+VP=J*Gy~-O)NIa+m@pL>x9HCGUJ0vduc(yx6(A4y1)1s1
z>K9(E3qHTTI;C9B8|xM!PT4cM=|kN%@RRvIi<0sHBD|FxWVejDVn2WPE=E8<@*Ef-
zzJe@&4yIfbp3Css^9}0F9SS4@MHPL%r0%->7at$nd{QlPWRL9Eyf%YYW!pIInw<2<
zplfL%GD{SiKnXbX9^1dj`e$n#5-;&zD3PJjACw!vWl=n-Vfn-W{afO{IhpDkaF<^8
zkn)<4QXTj<FzHl<&m23FyCsK%)OiDIxke2RI~2~~7AEe)I=#B9c}Z+tkmdA%0x($y
zusK$NKK~DGTW^o@(3S9N?9!x#Wk2x+qdH~wR{#)`b`N$QhT!jTg`EbzZ&gtJ=-@yH
z8`KB<hSjfu1^{}BPs)tZlYju6OY1CG#1Tucz@;q<|7YUk3f!fR0I&Y;Lo9ZNsSb`X
zc`gbQlFu+QW#a#IfP#j1z*XqYFS*}ws6Dy5_N!yF2E=vdbkv0>P7%e}q}3w^J1~?O
z2Rsd+xO568!nEp|Z9E10)!{DNZOljtV88=EUV#t7=+R^OF%xhxzC91B&u$Rj!nfL<
z`>xd|t`bISh^mb0KgYy;p%YHIbgN~$<Br}{0x&9H5|~nlz3~M7gc0I_6UT3k2(RoR
za7X03s+xk36RY`nZw><+zo?CT*x0By%pIRj>Ny^bjfJjwFeo_|nvCIl(i0ZgK?)S%
z0PX3H<mSky1f*Nl-4tM|ugi*ARia{gH)7URVk0t_e@e^96SDb71wfIwa<Teox=nJa
z4=>Q45A~j*fpnQoxarj4A3d3UlHvvP<Q89WJI=wD>^7Cq<?J4YQgW4;H5N4q${a6w
z8#@N<?@zfWecS|?o^w9U?-h^<?9WD+5My9(S=U&qgrk^Tmy78}l$0kzKXI<DCGpaO
zF(0jLx>ZClAHDicoUeG?A0_C;-%-u{hu`mJ#{~=n5`a0J=t3UnsJ*R}3$%=|DQ1uH
zTPhOxKCeN6Ar}fEn@}GwCW<ynWL*V8w40)v5~j4mE7-A5A^8A!*dJOgM(@BUMsV0M
zHXM3GJa30S=u4O6%3IE~U&c!sc<`N?jn90>XMZ5q#q?lk#6jFKU*BaSzq{0-Ygb;O
ziuqnePt&UKI%#Z@Cbs5tHH=q=$_W5ah+j0H{e3TaU?i%M4<=ouT3+7&#s`})F7%th
zVSVoU#jLTUpw#Qjoct<6xF#<`l!goPSSL9Kk&djJt4hEnV#>Y#bHV@*HBNG`lao+`
z>mFWKwhZG!?ow;&4@QlBDb;=jWhVq+Ax&JOhaF4fWhydgV0y1B=_Fq4Aa880ib2^C
zGco~q7W72KZ~i9$>7qG9Nf2@TX#sxN^QSf?`}Yd^J`nLa7`?gZ%j(fH<mPdix=&1~
z@lMH?TcQC1fqj#Ffe$Gv{d6hz8)|%r<{*vA2!0#=RH#XVu%NKzBKz!`Va8Z|VO%D$
zH?lRxNl=E#S8Qi~Zev;pTM;nfr#bP1Ozr{pXdL1Q=L7h2O&2haM)KcVyMYu|?v*u0
zh!MYX_Sn>nYIOG$FRyd^h^O$_`)D@f0XkA5Qqtn1RV`?mzMIqXux>LyMdsI4AR)i!
zKnh;&ul=Z10j7MJ@*~Woia6#@%8f3S2hxiB9?!juTy#&3Y+loIby+=6k^%6tmcEKb
z=g0wI9*_e$o?D6KTL=DRLlE<+3;--mC5X-$evayfpxlxBxCPPG=)(GSPu<3m>|9QV
zKbNPSK}SM;Evag8adni5UWcqKc5+Q$(S(Yc(km<Nqo)|Jo@phu-l<Fj6}=BSBs*L@
z5mn%93nnNwH>)A;VhV6QBlkZ5n8nNnX|Q$<OUqKf?ZHrqq)p?3P250Pquxz;pOl&%
zR@a=>_)TRd70{E;DepF%AB>FXRi{E*KliOsMRx%VrBJV!+KYB63#pF?NuI*!{US_e
z)y;BJd*~;CS3~OOX55a}h9}Jfe-*>|>J0t%pd>&uo#+Zfj?cp;eQ1Weg3jO0YgGka
z_$M}y1a6FeOCeBtaF}Y?A}r-W>)E$3r#uyerHZ#$H?W8r+;HTshE*n$-u62>@xV$8
zdh;(hE}3<d!K1Z*w4c8hi5zjK7<Q)P+v{9H=Y6FDXIa;kRkPYDKikQK9|BWyE@cO_
zsYTZZkR)pjdw<BjneaH}u9J4Eg_hQ_@C`+Fz+jc1-4O83p3ng*a9S@@I?UNLXMcW(
zzM+r1vZ9`In5oGbpqWnH)_KMz_O2x*JrGf-fH5cPXUCOf)Xl+WzyM#tiRL!ieAt{w
zf#FhlGvVRw3dn(H`v{CA`rxU9@*Wh8$90CrGT=zHr+L^suenKwKx@&-Pm-YI$Twv}
z6FuUn%0iqDhlHD~9mQEQwP?}{ovu4I8?z_$TnEOD04hhZ@_A8{ZJ;DRo&M%VC|{J`
zH`@SNvhGv1F15g{7s<o@WFZ550of<{;{aAZWDEwZfP=Z>YA@HIErv9ybSJfVLe2!N
zGbrSe?gv2Q0K3ZgG0z1KE=TaYx0fg0?<8KqoUWfqeVR&BGQ<3yM8gQ1d2oe;+6(CA
z^X2CPW1&eQz^;2wd}6Jpd$x}?kHUla+ync_9x7bqa&vOtfKK`h*ZKpDq7zpL^4FFQ
zsdsf|aP@2DpGM(Qppbca#Ww+~aBwq&4jIf<eD{=QKXWkfMvxd6HB%xpRp8z>u$jR?
z<kKBwUS7>eM+Pty2B>=$B=47wIt`Du@qYlH8Yc^_&$Lq{T=(*RMh(if@*df(&$ACB
z)+gyMr9x#&d(`Eo3V~}ur(uXZK8r<fF6)?2$;3~n@Bevv#AD_T<97_e*p)f>Lkg>(
znU^`Z5<5mqyg_y}E4qYJ%%1McMgWIc5i1nLbQzc=2mW@_{QZto532-WiQ@yEqbUEh
zwYTh9tq&57bpb{FE|J@TQ#R!NlE?zk!-%sAsC&$zeo$yVJ}ooi>NPUuQnFnq(s&l5
zQlg08TdY~}<*Qy&AjDs4u~gdHQXpclir4qBF5|sxGY*%iuTk0UxzK)ns34B%1jeIH
z_`ECNE_oq9?!&LP2Mau8KHt^RAs-Hm4(35+o+}#KjGkW4-il8BYs%^|ZE!I#uT+7&
zAf7640I<Mk@6e{5Pk@Mzym`WeV*GghMX+c+Q6W{5dRHs8q{OIY6kz%ZN->+&@%g?Q
z7sHh*(!#|XG!=OUUK3AHqsB`t%C<}cm~<kk!FBszWoxR!Q1{>%3crGH{`MDN0*p2N
zec{(o^dwUFk}_F)h3DMMDL%ez9HSZA2|dk)_*G2b=lh@Tbw5D)ykOB_d4t-QN`-Jc
zxjrzoOBYishF|S9lu4|GnuLnsH`lz&F*SH?!SG~cC}K+r9vc`uvb)8$WFSf`U?Y8B
z-}oYFv>({Blh)_0K?WVTw(lpU>ts0xvH|PslmR_uL0UZCToNhle~oz&|9LVkUEz*z
zt=Eoah+fIB5WBT6qIh3Qi5c-_6yw!2uEv&jqGfoK2?9T*su|;Rm6SS5?b#mY_h8BN
zK9Z;RP}1wPI4dOb@Bv4QxcfzSX`n-gaN-`<rr0GZ$9G4T`$Djx$L9uvk)LeG$Q1?c
zA0z&P!c@UI4YBUu-MequGwqOZkSXOMv23$yYwLw;X-fcY+=WZnGN0i?1)M;P*eh$}
z2PWvZaCdyiuG?IYo7N9cMQhI`cX{tkU`Ly}G8;Tfe-trs)p=nUf5f+)B_&H?J5C9Q
z9Ge8N@32S5NYzpqTQ-zj(w%B95+eX1b;yR#zth5L`M0l&J5|&>9_$3&h?7ZhcqlZ$
z7Fn=$IcUh;V;bDx>Zzy>w8}VGEA^YTKG3l$`Q0)(IO36G<MzUa-}JJf%~Z1YWY5>8
zj3y_)EdAaw9&Ono*LfVkKS;xi>FNKYy7P=`Vq5#Tjb>;bq(~4%lp-||LKPGcrF%q!
zp-2g#29csj4WJNI5WQ3pL8_ubNq{IV0qF@4nh-<?f=D=k2n1<?+(FK{?}zvMyYBmC
z)|xeI?Y-xj+0UB&-_QRy@vt1xr&w`?Gum37RvtMg%rZVpw=y(Db!Gz#k;6Nif282R
zKsy#=)A^st0&FN9;&LK>^}AgydpVW*4Ppf|?lP=UyG)<jREv<Iv&-_l9|Q|j+%q5Z
zPVFi*cr2aJB<6gXQ?*z<dBb;*eq7p>H2x5Oa<Gz~pt>C5RS1&H>%b!VmH7YU6=c|2
zxhdk-n4ww|{Q{?MoqIV885Q!^TMc#i<`T8J+NF7_Qt`l{3ogAKcdQ3NE+Q>3^TL>W
zVFf3LjN~?c57A_8H1yzBP0z@%h9i1L+QvTUnN5op{$T&B&IuCL<8idv)z+&S(@hPV
zwogO;m7#CguYY%uWuT6h-g*U05$j^Vt$vs2rj;7}%oVeZ*-tNf3)*f)0~X~km*o0B
zEL2?A*?VkSXtcj{ZT!f&H)m)e&o2D-E8gNIZwA5*$zEPfX#A96SC8<s?v=m1>Mx{q
z-6`Z7+8EBf-j8c8@=7=(L;v0tdS=dW8U14E_*`Q@&9~X|_apg3`}Kfjb=aLVn=3L^
zMfE@D^J)#(BqSKeE%UJtk2$4e%W@o(K(m`2sNGxLQTOi0l}5c+nvgp7>#HdQN4=xc
zEn+kxJ-uf(zAW^MMiZltOHC?sh_)kJ9R1jQoUF=ng~i=Jf4OyVXn-JKWG*bi3)vP+
zeOUeVlMTr;ljlSkEX02GUCJVt65|#mlE>ZXAyQAcCRg}xD$6I?5Tm1u5{^FIJ6U$+
zU;j7@1@l2miEVU?V0d#&vpFmAf0Jg8UY@xs9HLdIj5u|)$&xC?C}p{~?{$76Jo?b5
zwB-QuPU-Lpd8+DM<$xXEN59-V7k&W#*ZWJ%9TWTiP+>mFraR0Q=_&*3^_^0xUEiNx
zv!sLV-%IfSCQIRezr)g6zT0R<2=NrF`JfWOf?&F9+P`s$vn?~)FTA!Fk?-aJWFv@A
zEZ270$zSJQ+b`Q@v)XVEt*+n9$F%>kLqGa>?>?Doy5M7AP`C#;Wv}toJwjdSp=Wx_
z6o+g1#C9CMTx0I>dbeiiH|(jI{(aB0WA~U@?N@fWJ$wLe>A11s*`y;GYOH{G`C%^O
z#QY=LH@>_hm{QBHyps$(1i-UQBNRD6TIaqUD;z8*ghHLWd7<{~E^`XvIV`Ij)j*YZ
zPOKJm@Z-oS+cJ4Np?|Z|=u-(h07ML-Pn;Jr1g-5MdllnR3AVeJO))=H(XJ9HlGV*l
zpepFtKmn&qvm5@q;l${eAS2bPF;LG#yE3BtTWl&~x5HkScAi8?yp8v68yiCFdmtq}
ze7kr>Q;w|6vDU+S`MVe`rqQ?k%~{3@mAi1MEo@|rgdOwVF2=5a^H@k2dGLA9E**>|
zv$WUUDqqB5xBG1qASg;9#-QKb^kV*Bbn2dA{WnG{DeT++{}a4dR>W{kPZBJfImi@%
zxscQi%IT2e9-_Q|5zfKe<XM($ukoh19<j$TUd$Vzrzeq_-i9_$6-=!NI)aI4p~S-o
z_kYLV{jK|qiW(=PYyoJfG;-*M+R+%o<|$a<Ju8|;0Oatjm>!|$PeedU<HfI-EXX}>
zWFyYWD{@l*`kT~AsQu5B_$Am!{$CF;lnFMlgt*nH2Y5|s;}N1aPO%EC`6cX`pN%=!
znd0%+Pq8wMIZn(R*ZSTDHX^J}@IB}tteb*j&x<aovqYaM{*K%CQ;#|(mOc#GHHMFt
zIFl9*hliF+qgH3{E?iy9!?DEX8&>PG(`W1dWa=l=T$C%#%CC>kJ#A_0I_T2>$eIvz
z^pyBGxgzKpa$nN?IxJ8y27g9<%`m^G6KBvfqSg1k2)}hUAEO#__Ax9Vzk<+rGw)E*
zP;ye)@8|c50zn{d6m{w=m`~NZ!(l4*qD5VRzLe7CA{t)Zvvh950^xHT<LJG3EJ|~6
z(pdX*^SN?WViXFUc`C#ke%q)X3SMg~<)gm`$fn|lvp-)Hdio@ts;*jjZnmJE9=<IL
z)_9_Xto6yK5`tVjE7gL^^TbgoQUuq`ApP?4$MaQI`6mg`S1-A!CWcp-?+`wu*tZlv
z0hQZQ&#SKUJ`~kEqr5u#0axXy&1rhn7)mGgd~aM0J9soJovdK^m5B92iz|ir!f(v3
z;n-2U&e>Z*?kl;2EyG2|3LaBS&!I;gccxV${w&4S!P5h!n;SH95`nnQZLm+^Y?_b|
zg+lP-<^HP~{X#`5!)Rp6a@B>OSu4pX!xtNL>7kc9aVkyNaogJ++!R_CDn*MJ*-G6*
znxG*Np2QyO%<DO<xdW7MX`3cLxz_FqQ~K{@tcqoOW-M~l!I56PvOXX#AF1i!iY3gj
zx*y}<Gbk}_bjGGDK%K@dDG<xS)<qg@Q0O;O)8$zT@g!#^31`5Ur^;GI_(;eovfkr9
z;i6QOF{@vJ;${=b)!a1vlxcJO+x|5`Xll%V^K(A_Mi<4u++PUk4kS&53TQ*li%?ib
zd~3!$2mgfI30+AE+FSC1W+X1N$zw#d=$HJs5E>BaxrtF^S>&&ZZg}P|c%l4-JJncL
zlNHky!wsCc*k~o?T4ZE-k6irrNoC~TCV;%L#t;oxRbHK%jR$awtYWXZJ)>oL$|1M%
zr$Jj}^oz?e$fbH?Bb5*n_-*R3A5h}&c^0fa&+pkT5>HA`pZd#1?6Zn87^#TGC@F&t
z#=0qkHVj@<lm2(0s6u?sk^*-pxc3T){mTyFcSKi{$j(?|3K0k|Pohwgxl9L;1exZA
z(oKgd=PfFbtfVAI-Vpk58;r}O`ZtTZT+wlyL%-T7{_#DEtNr7(t19SaIRxU7P?Qj7
zv%4|$k+_c^jATorM70Ghk_rBlze;rwNJ}cT>C~c(RINfq-^(?2TcrIrGL@1lm!=(<
zrL-O9>tyiK14VspbnNDNxFp2)ns?NR@vk1%l^gDQ_0^N^u3$E|+kp;jwv4Sex|DC7
zY+MVxw6U@RCS4s#v4!wvllcSdm*ggrRpry`-tr9=awn*9&%fY)5b6EtUVf75`prMW
zbogfAvF7o>nW(^=Tp3B)#tL&T41iX$&v%~x8Q<zPj`?c((tXccip8W<oGiI^@oBa^
zU`OaJDB(w{AeDF02P8Du3Z4OW>2`=}uxr$laHJHw`5i~~>BlWp>bNV?d?H^uZ5qRR
zC>PgL@vlj^+2!J-ne%cIBfO4K6J&%V9Cr9mu@!@?bODW<7vYlaKUcFxA|&(b@z2xK
zIkq~G!}3cX$7|y=`Z?_ud8g9=E$NxrarR>#5o$mr2+E1jM^Yh8S$)U>6H91vkOG&F
zv>#fRn6@$v>eJzV2Qy3ojPc%Z>3f&nz%n9`wN8`zHwo?oA6InEuce3?3;tN>3hLCL
zZpAj;bbrNOdWs0^(fyE<B`oP^BVl6<(z-Z6!DXh8<b5WD>`78hNb`^Uq*8b?b<3Ye
zL1G7py9(^nT5}>x66Dz&NM>|r17LOdtnT~aBg4+IiE}}!4&RbP(<s;%hmvsI=nghm
z=yG_!QjNnc9?aM8AR@wYN|`S{fk`>(Wj|yq=P2}pJK3Uh^paZ9my5$U;4dED1qx#-
zi^Aa)3ggn45H*t}SX5_O{dADR)44?RKpl~cB(-r-At`CLkauII*wzdN)I+U#^Khq1
zGBho4Bo<%9rfh6JLrF~h#1M0wnPka4ER-mYDmEA5tRgE2R^48%GC(;}c=XcKg!ebW
zcXHddg!bX^E3OF;JUfH_GeTl{$B)&{UM95&diQQxi@jeu9vAzhKx<IV#4r)4kTD-y
zUc_m_ftYPz9Zz$IJTH!DVBx;d;vL1c8|eh8ru&Ouw62-CL?yjh$n{%PkeIXt+<(;g
zmYR?eB1d18_c4^{lsHBwJx$+FpI_)bg_w4JGV`FptHz!_A6bJpJWX@^pj(PanWTjR
z3Zaq4wyaa#e<pdG2MPiC)bi;re&x)1%whri`z*qVRHQ=HfMsB`@i}5wyff8DlOd1O
zv}84jn2rfCOBb_g_11OLV2i4&nUc%oQN`H=)$Fu-qeGGJ`2m4yku*l{8&CKX7@OWJ
zyx`_sM4PcU+Q)n1wCBLKA-ZJ+5H-ZVVm!ceB8!D9POUIYu%oZPw;TW06D2dxQ|}F)
z_cPWFgg91u8|x!2jo4(EA-46`8t#jI4fLMIFY(DLIQ)l#bMhG+^%tCGft+>p^@%Nd
zb62}@!242#X{ayF3{j0?GeV+X6c2?)jh|R_R*hk#0<@P~I_GdRMO<G(jmC?<9RTG=
z#Ap+<#;|dyv(mGusld4b`X8VgYpH&MS=b_OV68Kr9Wp4%Q1?U)3667zxRmqjl>8@d
zX@``w9qt~9gA};6S46_MJ<T3-vpyH`98)D$UQ1|_h$XyK2fe3e67++~oxYJk4JEKc
z0_S&2JU$xjlT+ZQnV@z+W-`Lb-At&R64iX7-(o($O6>Gs3a$Gj@x*S5|D<)KiO^Kk
z(l-vPoIFOKkYVy|yp3%Utc%a1VHsG@K6^1&^r1Y4hzd_%rlyZ;vTCy3n#Q)RkcqI;
z^4%o=LElJ?D+49MXxatPmXdPv(b!|I7r3cDAEz7EMLtW8bRdSasj#|IK&n)b{C-+9
zEd)_gd|0Hdqlr!{Cv)5oK65&J8bl|lv)30qP_sqTaX5porGbsY?tPFu<Sq)QB{YSb
zA6jLcyXv{WRZL^6GGt@1IqR~`?HY&u&WD)_wt6??0uAc%XJnN*LqG4eI=Q!Ek?2@8
z5QEC?P+0v!$0kH*Z|liTJh@>($CEAisGS1l*K#XI!)>^a{gWO0yIm8mAwi5z3y~Qx
zPVr?wujwRD&)Ma>=y6e9+y&5Eb(fN*mc-GV^r;_Rl0JaQeho-w&ukOEKs>rL&6Ti|
zU^X2=>>_M}8A{WWs^d%?>m<wRK;7PCROPn~sQMV!eyjCoL3IJb%kqa)4y3L6bgi^z
zaBD3n8DKr9I8|<w`cnZM%vU$>LxA;tK}tV77@99Oo~JQ%NARH*Q7ZLjl4je($po1_
zM@Dd;==;iYrFdK8f_@Hn+cXAQDSHasT{VN}dY`^KCE-W}Klss5DI6{$Ch7QMeLDbA
zZCzvJy8Tg&FAJKas4(P*Sa(#D0s!8w1-gxON4RWUleDy83m#J*_Bp&5wGc~~M(W6Q
zM76)o39E^U<drR=24+sK?OQSKi@f$84hGN8L{vL(3q?~o^FYn+*_?4H#!v-RNKFN^
zSD=j^lsdkKv>6t^p*KKZlaz6%1x_CyU%S-QB#;CGfB3+RZp63tRLH1%{qvIRr===s
z3l7oy?3ShxK0S&q?lp>*;pDSVpAz1wgX|1nw%n?24a$2bI-*;K7X*p~TTh1{nUJN+
z3#evIl_pQcKuS6{g5ZMw&4qXNiv#z}?p}bjH;^D^SzE3nWZ%e=IvVkp5AMS7S~U*M
z8?LJie)EIo;B~&_$%830#}9!hA!}TFG~Ix`Wac%Z8Qpsgi|3N0x69adaV&Imqn7J0
znBefE>dMP<E|Mc-9TaZpBd}SR6_8K%h!O7Zm5M1ve8~}1_Clo^n$fh&u&%FlVs#oK
z1!WnfX1BouKX4v>-cv;Z`VO5c`>hy{5bJNt%JuhFDH+D(iW1hg?Ahc>(*+mrhq$>v
z8t-r4$5mBH1Y1ax%<kW6AGxbL)tqmZI8UE!s{#HKAYy~ZjID(;NmkPAcWy2FMzVeT
z44iW8z?d^Q8cFJ^H&rECNK9LSoCF4sJePd?INiBUSH38iL`N#^d<e*XS5u^QwNC{3
zw$~tryXeM<U^963fHgq)r_0Z%<UOepRA8-#_gV(}iY@S{=Df+$w33<fqm18<x~8fO
z#z-_-s9VukxGJ2r-%=9#1-2N=)(CH(j=Bkl@vq<N$mCWC59!ka+q$RhmEJGZthXi&
zQmsuR&lK>_xiC-uJTc-mwYV$*S85-UE-Xca@z>GLo+c2IQWz5zs|QCs@jN^0``;t|
g?>QvOu5)CrIbsi+6`KJ6)7WEv-p2H;v1{DF0nA$VYybcN

literal 0
HcmV?d00001

diff --git a/docs/source/assets/design/v1/prefix_caching/example-time-7.png b/docs/source/assets/design/v1/prefix_caching/example-time-7.png
new file mode 100644
index 0000000000000000000000000000000000000000..fc33ef50d4fdb415de61cb9e75a085d776a971e3
GIT binary patch
literal 33144
zcmdSAcTiMa^DZi)1j(Z0D3Xz!qml;+5|toP7?C*SJSszmA?G9+l#B!=$q+_KL(Un7
zoO9-EUf=KBU)8Ns_f(y8?ybuoQ^Ve~cdyl}SNH1Y=?;0Nu0-&F>cO2mcL+er@>+N9
zU~=ENgTao21zd4ucN4#J=i?oa{0kiq<IVK@b=1fo%_21x5^kHkv9W^P-65^r;ciLE
zR93ft>Qq5u*0F+i53FPIqnx}QE}9$db}o-f1y{nG#^d!?-o&^bdS|YjjHRRx_a;+r
z`OS8VRDqtrh-GmY|KI*FfA}sj#<;g1lliUI0Eg6%dVdn;9vf1DI-ZaXd?}Lw0_H7)
zfvAirOE*=o12r4Cnyoshb2KaM?@(O?k%>zWR_m`}7=-ZlQRXc$(Yxo;h8lXJd|3xh
zs*bkH42_ginQ>)uVgdimZuPw^GC=>c_FXBRfo4wRb#3lA=L)_KnTpp&HVG0v3UX~s
z*B;Q|K=y54!{p&%e^ui9#KlOD7dcm)dvZ#+OflCT{>&OT)}Hb5!*EOjhc&-*adUT4
zTvxdZ$p0IWE}y$rZ%}1gV3RmE{RAfHhw-KR+sC=1&PMsFolqndvzi55KadqIOl>$|
zG#sQw<!Ypu_+3lux~V-@v|H_;B?zw3L_x+`*QA;ZqolP#8raB&PYwD}NM}dTP5R;T
zg&@1ci|YfQc%S)dvf|v9Coj(H7<|9RY!kq<W|`5|%#3)6hcC3!i5iUul2kJ6_~DaI
zZ$5}|S1~uveyUpd<80(eRm!=x)3SH-va`x~q=iyv00eSP&mN7bbYA}BWK>#`n>EN`
zE$aO`;Se=Paqg=%z5G|>-y`}U<4@SB_l-Fd4;MT6WIGx(C5v+-IoEDfI<XE={ysK(
z`xt>`H-6#!xXf2Yxy|?1d?7}>GbTm3W^>KShY648gHL^AN^-BVa}8rEs}61eIkeq(
zj*yTue)dGXEzJQ+W?wzA<F1(evZWv{WI!@O=guI6q6t?TsyW~i-{5HMxJl03m>Re7
z5tYe`KE~LngW36dG84c{a_8&H8C#!$&b*Z`G-9^;9{gj^{aT^iy)wO-uguFBrcvei
z&(^%-1!*WF-QPE<d4WZ&a0WC@XcE=@1v@djX-qw014%It?wvIVU>dTZiAynKjpdrl
zvdlftl|UOuOWd*Ox^SX_H}w8h_ZMS~gJ6^-NzCkJB2NgAZ9yHL*n7~J8DU&na<|BX
zSyw|X+R_tq=BsfAUmcJ;3;O!SSkhmUvdha@bh6RP@WkEg#W2%nT;yB}ABg0#?(GD(
zalQE6-%`;e$}LLkW7PrPSD~`i0Vz2%TVm2!WT5=40$n9GSS-Puxn=M!D08DBjfQe(
z6~oMrFFsA(QRuok-d)O--qQZPDwU>E_fj9lZx^Z{&k?Ks+|f0Row~W_XS}>tPMbpZ
zYF^E;iz2LO{>7)b|0s$NWI9@BT1=O4xXsex6({E9l<4`*sW`#$ZUTYyPI|_8S4~TW
z7aR}khX(y?kT;id*Wj3nv6ZeP!zM(wQWiU}4E}P2Dp5anhPb+!1uet{jOLW~H;ayR
zkWuvEGI-w4qp7S`vSHD@+B1orJ?PGLwiDMQLbjcc*2foCe_c;un5&T_EB0D3*bJK!
z^)24T3Pb%5iK1B8!j*gZbD4Ljxi<-m2(B>j4+dW&<_LuiAFJu`#+uis4?(gW$`&2%
zs6da1lBMJ%I3CPK^RiesVR0XXKDdZSG1ju6EksGWztR|-g^y)Ddz$9?xeqd5PjZ9I
zs;pA`&XY<tS9r;{CPg&;c!~5MbI*RZB*qORZKS2CXAoU>&wnESK|V@-^`VVMllECS
zL^bK5=gwX-hok{WX$FEzzF(#X!o>fODP0=l=7ko(lA2`nc61thcZU^rMAgl}*XpP<
zAmRKxxz$x1R?>3!#RNwDcY`4(qrzrf(9Vgc5?2?*tOqv}QrMw0@X@i}L>JyqR<4c~
zK!og-cTrcplTq=SrCy{>lwn1;Ukw$>jPAH7ij69F!l~w+Gniv%dI*z99{wG82{7>I
zbqTY+VXru45X~yg1X4+HU_IpvHRi30cppb^W&&*g=^EacveH~nPh^0F^j1->&y`xt
zI&Dpd?!HrZ5VSEWZkDM^6(sX5cV34V^Aq1T%5wRmT`2CP_*3PRENxiZ*>_~@eoA^D
zd!+{wO$72}<>sLwp}!7Ds@Q)g#@!&%&Xcy%KpVS%_d?`zgDuOQ?w>yTpbJASw{k}^
zR-J)Ux-=0S@)hs*JOel;t>b2D;bm1OtZ4VGR}Qc<UwsfXhV-x3e$QG>Lykk<-<){J
zeJw4bOcd}Uynn;2gULm*;nJ<;w*)sux3n$C5EH1(Or|u39L{^zkc2j~LO81_((!~l
z-H7b*%sb5QRm5B9dm!@?-Z;`Xr3rLFFs{kcB)p}c*z(+4xm(b_EH}@BqYuZcGV{y2
zj))hu>P(WVnhhBH=~ba3{WmgBU<pvw>a>Xy<7LSwK5g_&8s4={ij(_=W8qbu`!aT&
zoWaW@OtKd<>^vD{W@s)O+W?{A=3hx?O$6LB8%lRy6AS7aeoP#0ZzY<P{=A_rnwJkx
zfTFvuAtP?q`?J7zrH4D^?VH$mH?u{T)TPxGMsYf*@31Fv5w4OAPX*rP<cdT}b=0&D
z9d&_zl-jFx3h3*<LXoqglgX~>(jGMvR0fSxl*)<89hc?K#vR&I{W_TsZ;FePn;m<S
zT#~%if&XvdeC+w3;3lie{~JQ`_p<8#fBCxmxufE-nx?D#|B+KcFevB`OX_CVfQ)(!
zRty#a)uFD|$NAR4EAc;~CSON>lW1Q~=7WZ*_}`VU7k&&Rp#JK`!Em*_Zu8spYB}`%
z*sI&mwnweH8m>A&d3;k<@76(Y)-gd;ud+=h;Tdu+rH@`7>6m=YG1roM9J^lBq@&cH
z>&3w`tnI6*)VQpD5&VD!l+Jx#y|pSPzoJs1qj8icetVm*C}aJltB&f|0^rGEa@Qdn
z>FUTr)0m;P|DU}WQx{C*{l(BC&+_Z{;}4v5S4g`JW6!00wwv5y`UaQgi%HZ>@IYge
z#|gW8ADu)lbTQDTWT8}S=iW(~!%XBpflhy!of3*syteTAaPjqYNYEB~#K=;t_NZf{
zP9&pFsMF$Tt6<jIrsQYC1v1g$b)Jk<SG%>Ll<Uglelh)@^VzXEO%2ml8$Z6GBxbIf
z%VyU8AYDiFeea*7Mw)~-<&Yc|WHcB<FUe|A+=fbP`q|QOsrNo{eoKD!`9~)!D3dU_
zrS<IFS_pZDmwX(2gt|V)do@37;HLTDkk)?GPB>&=Fgrw3$=%2^Ve;Md@Zj;31@hSx
zxJ-%$U6I|xBfV9~=38qg0!b~c4ZK(x-BCbR3c8^!rx$BY$nw1Hs{V*?FYZj!=ZSp1
z9vQSt%$)DAtF_MeJ3c1W^y4|lZys*G%$;&SwM+B8Y+2j!7l)pga^^S}pVHF)JW;p4
z*iF5e@gF^*S#(FbU!7y~$gdnJT$QjMj%e4`Z;6=5Jli?l5^<LCTkL0x9L75_*&v+q
zy<kgU!S>xx=u!`Z2b^!W)GKX$KAyBoJv^;PO|7|&TP?0OG`%+;(iBlXy0yXcY`9lX
z!7<5W+b<thN<^ISdPHjL&jtp)Tw2(J>W5#795Kpa)xV~16cYz)4HcA0#wDE8W;Q&c
zmCHr(FCN-o3B7hBnsR(`E}|~w6(@225u!QsiQ`|ZCnF((;5QEI8BIvJP3CPEUb#EB
z1`)XKNO`}G6M00m$=Q2t9|wOSWChpPR+^ye8o&+OM&~Dg_%3nw+t(X5uSTh-?*}?D
zk%3mR?o_X2uDVS*`L9Y)X_{>O!Bg^HrYvyo;;L|T>0XbN@oVpqTCpuUpv|`sB%~G}
z>Jhr&+U|6QhxK+1RKDbn5$6%6RpawXs=q9@@RnZuJq5iw(`?FKcc4x8UdXcVjAqiZ
zNSWOtqb^OZfjqNF6VINVlMh-OIw@pVs#_~apOEs}VD9(SPqD51Gv!L2_9)N0@${Fg
zeLx<$^UXz<Is8?|xuJ$WyzA~kZ~S$m3G~!9Zg%72-sF!exJ2EGKy#u3`6b+Yw*zlF
zc~**T%TQwq;^~qxIt$%PF!Z?g)MEX3EOr*`JGCSiShuoMpU;XqBF~5ue{ek2RM>nZ
zPe14P%M{}(<yx@SWw6e~S7x@toY(f+bJ;0V)q_{ut-nYq-%|W&^(x0AtCA+MKS%K=
zTBTVb6YO8D2RSizH+Hel0Vc~xGv*-eC9Q&b=u3~}RJGuJFy2tz*0>36@)A+G+2NUS
zYQEV-XP~B#lVWe4PhalmtYo`<%WOESyO<{O?%IlX=k_h{7w@_#<<d+*&H8Yulod3M
zXiGt$+~+6P^Gc-r*c63T#oj!a+Q*bF3_V09B{HkS<eExSBihs<4}@O@EPk%A9;9Ty
z=uHx1nP7WW)gBS+*5z#E@yqn-{+A=`?z-?KbC2a7Cx+1T3R<b4)u|`uE@X17w*Is=
zV+=xsw#H?VCVmV;GzhmBGQJxS<D_Kq=M<3&isL{Qux%#}C*2BXS{(E^RTb&c*}mYJ
zCP(vpZ5kmJF$iyuX3AIYGHip0U5#N5jNeU#V>TTw;Qi2sPoSRF^!fDfaS1KdNjjhC
z7GX?{CjV&~i5?9f)JPTQxfUSG^;}7j5{j}j=DDs}BpMe`k+I!L9~@6;berK9dbqX6
zHSBEE_b`krit1drk>sd0qnNi6a{56mT?V@H$7QzzHMD2Av_lYqupp!{se+onU&NVv
zb0KK;wr1*mNd`*tr6ZY)&wR$}rQ-0cm-u#+)YKEa-pPaHi#u?~X9YI{0`T63p!s|K
zGT>Ctj-T+bVYS4xlo8+eMpEuO!$tmamcc^V*CYLwV7gfG2K>Vwfl=Ev+t?n`Q1FH)
zD{aDcrr)o*Ih~w*)te^Q(Ki*HYrjbqrMG@Hn~9&$rp-8hj26Vwvzt*I9`!<uuh4xM
zl+z?yI@{1NDN{i%mp8Nek>m03I-E-)lF<&F3xZ8oLtFN%OR=we=(I(Au%W4mP?m*{
zi)@Sc#&*8u0S1v#UR#Gfh&;V2{=WGzI@YXuD}UqXDBU+Y`I8e-YOxko30JWVRe@aJ
zq!woA^y5VR21amMA&2X<TlQxH9kdwt`?EZ<lAmT4clc9!TO$aoPPh$quXrJJ6u!CF
z%c<VIE;^~yd`Jo*vqy2(UYYbeiGnf{?aB|bjIM)_?g3}4l0Ft6a`~!0_alSFL=@~z
zU9@_WookW>=tn#@D9p^Lh^AZou@w{#o<Ba)p6+oD>3x>XbUe-6h#O!Xes3^CxQtw+
zy!cg7Ea9vK?HT&!CHK#d5<Z!4vnBOB;dtU6DB6K0vV0Pj$_v6&CC{+0Wk+qwq&Y+3
z??}H|)4$WB@DrV`xp@-K8}8wUdC48t9VNU_=UBDE_=?l$Yr?sKQZP0@eB(8}q;|YI
z)KnO?tQ5|%FJcM<91bHPzdFtdNt$_@g?E!K2{J%<sp<V7wI+1^&x1uvG&(_<7RQ_G
z5n~`e*C{G7&F_1vZ$k3d{f5gv0_*V)l$mZio;lU8)iOEFsOELL`Ny28b)g#}$r4tq
zzN>7_3K^E3S;j-m9T&c%uTikw?l(U3XVVennF|R9wk?4)<QeW1$FI{3x48Oesgnu1
zwV~_5yfRkZ4OOxii?5a_y>kVkejTKvrV7@)DgNY|84|nMqww0LY~cD}l`dLwCW@Ow
zjRCuN_fBLHja%+Q*-b-(v3cIY$4XMjfE5D5FE^@_3sYteV;52+Jg_^1qpPn|9e9P~
zuX<m>2nM+?39=Ru;SYN8Zx-;s@XSJgu88__b1qdRoNH-RxA|gSM2(YOTOpWnS*y<)
za30s%^@MYy<U7g4x$!fcGEb^<@WtmEPQ`z&OgLZAT1PW?MjNQxv2ct%e@sNt@M`uC
z36nhqe8@lZXNQOBy#RGousHk<{660dO#!L-+?)sRXCy%syd6gp`6GJIcM9jKnsSS!
zhni(7_mqUxFjCHCg72b05%7|nH@lAN6#B~-xC9nYPoo;$ROw|YM14jtP}9?-Wp7`)
zybHIz@Td5l?f2EoQ@a0$g1kvNBO_1zImT=v_|#GR$Hscqw*d-B8^%$NNB9?2iKEr8
zugbWFos8<R&^StgYm?!p-@Swx8;k)1h-RVlINm`PA<K^kZzxbBrri&T-j-4Z4tbh=
zhr#i()vD6Eo~+x@qC>FBW|M2L;n<{h9|mzP5LsDQMbPk);v-o-1sd{gF;Q7h--cio
zE2;ibKI2PY0gWjcN{r)7KJl$zn1N3%Ir$BpR&>F6zW14idZYN0V45?dvnj6u>k&qj
zcYMijK9DdC{6#&ZCyDnXKm89ls)IT)0?_;juSAMa)o0CiNa{@Rxwg_Fh81(GV7Q8v
zo1JkRK3SF1wZO_lvK1L;sKr&f6l!BWl}BY~>#~E*eYI}acFwP3Gu_o_RcUHN#%ph8
zRU+(bzgIR9?`c4*Nf%_V<`u=)pM!eT`75tP=C3j&#~9|Rc+=ixK##o}c{?age__ly
z;Lwu$f$_H|oEyp`*izH;3iA{<iZUFp5idk(Tx}&%QDMA8eaX#Np#p>6k{cB4hNrcC
zuW=wpnUH&8qh*;#{vQ>vN~lQC$}AYS-Unhaw?oh_mqbra+qTkbX1c`QFkV@nge?Kb
zCVDP8&AS;&5|j<cTc}UsCh9^y;vk~%OQWEhVr#?*YFS>a-3SN`Sg5D4D`+^%XTy%+
z*fN9xL2SY!eR#l&VDL>E>SSEg8D`ebXY)wP8#TOEn2^a`pvKqs25Q;Nr1K2fE>Sa2
zMWSe_ys@EHTz2&6Yw+Ud#lS4PK?n;X+D{RRJ6Uu>J%`chkFbCD@pQ`&8Ne$PE^r~G
zAjr4=JIwF%Uf274spSC87li?8U@`_8>(;)dd9VnqoMWMG*D-fyv+)~=TDO-u%g1rY
zRJsS)a2x)Mr9leHT1A$V*ZYvN+F6p&mAlktbEms$zeI5(1?qcG4w*ZobpS)?2aj~~
zErjufd{U-Rndu<A%=b`-S$8X%>UjGX0j7GvfRd-N^~N`IA@Ta_w1Ror)|pkAiuucA
zG@<hX=`9S~&nUve;{@sz-lR#P!W<Q36B~Sq#ksG4JP$5*H6pI+@DwOh7<w-?U%2pV
zkLD}7j@a59Ozc?hSd%pV&Aqvxbey6!-+Rvx(Qmbz(9Gy!<blQirD6QsVi3$oa(@J5
zHl3Zm_3hRK3Qv1TwmW`#u3iQr6z9h22`BoMM9IOCl>t!ttB7$ntQlPGB@>CLp1B98
zx85b!!A?(O?`XDQeccSli|CC=Ne*{(2`|S~RFq{3isoDo)qJJ+!M3g~RbxhP4AO$R
zaudhgfED<FNPeI6Wmhm@jUreJF;~j^ngFOs<2NMfabDGwMXGQGOK-ped_Y@xnD6S@
ztJq>UGph)Tc^%<}4uqgbT~xg06Z_vZ$XECmzxAlg((^~1j6B9nVSzcOQ@eH2S3}D@
z*+4`sd;-VIae>(dT2KAP6>0gs=i25F$A2coDEiS%>g;U|74BoXa9a|j!vR@;TOm{1
zEMhjSDnUqVt39Q`(f2;$w1Z^X54{npsovSah2c5&gR7WRI@Y)i@*?gT&8oEy>9>r_
z=Pw$gSLqwCl~Z`|i!8{@h94(<?L-QE@`2$BU{Z}lN7B|J-Z1z{rA@zkl(JK_a%Wf@
zx`AtfwbC6Fx+iIR%;b>ZU;Q4>53Gs;hubRFx00A{)bVZ*k7&Ptv+oUV6LEMg5pJBJ
zc6&l9fICVCJMCr((@R3#&l`GD{#X(R+XS5GWS~XMD58S<Z@dJcY=8&obT+Ev8K*=j
z4k=4jwA`z5netv2<F<8dxf80UbOiV{Mp(;aO9Pjs7&kuM06nCXGH&jqdu2BxB-S)@
zuo`$u=*dJRC0E9gr@!=6W~-1ja6l$+wvS6!!__Xh8E>ILD6qQa68vZTTb4kwNclc1
z7ba-5vbFOu9wI|2z>3lhW@`#;0C<Qf;rFD70V0kh@#aZTn8I4_<>}dcKj;ds4LDhG
zaoS{k8bz<oy-(ch1z!_aNYf4FMawqtk>yYO_K2+^PK$|2?0bzk)AktR{IQv_-cumH
zuc#a5dg6AO#Ywt+aI6%oO<0r-m&)QZquLwy0XM!y?GKH5J<x4~v=Dz6pqq+2v#V%g
zE@lmPwgMMA$aW*VejZg04mwl28lBpXVm}@RR$sA^vfOkTNmpeK>xb)^`|#f}Z(VpJ
zhbC|x*@d~HM-Lyl8r6}bBeKZ>TVaK@v|aV&8+ryk6^wGU9gkw}4$65Uy<H!k7UbG?
zmENx^u{)QZ?a23RLtNRS9FdTBd%k_oDuMa@X4i(Vq*vx__v^U&L=ykIKN2S}?VzJa
zR1ueOn;r^&ir1W~g|{(bN~BJNx3k9~;}Z$i-AWpXbF7mv-iYQ%HQviBx**KfSd?wj
z{KnI%KZHl^5XNr&mEZZ~y8T|}3B=%G)CfRs@W!o)2-5-qk|N3S&eJLUPc^%)f!fr|
zC?}&=6vw{>;NEXUMUk$>J+EIXY!sE5U-)kC=+_5oMvm<50v0ruG7VkcK_(FFIDe<F
z8aTc<&*_x<vL%0?y0Fv9h*-o#JeK}P>wD7&qWC3W1c?1w<U**$(^+UNnXm>WxM5Ly
zm<v{mkrqChqoUkR{^Qf0Z|d6J1wM4f`Hg|rKBl$$WB4cR0CFhw4+2A{S*i`?uaf*S
zlw9mdD&Qace>%2eDgF*&to76!3CC9C53}4l<h}}u?rey<AGcE1r{VdFf53e3hu;ih
z&}-dBDgbcRyd@6gY6q#+0(ilnFgkTNuBd@nE2`cNn`lrNniV&1#HwQU!^+KA#dLFf
zkR#aj=ABFI{_uoBb?~Arbj2G{{_QrC*w41(qp|z>xPt&nEub$=Q1JPCA4wK;#0e%U
zJ%~Td=-xeXa00`8=q6=h0{k4&*KSQPc=~MYg$R$LPiv1t9eDhc&^b9{<&jmahwy&G
z*XsWJmQ`1^nn$&8;uCrL8F`p&o&7J)@WSWaiV!=R*O|+-dRPxwYZgD3M7cKee9b;;
zn<K6vfYX>0!<tiXz*~I^g35KI=7j+BB5q~Jd*UH6I8eBV*y$=AUC5x5Ci)<Tsn21c
zp%Cy4+&X$XSta~?&euZ!m>pOq3J)_ku5KmofD<t+#MoTBp9xf6EC%$k&=gY_IF<9k
zxGI-eLM?s~1QYWwy!Cv3AUF4EdGU{@<9s{6kh%wPdKCMw-Hr}hY#9Tq*Su73f;HY=
z1X1SXN#29Jr|4mtN$7XC|3XpPrcU~;m;0%azs*JEg+lHWP1W&w$Cr6g5gdtqgN?j+
zOwi@k@U&zi5Y@AZi1l7@@frr0I}ZApd3<?Xu{^Jr8$O1%Yw2mlD_}wGb$dnPQA;~6
zT3PjO(;@LV*R+qlLxFVRpRnE^^kU6@wv}P23aa^zjqdXhdq<$u@-Uzd2X3CjtPEh|
zd>7bS8~qW`Iq4Zv{kVJg>=gsqo_1hyv>)BBkIV_BcXybV6&BK12H)-rbP#pnOuwcf
z<q64L_UpPDr0aTk)hm85)l4~MhLwuJB-`X8`Zj?Vz+#u(JLaNI7EPh8-A;$DBiy&K
z-8GRStN72-kI!apcixVu$Km*=+5~pg#S=X4iX}=d5qc$S!ddBW{tWQkI2SBazo9H`
zM962Hhka$5hpAe-sX5KCovV3$wW~nP6#ezEMZ`hCiSzhjHn5nYtn<qX6xT5F{hQ))
z-Okj}KmZ2MGMuW~>b8^u{I=-wyy64V!69hfY36E@qA{1z_l(fc6;RzcUNg4UX{D@2
z$e1a%dMX@n+)p?W%@O1L!Zo<FmI0)6$9o}+ozbdyZ+9}nv6DKh6=&|nvI&bKD}#KP
zCLz8~+MRl<kMLH^&?rXP9)O)ql@YOm3vjd`xSD2BF>`5sMRn{^LDA=qkHQX;UFXj)
z`}9?*ntlbRQfKiCKe`hw5-vfgyOS^eO1`<~A3P#^!K@?Y=>!p`7>Z#R!EkJ&Q}?5f
z&XZ~MyA1Nciquiwc9Q>L=ljNVb){~y<)f=7>%fYby(1g1z{BoBiSy#M-0f-zK!=O+
zWm|zu8XOIH$85zHyyb#$7BhAO8s0Xn7s<HF`!w2V@H8)`Jw~_p?f3Fa>^h>#&l)Js
zEY)NaFg{aIRE#Rl+Z%N&kMDk09M1IGUyxQDFBI|SdO4N3^~4<Ehl`ip4525y>6PSD
z{*=@0kWtQHwX#)TLBVjqn0|HZ^{v(^qjDNVt<4@iPryn1TAh@9-bwYW2En`Np8Qxi
zjN(m8bg59Ew)|BQQyTHe81dRK92us;>KYVUpJtmlpuc;L>E>Zf+Z?#Wo!b-e+Gby3
zTR2jUb$m+-yt2&0S0}5@jYlAilA-#d`$6*o)$3E7U6A<+{OvvJww5O<ExielNQTU-
zCds4fjD2|=c7JS9+XjymjLqVcr;N?gd^Y9$0AKYg@>=PNkD=3ja@_67<9*q)bB<2U
ziu4H;9o`}c9|x5<x0biUzbj)ELZW8>U19c%L`54LRgS;!v?hLaWd;Sim05+FLk)#p
z*-=z!f!;_oIgxyQ{o5*7YbzUg<h>cw2PLV!Un?T7-tD<SVR8Wi<FmBwuO!w(s?Dwl
zW;vcHCfe9p8tPn?Fj=|rx#@F0KXu5+32<a9mg{u*guys)fR$XFRAR;=VDl1!rM0<W
z7w(D(tF-!jyU*KxeI%HC(7qGel2Ht%$}*k*Vf)z>?5t^AsLzJw^T%c~=mpC<{n-mJ
z4;G6f{6KcS3JCUJnXm1w{NB{79IB1{$|kS+a9vH4h0~ATsTG*sRP6lts5k7b`|n?5
zSr%8G+u#uKb)n1&rCWvkfh2$bkDXeDQmaO5{s>`L%%#x%z7|m!f7C{hc>0zab6R!p
z`8yZha3Xu1l42$vd~8Lpi~5Z0U|!*^Vg-{SxR)$!<QP0s5zPBJteb;O6VSvs(}>cs
zD{$$wgztIOQOvmFaHsf_ZftBs>9_YaV>201OhEFNqOT!2xNlWG$v#3$p4@{w-0j8j
zM3#tk<O?kcVZ>&kp3abF+1ZeE_GeH;STwHWhg9xrr9+#^e&^s`yL=pLbG&!;Sb6$S
z+Cq~q8{9=6>3-V?A^!QK;*j)Zu|R8iH86=`*QIu4NC|MdW8QWkwFpAE4U;zi2-!3I
zP0F<91Jx^3G8roMGVjrdCQmrlEpAz$wG?jT`QS=hp}w9*N*FSb%A6*tIKTB$omy95
zBV?Qmv5AQIF!9Wa<xOO~5UI|~p5lV+CuxaCTEHQd?@r6y^BkLy(BS{57_{#4FKh8S
zG|%fR&#)%I=Y)a)Ca9#83Be56wDqcLlPG@k+H#Ouw`wD#Pg%Cg)~jBg!gwU1^=N3*
zizB-!LF$peKrvqB8mZiWSBF+Xn4g`57h(SoEiQmUDuUm>^A?}iqP=C8k^uThf%Ygx
zV~dF+>8b0_)zO-q72<Txc7Q@kTrUJ}1LV_x9nJsqi$A#2R+O^aSm4iin-Ag&9601U
zR8PFs*sMvc4x=gyDL0(1BK+Q8%})AqtC)2p54Q7V&%BXvC!%s~i8-z>U&Qb~BB~7<
z7z&yVZEFmC$DtXuJC$X1zLVA&9y~ZaEr8htr{2M1)Ss(0TdNPX9}lj}+Fj_nMR3JC
zW`B9}>KV9&xTGpK^G23jzj-t=+YY;Wd^O>ANR#bs$b7R6FQz_EtLmX?|M5^xHN7fo
zYT2P?LSJK(u+=-xR-8!6_@2$6CoAdEyTQ~Fw-FK|DPiBbopTRPwHj3`PPNqQX@7T(
zSv$*r?vkGH`#E3VFUa!XY<R2jv8CO&D!JYkU6g**W<{i5pR@OQ?EZ2(rwW*JqBv1P
z#Bi&asjdFUs)%s9yKBSY+>x%e?FE3-Xll;r6#~z2+cvz3zEDeo_xHeq=e_gF={;m<
zcS<T|2Om^dWz=ba8&#9Ne@>s`UWA^s!g!wB#TVun9~Ak@-?z#0XfcA;p4(K9fg5=Y
z=?^ONZoy^WXx*8YbFyaTdox79Uh-IVV)9t;<s2X<*&7*oV%W5RKG5?x5Ayv(tw?;~
zf8fO7rz81$47Zhj1^Im<^rez_X{}b=<X_b>LIqaNM}iS`<`Smfggj+Wp-jCiZCdr~
zxGS()MsTB0%3I`S*91ktLY`5B=FF{Db(L1zT8~xAP+_~9e8wEVzkrgC=OW1S&$bL_
z4YPn<hRL_-*LJkE{k{P|s>FUv%$8N<+Kb2DHgG~$!&a{Mpr`4G#>{6jaf<H?|75p7
zJ`ef#GhkJQNzBXRy($6PoA;3U`p9~SJ<v)7K#jy9M73}{Ow|SoF;$BVzkRpZ#m|^X
zhfo_ZK<%$e`uq^`*XerpL)k#X$!LNh+#Ub>PI7bz=xyYb6QmW6S1;+fxPT$*Gu9_d
zTiOio+K8Nukyb@K+dKWV)VMYh7M*>+S%Ae=d6gbO6uZ(K{SZ>8lo3y%bv<k=3tZzn
z_VH}B|4_3vvfnWsHkQvvAoFV7*x|FK99#J>)zxOpo?qKTj2tug9-P)!(U8+PNoaQ&
zPXC!zgX1}XmkH)XZGk7IN%8q%n9R9*3Dz^;r+{1Pr8hnL$beb{3kH}q{#U-bUz5f4
zKx}+&8GV064Jro^L@D36$ffFQ#^0q2LPdbN4!LLiuzFeh3eZMBMHa^TMjS@7JwS-)
z?WN{KzCi|n=bj~|j~pc1RZ@<j-NyhSDB*0l<1TO(!upwfc9f|h?z8jVucMpK=FreK
zpfD$NYe+BqbG3@-o$w(T(}5^IIRFZ$sE?Bwo6dZdDxv%X9s*AkDHS-_q8Ym<qmrvz
zFwtqZ5IhG<6uWIxY_t*_B|eD1=01SI>sU9>U%5=b-B29qyUDlv2Q~S+@x@00-o?Yg
zp8&tkwI%99R8f4HNv<&7+`sV?PhEuoN73w*#KLr3Qt!&uh$1@w8X+itKG)*^p-J9m
z=WUG&D$n@rLJrVTm_TFK&E926|5iJJFF+*12Mn1zY2Egz<X40Zr@C?PTP(vjCdHSS
zp=KrvJ+%U=B5L9vlx|lAM|a$AQf&Y`!_^3<Z1IG?L&?N>26t_;&Qk(XE6R;eX8Ih$
z)Bf?+=(1r^XQ(L~+?>|nCqO9^IhIA;&H>LTpzM=H-fp-z{mI!$MkA+hsS9vorJk#u
zc6saGsd=FQ$1_9bC`A-|d5Z+HDvCK=Y_lK(PL6mbes$*oxwA>Yj&8O(-$2FVpMM}`
z=?~mUP$lWLGi*STvU!M-K7WMCekn~f3r;L`GP<m3hiw@k(Et~0BGeat6%x_yYXX0%
zIB$@3%O0h0EHSpMGJ5A7Ab8D{&nhn34J32<7I@X|8#fP!Jn~yfA0G86p$d3z5OXaR
z^8zc57RmN0MFW%{%Vd0ng$f}!!PISQzm%8qH^4YGGal)O2k|R3iQ)p0vwr&GFA-rg
zyr&S^Bv(@my3D;mzI(PlE^H&8PepxqB!>(OM@^6c=heNfN<1^EZj#4uYtKsRilRgS
zRBb~d#~zk%O5;Qmv>S^{q?g{sh41=EddYyjQ}-Xn#4djoDT;1`2_V{zfb~2_Fc6L6
z^Hn*{fP(UFSfL`K+?OmXDZUAZCu!(9Q?`I!8NTMrj<q0~4Kk$V<#Fnf<Pxy4(n!<D
zZ_WUIOT88UG9}60(TIW;{N>|?=*jz1dLZL%mbCbd1aF_=yQRCSp4~5;UZ^9M?QRU|
zE8{|<`we+O*t(ZsEFFsQR$lZQVDSOAUhiCT#Sf6VyCAn7c%xm=JwZY^WC!m`pe`9c
zs8rvmGnLTPxEAnl7v;Kao}?iM-W0IwNAvC)^$Wu{B2nN#|DV-iT!`43tngGe@}1d}
z%JdB8)mm2Kbu>ay7^V@}C(YaAPra(cI7l#*9vlnWZq!CW5cWyeL=hk8Y(+UP>5=W+
zp<ptE1V%()=)9(YT<a=rCObV;yHUjTWE9fEW|pp!6!=Lgb=0Y{0U&H!k7vdKe)^4p
z%#=?sfr)qZ;{i8ZrDjT!6(dEQ&665gqxzqzD-Y8rRUrZ=S0cbkTJihH*RB(v4i9tx
zdw$q}T6PUt;{Nd1_W<x=Imt}xjc<3E$i%JlVF=*8OP_5`OMZ?Ym`DH1Z0ixGxQtJA
z14Qw5v%n!Y%Qt{u0xAen2?<Y8_z!t)P&*N*>AAX7SB~qM7yyB7Qla3yDR9_JmE5TF
zA4CDdc*lO9KbF}A(~@tayPxht*B>noG?!PV(vaJ7Sv~N11R!PQE0^F5&+ZAnS3Q7F
zZ?qoM8^gWKBo-&zy|8dH;(svu0Fu)H(DB~}Je0>o1_wN5_I^FL8tS8OtvktjnT11Z
zAsBPxZ1H1vh{+31Qm=aYItkkJV`Y;a4~{4BV1BQwSCU2O;VCo+j#n|<EnO};8@7WG
zJ-|C13xG}AH#pus->BOyuOXzUNAxYrcwI#gS>u;klkZxIaVu>#8=ilBP|LYwr7m3`
z_Ml2-%VUnI^Dh%<mtfy;8t3BH7qkZCrrn)pqI*!Xe1WjJh+@F^1fLw`H~_e0i^nhP
z<(o@A$g$6FYrxjKX-GSllxCv<6i!M&wC&~!;j(RU?E&0@)jTw4dC!~)82h%h-<^nY
z%(Qw(rDnj=$4)J7Ad|;Fy+DngQgiE5MY3j)Ly}t<0Qh$ZRkd;9X)^ekh87a1BD<t*
zQcLtkNd>fy9Tfc%?qg1u*>xOOoDm;qUCmg18$No}3B=Umi?^duCG<S?`ifP$KkO$%
z9*l69H3SjfUtlz=^=@}Q$zXxMq>u@%`h;oPObtdxR_;9~q8|fr%Nk<BNCC^B{y#U=
z5F-kLe4ovq3Zlv6z4)$BRP{>!8LwXJV%n{*U{|bv%O#*A`sk8vtmsJtORs7YV2oL+
zZbLZ!(Ls8rjs=(9{M9y}q|w4xl98)ZRw2X^rHRZAqW!NS+-EXoF~XVTBHkBQJv!gk
zLmX)@Y6G7vlP?O;pqHf04?L&c&M5nZQu6eMN&cztX&&x!c-`fYQtfq_DFhWZDbx#e
zsj1AsVXT3EOKe6~c_h8KdksZop{K_y-(i1BDbWMYDeT7%1ntkCz-csbuW2#@0D)OU
z$5cI9-vD}~PxlEyLong|b?c4iGG9H?UVn*sN~F8%AVEm>au&t?U8SUy)ylKAIsG>g
zjm5}Qfj42=3)ptaW<&=M5h(F*j@nszMSr#3onL-D0VUkP;qyS6vI%$}*QXS9?lb_H
zqD_2K#tD%g4^3iU^1R}wz3V<T^1RHFMOn6oP`SE}&waQfxE@NmJaD{uv3bbCS~C0m
zO*l87|3f+ysX8H6!(04Ny*FMJDc-Yyr!aFTT^GR5X{8`+ee!`Fz{By{S_Q2LPpPkP
zM9t-se3oiwCm%qzz8Ys}xY$?8`v!{8os*0fT)$yexC+Qog-r&WiK+XMwOKV5eog+M
zE5%p&&Y@3o*F%K(X2mQhmcD5;0NW@jbCz#D{BpMJ@rM8dU8P{kdxjbmueznr$Y)8m
zmz0E(Ar)so3g+(|-00cAo$2(dR3xcqbn->GImQ%mj;Nj|V-DC#L5LcUFG+LM1piqE
zJBJ%9+0T&GQyJf&6;>OTE3qNNBzkZ_3+MF?xYi&`Z&)(}@rv)9m}$w=|IYLy4y(t3
zp%9Hq#?7op9D3K?0;MCrQf1q3-2`74qFXtsL(Y6Cj!A0;7L7-PlV42PGgs$o)c4b-
zIEnjX>kr0oEbzxv5XHrG;o@h;rk&Es+<A?uHe9K_UJ^W1k%;0SeE|e!#4FBvA<hNt
z59tg%_Qys@Zrc+Cq~*l}utI}}sOhHDTN#8!J0>n`zy>L@XO$T1ZNGjL7xI1uCl#bY
z)RJO$h^M|7=BdoOgs3(Artm{e2E`;^t6Q06UOZmt+Pa<aijR|;L-Ea<5^6UQSg0jC
z{$q<rgiP!{i7;K@fBo@Se2QFKSbTcZ#2i+Za_u;cR%-D*E5pyS%=KWP#fDv|73pOl
zp*|vn#ic*LR(7kDRo38&%0$@sPnAA#Rpz{UWr>c&Mo2_#`QRmZvticf(mW0oM%m!j
zqoQkDQ1kF{v98x*h~Xe*fTK=a(7mwr<OOgG*`BvwEjjE{cnggmk%SxX!0&Cn;4SP{
zAqtO+>*dd@G;R{+`=|P}G~}hN$~<P|Z8w1(5*?y5<?dAbG~}J_++6cr{<8CL%wQzP
zW|54&c%xrn2;LS*%O_LDG^x=ruU{BU^a7I7f+fKedn*RV1l!@iApR^?bebpg2p{>Y
z@2zb6|JF9ze`|YUy`Jc`EU@+N!i`Pw@VA2;JCI$BkmgML1qS$akj;ALqy)EM)m&Du
z_2@}#;6R8D4A1#R=SNS`A)m+<&w%`TxYx^^_HHwy%$r|Tca=t$@!IF8{B?yJ$3K9j
zDoAPTo%f+wsva6IPq}e9XP9+3y}syw0Mok0{F@}N7moUTpx10MU?+nFOu8=AmD}a@
zjH=An!3Gy%D{-o;2({}KF1hO|$CFd^LjPy8@;96c4NQ2yeZwng5*H9J<{CLl!ZmoF
z)Wj2-c#RVN4={uOt9tam=Z4k6Xp67OH&w4)>&aZaD@OA4PUGWvXkN*-`Ca^Z*<bNL
zKcfFlIKlr40KxLN?v^|FDFFomo#g69?wy=`nzM_ii#|mLgX&VlP}>xp5;Hm0*R+rO
z;+T{QGakGLGLvr^>+e^W&U+sB17*6m*HVfOQ@<5x(O!xDTS)wWq-6O2^Os_V>Z?#0
z4(KsIdp)qEtCWh2V~!|#|9&_+1htxbVxULUWnP&pf)dnKVv%l*nX!o|6n}z6Reipo
zV+~JIUk@8(-kS}uDxq;;9$HBX#d=Nq2fP*2!Q6O~D;BeVHh6*0Y*m^I*-3kIx>2)!
z%?UI_xI79=IP?Mat`afRM#n|uSydVic=s@OSA_ILgD%|z&s{;YYF}@x@I|4Qn?SEq
zamt}F9<~;~(W%||HTYry0`fDFWL0*+`u6l{BWx6qzjP$SEEV^?H$;qOt?^iO`3)J~
zQ#5}QS?(1ddnVK->R5$&tC68ZQ&{cmhI@1FWjYee`nEV+xwGTXY|L0Wj;pT5_A)QK
z7`HL&1?fXE?rgCkXQKx7J=Uj~NXXuKaW4BMu3D}p61|q2=3c;HXD-3rEhv*3hmx7f
zb=J7viLt^P0LuD<ORe64W}l+5S#!d$pQ59PWgpcjd(jWQKkF3h`SKRt<py%a>GERs
zH55B85Ok*m1X04^UB|$LE}c{ARSU$qU$UZ`Z8CCw9L_+FwmxFs+SY9l{f68d&b2aT
z0-}K`nndPUCyy=u7Y3Tk>Rw%s#JL?`GB=Lfaos8wUzFpz#WP6^i{hB(;wd||9wT+U
z3rNfFS2n3i$~O)r9+<3Aa@+x=*I+n^329j!Q>xy1cA{Kq(Fo>Q%gs#-YA|GJmz|JB
zTDm&pmx4Nx*Xf7GwHb?uleE;hE+<dWh8}(C=W4kX=0>!Yyt}faBnl$N#va*_<LQ5R
z^zx_ZnjQAF^tz(zIWQ}#Wr{wT4@d1>ISEax7v#Pq*ldwrzI?zFynnXC68a#?Ei^f^
z=N54Su$I_8(2kV<?3p_PAInc*=)Pwen|14YD|>_}w+IW`$369s3u)=2RT|XjFZkwb
zu0ZU}EV>9Z{%5b-WFQaZkeuW>@BKyqb4mcjY9!*40q7#Q6djT~%1dPl$LP=+xjUB{
zGlOl<CQGe{vv2nCT}k?jQS38WB>F`x(H?D<_GdFhwQUo5?0HzQjTrZ3Nv0mWkt%mZ
zjN7a*XjlMbR6wo=*KbUY12;2tLByn#@gf?iKOerjiEo3ZCLI={>r~T+cEYk2wZ8Gh
z*k3H%kGIe}RA|BjQmnzxf*X#veQI3KPmqocQd2T-CM~*-nn9ABEe=;jPkY0RmA{IJ
zvaQ#ZJ7$jS?QdDaf%Ct9mia~EVWx@~+rXRVyVPJBB~i&=9R$}B9|&N#9^-8*Dt6yL
z8XCqgyyp9R;g@ZCT*O^`yaD^2Kl;M9%1oOJ6S<P&lvJ-Lb@l_`U9&I6rP)!true6a
z@f@Cf;@4WB3o}qX3%dO<P08T>JxdaC?q>Ei@=w<$%vT#RWQVLz&)|5_ZyHc8cBlv*
z+;;-AQGc-CHdCE;Gvg3m$b=6nhj(SPtW3rjV-sQ`3!L9t6$$gKg{e$n$NQKMFuNF?
z50J|tnK#2CS<yR$*chkHtlxMl;XMWOrv5-VL(}i0^&Us7fY2<V*nO^2ndQ-FUdOrC
zMT#U-x8uSXzeh`GzeQZ^R~bNe;lb4ls%DS|P~YNo?jtN&6TFDn45rsHa}1nih6kaL
zzM(H3SS9?BIkf@W=fGCWs$P31U@PZ;wj)v|7-~v1^ifllt_>Cj3+yrkoz)wc`YdBy
zN2(Z5IPnRx%e3+N2!G2B0ARG>AbreC>86-K#>GLL+FeIFYf&NnEX#~1VF#41Ea4qM
zmmD#?UG=fGyY7Ib^w7O$dZ2?NIa_EQEoTS0BDy6Wbz4Z-+01bR6)ciQ1;sE3KcvtG
zW%96_2y}C@sL3LkH4fYRJ_4B+urD0-;MLT>-1o<i70&I({%T9~p2GR-==QT&jZP>j
z?5G07Oi;ISTJ6y-i@~VdBq_?tIREjZiMQ4ekYBg0R2gcKLyW9Sd=%ZEHrgDs@mS)D
zJV}yR>#+gHe$+RF&&S=iz`aHB0Jc&QO-W8f^j4OgR~<ppKgxmhRKED`%VF&a6!S5~
z(b14*s-NDUdVLn&@1(*IelOEKT4jby7I~lbosj~}k#|5<RSZc%*^c{>3&~7VU1Q^h
z<GzRFy$jjB);|l+(F*H>;971(^-+@%1$p{u4)lb`z7!AWdxgZMeBIv!^2Fla44VM~
zbrXmtWgOrj7&dqD#5a#DYw=;|{^Km5`OS(!RDWJV(`S{ViaWtk3dc8d+~CI3pMidC
zTp8FLH0f~%#QB!sxKL5<T?lhenA<zI0JjI&x<FOV$y2mNcgH{5A`-f~aPM6S0V9Fj
zbdWU$Q=l2n14eTP)<(#h4{JEVK5wZ0?MTVXd#Q7)qVFuV?*$p;#E66$*ch4G4tUd8
z-~wA#?z3P$DHUGGRRwA6{$}h=_vb}}H^}zLT#N2;5>xz_vIxYb@d?&hTP~<Xi2Hep
zjGsR9Gaxr-+_RqF!A8vXfj~!Ru%KatICu5AIWR|Aip8<q%SnpZcg^odQ1^VTh@PTn
zqi?vITAMfS1ytocuZx%II0tIh>WK4I=n#!u3bcuQAkQs=-pJ<s(IC)HnEeL8S0~UM
zRH*`COa8S#<05^UcajAlTiuv}D+DFVRsXaxtn&)fM;3jM9gkwezqz}B9*++vCdZ*o
z?8E&HK!kbdgUs0VEepBfiGYHP#fMZ<ei14Oi<NJqI*v;jH|`_d%5p1AGf*&}0N&Q`
zmLNQ4G|5aK#EgIl>8l)Wf^A_YNVgcjf4;$>IN6^2QVMAa&poL#$pN4mTk72YMA;<Y
z1Ei11r6Jn64(pz+2|g`(8ZP{z+SZR&_1TAKy{Lux+_c2~aYqe@wgW@cgF65*Qh)X=
zFuWv}Nk&3v2*N1ic`mTHkj%`E`l9J3Q|^D+Xsa5P0JK{kiI+`a-p*VEfjLFZS4^;H
zqi%YYd-5WpVy>3Sa-PT9L{A#CinqBW_acakZ@2kdE}!`-*u^91!sg-eeISZlZVV#I
zE45>t3jdse9J+nTs%a8EW7eapv^m!R7%Ci8MslJ!KwYmslgPOSYu!CT48{v8I{hFl
zKjPxM&V}lO@My`KJu#{TeOTI%L1miYwqMW;9qNGZSu*t4Dylz$j>pxD`Og=F9KieV
zeEWl-d^Yn?8&BWp@jC;nVMf@iVf4BK&BcJD3EK&Ktt0I$Mc<U51i(o1U!-p;UWjL9
z&M-ntV}$Jn>OWy*>G<6VNKPZ?Sc@ZBAqo}eo}b`LEdxBC)cv5S-SORPe#3!_#_sj!
zt4R|rv>xp2^!0;uw@tV|<F|lDRbp7xJbj<>%=^1;=X+~hW9`v6t?|5VQdzOPu3B`O
zNA*|`H5sP9;6;9pwYRDMnDRITVUVj?;_Zw$1;G0RQ__-6$R;e_UAey&7eX0X_W3)h
zS5>}ZI1z0ND3{w0Xf-ALx&$c1D*sLnYA1JD=?`S$@`O*o->l=v&j^%5Tg1t?ru7aY
z-MJAq1Ho|1*k9v%x8hEGtRPz=c}RQXon;~wqjRFrOvd73<YV$q?E#uG;zhWD;CkKM
zWzYuNQ8_8=knn+xFWJVQ^rv8*f&Q|X{n<;)D6*MaKpAldH!)64j;8kjXW&y?Mvvvp
z#<aD0pt<80dyhu-hI;uVbZ||C^v-rN&B4}fk7ICdL){S=wBtV=Mxsx5$|#xC2-$20
zr(s{C*j9$(8(;>eMrX+(yWYsx`_Gx+c)WM1sYH##JI-xZG}|DIgi?YGKvHTI@9}3!
z=JR4ON}s6^HsYDnf!7*PPEDTx?0}CZg5j47-s?RQf?2#CdZkMwvD^wYi3YxWE=E|)
zmQ0@J$L&B)4}UOV)3YwLUUx@;)UV^|qZ8K2g0oZ&HLqIP6GVTDCZ1d)Fuz~;d}r@7
zqk!7WSG_!(YgrWU$E!sQA7GN0(aBV`bERnDr3>9o>!~Hvog7aeE#p&_-_a)WA6h&X
zPU~_v4~ZkyXDS|(bdY@JtCZ8~)xd9b<>r0bp|>O_{jr)udOA|n+mUKS*~{JBoHgqS
zaR#5ToX==dOg>XA2IhNP1?Sha?5*8cd?#`KM_Br^^Nc*M&yCk%LPCsOUUY^eIg@nj
zQw2N}x!NYwtc^$IZV$_dh+|o*SRpd<;fIl+$_vdg@|7ZSt-vFCE?6doFW;qFkZ-gj
zS0>%D0$$cT3(gPUHTd6GIBRiF8i2MA&In*o=Y-ft*V{I}<TQzKv{<qla*j(KEW=QN
zRK_SW@x<q=v{l8|?tiZMZLs2eQlFwFFA2WW+a|_!JwMI(+j=f5&l`o{;;Yh~iGDBS
z&T5H7zw&<mZ*C>&Cl@ZY^@;3PxZu2l3F|OV@uS<rKVTLdI29IZFYCsYzljR7OQ^v*
z|F(=X%B#I$kYQmjyRl)0eT@Juoc#XxRlr04+rGUg^2Z`L|9Ti1o7?mO0GXhfk4(qP
zvzk@-dmR!2iRh0v{x+%NQ>0x?Uc?N6`-5x?I)To`<OQ#qJO7L$+c^YZ8uUT_ka4oe
z>9(4oF5}t9e}=ef^u%8yhKj`Zx7$~M)=p(F1>-UNGkWpF-*d6Kit_Z8=^Q|#lU3-(
z_}#zS=w;?VeOg5FzbqyN*_YL7@N>Fa$D}ooU^HP47%SGJ=oVH`M6mQb30<Jxpy$zO
z)qP^McHhGzC@41SJ?W$}@FRIvQc9ngq2N`y8f`T3E$bj<c&A_INvr;cvfH1OWb;pw
z_S^++X<O>);4q4({GQujNq5;aw(m}>`ne<JY3IWAGHAQmB1L9sA*^==F7tutWYnbb
zA$kYvKeI|O_FEUPof^QaU&~MLEU8?0xLuNHAF@C6@ShPe^QYw<hYD9j49RBiD{nov
zGp7atk|F>}WLF174nHXKGdrHu4<YO~dPYOxVCw=q5l1{8(a8Fv32kqeiM@-T)&-R5
zRO2_N#}TV(pSaJXwse4A+~u_cB6>Hbe2Q&ui5s?)cGk%HMflWo1f9@;@4;e0PJ`#|
zqGlJ#2gVE;_p{VQ8z}406*Xq7*)riW$`_DO`b=kI?}*B{iYD15`|LArzs{5)MOPpo
z`re!Sn&(6_4M+ld>7lnCBWC>-|3&J@##&dK(k5HoiHBi5JL=clGnzA!|Eg6E^jEFV
zo^nL~6$&HH9t_5BvhP>iC|Piugjk$Fp}wddsnO!K?vNgf_?5aZ>9v!6CFPshn&;2S
z|23`y`O|b)Ey5Z1i>aTe|EMu8_kW;9EWUbvkKHdWwD&VLr0nTME^GyULT}Ed*&GTD
z;9X~6nuUBSDa$zNK81fRRFVYYzLkf3otN&qk71v6e;K~N90C|^aQ;I|6`&3)bxiRC
zqo(Iyc~4=)>5A`mUHyBd(Z5{Gq^h3zSkbn>i&a~u`*DHzD<f}(b}bN}k|vMH<zK51
z(a_cq>YVmD`lD-2p5|GY3huW3FM48s$(rVzJbCqz324*Z%bERY`P!gFh1$L%U1Fn>
zdOFDD>Y2ER#?Iv?(RS3Lx_eg{|42zyKiT$(l5zOHOU|w7{@-=n=X21zW|aal5OJ%v
z^b)ikdsTZq!I_8Dr=i=Xsp%oJDOdx}C>BKYD^{tl9c;c|!v5Vmp4oI|KUc@Tejn5r
zLp}Ss3zSm)_Z4M-xp*SeT~af0zIJ1Bb(s1+{aE7C!~MK{G=IWwR&IK=DBq3Rw#m-L
zKEqKjK_8;)-t#u{2a~#F7eS^u%-aQ_q}gOo>S{w>6d;d|p5UBl`)|#CWmuG3)V76y
zh@dDbt)ev2tstEeB8?&-&Crbkf=EkBcS%SOHAu)PASewp<N!ktF)%dmGZ@En^qlwo
z{(b!5$INv-&)#dVz1Dr-YwbOJuK`_wGO~&P%jQBke-^);sciFT)6jzJm$bVpX5y}W
z2Rm^bTGSr9=^ece0C!ekD}4s@Ufj$;1HI@y$H8Z#Vx(OmJIpA)3TcyAr^K%(+MdXS
zjYkRcPi~q3)rFe9e66?Z@FTcxy+gCU`^LdMvf5`|7}J+1-kJ2G2Ki=CT_zB<Ztg+r
zURbx=C6ie{dShXLPmLg`Jt~{cdFV5>W9O~;Bl8Yzv($1K1=ez3-O4^KyQk$NVk(^r
z)>X@0n(mVasnR>ON)<uY*XaG{co_U_(=7M95^J;L9&!R_P{$#$HZ>mT^g1bMJ&@|1
zNf`Ui=YFK2n#&*>R*sPb?Df#0_0+-9es=xroR9ZWPNMPF8#r&=RkE4SB;O-PbeRTl
zu))$6{08=9F}df+&<!SOE_RhX5%n>u8)RwyF5hE0u+g4O&m*vIY5;e1&v0L`eK0hm
zOOdMbb)JZ}j&U+!uQuPI4~&r`+XS)@CIctLh%R?Q_HR@lU1S2^k_mMdUV-=kgCW(i
zx>emvgNyE*qTdbtJZPO^WBnsgoYwZ)+kA^XTnxXCq`62fxwiy~N%XGGSC<^7krZGr
z?-cP`*r%&nG8x6O)U5@vJo%9eL=Gl9;(FZ2EXpf;C`z0L^_tkqzK9SuFa`eDZraQX
z;3d#Rq|GN2EyW}8Nkoad<dz@D5#<cbt^0IuC6`0U8>WM?H)e?~Y>>|Nm7?y(&~@j;
zJl*3Rpb`tTkHDM<$M-aZsJzF^ZF5o$kCp~khF!d`dXc%Kg2ijnTLS8s4E6+s2A2Oe
zCGnlPNk^U|7sGmDYDw|KiFC9;@a8fX`b#o07b>am-Sl;ZEY%eGOPq)7F3GvH-WASW
z42#FvTkcK?|0~7=E7b=Y_Bv|mRi#g~sa!7h$bMu()POeF-T6%GvpAdNG3Hnl{26v^
z*2)NAZGhhRv>h^oPHMa9p!uV4f%zq#=+4=@{uH+*9ac1}YpL5%ix<QvI}13QZAgLr
z^F3+&GtnN$EUXa=YYuOSra!K1y<if|)}W_3!tO!c_hstvIrC4)22OjW`{`OWTJ9d{
zaI>2kxO-7nDU1tl52=qf2Xid9bqf2qhkYO8_Z;zu+$niW0jRW+PLZrQd<w-#BXzmT
z#9&t)G_xO7l(jX@F7w^5K2A0%tXG`$K330!A%LUTx#PR(b@NR~cd>HzSF25#%7Xmu
z-;D8}F9JFkz|^-<#!X*ZRX2s@M7P>8y&~GdD_r)Uy4PHM4xx@9F&O>$0owBZ*v}~Q
zN$oxq2U$@9kR{uS8rUPuK?eHx%l|ZC0p?VD|KQkrFHA^qK~gk~)<aM<!1!?^(d1BJ
z*{r)w!hDVk0c#n!Znd^9LuB+O5XioT3jkDe+e!u~rgXIqZxjvn(ZipgN4h|8$mCp7
z%iZdf`ig!8h~MB7HeAUaLjr7c{G=9(u%D`l%PqHsCOJc3?zyAA@0ujTg)@;?s}#17
z5A~h)TINj&kp^zT9{c^YV10~W>t}(D!uc*oKHKV`+RD)H$x2R*@kf;5{;+9v@}7Fo
z`Rm_u|FQ3lY@}tv@y!eq4pWmSn}_)g#A-%uJaTm(1tODNHmz+AK34@nIu@HPQH~01
z)vpu1ifek6C{tVyqHe12W3kNt?LFU8=_QGm%8b9`!2d!ffAYFNL5s|I;2ML&m}pzm
z*L_TcRD5W0xsyO1-bY<S?3?D>i+&Yi_!mebU2y#$xD-1cwb&uTSNfS&yPuHy^UkAt
z6E_<!t}!T#4L|Cw(*y#27U`uJnQstB0`IH&y|by;cp1LqF+O#@ZqhqEF&|aid*t7i
zEIW@Mt=>!?&5kuRv5FphS=_J(;%`wbzc)Y1{M~gJE4BVg9)I)=6YOC`GKI1$E2Aon
zvtrxpC7MuI8C+<J+hTbN4+^}xE^nX{)VyJ^Gaa|`0^>ue?~(QcYCbVvGl^BH=BS#7
zE!79Yknd2^^PzJU!NpEg)-_(hhz3TC)T@VjI;0LtQsxko!JLxT_5BxCq6vz_dgc?$
zyTr_PA3yN0di?Ug*E^f_3P^8kXP6e=Zdr&hO?6P*6$_s?OB!aYoxZ&?ntH?~3K_eh
zTO)`};rHr>YV;~5rYH!7b>w|juX#bL+s;<A^eUf(Bt<5kGN7zs>kbwmAMb{q&hoAl
zh*eJxf^&(}dmLB_YL5||NN)H`yRh68=UTKEXZ=tQO-VMJ%hfoDWLr+=6)oKkYNz&^
z`@7t>M>nm@2gUcC9Dxa-s{YGivsov>ih>r7LS<K9@z_2wa2?gRc`ikY0tpxZcn57g
zS1q#d>oHXsJ~}}!3D-^aNu=?Zc;NzPwk$@9ZGSLS17u`9FJ$IC*I2BWefzCZCyYp>
z_bl8a#Z8pC_s$ikN8jdyjY?**&eu^+7_m21)#B3P>C4^V=~~Cwu3Qi3%nZtBzN=w9
z7QK&wY^!&#?WOv_MAz2?zg9Nby#$GE39QOix7AWfK`jBNnAwtPjEj}TUQX1<^yj^$
z7Hg-+Ifu7#PD(bMn%ZYQRAn@#T6Fq?=*oG1tL@AApxxQ<MyFn7em~vEZ?3>ga}DE#
z7{wY>LHLcvK=1+~^*|09VK&4+mO~AAYd*hE{H!slO}Vv;vRM!3p<qB{_Yzgt;tgHY
zbWVahrAw=vSuWI0&osq43=JH47a_s|N1Nbfm(b!+yAaa^6F!%(n=WxluZYylD4@-=
z9qRX?CgUyP4#!#EMvz?s?|$m<Z1Q`OyTly8efy|6xQ4<krN0QscPsBRRg|eOT_p(<
zKNukDy>FLvskm-^$Y~TNzN0@o<!NdIK%Ja9_ffiqwr|n;`qfL#rAz1GtGA@l0_;GT
z^{$=$kh-yoX>YVORby|Mhwx;5p*7|+-9H4n|DYfWF8eHx!-mwXRQ!V}kzJxx!`(Ro
z{z+UzDH92S&ghQr-HVTbHZQ33&N=|q1Sj&M;PGxql(x%ts1zRk!k^jaw+t%X50c*Z
zeHsGAA{5INcUJunS|JAVO#e4nz-q4-@*U-?01Fu=uvX~!;QHlqmPdKy>CgE^x08e?
zlRDILpP_tKM!S0tIqlo;&!5`$3m4@jq(&nr!0!fb)sIa_5wTrn2X{&9W|a$gOWW0r
zZYdAy?5LJ=b2QK=-;(g>9N7SL;BjrENwh5(fQRlqGhl!<d;NjxA4BVW;1G9{{7v$@
zXIur44;Sicu+6j7`aV;eDr!stt0gEeba3A)W5`l^+}PVi@WghK{h=n{!LE(x8Mh8M
z8oKKvDO(=Vh?RG|5p_|!{L{B0RZBZ;+~^2T>RQyqz?D6&MhJ0u9+ji^1w_L($^fXE
z<z@P1qRA%IiO-SNd$YRp!%ksL!Js!<aD5W}PSBt1V8z!GCA-0?K4lgdKk0_dZB!Z5
z*g>Ur+kihfe|2HJ8n;8L!*j3~&x<Y>veyG>saRywppyGe{vtddx{UDN+~NLO$|?#v
zy1o*?q8!rmRY@y(iKY%S^$SmuLI-jf+RIRvSw<VE{EYT$kz}funp<{Hhe%u{N(vn%
z5?~k|i$#!)+lA{s2{P$HEK#;S_U<V+xbx{_H4&Ha<dVYZ7Gf)?c)4;@xh{8RH5K#<
zIL6%Bf@HDr;uQMTEI{<vpW5#Ny_DMY6c0E|R#CJw(|0!R<}%e2XnTwSuFjN6bvLMG
zZ_6nUEj<w90M2{x!*Y<^+<b0cp95c&PQDk}V4K?#+MeUl{}C%{<vQ3J-cgBKWC^du
zi)I>YkC<VK;O9EnDYG+Uv>w8jYhbg#Q1Qii+HuI_fW89`KXQp%2D<PEqKsuJO+hB4
zFW%i5`yAC(c%CwvJ%LFvc$2fU0F<<;+N&gPTsxbeg7Dpl*c7{BDjDBYI)9AWH!ySl
z;TguA=kA9gx@%jhU1mSbAHAWUCg&FL7r%50inIeA7KPWx2b7F%PO}`Z+X-OZ*mkFv
z_uvuqexp)mv@51>+}tsZKAytu#@C_ONxy3dmh+?d)pgfN6UUXfK5z=MC*S=W*1A+~
z#V?Nbj1KE<61sF4-qd~m>?6(n;Em?<xG+42{e4Rt6-i>ju0q^Lep%eT&O>Wv54<?I
z?Yu$Wz_I4-M{`g78TLae*#&NGk!N%UvsGGA_5_?p^n&zA^_~Q!RPqL<+L?C8j=ubi
zE6dmyVoBm`+UW6XDs>9JAa2K04e;2S#>0h}J^{#4l|!b;1AD#4KZPY}CBbHD=l9Ah
zJsV-sh#rbxrJNL-f!#zK{mQdB)W1qPbf#LS@!pg!qbAVYRF@Iw)71l%9a1}mc6y{b
zMk|iiA^jNy-7L^-zOOZv(}Tdy-NCol6Y~40`udofV-am{(bTA<*65*K^q2)!H~Jyr
ze35iXN$o1zpGm}@M<QwKm_zjs*KmoX+$1N#s%CDl5o%GuQPS6l9<E=<`47n{Ep*V3
zH<`W$(6-b|8HZLUC40YI3-j6AK=zKd)e9)sN__Y^|CiHX$Mcct*N$P*Z_sP}4jH{H
zYrC0DY3W<G-b|d?#119y%{fP)DlFz!zc6%n&nhm3<u88-mJL&Y5>v=)kAN@-Z^uHP
z^Nm?RYjykNLxcY$&QZ+?A^ZH`=w7Ae!=1qDf*zmsuPx@_q0d@zW?E&}{_tEK_ZOP*
zp-sLOw?%=+lk1lmrO``U*OadQQF}YBzu{x1?DBpLCO!W&R5Q-GgQHwFdgT}Y{W;!d
z&@g5dQP6{hfAf1HVIn(8=p!qsy#eHSE!B$0h|+eG){IEwc5oeH4PVpzWp!@(w%|Sl
zId&kTWUQ+V2}V`S%$@x{iWFA1in-UKwrMiu-MrPhE(o^KgYJeaGcmhNzbG>i-;^0t
zwL$H5p#Yo4Lo|AOuVf5h{(yVlH_TC)xd_|`060znfOY}a(!9roP5FHwx6NMLo+`wL
z=e>&j4On>#9!z^E;ziU}zris*0DHp1-_O%qOHkavDC)8t%IAXE60Shj*;%-LqO(<o
zewwV4kjYJK9k&_Divjr1CF}a`N8?Jcb8Frt24Zr%i>U`U+Br7$|2Nc(4>jqvc3$Dq
zD5+mgM{}n??hJE<&ZHSD%WX97ZHU)~R>8%bRQEgak)h;*Jq=qehBY0wUSB@*bXbW(
zUDs%BA0UqsK<10Kbj(2VRqbk67%1Q|?Kn$B@ftE0S^aT*r}YeGnkuyxNTXt2cdxm*
zGMz&vd+jK#d?50U&H37e4(0bo!%|Sqw;*eWW<3?i@jbcS9CzISn~Olo`%fC$45Hw5
zFG%^4CfL^5yTRue<^lcCcyN`grJ1Uu#fhsWmpl?e<wsnGMcDd2D0TC(4{_RK$(~~|
zU!Cgi_hlY)6eb0r0;xdj6<N$b{TWj&x~{em-^B;jv)K}af;UQPM3oXP7R-CzP>k`Y
z9Ie`B7(Ok{E*R7C={?<@<2H`P@>p_c&>fqudk+vp|AWD)#g<8YkN|E*tz;3i#%Aak
z-a&=XKGm^dtQay%3PUZ>_-x|NcM@izw>*9spui>AO08D7<SU2Lxz`<Fu8l6#*xr;%
zepIuIzDqZWD6;b)83&#YP5!#^Mw?86@Qwb?3I*9_CpaB|`$%W<M%!H<iS87)J(>Fy
zwNk*mzl(7H!lQsDWrC^}?PyHncDe9MXK-&Nho|@S3{lVh%?2XDHtVuY<*zPP@*xYb
z3g=hu6~|)z6TpvBc1V|oDHAJ|02Q<_*BYc3aQ=-*vNnQy(!i6S+is8A+p}PFOO!{+
z1Ast1#AT0Ka%GQH^61JvT*Nmm97k;~J3SeZ1@!)~e?zsv5p#QVJ~DgKNNaRWwszoA
z;3!6+rBg!YIWd>1?nA^xuiex5H~51himV{(^N+9IbZ$zp383h8qTn+WmJ%H~ma5C5
zuP~FIgPKf7t8aOFFU6p8eaN~(UG(brb;zm)#Q<2^LwI#o_vzO;;X2K}#Q!!{5SeO<
z^ViDzwPea?8ByLBtH>5fhqTWJnk*CYE3`B>BMQA9IvgnV*wLptqquFe#(B9Uq2xQ_
zQ?2Fjmyzl@ULjw-4r3Mwkj(>TKsCDjW}h3mkn}d$_-E-0o~Fwr7YGg}^VGXSA?3Q7
z=D2`%`t)DmCgGEduUq*b<YQ4PViiz<>Yb5%`8Oj{8Y!$my_0T$r}W{`YbulzmY?aG
ze$Z5B`ao;u!bOTM-_V;o0UQn}ciN&HW%w(u{a!A*emlBW3cvyNx^AIPy%^|_@$yel
zdMV6*{0Y5+O8r(T%rs8)ZO=Z#e#`?y;pF=;#CyifxQG<cCnu~tUm_oc=ovT!ieV%#
z{$-U%Z<s!^(h>e+W#NAjYkrC-VUJfQ?gH0`^K;8BL`jEJ?xbd&0PdkYaCr^#2AT+5
z-T$BEj(;l*{Yiegr2vR&ZneoEur=3|*G&?{OeAZx&rFn|^!!gP?EfHWm<O3WkcG|q
zJFvf%7sm>ZfM`H6V|#Cj^@EG}#(>$^QeekCIu8GTD;E55>B{^OWvf#Pxu{v!h0nBG
zS-|m}W63ov9M`}#mnk%ka18yvT$7v6DtE7SF*S?hde&8>z8CCTb&HnpJ^gQ0o-!VQ
zQA)w6GxfKGK4Qh#`|gZWCh56JKAO>EMZaLHKU)PLktxp4bxl5R!q?x1EQ9U<cQaIc
zt{NTw%-PFogD7%#1QM+arL^=e0qUqvNp#t9#lM_GX~V#?MbxJKex^FqB-Dvt9_9t8
zH7+h9V)V7nfRgv_TL3agYpCbW)T49u3VO^j$rusN_VJf<r`1EqFJ`;E%`?N+IR9OS
zQd<Bgd8c~-{BRTyTQ&vg(h>GuSNL<4gALzG;0>{sf1YOtZ-t?`yUua*byg_yey$L;
zHSjqoAJ1AYRxK^dBlxbd+v&yXx;gP5gm7zpeyt6iY2Y}VX0KT@z*d23VYCMZDncB2
z^~;fQ1?!`B<GsgOaH|NQwYjO|JM*YaOP9oN_1J$O>uVRi?W24$gI<!`z@0>iHsuZp
zM2!8G`7;j+&fOZZhk|U@KLzF-?eYk+_h+zMF&@j9&7hivt-w)<V73{)5&OLq^cSF+
zKiM{2s#lE5`=|w;<j}q7Jml4g9%AJ1nFvtfVsNt97O`oR?5!v9UX9W9S|)@8noiGf
zU0maliQ}>j$^Vv}&ry?AEa)uO&1f04P`jc(iH3FPP+2X+>N!hQk559i<^jocNf7mt
zq+1^5ijoXpmWx8Du`M)L+-CtQOV_WG4csUVGoSnNNc4j29C)d=n|{79L>!mtOdzI<
zseSSpG}TE?$k+kUNOI=|xU*O^3U&ShJ0&o5w|M=G_u4#S1t>4kP<QkgLf%Y34wv$8
zHt7B+Zo2O-mKeyv0)(o6;cD7tVZlTc#+OHh+3S`)@^p))zxX4QorXhKikS@Mj(m<b
zZ0jyE%3uKvJ+(C^UBY&MuG(Jj)P1t5$b|QbeCZNE&;_#m5iL*`v#F&pl{ZrW9DO+c
z$^-rK_Ml|**kO1iQ)=&qXlL<xN}ij^A2!VG^$M`@?mKv>Rf;pTb99#!cRZp7NT<-^
z^l5k}H*lZ`@MGcQUW3mhUDMmi{7!zjME1Q(!kQKlOOOuJ6`!MR=V<=F==N*ia<z1$
zBouWV)L_Tk!tU|M^&@Wf8g?gD&U;ws(xaNza6wGyvjPn%3Q<vi^dYtPOInMHfO=QQ
zARD*F(fytea!c>AYWp1ci}#X1^&}}wQ;^-k$LzTy0!R3?3*w*~?b%&RW&37~$(1%j
z1-pVHtrd|#)$%I9J*{;tf^gsu5{qVXQY@nPpw+hbpheSQji1F-p-r$CT#;LOQOpDN
z9K8Xx9bL>xwXtw%3)+)$+i3kR$7i!zi`%5~WI=Pvy<cqNA99zo)aTg`3?mg={2$`p
z6(YdM`CyPfisHsg#f`&!E7KdrisH2P@Oc~Hw@Do0#you-Ly!G6AKxytT7FkSZTb78
z(r^tu`3WhEr9S{SK3%Y>hK*?=mp!&YD~HH2`J-^-4$})uifxK;U{A#7kjUSFW^2V%
z0jm^t<Mwu~94*8-E)h`Gd+e@BZ`sx^Oy3NTxf*Bjv#d>3RQ*bLYZU}!?xltzUE}`j
zUpfuca$kM*Jfz8gbii^;m{VbOc)&8Y5aBzN_k5!PI5tMdZIe|lNlvf<(inQN^od{0
zi81%;413+oa~{+M1V+isdPsliD?V*@IVZLN;sKC}3?OzeTLa|=Y9*isA9Z2GOt^YM
zULofQ%4uLT|KgZ17yNL2;3z3&M-6qAHivxo&l2kQu7NY)Hva1e4OwqrpEc8qmz$Kz
zd_>6+1~(4}uF&O8Z5tTG7O{mAs{m0e8%7u9-7FZFeIc|PIxwue%riEMo^Y5q<MVoG
zmdWN$y5v&MY<>Ovn1T<H7+Y+M#<ETQdKP-4j>JaAb_vyB?-9)(4Os5J*>UTMu7=CF
z=Y!g@8EkJz*vvmwlv76>^H55%e5VX0MJk{q20iZ2@URtxg#z~o*E+<g=srnzgQwJ4
zk8SL{?M_i_`S>MH+>=2G>z&^#5TqBEc^v1aSK0}}gTng%**0;euw^Q!{ckC{M7Ls0
zhh>+_Eym9DaK9t2SA<bU@oSXWlaCWTN?{6>%$w@KosaH9d(o%WbF)8=FtYrpvHw-H
z`dgMBB%BguWgV%<cCMIe5C?C9RDtG{Jf2ijy@en9qp~$26ML;QlBVId(1UwFWVFBZ
zhrbVz%5fKe58ESiW21pC(Ym5m_v!PROh*oE9{5p6XBR`hbevs0&_cZoyK?16?fhTb
zMX2U>8yg|WSqg65+RS*-0IbG`K6|vTJ}3nx23W!eXs4iMR-@0Jd$dnlaP7CACf0+!
zZd9##4o$KUdzCl;sNLpGpA>9s)KeP2vqXnNw+3r?f}XBM&^IcM$UG5H__fUkcp<FB
zP)V}Yb1tqQpwi0jBDNJrN7XJgb`P$YQKjl9yvkVD#(zaDC7zKtv?m=(88aj$ZkfTb
z^-}seC6~q0i@CrCI>Xkncb=pgm+!t9G7k6EZw;WNivQ%+l{B2}HSV!8R&M8vs@dE3
zX+%OBxXo5NMUmQG<G1QK6l>^<Ij&d43=Y;#(3=-r)_|!s)1HjM^e?U?a%c?Z;valE
z8Ih2tn>8Hi-hNI^Gw=Ny3&Y7b9|M6-F)2wL&&x4Vs}Ve!=L)P-Bz){oWZifO1)F)i
zy2;f4!I7l!P*pR1K7d%_T-dqoo6N!A-yrhhaQcL-5@#p4u}e(-0rBnBriC%B$k>Q0
zK)cuo?v>U18l+8<JsP+2s@%>Uzu!nPi<ZHQ)*Lab>EEffzcBVx`z{`RmY3>DS}Bu@
zBG;5+d@T}lzocG0k&ka0pHswv;VWTy0`cu{KrwjHvjELz6-3MvnR%?ffm!)#s@5Bq
zccV_zy};FVnN7%P#d*svlBKdf^#I|yrJltmMNRJ?;e6-!xtUIsU$<_8=G>A?xwe3P
z=IP&`$$L7L?Gshu%E8lwOL4Y@S{PuMqxXLwCdIauobKI{ixRR|(oc!KYG-hjO<>09
z=Tk`+7d)ZHy*m85%GZF=Kf)p%0`U$r*EyG)*QsOo+(rOA!k{nYa)C$_S<J3Kbtr4?
zYo;ixd*|DeObc%P!^)rEn<<Q(a}On!D8T!JVV`%$R$M_-()(*%{PSGQS>|{p@bVcn
z-pkzAK0_p@o}D%_dkf8kd&lCe7;uK`i={uYnpEDi@QgA%n%6n<cPl@u&#7E^)_2~s
zDyt}M5Sad}KdjRKb^6Q_|FR4?4bph@n*P7L)!!%Vbj@qK?V##WROQmVkB~-xOwUFX
z^uu7_h{eX{@<m*t(r!^MW&QDs-)9!vOG0E|?E*9J*rp*8_v8@XD8lXMo$<%|>pvfy
z?AAV6)otG~B1LNO(R*_eQD9&(<)zhnaxJoD>s@XLIpty&;=;Sd^OQz)PKy?~qI?f3
z^PMKWWw9pAG4DC8HO#Ir1Ve<6b&uWNvZ~u8r5d`j$7a@;EU2xYc;oLTzTNopFnEO6
z{D~#+rKTsBJzX^P`yB2ITPG?l?F2hkt5=Ub>{n14w|w_ObdObDuX()&O8ZoXt?Ffi
zaphrwQW49RD!#3p$=V3vfz~_zqMqut10v{QSIS8K7^G~Yq_5Jvz)|l?!QI29Naqjj
zrzGr;oo`3#l7{xH4EuiBopPCxD=Y+fpYXA+G)orC!d^6x{I<jjy0uK*nBw-uX6^ie
z@ZM5dF+oh#)a}Iy`mK;+{iPNvLN#j({n9{rSA_yzgG8@wgEje~x$7PyR9Y*RK0=dp
zPD%;P)LJhrBE&ti^h?X0MI?Rq8MD(uzL3K?P@Ns-#S+-^u;*cb1G5Lw@B@9S_EeCz
z?Xo5n5wfIV_Sv>b+j~u|GQ)aIo`w2i^FajG39aRbMxBC29R`b**BjzWVN&>tv8Q)`
zcT7W>Fa4P+VW(mAbyg;raZ9_xe(%x@eWT~a&I4%tHQqp?(S~7LB9Fc7wd+L4aG^c&
zz4ysSmPpUTiiKQ8JdC6(Ghu&~n%Bl!*7govP^6RjaTGKqzxBgb*lp)@#=5?Va_wA@
zicbA%VpEsk^2hCj#w5?D<+T`th7vB2+s2M?pP9495AVQdVKIb=mugeIZiRTc=Uet-
zJ@B|?Ru**sq+!wGrkQQElatdLb5LQ+xz6srt*WCGQKkO7ih31x;>RJbm&Yi^d+n9h
z!zyU8?l9pXRbH_Y2{!5-x8#11!ttyuCJ8{<da%`HCGUOE&<y0h0%pRDgGvkh;=xZR
zz427C(?@dV+k;ig^y{2U<3e|h_hrOOM|$$r8yYYJjRn7riU4_yUBCg%9QdxMti?I;
z1FZE4wLcvtQh&^6Sxj^AKF+uB5!=ffmt=)o1bL<ZXve_0H}vc<E{d|K_!bk^=Z|Ok
zD)_gJ{d4yJjQH5LYP?(QzlX1%6a7byI{rKh{CmtgZezYo_FI@d{ybtT<F`0{+!3v&
z_gidKC9ajyp_dX5F8LUU6S`93Vr)&KAic|>JtQ9+-ltC*n+Awt_eQwXeC}(pvD);c
zcB{3GR@cuxfp07@x9j>E^rE@@mw7z{ue3iERHuROiG5mhM~pn#Bm|NU)o(MsiML3_
zzoq8R3%2ntul#y&*H&7G3A{2Zt92HId{M$c82RF41H-5_TwpAeN7U^g&HNms;6*uO
z!wmwGl~BjGtV!Xl)^IOv$p;m81WOt&>G3z__gPlot7UK%=jaR`j6iD;?2&toOL&bq
zn#-M)*}df!>Q8AsU{!F#KY}@@xY#D_>(OLp-mCr^b*3Znrn0`l!Oy(cN08aOyY6VM
zd!C#-1lp0@V)1D1WdfO9lq}IbcNR5$wXp|51g5c(d7>odF=<zgC5F+m!8kL^N|7XS
z7HgER{e2Q{lvHX?sqvJ&=zQgv#L9a5!8<+$18xw#|K$t3>Pml4NUFrl_%}+EO=e^*
zWrTWoKI0iLcC$xHlk=C42;RaNDSUNq>ql9@t1gD0zaiSF-0pD$WisVU=kQjR>E1{O
zm`(jsX9n#R^LWtRDw(rUehfu%^<pF7-Bov@cF0Sf+iu}TR=qh3<euS9QWU4YGBQxx
zz(vtU9g31!QhvA3l;0$%{%W+?;~g0mT5(_XLfH-W)1h@tfeAg_yRKD%QoDYj?i1bY
zXHN1#I*!N(#Y$SUFO^u^>THdix~rdU5EZ0gU_fOv``L16b|BV(;8qNH!oc_~Y%THv
z*UFAtJa8A#xzOMC#5sxUP1D+uInukgd!TJN0(dC~;b{9T%h1HT-poA=a)~sNDIsWO
zHSy2jBEdteyE>e1ceCPrF^{IoRI_9lyvWT%K&jz(&fP6Wsv%A_HcBNJ!Q;KsE}xCU
zdj|=skXoln-SqjisHSI<(DvfQ4m9lo-N!c3Rk{=VV_+llU9BvyD4}b1+bfUm)LN*`
zqHq^Dw`3SYL(y_sZ=-}Fu7mCphm^uE&pRX9d?&{IJht9|#*YHB4!hF4eGm|oD?*Iw
z;BlebHTG?T^g+9ByRa$3%!Ys%64iD~FyvB619?3W37IP)29h+$&K={iY}qSRLN5pW
zj$p=RcfwDz`&*Q4s^{K-jXGeChzGCiP_+CumP5*NqSdc1{?S^#L^m$7GT7nIn3+d{
zJGpEgkU0i9>ShJyn9ov`7voiR$&RI(>8;NBDRl0S(ayI`HxS*(B_A9#Zo?@U;-<Ap
z#V3i2V(>CbpiP5xf$vb3BNC~|rU~&DE<PKNtlw&yZi+OO8EG|qqSv^+&-0F9Lw~(C
zyA|iTpf7LdoW~bbb!03j8*UUh7q1yaB<dAHAPb71qI(Y|&xs@6T?6*QELZHilG7ms
z8j(+?94>0?^`CerljXy-DSOUt{GA$t!hDtubPLJr$uZA+O_xN;t%N=(MXTgR5JCMa
z<f}mocfeqTfAl(K+c}-}k)S3I{Q3YhJRNzxiA1-}k-D7vL5U|*zGtXE8G<DOms7TD
za1{0M=wC?xhOls&aHBFq31lT74N%a5-00m1iI?bJ%Dn3(#w-8W%``|CQE$+fYOF_$
z>&fW`F<&5he)H7rGuhsI%JE>j)mBzFysC)_hX=15i$QnIWib=HzH)`+bP>(zh=^{b
z9iw$S#3_q2MQqpEp;Y`N<HS!pWCd_56U_zVTMNYujs|#+Ld-%FXK?XWODW$O;!kSt
zw9fD#UN;2Xfl*?1VAlp+i67dS!bPP;#~A6_4%Q7$C@^gnDe1Ml5sr6w;JKDe4)dL9
zW|k2wncS7H4@iS~uqo<I@aoY{HB@DQQh~Huv^C<D<7Chh%dt&hV0jDld~nr`@Ok~<
z{+41$4X;5R52RM`HfW6wboUb255!^s9QXV;Q>DLN7M9g}C)MG1hsav&wtE$=uImX?
zG*>z6BK=a43Jl`sa0XAn%ZTg-`@^QYX{5ee+gClqxujE<@$Pa4vE2`s*{w78RzB54
zYTDNe{s=CzNz<6R1Z+UpQYRU2+o!g@_MG!1!4%Tfn5Ju|JPUbZ7~ss#HK20Tx&vKf
zGA_nF2V^Y${#Df+E&e%Q8#q6{@7IAF>@)QNsD8>4`n1?h)$0a(#_M8O6HufD8u)41
zc~PbhP85ByTpf#_XgFgac?7(2-Glq6{wxP#^L!@E@U<Q-0|}Yu*#`-n();zs*^7cL
ztuKhx7=KHdX>=Nw6d6En0<Pi9!s6sEMuJbcdMosisgZ}5`%d;vQbv#}p1Ot?AA|d7
z>b77{-gKoIH9`!Rh5g~J2o`p{*ZU#-zH*(c9G37`FIB=$TySDM(Y8fN1R1wz3#HNx
z?ow!vd`L&$5&}j(>AkGTKrB7Mv}VR$OTj<;*{?P<>NY%l^Yx%O(Zb0~_;3$Z^C{n$
z7ptAb@4*LN(DQ;-FDA!#v(9sG&Cwnlw1ZKvtc__>6{lmsPc}z%rENFf_F1mxV|3!d
zMjvLHrz@czge>gl29jkb07p7}`pFV`V@8%WYfw6&y&=HbX=9<1_zf6tqE}WYp&HQ7
z;Y!Sj%|n#Gp@qx-^1gC!z|ZPJ30C$=Y0n_K<8!p1c2YK4C-`!oF~GCugq@eiiHeV*
zNr*O9xPnJQcE&v3P3FXOQ!KsJm1d-^3B0pJoc1K~d*i|rP16z6yj^ZaIh5tyC(Kw)
z@3}Yg;CRVOzavD_wk5l)9_$)*;~If+<4v{z_i(><x{e8T@&Ui<EX;^ifOev3sj89o
z;q~`?oQ1{P&#o9tyz@|8hxD|5z|Ehez`JITcVmwV<Mz<${l@7z+d`1_7p7B&XWq&#
ziMU?Kvkh*=G1?M)JJ^;8{Q8rfS{R!qouZx&q7oLeNLDdm$syz3t+B!z-J*4f!~r(l
zj!_|bFoy71iOxEu2$zT()`j-1(u#89K!JOpE}Z``*C(|g1w({styz4|vkOhesaf72
zMAb(lf=d_V7{9bWxU`gyD;6J}U<@yKMU>IpigOn&+2(blt5=bieNKlZ9~Uo!cp^L)
z?Wh)hg4)07ni%akzA+l+${7{5EQ2Sz#^GynCTVEh?}WAAw9pb#MifHyx|E{LI&Q7$
z+WYf+dWfp7iT8XD2<^sN{Z3FHz|G)qF6)t;9elWXXLY+CG=8Otu1a;1=9xSRnOy+L
z_^x1EaA{IR-=iGrliVvUm2IL-A{=KRRM!WsY~g}Y_-&&;dVZx<no~W%27JE-$?LOx
zH08S&f?8|{=(Z&&nbbCV!HT976Mj3WOt`?ZM%hN?PKekvAczbicx?-pfeVXJi7=rN
z4WF=^?&%-Ur8~I`IlqV}+dP@MllD`1Cnl`zpmtDSaig*K6tN@y(|e-60%ziBtoSX6
zn-`vqk0B_G5{~ro-n{^^aL1dl^V(R=H(3eowk}oSrqXt$ahU5hPV%03-jBL8Sx)Rq
zV+yz3Z3dq-EpXwpxjHoEx9$`rMCAL>^K3E2&4u@-ZT*jgNvq6x$q%Y|SE=oe_IYVU
zNQP5!5s{xxt{|QVHEW)F7Xk|0J#aaRmvmrp;!cD|1n8|S7)h98{`Nuk^m3(jXT2LC
zrjv;#MZO*=EG_ZY+i<SX_*b-9ZY!Cs4+k1-baQ51Dp`I6KagY~ac4X8X~{mrN)oYh
zvi>>O-d?DgZ)t|@vm&#b-3wF_c;@^X@3j<m_eW)x;yaW}IaC_7`!zH05geuAaECX{
z9N8$bH<vKHmb+&(aYGyDPwh=b@fWOT5PXv>*mB8o;6}6!n03>H!|=6*TApJC#vu#G
z!N1e?So}DJeSYm~;bfIUT$ghq8^4BA%+oz<zEWYr!))OBZ)`=$aXZwR!Ze+}BmkVa
zdw1hvLH%5_ODvX(vgMw9i{3~b&5L9VC|K-?t~e$v^t`_3wk_mNYw3TYacCW@f<VR~
zn^XD)kn3C^bM5V}WA)2?*Ln5{tj2*!cqiRbLq8kKEq}6?k_SA?rKMHH#a$50f)}XA
z4vxZT4pwBj$?19TFWvH3nU@(}ED;m+?rzPtFA+VMa{^7Rm%LHQtCh#<Z$J5zVbR<d
zKj-p98=jlqq6<sLORp7JZ5mT~rS!Fwfvh?UP~FiT{zND&S<HRoIXERqVaA|tDuZPC
zTnDC%iQ46v_0SVG;qAf1kz*Fe@^=<zaaZ7G<Mw0GC4lo?Z9JM@4zwBsgn{v5AtilX
z<svz~6whs)y2UWvA>OlF;d`5#ih9hL5qjJe3#@1&b9Do@O-8fSI5^>2H%^V{QXkC0
zDw01L8AOM;%8Y*1Yd$}5YWyCKHDp-GIdfCXw%)Ye-!kE#t)RwGQ8TqSp%n-J+@aS(
zB-iJKH((%YiBy|0>5ms$b?aAGy)79q{4TisB%c;{br0?JX~E1phH~&dGX-9j)PpAD
z1qYodaA9MYg>!I#({dYC;Uk~jvRf37r*7LC)U8iNE%_2StIr%u0DmphT=T-Ysw<;j
z8!tY$gC2U%XS=t-BA!sqC5AH%rvzi(MbUr@O3%OL*;}beuC0ppobvUXA#Z6OB`YIK
z609#hwzshT5Y1L?#}T}6sUqq^sn;G_?VpCoSd6|XxvZwJ6+-l}qgMQCWyx}>tjpe}
zO!!+NytkaVE7ivJYU?B1)x%OAtXAFPh`F|1-L{mXs%LJW#+kPU9l|OJ*uc-c5H&Ca
z=6DgrJgh9Wma3Bu9NSvfJm4)RaoWu(?o*sj)(Kdp^(XIUDLWMP(AkIOaT;?hrfD5{
zTkf;G=4i0<FYvRe-wN3#7Jp!6XFH(l(Qhi(g;iq$2}kuS{QxXFZRY2M%V$GjJf&7M
zxT-*Cha%N>P2Z+43MRiW7bh?M57{H~#p$h~oEO%HWnGKOS}UdHIV!3%2C{eT9@6^Y
zN)L=WbmY1AOUFveDu2NcrHTzd-0BYuhnnDl#a$1Mmg7(=8q2TPzv--1SM@n`Pn<Z(
zEqCv(#&uC~H@$t<F>mww0!ulpNc5KyFyBmiBk57(47K*JGWXBb4VTxdiAf4${#CvG
zH&x*OUunZvO-qZv1$-hrLnpf{S-8QBISm3iq*l-HGLYY8Jf7_Le?4EfiNf*DhFeX+
Rbxr_(a#G6oiY1JE{|6ePQ%e8<

literal 0
HcmV?d00001

diff --git a/docs/source/assets/design/v1/prefix_caching/free.png b/docs/source/assets/design/v1/prefix_caching/free.png
new file mode 100644
index 0000000000000000000000000000000000000000..cbc2f22222e0443072e70f7d8ac1ee8c5daf3590
GIT binary patch
literal 17933
zcmdtKbyQSs8#k&C0xG2<U4jZCF*Fi_2qPd2NK5z7-BKzF$Pf||LyUxUBV8ih!q7Ez
zNO#P@oIU9KocH_VeCw=r&N}h^GYf{j?|sL0U-vJr&8JsNvLrWYZ(g}_g+yLXM)k^-
ze_+7R71s%X&yvV?$dxPLA$b`obx(uMWc;zIw(5$du}|z9Zv))3jRON2+)aWKl3J|d
zY3QKPWr3bwJWn<p;T2YDA;=h|2_zDEw6o@au>G{QY@i|iJb(I})2hDTSHJ(zm&58?
z<jnX$dcuI>-5bmwZau&LKYW~b#e|s1|9+;H)@RJ*-UzmN332xtfU8F98)Re-nZ;2R
zut(l{9vuD&yIULaAjd8;$TrhDz^D-n6JXaLS8o$Y)u{m*b(MSd<E1JG3)#5hFhP?a
zjM7TArZgwi>Up^<@X*!IU*CSo(3D{;O~@sAe@@9YnBi)!MuYrdy>{pM^#Nw7s~qwL
z!Qy888g$ptTsj%@Yh9UqnKy<9AAY!%CjI^&%_yDQULDX}q3cJw-{qnQzd+0t$#cIc
zTxXUEx`v<spqypzbFaGHt&K{{NY-y0iCyum>I!i{w@Gi@d@XMcUSv+pnYcIRFu($N
zsU`G<7m^*z4a@?=ox8<RLm?f~LTcqw#i%qeJf_h0u_^o&mp(8{q7NP!J<Or_pA0xw
z9=p3#yUP(VMZX%iEx~|2)?&IgsSG-c>qiAk`Yxp!dC`FXj=3V&|NJy@H&J<pvGH(-
zVHf3ps8rmb5t(@*H|ika)wDEj=zpMBT*z>!Ra578@>_a0j!A-!(*b<`B@=tT8MGS&
znxayEb3l1aSG=yP@ox{iAJ~7-;Dwa+iz;A#K&bFh?0G5m#}g+T<YYHub<g`9B!VlM
z8lCETm+7vpcV+8{i#{B4;HW{vT^6*VjFb@umKUB4Udth}J-msvGw!vTGH_AIsC<xe
zXb^D<yMamM#|2~QpceF2MFFg6x{%^f7U<r=#Qoi0%mEAC{i58jM?=o;LmJN)TRp!m
z#kw}GgJrpHkueDgG#sDp2W;x7{8v{UC>yc$d33SZ)BVxp)cL@@eo0@h69by^<`c;L
z`Oz5n>y#lT$;7)3Pu=l6`uJeuGGWuR3D6Lb*#7I|gozwe(kWyDsWsi^kv14EDo+J<
zYm*9?uS7H;eJ@VMlv~Ss`CNR~mn^PLR-`E*m%s(*UELsxex7FHZ#*z^k7-r|r}2ji
zjhKAevVCQdDH>Zpa`(yne*?)jVW@{Qt+Rh>bEN)gtl-c*z9SLzluP1#3~xc!p_bD!
z_u{O*s7+YU)!L^iN45pgc>V#sB@3Ea8QDFya-kn#JB~2$*$F%eT_6o95<UKK;vwmo
zV?3?JBxxvl(Oju_2fI>)a_L5S&L4S}e~AqnHMAK%xu65to`h3D-nsM$+0@O6=82_0
zDvO7-n0B_l5F8pi|6RXb>l!J$airaPEG9p?e{9Hv@WM2abhoL``;Rr$68ssg+q^Nl
zO8iBE1x?SqPF?{ulkbm|!~Vv2jJkj8t3MkB7+;)r-N`|^e!=x~s#8T&UgUmg@Ti{K
z$CI!1?xBt4tJ$9Qf?A)jo3Kv%a|;J?%6_rqBzqd+jnkE6F#~94>pU`i22~(=9*;NW
z5#2s!8sxN_A{Op}MK8})FDFfJ`cJj3x0UtrO&vy?o-3o?ObyeAq0H(}KKr_dQ_+=w
z<qyUlt{*4A%lH%{z)+3dYO1~$x?);sxO`!O<%4amrzV2jy#g{JwcL4fYW|r-NNGq_
zgj4PB2-g9&&&y|$n4P7;c(9*8jI-_;rWM^^g`zEQ#-w1+V3qIf%5wkR)#g#cFuJ1S
zP0#J*%3Rn)<yAa_Dq4xg&569CRrs583wu6l0qfhRb;D58q^9(QsQ$WV&-slt%jEW%
zp^jaIs5)ZDpQ-2;1ult+bN?;*pw_(nVpm^=-t<Jf;@G^Ud6!tIB`Bfw=;Ipe7HZsg
z(nz0!1#%u)i1I&E6}N6WooeV%kieGbDIvj|VryRWeH>FOrSV|vIboC~LnT5gfS^Vq
zyDs0X(gyr*g5-+M*Pb#@EGcw{R)3?6@ZaeQINZEAh?egYPjUO%MB)H3y~P`AD~)9{
z_#n51fX?MZoob!vst-HQ4gCk$xqSr^20rv+#T@Dr#7H4CoxYu%NAkre_Yb`7b;zm7
zHM)U}9Ld2}X`=F(x_sV#J(o7w%|_W`2*+;${Y5K-@4JX@6H2y?s&C#UOF6Yg&ic>g
z?sHweFwSH6HxE}RzO)IvSX?61E07-a9Jtowo>y2E%70USz$qouvX<#X$$B^ER>L`^
zTUpgsrB*mqlEaJ<Hkj4EVs6v6haIFxO#dXnrs7EYZO3KbG{O9!<@Zz{L<K{F1%S*F
zRkId_)GN?tTU&pifANj(Y;wkLNn1&;Y`S7{>TtJw(n5pH>iN*fezB~ia$_j%YsQ-q
zl4jdR=>OisF;L3eke{55`?+P)7BlgSP}9ov8=rM#FAy$A+S9$F)r_WE!mxJ6)iY|=
zQKW5hzIF@d;V5=9&S5n1n->gu9GNb3V?hqlwyz{H#TBi+5pC31u=*{?-c_*ufN;~*
zk`^>LBwu4+z2R!n71f<BXo^@=sP)x!i{iF!+zcR<Ug4B@lj>QNTtQsXpQ|+Zfm%0m
zd;eFQUc*x6dM&LiaOr<ds5O*wAk%cj6_of(9?`#^UlL_gcm(?XOBhDFUuXN%0$#qV
zwi={%dh!iTcgAVpav^VDMIm{Y4AMPh#ERy0HeK?TAZce5Xi9ix9rA=!I&$YZ(fVot
zxQy19=Cp3TBC!_!(wosLO1p>qEbvaHlJ(a?Aj8X#g0mQw2A2rySPge}nqI$mpVS<(
zs>$#AMSQ-k#@g-@5#L;&x*2!Ak<2#TObH4a-~8|0rD_Ph%`a<r$G7+dvoOjOMrB}>
z!5=(yWv_yNq2jb50JHR>x0`I}bupt?SqJR406#htIx;XPaVjJLjgX2xm4J-UqZ+5X
zbd!ieliKU?A*h$uHlC9cOiV~Q@(0aziklA!6IwS!4?E6(Lt8`qzPY3G_t3@NWg$V7
zj0T=x$F1@p$?Uq87hd{UaIGCKf;G)~V5<q<g-&D^#Hr2OadDlitR_K?*8jb~Fs}Z+
z1j|cd%UT@kzR3H+tL@uWQ-`f-{;^CezhM1!_%HVbA#HV1o4qV){>%&aX50=%`o#G-
z$yf!d$>2R>*r%h(RK&f{_xJBt8vGtHb&PIrzvs*?06v>&ho*@rA<xap%RY~--K!U+
zDNYsu0z#&ThA;~Aqbf1^K;r<&Vh_QF{iTY>$I6m#TF)=447S~Q8q&1>Jt4j2M>t;=
z!2XYcP(|E>va#y_31IP-7r5^+sLp;kNh0!pJKp(!1>ms(Iip!irQmB^hP!~}uOBEG
z20r9hsm3tXBmImgMXAyR*d}g}4juvViStQus^R1kIOO0ncWs>|361#4F#ulwN5K1k
z_1BJ`_>jKrbBfD$(%vCCLTvm$UOvR~8C2!+Spwky2Y)cviEjlzp74IGQ`F#A(nGF+
z+~=xg`KZq^(ZapK7AQ09nh(|b{=<X<lD*myQjur5+&0X`|1~Y_v1oqUQx15elUsXe
zNbZVwt^NQ@hRtzEc(!GC2-I4S3xM^(O7WCXr?~8z{svcN-*pph;oQuXD?sy-ng>bx
zW6DN4^6d}Z!35CkRd~onQ5rU9^;pHt(7(_U+2UTnqXmvm|Lu2Co`wy}MN3Fjs}K_B
zuGVMQV3&GoG{>^iupznV)0GJnLhHMX2_24?xc>Mq07Dg6x{@2e==yPBL`fEwi>j|k
zn=3b`E;iIWI1YI+5?lWBzREEDmL6I>Kg~P+x6@g=`{BlFB4Lnw?|hWXaEpocN&(ko
zahiY8DtY5&0En9Ez*_@0L+<cBjc<*3eCDQJV66CDO#m(4!HJ-yF2ZUJ=tHZO7wn*3
zdHEUxl4*T5lo6kDQSJ9ws~akkpk9}KlN*3K^@gx&e18K5X&oen6e{I2X$^3wlv^Fb
zvaO9MA^FFlg{vdkI*Dx+MOxn>&H=2RWpa{@u=L-@{<;I_Dq}GTP|CQFTtWNB+*LA_
z!>R@$?#^E!O8KkEY<L5c?6A!idh}ZY*f8<<`H%<V<=NdWJ1R|Ta*g9oCtdTo5K6x8
zY!axy-imv|!(4*}beeUvI~z1|e;IAsySHxLEt|N63rov@v|bkIXFLD98W7rYi16@;
z7ypj^CRQ!onuOLD$z#O`zmHW%CQvY_4k!!=vlfHuv@;B6Kxx)?!!#=8jZE#@i-{1t
zGC2|&NJ@Om{$tUaG>^337naaNqwA>fhMa=*%zoiDiC2?taB%2+o=UkSBgpQ1vxbTr
z;(WkX+yhJ|I}Am+0gGJqG*{x*j;cZy)ahNw#T^YBP2VJ;h+N*etJ1iwa*kVWNmy=q
zpk_R8gfI*N)(E&JkY#f1LM(SAmn5Hy5H8mkPTPA$Fv=hHlU6pA8vXhgr}BL7d>ln@
z<m(1i_06(4pWE$lur(oXOegJ=!3#5W-*L0niPCMEVbh%o`=~^lZe9iXGC8gXD+f#}
zuWe$#9Hv{J@fP6$z5B7MY7=7B`e<vqs^(!uL~fwh!o-SFFN<RsFJdFerq6J%C%-cc
z$C(Etkb|5i8JQWuRJ`IdoT@hQi#rp0-*tGg%b?!0A<tcZ9m$&V^xqPmS2Jj4I8)5D
z)ICem8egE|hcmSGSabchOD9$y5c@aRxY!^NA-N#<7_&Z4OGadzYMu7tgpoT>HL)&|
zbO@uOc2=7`ku-JVKCl{GD?fGHu-XHRog{oecZ3#$ILNz21z|(1evBIZhlANV(dTmN
z(`@C)<aje&gbu~WLQB3tE!#R3-dmZ7K#=n3pDw;tY)z^RQ}0UlM!Jg`F1Uh+=p88A
zCn;2h$BO$RK=A@F-Qkf<eRR?_wGL5Kiz3P5&`jSUG;-qE1q)nV*E}eDg1Gy1QX($-
zxxaJ|jbKVaU2nKm^A$Dk9?z&0V75wrsH181BKnc&_`l%?KwU+pJJx}(`xMwL1O`^K
z&saGpjp;Yu6g8Y-yvpLCJ=_tw>&tn*N^meXb2p{DU!Gd}v)jQgp26zLPU3J5ux+@;
zDOJifZ<UGZI;ZD2K3BI}KEHRy+IvnSSe#K-WMiG`z=ldyUd{MW#m(v8n6|R6YY{ME
z5PDeV4p!gndlr3a>VBD)b+YoI16i=jZ(Ijl2W5V*yxZLCdW=&8BQj?icA!rFwJf<u
zPcfB`>S^8#XP?#c+aRJj88k*uX(2P8muoAu?@D{9s87cU!y2@`Cnr`WZ4g;)A64AY
zY-tW>eicN$t4S8{cSq&#TDUkkWQ-d%hH|tqvg%=xAN|q=;`~vCxT8U4J!Y7bFNU;x
zQxP;YF_594mhI2{k(Q%}`g}bq>5j%R@4#hH+o+@`&@pI`1QjuS$k2YkFmz$;>B;=?
zlS4yh;4L4~jqb;FIlayXC&doZ&C=!Oev-XoY`OOBN1bu+Cd}2<3sS$AyYp)vI02KT
z)IdDMoh&5HPb~}*8rM+|R5WR<V9Lk!X5wo;^fpgOer;`9Z(8p7&mUt4_6!=mB))hw
z3yuL3Z{RF6qrVznN3|q^iGKyC9Dal+r{&&QT#8*h36w9peo%&K%(m8%!vkgTym)~2
z7ARqWbQ;PGQ;Cy?=B|nH<}&UNXIrZxd*<s@+^#d7*$E{|(xktvsnf2z53c*Tk=oBZ
zX@WWXiD*Ag`(2(jep)_Jy67m=JrrB61{>|+OwrTr(_Qo}kx~!i1<mXXk|1aruH$vm
zO%({t5dl%2b_4m7(}XiqX6^ix^FYLG^7{n8^)_|Ut!Oaqz^KX@rX^k}G!+$oVN0AJ
zc4M6gt|xNwAzPF(4J&bep($@m7`oz-r5$*CpMcnSsxLIxO1acX-Is&brP;kMz9}9y
z^XnDIOVR9)iqC6ar_fjO-ViL#-&SZJsVpQFk=frI>R}O#v3M~|6@;la_Ow%^ipUKJ
ztu&?N?aSyVec~slzF8|GSazM-)f(<g`$^0}1|1i5nL(I=ZYvSJ0MCW8Snb4at!D2Q
zOlp1=A}hbS51L&5CE%Iv={n@q_|v`V!Jd$pCQn#;dIQ>TaKomS$hC`sV$+i9=&JOi
zu<(WiI^E0MDN`8_9*|6;(MUW*6ZLE-s+g*IWbxb+6%4N+%f&79&1cY(%CE3z{v!<o
zOfkkE&dATCxE-i&0+YxhNgCWh6Y!aelLv-5?0%)w_zagot`Fex2*UlDzb4pcJKzAo
zb>=rA4ze239mSxL2P}c9+NBNUxNkqKvQv+4cRv#NWS~4+c^CK1t*4(HLX`6K%{D~;
zG}U~PX_?upKno^#Liy(crLxS``a6@ji=PKGwh4pz&*}UVDhi6cVlMZp4B1>_TKz^{
z%jM;1LGnUitX9U_@MT9nGzo)ImRgrLUzbw*PB8bU)tp~<eGBv-{kcZg;TbMQF7N-p
z=OgVkl={g|UKir^<2V8m_*$h3DKUf4#`>eB6IG|&){I*O#MC?HF;T2JuSpoi7mros
zX@q>1C10>x#fBhfnWeF}+?dPJ296S{CAp_9k1iJT8WkR@amjhzqSt+_+0sJ5F8lsK
zt`$}__UudWy?WIW7FHoS{pXI{s=F-QH<jd<0;2JnO1_zZ6>UE|Kah)1uM38esDHXX
z$U(tjViGX)@Y)Is1!z8PHb!0IiOBz2jdI~zG^xN^JR~kk(b#xoU&M-`<6KQcdC%T%
z=0YXR!TaQN*1OmB<da340%yG*W`eBI`nBINc~Kk8*JjNKZ4gDKM0%}V3F$#9#y#VU
zlAgl$Hy(RQZ*)?j3_PUc*d$Ko4c~7IArFtmZx_`p*Vr@J6f`>dj4l*in><Jh{zRf0
zer?SvQ*dPbo&@&y<P<h;#??VOW%juCqAjsK4(05>Q_op*R46;`r}Sif-bf!5@^>Ar
zCf;N{oxiUnF?sIak1P^gj8lPeGZsCCMxTVjieQo#hbY{G9}4?{49vsI7x1YV863*?
zQ+7EJKDTX=u>VAX)tjAD54{GjZ7P3fM7v$3g0!EG{Wpz;P4`JLzq0}+e3TvQULR?#
za;Pk*LZ<E{s=Ecv<n$bKQFK9XI|^grP6!A90+^>^?cH`Q_E>VS#k12~E7-wnM>`AV
zP$f3w6ydud6#xw_syci(TWd9e{(4`Z<+`i02%1jF$FtZM)Je!j_}(<@)0ln9X5SXp
zV&=5x8(I3Z{HhYuyDKRk4W7*<x&$Z)Qq94Kw*Se3;x7IDWoO|?ZHvU|R?Jd_AC2v#
z&HTi~y(PKNDNx$_vC8>v1J80ac8bFx+ydbuQ!ie%v5cPgC3`BAVzu-6WDE(q-Y^;G
z>G=Q@87m!rZeQF$)mmQs?(uD@H^}dOfr+i;WH?!F@;<1_rDnGwOV6)*?<i=#S?S6B
z6!Vesd3`p<yFS7YABP`rulCW0j++rxHh$4E<QOXi+Tr2-P5ItM9(kPdrZ;(^RuY?z
zRy4<#_!%d;qURLteG+dMF0#O8fPQe|2j_|3i1t<a)QYX$axp3f*48Fc<Vgp-b$CPv
zG!{tL(s4=^Dhb~!+TvHfqTFy1U?lcIyT!3j067~-dp)65&wrz-1PRDd2*3Rxl1cHS
z-$8v>Oc+mlqIJsP%PvKAt(yx&M1{@>-bA0}OykN;K_>=U=@<^FH}Fn)tJjM@CR>-b
zhokM%fuu0QbF&b%V$=CAqUeCls0p9M<fI$-xcFciPF%KMU&?%*5ddg|u8Ye)UVjUJ
z505nF7i_zxzaMh(iJ!pTaCM{GAv8|Mz4c^Qo~gFKyQs8gawQRQ&OKH!+T!+iO4Q)m
zE#40*PglDe+idcmvd(x;2_2d**2YO*XcS9!Jv?gnJHgC=up*2S`#oE!!}gP~`3W;p
z(k>H9L(J@oeY*hvPJ+>9>G9Pj_v>X(+}9cOT@He)SHcX@2c!1YJtl>cN3&yAz~#4v
zKQJ6)!qEu0taa>$(CUDeL@*&K?cIoJYeEaY(Md~AHT7y%`&zrAPc}9IG53K9VTWli
zraGPUl(kUlxzwT+CDOXzP4~elZw?lpZ=tIiFK0qAsS7s~=De_A)@Rl<rwHi!M=EX3
zF;;ds%Q#bcluzXs=*F6co-SPmH}jJ?+=Z)yblXm3c@5rokK^|XYXrwgBOU4p=iUXp
z%m>H4eta1+ef0D(t(+{x|47aI>(y;WuP*hn2*;Rf=?xD*D%OMYjxu>WZU~qQ!o;Z^
z)n>6C5?E0`_dSayA%g~?f%TaE!s``l;Ne196<jLegx=;V87keq&6d1$k;S9rT`KD;
z^_AVbjH@!jp$2Q^!C*$2H?Q;OFtF;frfjzr?K@;x@VB1;)nUtw5;Lksi$RL-w$)JU
z^Kas*-H4j1GuZRA>#)_8Q|{^}y9wyq#}8}uRZniuG|qY#xThu<w3c^V4q?7;oLiyY
z-GTBH0cTAT_f0mVbU7~}Y*$hPl<zDP1KZ;60!Gg*yW>|<lDFZ}vD(HI$EOVzDB=nr
zH0|tg$B!~UFiux$$=@+90ZW<(V*6age6X<p?(dpS1@goB`=wqP;jFxEpO)c&RA3N)
z%Ua0gV&aagsLXum?iL$c^(Or3R~_GzR>P_0$0%)&^pe9Z*Lv<k+2z@<oY39KIFyIj
z(Yf52>L4eJHPee8!3b~F810_Tc1C(Jh;}c_d0=p!zJ2+MUr@F0)Ws@ni{F>-v~IE`
zAi&NWq8HS5Yw0DBae#R93R(G9nYZQn<aONLCH(eG#4Qk~5$&lWGHrY0SNFV5{F5)A
z?$zTWGsiaGqXB=Oyuaqb$;FWCh`Df$FQfC6Ra1=t`Djy1w(j-5hqy;OjP6G%#1`J5
zZVnSdb!~bC^tRRd!Q2xKC`KQ+IKHAR>)63}6|Lw-n)R|YrDc9feGN=C`f{o%%Yt{)
zUCw;_>u&p?tbbkv)%yOD4eM_xn)rYhejPW{|49Ns%x6gQIn<$<x#6FdnY+kAf%7a}
zM4SSTC1$gu-){ngB%;f{b2~#AqTMt9adc)*Cq8rJAk?zei0S*wU9ZOG2oJ=t>+3Vj
zAQdi1VA?GkD&ZS-qZdub$nWhQq;BYpkJ5r6Ve>V>HZFxmCgMYar+7$3&+ck<Ac<!U
z?WJT>e8OuZ$9$P>C~t$re$lg+#ZN{&ec}!u>|uexS`dA!`E1Zm9dKLsA{f6GB;&&G
zpwutkop0YroZM`27C(kt31GX?-xx$Hj&R<Q-fK>Xo0;tj2&){MXcZWL`0-E{A$rJa
zB`7T%Vq=V_!Senflf9QybzeH=UZ3t(acwuKQ>WEt%O~T4fn#KMM7%&H^t3bJvEblE
zFWP`|bc7&B0k<Yr(8w+2#fN)jZILAgG>yo{@9(RazJ;xz&nx{mFeGTjkXVvhJ5a9s
z(^7mwyV~w+7Rx7yhL52UaYIe*NxuU{<@~ZhWu60v9<5m^{^rF=yiq(%?q2^MeoU!M
z%g{lx24a(bPxVpX?cKob19Kpe5zoBO9FO#Sd7x4LUx0!2rWq5z0cWw7$vks=3t8iV
zL9<6Gpi#ia+P_pjrOsam-=_Mza&^C!b?;E;wk%T-oM-i1_9B?t8de%`5u&?U6;!)6
zY4=72K#6~b*^(f>fO5{9|GAtM-B-=&x6bBod1Am{=Gkrv!biz!+dRzjxE<Hi6uI~1
z68Xf2Q1F&Wbw%ELibIn^uO70<6{+Xra82B;Bp41#zE@K{on3t9Xq)r;Hu>d3-3QR5
z$>mDCYz>JYyXSVJsTzk*A3b^a|C2_6XqIN+X}QsyIp!8>KD1WM7FyC8uj}qb$=h(p
zv}Rmof+n}ApLfD;uc`ALa3oDQZp7|(t{5<KTGaI_S&SLw7M72op1hvP!=W&dI~i;c
zVowDx$L(W7vXR>gP-Jnp?(I`+?0u!{vB$Z8;do-^l9oX)Ni{Uip?2MXAtKA`3_Mzs
z>awwCfLnv#cW|f!e{_lwhcla;p`X+BOq85z3n_SCq_<(tU$i`Qaij)?*&&x-o3)(V
zY`+nBJrhFBkvAQA9A{wvFiuF4^edGxy<+%u1Kr{!CqT+b_1n_Bi!8h{9@^Z~Y#Fn`
zJlXv7G~9|?nY^H^I7AAvmg7JnTaueoaUBKjc;@OQ6R(Ha*!|gUR$E>xu>ShJSR)aB
zD35ob{!B<V^=t0{NWfD4<G%zWDig_F*VcOgjpHB3LwlafHr^O1XUSxM&_$GJwF>sq
zCu#P~!<aBrkja)SbtB1T;Z0TAh$%ocA(Oy{qf;1?zQDV`>p(te>sgfx>c36UZvj<8
zBjN3dOrC9<2&oUzXrs55_HRc~`yX^22tCYo9uvGZXCNP)6ga5#adf^1WE^1s6p{zH
zxFl1*5)o-+#mk;?+>W-_caCLppfp6!X>l9sIWHXlzQ2EenZ$3k)g!=Ex#-{JD~Jc*
zv|sjS+p3f}&}**9Pp`N^9*Nm}Q+9HoE9s1?1yCfd4(XPkvnpn)R+1$AX?@6l<_9=i
z7b}&=rKZ>wBrx-!uyyODV#G3;SJ}y6P@`kJ5x29X5&xUjkDY~XJ&fLO{#|P|4(Kj`
zwyH7CGa8jbF}!{Z(SffB9&U>O>fzRoeiw7&RsjMtb9!4$W9fji&}w<sOc16{)-Qyn
zx4&80TRG0o$mbeQr=RSmH#@#UKL;lgnsoLS7QFXSVx0#D<7_aiJJ{o>-6mn=6OdT0
zU8#1Pdho<uhn(OiXOvgGQiv4-19@mg$%fsq(XxX?i`*~c*FI-ms3_>aD&jiEQ<-8F
zFfwgvEAIH~>q$ir!ejl>Te-={Hu;-y#dw3<X`NsBe|HCPEG9s0=D?J&(bwLgwQYL*
zp?DvKBsrJY7SH&y)&UUg$}hC@w1?&N{n|pCLt`Ggwq>w;n8+JSYLZFi4FImcBo_x$
z%$_ba0o2V!9|BD@?K{ht5O?KuI2bE%cQpTJ=^bJYudM3VDtNkw{=&OcJNb3{#;c<-
zWM0+CW$JQ(1HvNlwBCNGt3K?J^b79d&m8=(+nXOCgvV^_ID$IKFBsoxXDFs(oKSff
z#KuJF@*6t0DgDeu8Gt<3e3;c61avs1F|1=8RR{SZduH^3{&EL^VTy}unVkCP{UrtO
zK_k&rDep0z$Cv=_vU*2ipLb6$4j%UjyExWq+p;=PR<pn2Uu<lT@EoVma>FA1KU>XV
zL^IF!g$a9DKK_(7ycqU{`pMdh`st=z=kXEJzg{MXpkPH<aJQ@V)&AsChPMKoP|U!2
zoMH+R-fzarcUzgnr;U>jQGjBDj-ZA*EITLoQ(kn4BTJ#pf0);evW^O?z0p8gQ+v#X
zm;>gd(!N16<#$4NG}G6jhhH5&^%E^#U0(9}9i}{OVbY8j?EiU|5YxqGc+CoKpix^0
zz6~U>8BU>SVOS4EO43Y(4)gNoW&hay+T(TZ`n8#pmNJkV)|9e(7mZqX^N?I#4m~D&
zj~Nl$j2nXF>W`M*0T4z!xB>z+5QDCTh*w=O_@h!eQ{CSjm?#X54$!*(P+_aaR`<U;
zLXWWN+&^LD()P)Wx-TNTb5J<pw)p}`edN$cyTni4*VWFs$~Qtt%HONsI>nT?sCKul
z82ak^P6Oa4yYho`(Zu%7!uoplk3GM)BYc?PBFo?S!-X~#vnNbk3=`F_<c*V}dVlpR
zU2ryFV$svl*qeEUbJj7h!xMhoo!#Me#nJn7u_=M@Cu{j<7VrReJzLSi#a<RHvSdbB
zmaUfsExGZ5pSGc$q|+m$jZ^jH#0@2~@gz>_1d%n)n;7l_9kf~2hPpvAg}BZiG9}Sp
z4^L+8VG;JDL+?&*j~Hd-bY<lrPWB!|Xg}zG<tEwRaXO8p9ToB_;Drs{eHhU$xc!~U
zwi0)4;`pur;C~0C_L5KiithbI?F+}x3CjwMA0dd{zalkWGNH3xJrZPgOzM|7^a$PW
zN;K!W?UQox<4HZtMcC;vIly;iyp$M=Pd|=)HRBgB$M5Z@Zg-m8obO{OYxn)=?2$mB
z`H4(oSzQaZ*chO#=ev2hj_}H+<}rH4BkWZaYy-WNhoXQ12)+{n?Up`3%Lz#%Kd${;
z-*R1!K!+9?v-S_brMmGmBYjpH1A^q-6XsGP2EMPYQaC~Q3E6D;4^q$!%PBiTjFk|d
z4yP=U$}PYfFQV<XH2C@CY*`Qep2!#9GQ7C`r5XUz;gLC-%XTjvD6O7LT=c+b8-h+#
ztCO(2u&FwXXv2Chiy-q|9r?20E$`*9d;d;vCMPg8&*0Ho2EJU!(u;(_G&>GGdqc{^
zmW*kMP0kzzzZGRCh`8@w<AZ&?1w^uTV(O_vQ#amxLlEP-VyZAKnD!py?+&7=DPOxX
z-=>drSKGbOS=hVuMN}=-iGQK}ATeZ<Iq<V)W<9C+VA$uTi*w%~T64y;H9o*o4&^p9
zCPo@KGD{w+8m?qiLH`JFR$ahdG#|;n;@_5*V{9S=Oo@h62TrnYt_tZ%H8?%R?FgT0
zW~3cCws06ym6;F_-3o-|0BBmMGfwTqSZ%r+D{#u{z<ofoBlz;%iqh9bpy-!(nH&=p
zpW>dk_yD8lLFX2>p(jo7rMe{38_e1k0TelC_&4vCj{e}VqRbIY!v-138W>~swBTtu
z>~e+oQ9B2$Wn+&D6E4*$c^r95O%%A^0$(<}$dZTuoG4oGw@y0+>VJen{u4(4fgRKm
z5eX$FgKr=%j28t_cGr!sY0L7b_n4$takIXd1{6me1M+Hrx5uUp{XDu#8l;lpK2d4E
zmjIX6={n)WJrZ|H@xlb8oFj8oG90s@RxMNeZA9GS6LzxLJ8)*N`;>T15s9>7BamLs
zL5E^(NQxjV`1!{J;Gq5C;!sOV0&iO*l2v;o7>JW5bWw5Ii&k;_q6<;$`g33uejoek
zk-C5PpR|Xg{P@=@7P!FHpLrLCI0rc){xa6&yrBm~sezHKR}4ZD;M$8dOdf3?n@Z}l
z-+fh1#43_gc4I7hg1RL?LG$=v|Jj8AX9Z5$`d^`n;$BU}(sC$`(B`E3%kI7uo#f(k
z(f%hhLp}2ngiI8ME-Uw&N^XOm-`ZXbCdde)3&kojnkU-xB9dx9sU=(=Oausgq$aa9
z(OT3q1^n~rPEZWfqF~pKOJP;#5*3^x=XM&|IA9T+o1UgTt)TJ(i9E*xS+jPF)u?_p
z%rwWB^8XK{#__QSDRtRh(yK^^i(eW$j%0f&0TVWRw8y^UH+(acMOXML3qfUa1`j|2
z>-@daJOz&~_$A(LMmuarUo>o4nUjwSxHx7PEPROGJTn+#RdH#-3QuFddj|yj$q5Lf
zpC>fDn7Vgzb+Gcop@8J+Pq@wt`#nEI-IY01EZ8>za7;Hp(+B^2pq&WLoW-XPhZwe+
zF|mg#?JfT#38iwxSe!k7mAyM{=RQ+r&zJS(FAO^=`L*4{Z83HgtNM?ym9=B_3rw+R
zVv=9>%)u@ttpi`F-^Pg8h#2{Px{^EVi$w-R7QlQT$|=J=)sk)CLZNp{GIo~y1AorH
z3yU|U9Z%W&=^={U`lwZFe<MHZ#)M>OR8f?$M-7joxa8gF9+Mnk%2|n18A^$FXM;r+
z{JS)|2=?&wKXgdCQtgrL4N)3D(pJV`6r3>2yrG@vr8A@Sot*9@0NwqBero<!U#U@Z
ze$g<{e8U``#v8bJwP7WU9we@^{5kGX00vePoM}y*i4C1Qc%ceadHiRL6@Lg4ApAy#
zP&t)c>W+WpcN~Jqm<Q|KO^WKej8TP4f&?Ub#vISTj85xP)5ozj@MD7(a1(pZ<;4Ae
zzEK?eC<>7(j5g6@2~0B1zFAv8l<;@WsYyHA5KecIft*pI<bV03mC1FkM5bG&(eB+l
zEYlDEGR90RKwQ4QN7Nk2OHeqi`CIW{=Bl`EvwvrymaokP)H|LO7p$;6dlGp-<M4TR
z@7BT=uN{-5=YKshzBKY`mG8!FQx#si_V0`?rK30mhtr?nGbg(a`l2*!11p4Wnr5^@
zUDu#8#i+;$s?iC8T$yWY5TtEc|6Hd0;pxYwvjo8a_UEXxPNw&p-;Qt&$|VSFIq9Ts
zSv(Ie(N3u5vfI0!!;j<4yU+nr8L!p?vMP}jnv6JFe1g)9TSl6sl5e@oqk6KPJHt?U
zWU-z1YTe&z#-6GmPf7RNCR`6f{!S7Ptdqw57XSbSQ%qa*`gm^HKwlf((lgm4lz~3*
zWp%R9cK(bMWAyGM34u>Bb&sX^{D?qhrfU5=usZF7eLrvBc!_Q%#6cvTQDOpOEO0!@
zXIx4|%CBl?LhQHKTPgG>uTssg4f#?19b$gBjt%aP4z=Vc0;2}-mG?7ent(KZX(GJi
zXvq$a7&P(M-h&d`X2U(QvPK?}Yuz1eizIu&s5ry6r-7SM{Ei3kze<1toCoGU0&00S
z-NWCHC0<pckx8k&NMZXu$%ExzBEzdj2kEfh_`30O3uf+j&u$B%C}I^c%7HT%kbq(9
zwMJrplhrS~r=xVJl%wWTZNdnQPEu{7N2on%*Pfg#TzJBVLm1qTV~F~mtnZKcMead2
zchoj`NyzxWjkz8K)ufJ3{muG-`16R2PGX8;J~XKxD}c{)J`YgyEvS*Ps4&znp%^<?
zNYK=u#sft<k(5>7MD@bhs4+di?(;1S?+qaa<V*7|@}%0oXCr@E2OeH`j@`06LZ=NU
zsYe*t-iTxBtNpDh+<U*QnkTO*xO&;C_p`X|-fSf2P=c>90IHm@+tn!J-LAYMij4F}
zOUZ^+nXF3YEOHJw-77PE8=3rZ?L59AB1eMVAzm;bgwPrmpB5HNvh~m}6SO6`c)Jm_
zM|%1x;U^FIqF7=e-($O*(@L#^<$qKFHU2WW>Yt&UnErgO#oHC8G>jk7eXFevOHh21
z$MR77)D7mMNW-Y(Bnqu_cVB`hd-l!**#_iJo%BHAM#3FJt?>BZL?#fv$fOY^FX4sN
z_NP$^Cl`K`YZ$+ZDPs?+R~PNK2HW3aZ#-}b0vO9Vm%W+dal77#yZ-opgoC-T$m5cB
z;=<Cli9*LLEG;zBS4DTRQ9Am8<V$SA8aCb~8uN}>&nQ*pkWFm)H)HKEX_He{c?Ws?
zcmhEjo3bOrA2k?Wr)`!CL9m63F6mLJ`Hk8JXfGaP0SfJgu(Ez|;cC8%o;%(q^2iKG
z_t{zJ30ma$-e3XyjeElEoI%q6(z>cSzLm;;kYbZu_4`KvUfh7`&|{MUOH3c3ba%H{
zYX3fV-?!^tqdn%RR#0(Y+uit1=RPuRV@`D9Ql84_lq5c>`{17J`I*DPa@dTnhh-0y
znr5>257F@X4KfPx9@V*4thgnQKJU6^r{fo@%)>k_upS>_dm{Lz<4q442($fUty7Ol
zDVAm3^Kk|ugi08i^mIMJKY^C;qW!_0j*CJ_8o^tyEMCDOOQ0a}S8~vFq3vIwmiElh
z@*wSStf+eCMPV0;?kccK(qH<;3wE2AmbXnKO1^e}O1!%6RXn>4GtLlV7HQo7e%EWh
zn7lu91#V#`Zr1V2zW^vsQ{{U4m#T5sW4aE--^sV|@KHSmR{7k4VL;}I(T^)yN+;q{
z>sPsmF<UuaEsCn95Lol<OIyq2;&z~BA}(F&X{ECZ9K<MUTO}5hKbq6nvisK*F{bq8
zBGUn6odvvb(HsK>09+AlGTrk{cz<K(^dsfXSQVGvu%ykkk+)aBn%@_}+G~H5r*Awj
z?0XP}C^cx*Y>}Xo9ufq^d)A)$U&hV9y>9FlKl31HBzeI*?jKnS{q!x?Zs&fCVFP|N
zA&HLI3Bv6ltGs<v1D9E?Mu?N|3SVV?cZ^85$9Rk&Cfy@%XvF48)@HIDLB1YK&VMjM
zY=*FDD9-Q9@z=TnE@J4#hi^H-)Z^<3z`Ze`v9*8}i;F$4tQ%<jT?-cXiqseII5W%l
zKYNak1Kxk>CvUd)df!lSH1*AOTznv(MwA_3<0V+1-sFfUc$WKk^K<X)#xCC5*n1RE
zf`<gy%=f#$b~Q)TbF0?a`bk;!{*pJQ9B=c_1c&%S#24f5HJ${i+OGyBI|i=bEW>o=
z;iEYdv+p$xIU8K0O2dJBtZ_pEYNRZ%343Oz8k%H#gWyrr(g@Nb^5#QE_Z;ffx>EFp
z#_&APW)F+60u(SN1C=`Jq#)%**gAAWP!*f3_1!4Yvx7L#-T4NK8OL7jb%#>0IGWz2
zW&&LM?@k@c)s_Ls5h2Ecn4mne!5R@4pM8dMAFbEp|31C)J*uC)UF+tZn|Jpfz4&3T
z6;l=dSFhXz-jYB~-8=7AUIm<%R@o#U_w#-E6+KFay4I1x)}~_!hmJ&JlIzj8T~RuF
z+I}5g0X)LxM8ki#q>dnL>U)mJ;>T-1oxMfZ&BwGSJtWC~Rzs>OZ!W$xNpwH5?z7rj
zoDs%8PuH`7O=qg%<l{?Qt=yE-YW$_U?ladsoUryA;Uxk^Nk<eM84leNce@OTc24IK
zUp7^E!dicu0;xP+6P<#9^gtt5HRzw%x=#mL7~Ki;*gxLAni%wUfKo5-Zl<GI{tL9}
z5`G|!MH0K;-XYUa+<*Y2?0HIaQ4IV4o{}a2`!CD8ZE_(sQfVo4816IhiB;a(z*C-E
z8yWg(F?EpfPfGufrUZ;MSO<Le=uB8A=joKmWb*oYb-SZ!Rw=WWLmEMmF8ucSb>?&S
zf2C0O9v0BZyd>t9<9A2Zky$B>khtTlY==ju??8PxUpHgAmJE<tg8Z1jub?mFtZD#r
z`+b&}f!|jzufy_D4_~;YNhvD5=xqV`$yO-Q0zY>HV8d$t&CD$UZ^KknhbW^YHy|<3
zQ<!L%9p;;43CZZF12eRr9X~~9PHLq<OBDZAHm7ETRSfYQ`2}L_B+{xA{K5{r^u2pn
z3wbK1$-*TLr_OZu4Pxj7geOc7LjI#vvl`e}udI!}Uoxn>)hX0z9&`1;cSR~%{)#8_
z?|MT_jxim9<&l13T{hPq$l&O*3<9tV6yU){^$xF9`go3PoM{A+VjB_z(-EGI<;};t
zw-l?Px+0x%_2O6Yapx4f;s0``!vA^u1Aw+u=Ph|#MDB7#Etp?N6o;Cv!5Ey(Ra~d_
zr<@15n6*~gJkHZ`1l6@?vPFfAt*lY$Pp%Xz);|CO4QF`|3ak+t*0^T&*0gZmNbXzf
z@l0O4QCp|gT3+#|6ryE=dZA+S;p@JnecYcvD~7gO#Q}E2A9F_mPQ4#@I?m*`Rv)-T
z+}I)!A2j2aMZs$<<IHnnK!u}@b=pa8730&hv7K(`#*_*7zpEj6pZ=}(z-f-efpEPU
zCTOIW!MI13<jxvckBB^7{KjB304ivH#EwIv*3X(b`F!d4UB&>fps@V1EGZFCn<fpM
z*6~QX(ZF9nV>!$EC^@$^oRaPSpzjabX1c6yCp)mlp}6BXZ8Q4UnH&|}-Wqzyo~9<?
zg%%^n%|jvD$>KgZxaR?0PKxKzH<AEdzI*En(Gu2UTYy#_VR*3`w;L<Av1MI|`C9M<
z-OlDqpOOBbJp?>}m66p!xorpz_NyZP-ut7Acz)A{1>wK#BAlIr?RnPY!+JzK1rd$E
zmLfH^S#oKl;yg;+$k^56UR#f!9oPVc9;5yTbQTwnX1D#f3eKFZ@uLtlx8f2g3@%Qz
z>|Y8)JyLy&+qb|@1$KW*FpMcXLHCzaDkYZK05?^tlAZjKbo_8DC~K;EeOHUNV*m7`
zH%Xb(jz|KPH%=7)<LSjbj1hC`<>!+)I6<y=BYRj3-Zx=s>ZVu3pcl+Vbq8~x-;;BY
zM-lN5V)B}G3BsfAgz_OaaFHi7oP0$bl3j~2zCd~Nm3SME#N^{pwhjL(U5~Y}<0`kF
zj-N=TtMfrh#R<a!DnkiM;a16*?2VqIpAxpBZ@`z;Vq$=>o9RGTwpQi&#i2+|7-*^^
zpOG3r(=#K}b8Sg81gEU?2i{%n!GomsRVXf1&1Qp>M9f0u2BpZqFo-nbN^>p^MyoyO
zZYmXP`+cg{*~UU2Gqf|Vw9)gR_|+_V1?gDdy2Vf^9(II2UCeUhAWOBCWi--}>k7wJ
zF+Fhy3TytFQ9#)B(8Z8-vZJE_LszbcsG)7bRF8`A!s3n4h|>b#h`g?cMrjMicT?g0
zAO<Zygra8WS{Q3D>ER`}`=GrjA=7%TcYykNtZtdy<~=nzzC<#=_ujwmgBrI->;P}>
z5H1gOXZDY(tm^N5y6wa5kPHx*rV?)zTm<eyBe4lSbjahMXh|1DG&GV)Sncq1`?lgg
zi)>3Xmiq-s?~Lxa**`uIG_HoH(k%rbBYJG8hHx^m110Yn<^+?JC|~Be*tt^W<?oyC
zb<Zdq)$LG?@@+Wllbs46Kt5NMUbBfDCZ2v&_>p8#z(}d~>x(}^v8zAkh@B22J^(By
z$A`x8bXmjGAJao#p0oU?L4@F&ho_XU$A-B}^?H8+6&I7wD4xlhUl%rIyj7*`5|Bl2
z<Tc2tni#OVk77osKu-!u71<Q*?wn6n6RLPMIpYj=Ko0v78r}sMW~acTYO6(7d2ykW
zN3k?Gt*n&mA3>=3IPC=CH$W6-ARvKh;a{N8)NfvT+L#QC^wQ=7R+aKCo22zWCmTRv
zgR#FDWURmU1}s~|EWyEVnf=yiADNg)Q9ht4`Z%6NVu6<-9&zpjm}Ye*f-fYub9Ix)
zy#bd3fah&j(q~O0D7Ae$16f%^o8Ml=n>TInHt<cr$&r+-FOcuC`IgRL;I#(8$b1P0
zOYoz9&ns{*EE50@${c_H#<CG)`G$2+Va<YZWeTgb^;dS?C`jmKElT1OimT=wVJeB{
z!NM?%e~Y|D^~-Bpgz{xofG<@o(j0yIabnfX&}tc;MO?jlR{Oi#jFJwls2JKs;I;NG
zWQ4yvz(pR=vsI&e@UHC#0;a*2yW;Vz`*#Yo>OAI-10pjvQlkP&yFqc<ws&!fGzxm#
zDSGzBGr<C37^kWKkm!J7a?Ccz4@lq-6QL*vK0t`=BFtjiMPpDQ2KkmwjZ=(b5I_gd
zX2{I>{CuX}F3e3RSQwDlZ+Y*9V`sv>{n!|vSbp~YxsU6a1`PSVjcEn@_{;_@EUQ6|
z)4+jJ3NdSfH5;IKzXfhCq<lil+{1z|IH?clmxt%woI<&jd1=ff*vmI^5<*^_E(z7+
zrcsB<aygyo<sj#vvKxB(_{JX5Gy}%Z^U=e`2k6$W%iH)Cka7lRYG5jv;fXe!V<rGw
z?V4nAZu@oYpCw*13!hvNdC^0$_-)abxa<i~;kx^tD8jx9+i`ifZBq&K_P~Q5WUf1(
zQ~NkoNfh5bLzbbI$2$?#tQPPGr;7Xn=Nd<ndorV##c9ktS;SN<paC;S?q=i=nf^dE
zYJ7i+owR|o+EC_zmzl3FY%f%-w*=@RUeEzyn**8838y0{S7_KjbJ@Na`c@#>leQ<V
zdv-j5YTO_7xWB3}!pU-$uJYp5Gc=t@<E|Oko*2S^6YlNP?rfdO8|NyK<*<;4P;~0f
zsqbs%eBAi`o8;(4g?iuvz}8UsN7%aSmII|QeeJh(*lJ}eh0V{avQ>7cJx?v(eSA?(
zNc5N<Lv;PX8x)|e?q{t%#B=fONb;0j?bNiFW&WY|-W~L;!QSltc2!Wwq5>zN#k%?o
zu@6fi8))Eu2l35|Vt49#5;l}Z*K=FhCdr=ma@H)bx(PXo?fFt#8Qe1(dSc*lY&x9P
z4!EJ~o=vU7PhX|~AmVQLe2?euKYq65K!xN;{WW9c0LNa_n(T$jKo&)^+CUoxNgfv&
zkbZy?o%6tx24R?h!oD*(8DXdj7_dB|;L@Un<lZS8W3fz}AOT>p(zHL(W@ntPYB)h`
zN)lpsWUxUYjM=N7Lo^F49b<SUGZMhx7Z}ksOEa7XKF!m84|dR*Uo&q+Qqg;^bxI1H
za6cZRHl{p55EAX(0W}vFJoN)AyL2M-oL&X`(3fimzAg)Xv~b$cNal!OR(|F6qhfN4
zFV#TFh?qBc@qWe9TnyFS)ng^e4jKYd*Ykb@z`6%=FGLTiW7~C;OV3c#)qCj0X*875
zDm)WLhu)zeQ#bg9h-9^@;hKf`gfWAv@__=1zV=r+V1>1B!qe*`PRgcOPipTCKn#od
ziqA3FLKmW^SDOUup|iXoxi9l7TPkDw#&V<%xQAKrCUMq$<$|EwO+0&X%ny%5&*F7-
zi>N(!yn=U~K_r=y<UEnP0#MDjTfw2c?JhD+7}YC9EcUGTO7p#eq{Xl+g5Y?mMh4}G
z_?Or{B5AJ3h^XYq%d^~y_E}RBE|Kr5E7hHBvc{>&?Gv~-Khw3}`p&vz)G7VdU-!$%
z$HjYC;%}`IOx}mmS_Z{jh;s!{Jts~)y~6N?`M_M6G&#^m#Bt8eV<c{ukLVb`>(<D?
z-Vf(9t_dCME^5iTJ8jXJ=QL&!(>6E;@Gw1nkC3S;;GMpbwlw1oS3=DFMVgtSA5YG1
zlI`7{d?<^WZHfh6nN?b2k)7Eq|6I>KA0_fcwc(F&@u$=rC~uJkl<L+?@$}1Oy%4Te
z1*d(hIKF&*X@PQGmYU;GDNg>rEI2jzQ!fM%J}R8{jh~hAzg(ltPt>fja!Z3TkC>i>
zKLr&0OqtfE5<6fH1Rp*IW%OnOTEAM_)(6)~V75kM<o>*w@ZWDAtV&);*0gTB*LExC
S0{`aaiac0JrtrDZ`~M9-Us$C8

literal 0
HcmV?d00001

diff --git a/docs/source/assets/design/v1/prefix_caching/overview.png b/docs/source/assets/design/v1/prefix_caching/overview.png
new file mode 100644
index 0000000000000000000000000000000000000000..14fb985adca032eac15543d559fc2f725a2b3bcf
GIT binary patch
literal 33028
zcmd?RXH?T$_b%$*?rj4EMJdvxE4@RgiXtFQL6MGt5PAzOK)^=tJwO1ZiuB%zQU#JA
zUAnYHY9s+dsOJZ|_xt|umwUe4amP643xkpTva;4(bFTT!=b0;EkF-^(F5S3v=FAx?
zb+!AtXU_bIJ9Fj_jSJ_1S3E^IKAk!9@{IcZyH9+~)-ui`uBpr%Mclmk{66Z^xhJUC
zpzKV`Xt;=_%ymw_5AXgA4`qa4E^=av;ShNp1$?s}zS$q&Nt*FnKpt<GWTurAuCUmo
zPO~J#TC@6z;~6U(%b8>?PbJgCB0j6PG|Cs)|G)cl;|^O<*x$0~l}AMiC3Jjc(;a5{
zb}hA7h6Ma-tj}F40q4Q~;LGA<oN-RG&Hi_ilDyehN+#oJn;NU-ZJzvP1v-JG5{5)7
zz9jTTh1bXQqT~Rb3%#fFz0dRpinhr?rcULCOz9cz?0;w$+#O!r_Z2mJbk-qti`bE7
z!53v`JWqleHotpGE>^7CGcQ`wu(r}1@)LN*w}1B$R?A;wd~Bur@i;%vEXntA!>KWB
zIh7$tOhtyTS}!CY4%rtDN`1Y8CYu+x-d^5+$DDzewzWDqvKPgvrut9wzNzGZv^Lo7
znY2H&7MxQ~K+Y;<6V7TZRb{)Mnswm8TkHYWmJwl@mdr+%=zSle2r{Z|897c&K9n*V
z)!(&B%>dyoxAS_&rj>39i{ZB2i*8Hda^(;G?8X}h#C{EO4uwIu*rz3j@Kk^xGw78N
zeZ0Zp_==9cu;KcR(re`*Jwv><BS$(#B&}AMS{#UX45qK1^O^v+$1=}|Y4x=aCwCSL
zi8S0jN*VFi_ckh_D_;1lHq?8NHo~TLWVoAE$$%^yPbFILnB$RJdd{G0sl@mB%T3=t
zJUQ19t?wNjwtf7_SBtd?Cz~(}FGpOXo3XMlKiC`;8>Kz<A16&)Oiv3d!vzS*-j0z+
zULs5d+uSswmq*~p7_0thWl}Dn54X9RPHcZzHqg$4WVemB$0CyFW|NW!l-C$-2dn9Y
zM<;8WZ7%04nDwuAJy-a-R3(lR0Jp?s6F4Adi4et(MQORj!|BD_!pBd2wLpDyExLvu
zcpvtCN9i(7Qk>W+R4PCD@_VF|UBM>~NPvGK<+Qcrx5bjiYL0#PNUq`+gk+=M%(@%T
z3EDCJ$-QdQ?<tib+I*KvdIFj<^E}fyC2%om{%`3ddRA0FSDt^k+Af|a7Cpz6knU-8
zL@rJo`g7go$q%*V*|U0mM>K-Nt^P0UOn+Jf)0BrkZ;{I;7@^5Yd1KWFB)+80L&9I*
z&la_7Si`ppv|2X;;mw_y3#2t&qu_JrmQGEUP4%G@!D;l>a7^z7re*xF_ere{NgNuY
zn>;W`Wl==t%M&vgUu-8i*^M+k!J^5+nlqPB$7>PE`0>Ymwt`48t(^Ci<$Hm71pbHC
znOE}MJ?<AqGMzIcDxl2y&K{9$j-uX8xhTm&v}I{lJ9U`sC2>O<Wyk%CI`<iZ<NNjR
zLqNFlT0397MN-7CfO60iEU|uCw3**itk`yr+Y*8)4{YU@*Iqi4HF@Z{EJ<E(wqbIw
zEt*3^Z=qJR=N_WJPT30DCXZZ8z@_-LCC^X4aW~*Hz(zIo<MN*>+T{17xZRuEoO~vZ
z^JjzGk1{a{7L6c?4KOg~(Ui<UPl^xpV=D1gzMXOA)zd>LRO*Eni}I^v^W+5VoR+C0
zMm^&v9^zCe&{0r((7ZLlq^xQC45=&r7K^B+%#zbu>00}c3^D7N$+v?=`d%m$2pE5w
zSBXqEQ(Am>$>-~0Uz@)d@^B5+pfmkBGGy2u^9oZWohnb&Elg@`v8(xrMZk8QeB5uI
zE`K7{2-P6nUHxu<IO-eXY*%U}^&7F93Z3Z&v*WIo%aWJC%P(p*9$aYnvItuJPU>$p
z2vA>^l%^<38MTW}P9wS=LUODZoA#}{lY1FVgU4pmdY`Z!k!H_l2q^^Y1A)i@$=t?-
z^%`4Z_R|mcUwam7OF=JDuMXTER;u`km2VFFS!|a!<7cqM@hj*jfMqrl?s1Yw&9`jq
zm~M&TW+XV>^F#@`nXy6(eI&jT4b}VR4|pHxYJ6cwmObf;dWnZKcwGnz($dqGn$*w9
z!!oz;HS~-Xa6{{KK|D!`G`;G2n}s5|8M`E0*ylgFMYrAwE9F-U?N*MbLXN9BS}i_~
zAcmF`#uk@DmX$~;z2qbLBZG$VIa0Ytd1Rd3fuAsA;_py<H+uQK#}CTxZXq!o94q{A
zNu@E*?rBKBobaI2XWt6YdD*Bv&xNgo*yJ?GH<X0u^QfR#RFE=8!LFEM-P~?X>xpRL
ztPBvi*lm>anNqGk1fvz(Z+u+ULtH(~eek1r(AV|Bqi8egWHO$d{!a7MKdP@p=Dyyv
z*RJN6wcraGJNw|Q1xT=(oncEuYiM_F=9>j!J~|7o-!-svn>qAyvd18$oPbz0JKoLv
zQJ1?>%I!c*E`I&JUeU3D+mV;#ZnM1P^FKKP9pR%TZAlNm&g)!?)M30kYpf+g{)T5i
zGEjz?bV%_8vhA>4rti5WdZdudm*m8nNX4J@8N<WUjquc_9DT*Dx#asEgp(^+OZz=b
zk2@kqAL!Il4Wxt+8sF;=%A>2^q$jw3=>Y|$Q_1#QM)r7?op)wEmnMI{<p%sv<nQ2g
zUL`+U8RCauxj5r;R)4^~=64y-i=jKmZq0Xo#RleF$u_cjFTF=DwN_d&F9_jv>L%>6
z3j3+@I{q)3-$k*rEEnX9^_!o5;Q#+WC`QvJ&;7i^1a(M8t7u8+y!_Y-(@^*=3?!@3
zbM=Uyb0R~?w9((;(mcDQzG;)!A?5<RjmFNV@G(2unnTF#-3r%u$gt0q6q!e!@oV?&
zi$m3!xdw!|Dq?R4&tE<N@IqYt6V}4b|7)Mj<S>Pdq1ND{3w>pz8&HGVNyYQYp5L!@
zTMxN3D2<n<DIV;&j5nm1;%k)->h-GVmG+>O+TiPMVP~K6_9%vXkkR6mwEgm9hsSf$
zfhD1$vb!H>D;+lcxyDQBS~fdfrtn-sc+_wEs>bD;!i=T727@+^>jDV-T$m7Mn6B4C
zWI_D;$K@m)WM8y`wK8<S+k9Wbus}Q<-f{Qpy7JWsY&D);)taPu*tnlXG&W)D(uBt=
zkY=9sc%he9<XFANzDOevOjXDfYzL^8*(USj+vE9Fq4gcFt}5qpB^9U@AQU8-nn!(?
z#uR*adcCHs%)#FVW;j)?VS77huWv9maO1*ygvU>69S=N+Jh)mlO;WgJ?Dgr=_@w!v
zqG6eMI55`7gcoco9LcLLl|lQ*hvo<4Ea&NI*>DOwax$yq4gubCVRa?>(yhBrwVQO4
zFDQg3gbhbiJ4sCEKWfLru7x$$jZk}e93u%i=8X#sZr|sYWbm@{pB!7szAw(RK>aJ7
z;zV|bXV<-EhNfL1w4(+1KgUsqY}~6}Qk&CV|JdQXGDTDH#UrJqVq#S%oq44ReYCuV
zW9`mYs_E_Z=u~0uNxJ35O+Aw-(L)O|r*Po+-a_F`sSFkWqg&d_p?}8r>g58sW@(-J
zX)*CT1>S~8+Q`TV>_qqMRG97U(r?VO)s~jD?!-E_{r!w<)$76Uz{ycl0U+r4%t8a)
z2Ctc0MbWl;j}}R3J#vz-sEixJ8itp?{3L^G|CVkX2#HUX<{M?-00OtD@bAOdMb7AC
zGcR&D6A@iIqsZjB27fV);c1<eQOh<}vm8KJLRu$ota7b>6{$G5#NWUhzFBDiCmv}C
zBzdmRC^T7gB`mq8LPpNB+jAGnt9<N4<S0<fNKE>t&utv<XTkcOc4T)?A}*vD*^tA<
zgchrtyMcLcwsij5u#@NKnxei0;~VStk4HnYyjLYtlJD<zsEkc7Yc`^wdAl>&G9wz5
zF&rb7m@A&Cvgl*^kKmT<0^?i(h0QMTb!++C>}YPYx~ed|KzN)jTfBipff0%v!4{C=
zxjdLR<$LrdW5*Tu*+=0mXx6pLV%(xT6CqeN_0rup0@bkchNdR|NhZgLJ*Hc$^-E1W
z;kYR4uvcN8Cy(7MhK^7(YK+WY8-@toVrs_-0Y}oQw-o9_0;aw}DxZ!SOdj^@(aaxg
z^dG^C=^tX{ug|(Le~?w*d7Nv621%ptNlgtO>R<%jO7oUt75Nl=w@|DB%ZD!v!L+kZ
zzm7Uv?HV|wS`FE0a)aj>qR1~R&EbOS5j3}0dRhg$tT>p|hbU6K@R(nH>9%eO_FxMl
z+>>l#(~v7ax||@488r=D$VD-|`k#|)TTZA<(q}{@K1)DB`C4aBkJlWvnly;4eVOEo
zH@VGerA;`|6qkURC26ic*_>VHg_~hs>|rVp03QfXlm&Y2R2Rh)3}z6-W78O9d_mn5
z=`D&Z{FLqOhooG>HNXPe=PQjUcM_MSZQh(KGkpBOp;($S!?c6@x2Kh05l=z{A1-Uu
z{6DQysDV~!6dnZ8o2|=YO#Kf{dnz$}|8Y2=?vno_pSr8YTPaN-;(8jSgZ9?mi|4~2
zp;QQa$C?_SP8{&aVzyAlu-LddI5WkOje_$IZh#bU<gz42K*z|snN&K1ul;idK8XMY
z9um#v5)>>}wqESp$AfhK4f{em$PuA?Rwl<(<Gjn;LiSc~#Xa}RV(|#P)a7_v`rp2k
zswdP(XW;sNEhFoQ-H?tSDBkLdWo#wAotNrgV1j6>I04V+6T|UlijFf1-PaiRWpa$C
zGO>HqK-8!~KqPEyBr69BGMA2!q{6Yv8Q9(naWD*CJjSSEGJj`89q{ezsSsI@JsF)Q
z?2aSaGo&%@*?~bzFDMJLdD!(UP=Mj9efMQ_S_pLXR=P(1(NJJy*}uKlCq^=O5B%P#
znv-P=PlDjziGM?g^F<s}i5p#EN%up~Hf7(<PDT*RBOjNlxy=1%-t%oN`H1IjP)(f5
z(dJPI%X(y*dB}lmvclmGu4kWJHX?3fu<ZD1X^vKjphw2tfA;t4gQ)^h|DO@aCmjqW
z%_}QHo?%SFNVI}Z>q;_uZ*g+*P<bF*)(7pGK)?w66E|VkDiwB=Iu832Vr4MH8myfm
z(xdPz$=9df#-+0&&u7b@gz>*wr6cMVjC(f=LD=<<9I)vqbY+r-ZYZt57h7n~vaVI*
z0@OiKY|Z9^C3xb;Pz~l=?Drq)LMM#jpX!UNB7SEC-G$#arY@}D;Bkw+-xS-SY~TFv
z<GEe`r#%_o@l%(wfFZ>^5WnC_ow73@Vh(OvhdiI#?Wr1~b@9j6wj7Vw#ao|`l$G-P
z-cXl7h`#yH{w#0x(1Z2zXA=|6-6r2S7thu=bNjB8^jEb!U`<XO;|QMS!T_$DC@|@_
z_~Ei#rqn+%)**vZsq)e7ssuh@S^DRSnTc7%25irh9_O<=LXA93=MTiL3&%TQy{;tu
zN-U3apD7+<4r|*iUbgJAjL8*_8V;2T6g$*}zIO`rAuy_g7K+#uu&CT-&i_IRa(K@>
zwX30L69_~7EnLRH5t%?pzR=$3CbF#6?C8}<tT1#o?kc1LGpQWX8Ot~ErFB;4g!W(S
zEHnd>yxr<tQ~l=30)bXLYmtX7yw8}U>mGjG;V8G6ZrXL0c0<n4??*7^Lr@=_c`^BX
zmR9ZZg?aJcnaXkso1JdoJKv*c#MgDtyxRO2_>`&F5Ntzk+?H5U71O%!*MMO1?gh&e
zZ7R`ILeX-+($Tdrrn{rCgBqO#Ym<9I^(&*f&D;~vvcsy!IXddS4p$%%u8x|Q=#qyX
zjXuUjeka~{yQ+LG7RX689SCs<S4pnsF-uv@D;q<Zdq%bxyUyo4Z9Yxdw;-@h8+Wo1
zkDUcjHVOx95kKOO5B8Ouyu>oVvxn+)V0(&9*l$>59|i(*P+1yo!N{|BWX2!ZV#uuK
zyWhb8K)VRR)N1NW+9+H1Imdx;cA7OL-egoHV+yAcI6_g+_{8|9&<Yr%p!8~AHooup
zTw7Qez^gcLRfN_#Xv2xE6aqgw?hCaxgCN?llAWXeW}<cHTf_Gv)+$+rHJxtVzu*x^
z%`gH&H7Cs`eU2UDu({ot4hd-Ata0&bLkKsZ+{cw81dKN20D|mjVg>vodVA*?R6!{1
z*;}Wyea#ZV)>WBD%AtwBa!<f;*g|CxommI^^=Ok0oAmW&Y1;xV1Xf$@>@%ZIQ~wy*
zWsheYXb`@?r(9>Ybv%|zS}>~vBZD7wtmsV9+=)Vf@S}oFSd41r#xxER(|K6=6=Br{
zBvUJCE%9?PsY2>W&&PwtX5vc3J#rk|wd&mW=U_gl<yqV-+-#3Nb2Tt0c=QQk@>}no
z<?S9!o8J%pg}I*;fxyX+*ubbw>?EbI5}5%uF@D`maT5hG1!0?2V=~7tH{lm-f-;ca
zc*g|6Ize+9ZFwtqspp+Z2{29+`A-U=MVsZ-fvOaZjGujgozsSHtWtXJoBIp|jDN2g
z<u~`>HS>sy5Unut@5!5fGBHOPs9A=tp6<JB@YxB8zT8y%Ls^#hUrc|JPv8GSv=B)3
zx^`1amdSIOHW@vg+uaDg{$0UsbIaTGY)2$J2UX_4v<Qu6m$x8u_hsv<FnaS<%vpwW
z-U4wT@Q#IzXr2(nU;gpu(5grxif)JWMX}zF7h}=4BuK4*Q|xy?`^=-eM>ATbDzi3O
zX5*hY2XN3>;R#YQ<=i!OgQa_IJ2^9ULAPr{>Ir$thBN4Q>v}&$aPk+8yvVOM>a<`N
zjN8V$ft;(c6D5L^kabgFu06yo5xya+U|FpciDgX(GlvNUcZ*8mqU8Ono4nXpG~plb
z!E#w$QR||5(zBkwQ$hG!Q1>H?+Ito@%^>9V9Bx`=qK^6NB}@=;kO}WeucTQf-GQi2
zHgtaJ&_KXW;l4iZYR%U~R#b0<VD5BxzikHLj)C&wn`wE=I0c&gHA>n$>y`dlZKDuY
zNPLqFWB}JPe!L{z@46aapzR<JT`P_;BzAYLvaMCC20>jY+~gl4R=CGi<3t`ajLBe3
z41j6uMTbnQ&nT(93%Ze`b$r(7HhH7@@w`nKK9xxV1?qO|HN-Vpn|!Um_56zuT8jpm
zv<r`w#vLH1F@81<RO9a)ca$NRza2X6s@7Y>)$+gbEQP_XFd(Vu1W!Di952U8sesgp
z%vF+dmZ-m&J~Xv#)OnQi3qM<YvWtsY&`g!a4V5-bx9mt~Ggz;Pct-E);y)}1EbQaX
zCGWig;bgwlSTkZn7^X!Q)JbWxf58~$t&{;6s5Gn8t-J=fqlxI%^T$8$*G0lMDoJYI
zM6?$i5>4Dm6>c0Gn>n-{&C{SC3qS6nN4iO0JNg#79~&jrcTxqp3zTsGdnWdz69H(g
zR7g!kkL5UG?9@Ij?yWz_^3|^$<a;xH+#Si7<<zv`yksrm`t?!gF&u&^Y-Rg9>N?yO
zDYPx%=y)LrgllPVdv_VSz?=qgmI58mZ13`KY}1w<e_4-VGBLBlj7>gISljLoTwm$?
zIsKrXWX(8cfbnY;KJS@ucx`*#PH@w-`=1Jw!B{8&XXXV9lG%JKh(ny&?dy55CrD~a
z_-obe?h@CX2)dFi6Rtz48N@z?LN<4D^6;*VWe`9vLM%wUDlQoH$KU58FA^s$=o(&H
zF^J-3XnfotuG0!J4Rq&fJ7X<nY3!~~z3uq#V*+qanoc)tFqb2U#Oo_-Y574^K4r=^
zMqF+x*6n?pZBUE>xs)uV4&Ah7;GCT|(gG-`{Fmu(5-{@pEj!qQ9_hff);f{xP$q?)
z9v4c;hH$=_$#AG>1(x(Fz#p-EMnl8EU!df6^zYK|s_2aSBul)E9$kJ74R}Qe`%2rX
zx_8s`G3MChM_ILJq$)Omsa4MM>JF8GQ<P^VQ3%NniZ%eg)sBAK@qnE23=VP5w4<U9
zG=@_E?Con14vQAHG7dOP-LjjxALTx!*cGbZB_mlT+3BVtdUgHY)d&tK1ViR3r9L=n
z1{Dm1te!){FxFG``?Gv!d0Ycz1ExCGTER0Pd;&}3_F3JZkR|&-V;$X(xm+;hz^TQE
zsfMJk*slmkd-($EzP_WX)nI;mE#w&RcSU5RQ^S?Hz|w9BueaN;&XxMg1wEtvPvk$*
zFruK>Fe>*sh`KoMCK`-LVD4-9RE1ClBM>L0rUuik6`Ed7d2Nr)BVZqyPgoZDmZie6
zUMxcOdvhTCI8x@a(TE7H!md5BB68HjNN1kz`fxHC%ItEj!1uJg8#J<2=l1GIrBxE;
z4*F}M-91x=#!rYVx8i5h=U95r3*6|pS1jibJ1?O@ZMWiQlpR|@1xpu(=v?}ItK+RO
z_lAOfu4z*6a}NJhbc=*CvmnhK{^c<~a+d_9Gwb?=GU~8-&5~}W*uMsWL3VfG5K78f
z(c2=eOY$2u4?hkpY7mKcB!LBJH7?55=@^|QJHjCW6!LBt>lUUy6V99ajHN=1K<!I#
zNG9s}78EIiF?OocI}Jtt-|(bxw3@DzJJ)q{azR~yislHL5qwEkL#=FnnOO&MK>NA=
z-Yw+4(J<1}Twp7zG_1}cM}S<3f`(snRt=;IY{fFK54q56$L@VO&Yl0cC09yKo3E`C
zombQ0&GL)E1IlYlObnA>bRXm8kX(+uR}=z*4o4MIZ8QV2Xcm}58l)QcM@S>>RctIR
zUm+cI)ov)~o{`eG=m@AJ1ha32X`iw(+7<QNtf<c4Rs3Bt0Yj>?$aSUGl;^O}i&BaM
zng=#QOvvD%zcVu89x!GH)t`cQnt8EPb(z5|mA2N|krb|y5bci?`GU=J^WJN0*XxpG
zYHw+8$FBjEF&v`%dbjmJPYCr#31*nWQs+iG|1q{~V}IfHUnM_E>@Zh!21M@pOQAr5
z-{I-yv*&eOKP(KR0g0<X{y@K79~Qtlh3f7eG@7sfT}hu*sR(+RXS>zX9Uyc%<i^E=
zonflI#d5(i<30qZvu+`n!PW&st+?_R>4o@IG(-aDe&mfkNSo{fvOXrEvOQX%!O~dQ
z*yqpd$u;@Ar3-g1vg@8uL`EpkGM7gi;ao_w_Doijk8k}@xa{HR`6)3%lsfc%ji(p3
zc@zN=M*Ss7-(1iS!u{DFwh<*9u)6c+RrmAC;xtgV&sU>1g9%5~RXIGZ7siglZ7s>-
zV>-Z~0fY>ZA&HJT3mLcqqOV`bOQ0>}7E;Ld;C$dLc3Zf*7K1^Ir9n1S8n5!i!uEk0
zRNG>rfsv~-Sf4v9!UT;Ns?<>x67uLk%wFV4qHs7pf)v8_T29y}K`cU%c@frh0ONOe
zga)pNNY@O5XDcYJ+R#*GU@*o34I<6kM0~4qL4lB<!52SBpBsXizyWp4{HuV(BtP9Q
zF~G?3Ci-*18xd&^z#RaXaqB@gOwAIYTTmd}`2hUPqPxO^G@ACF*^oQBp?mg=duuNB
zx#NCd*6|S5WVhBupB4}a_f%IHsx?8`t+IEktSA-IYu7Qa;Dd%H7Cd(vzxBC7^&bpA
zaFs?$_FiuBuIe$O+(#3u>iy1&m0m%KkAfHE+X#8Tq`cvmm_UKqgDV`jYU(dQ;(DG&
zg=`Y%Eww_>^~_dizZsc@+oUwrEt5~6&+Y_vwtnx|pKo>0Ls)fu+q>2M-e}bcxI|pM
zKwOwq{<pT4X5WlW3{|Z?3jV7deKCvT%&aJqN|*KEg8=+FiqVZ{>9e^>{Vlf~+E!ug
zc)FDC<=xB$Sk5FX^z)K(*V5FK8|9Iuas>6}X2txKayH#KiYs<E=5`_{7F2iNSBX8y
zT&F>%l)K#j54h)8|8T1;Q7n1$*s3gGeqLpdepYllDbCBkWw8svBs{N~LBfaJjZw8?
zzZ}3pH%ep~w0!JR=;sDv8Iw6T`yPQQM0oondeXOHR6j1LS}E5nWc0&$rBM{P+k^0L
zqqC^aeMiB6OFu5>qg|&?T66XQ-mQ%JK#Yz&$kNpLa>LHR<IGi79ZQhXpvY%X%+gV_
zJ3w_O?RVb{2yvv}Xr~UKafmsjcB8!#WNji|r~?opVx=V#w>HzUQ%ufP%lb+m>KXG{
zgY8V@slJ<FBvYMWs)Myxfg+Xrt(rPt%zp_DCk(&|O$W#l9bCVt9<?a?|Mw?#`9)Cn
zmhVdVk};W1noB$B&xA4sCsTXZrAtiYjlD#YCS9I1)IU|TWS2Apwlk5217Jn}i-h=(
zlwojUj*>IMqeyAl0{2Zv(CicEzz0k7<L+^VQ%A$SomJM2z&~$$J4=+WQvA|D-qPGw
zUy3`GMoxIhUo$=-`g6NZ$awp|dD0$2_#v6`|C=0#(W|ih!Kq$E!T+Y<lB@q5{uo1E
z`=85GvAm_wb^UTDO_*jlP1)q=MBuRKlA-E7(ecNSd%B(y|B}=vJb#$@(6@aH-n`L&
zHHr@AKiv?i_fgbxw`(&0(SL4Tg*bP3zGAoTN6y<ww+1%1KFRu*v<dH!X^ZUcg^!qN
z-fZU*iqvB*n@zTyvhI?$$^TT25VasFEPi-#7=7y>UL$OCHZ_quk_hl}^IcWWeTECN
zMZX>3F;^D^ocvh?qp8^@|Cf_Oa3GjgH0%x6L=YF93L)XU=pU(N6sYCTyN8^Y<>d{{
zKkE38xxG8xh}&1O@`{f|+gTG5ch!TNGhrm&>qGCv@19Kvew#o!!ifS^f!wQTTERf|
zbd_<3g;nQPzL9h3Q#j+TaVa4e>VNG5C*F%zEX<BfeVpnhiBLPc*%DOMNH8V9g7k7U
zM*bxUEih1ytI47{d<k(P;|~omivkuD9M}vQamI%St-5@cSI?_gQ>Y7sF9HQuNjV}l
zC-q4vqb-b493{5=sGfEEje`pCT*ZGS5)Mc-yeot8>u%O~j~9#EyCQ$o*AO=wCZn7B
zM%Lx&m&W^_bVbI?%WmP@gE9>d;4P82ze8jEvU~-{+1e1qFBRs;+*1osbJjEn>aTs9
zQE?lq1qQfXqBX)1dJLqm@fLdV#*%Wv$Bj*%KF}ZH30zgIVt<Q^w@s736)(!94e=}O
zS?iAKy7~XkJ^;%Z%7~vz#O?SkwfG~*t*H}fV~Uu=6a&7dxmMmOFQspGG2yA!WUZfa
z_1MzX@w5Fc>oW@j&lI(H#;(?Mk*05LlKI=8NCVo}lNXqQlaf^2N328vzqBB~@=es4
z4!!O@&Ts7cKFAc&U}1&TueG9Ub4}ay4%j<Sda`~}#d!L}VA90e!DW<jw!HBM*9t)p
z4CO$gRkMx)?oVl=2u`S11}sV|G?x4af9aad=!*vyrGlt;Qzd?+a7I=2x~(csNX7)9
zfAqL{&}|+*4OYMK_`WU_s7ApbJ&jN_fGWIIYTA?>#ensnWby7h5q2;?+~c~p_Vk08
zY~+{omGYP7l0z~f^w3C_J|C@>zXYe?R_?v&e$CPETTeKPKIg{%ygK>awx&{f30yV7
zU2H4(B~1J&>9D``sT8g_V!XrSxzv{i>**5#L5mY_tO>{XY;vavu{l5*fl_!~tLMmB
zX|pmpd?Df&YXOKO4vQ8`GD8I~movh|={a364P*|?g&{?N!_9cVpa9>vk%L(+at{2G
z>N%T+S>;YK8*RzY%a9>@a4JR{4X4d|=X7{8{t`$oLIS?Vb*C&Gd@IU&k)tCgw>nXh
zUZgXF(4!CC>|+aDaBfp~%MKv0Zk!uWyB@h)?cm&>Y;KW9*eCjafB0tV*ii>zD6ANb
zx!KzNn@0eO0uYkC=Sre2djY5_62=Z+&N;K7K*9T6j5-YtY)I>X{RJNL#bn}NaS+^>
zcGae_bjJoKFF*?E-Tz(_Nccjvx=m8dWL5=V`A?w$Y|8Yb>PjuRY)kWHEHiP*Ji$wO
zbr%rNOzBFTs-F+(Vmc2_=z>+t3)Bprj+L_~HO;ZydV@Krg<AsWP*NA*wYM0-U%Rjf
z-GYnv0WD9)NXP!Omj&@z>3@w9yHOXRfQDGl6uN6*Tf-u^+~(Nu2Y+MY8D|;$L+zr%
zEtWbSz*#%II~w2D_iXC0G=r#-6q*cOf|260e^*$;Pz^{1s#&M|UW*s45)McCj$uPa
zBW1_)<5x8Vhxh=aKiP{mKG8ng5LM0}09Is<b*XA9oygeVY&a7v!|HZmcik-?Lu;&S
z=`*;PCE`?p{<+p6-mkSY$@p|HKxIFnBU^0-5t4l+Y_?nn+JP84Ggmja*Dm?$0h>9!
zk9)^<)Y4FCD_U~*rxMK5_Wnw-PPAoGs}*~P<B1fLrr~h0Gia~0+x#{%KPZv*8cV<O
zz{f6kUESJgJMS@$sKORNaEvDj=-OgL9Xu>^M&u-L^3Kga8s+$FbLbV%0aF#pLiBgt
z5}EP8Z069iSPw-pwz3s6=5~|^sOrO`znU0Dq#OsZLdkF38A}!cu_@Kxe8>g%Nsvu)
z?;2?u3CP&fE{mIEidr*ubepg$;T;`~fCEr2_IekCWXkCL5*oVuAxqzr@*y+$c+nAg
zFYw0-P05-(`qBA$lRSpd9o{gN({%)a3brSaSalj)Wxdd@XT9N4S{GVShgEcM)6Yx)
zzvu#pC1%XT`-7Zl&}wYpirlcP7y{|-;+<{m8C8=&a?pMcQq6t*RqQjTnhOScplH*U
z7``6t+}z-RF0#QK?wh-}D=b-86NZ$PyGM5o{56E2%M9JD3Vw~G=rM;vpP1i83Gg*`
zhDv@DPp_}KaV`v{*6+H}LPo3kr*r0a6^uEKVZw6HgB<5n_L9)P&1R}sx><ko+}bFR
z&g~PbIegpa<9*+&$3v=b18#N{h~tL4=KUgnXSb71pt3e(>&}L%X;ygoJ$dPz+^*d*
zlL%Y*_X)3_ZEs%uxMAmiia2x_Na=7&W9si|YUr1XmuOiow|u!6gOO15rXEQHuyvdt
z3cQI$crY1lz!YW470hg)VW4%6vaqIi#UV*&bn5xfFPX`+y9UZ}XIt7gI1zVaQfi(s
zoBuATL4c^IZ<!2>sCRivU~#-+6DuQ5*COl?g@6pE87=HPC+-R@{F2K~W&*T3OecuU
z4n*ytwNmg7FV&*hF&=9>*1aH?ZH9=6lac@!QGT5}D9GXO+es2rU)?dZ%E-&m)#1N&
z584LW5+S8~H}4CcG$6pu2wDq-M;=GH?RkM(5#ljJcGGJoO#@;%Npe%c%`YNNZ9A6N
zwKb}xM=N})1Bhq)pCk@shri!xmkr*>2EVl86nt$B|7%7`o{>2IMI0BX-R$u;=V{i0
zO$VL!hfOz$f>8#AxBpP-mLAcre<2J!78jc+c(dBd#R*h`>i*^(3CVe`P5#xd?=E59
z+G2~Tr~L*#^UdgJ4Br64{Qf^$p=h|a&pqY{CLT-LndcTF^{=?uQe=o7T-w|7`3!GZ
z)2<i|LxnYco3k)C|AhqitsWm3$4G8wJ5-wPuo^zz`Kf_#;;l9@r@5WxcJ}gF3TDsw
z`fxE!Qlvf|KIcEZ8LFpJ*)O1cf@e>f2x4#W|1<6{_3X5Z<Nv%TfxZ--)^AY|<`nT!
z5BVp<h|sy8a6=IxeDS~3x;Cp)p6h`_*OE2eXnk{@rHq1-UDR;BOpM}o@mj8UzeaCa
z{rcw@^W~rc2p;xS)X@e^_b<4duj}1iocl}JjMDFm&ypKdqtjr;%`ncn@z_opm7Vno
z9ntWYOqZ+nt7tv|$DX!AWpX%GwU?IaYBldpgcEvF%<x^3(PJ)!3}Ct(wiNHvBkEMF
znE({2=~m&!4NSPZGiH>r@o{B<M$K1_2>r;Q+3}N@KJ8<$E7>0P-jC)bT1M`>RwjY2
zSRqKulXaGr6&Qh5+&_tzr^jk~>0Bwi<!}VgpDYT#=P%?~2oU8xT92-!(Axp6DD_t3
z0Vyqfy{WK7_oJ>!4~b0>Upv>^TO5Ec-iEMxmwH76(OmEaAs$9u<wB?BJ&T|-v9(Ag
z-ubuU#Uf<HDI9jWa=svt`&CMfGTpQ;PxR_GrmZn&|AUhu^C-$z3k5+j90T1v3e~u|
ze=SVCRpxRk(Zrwrm$-Y9%k<bDuqrlM8UlLOa#j|19bJaoOdjx|!#&}+?H}VsmdVpt
zfNbBKS5?xPIWKC#yj}aDCc{w6a5S_4?jQY6D-IU`;B75u;PcZd-B^TOqaBMWaK2~E
z?dW|b{s0gpy&Er^nK(a}_(QRKKlCTp6Ft`_B#j+6xvL7F&BU)Y{a0I3D?@cnu;X`|
zxUHoM5L@8Ul2E21Ccx{d&E>q2;qJ}26eU;j5;3bE!Rb_ejULPKfFx76Mo0&9?QcZ>
z>un;q6N$@agbK_bi3Z&kLRGFVXim^BG2Ec+*fM5wEld5odhDR33g7|WPw~ZU;!#71
zew(Jho3sGj63`CQsqcE@Iy(6kk>J|^^g-1JQ<Opl(l=4*&WvMS*2zqdw3lRmhAvEL
zj*d+e;Z3Z%&wS7)nmyM^Lr;&}&C-~nGKG$jDwX|hVd>Zo>;cf!)Ntr_?=uvhs&+`(
zobw=S-iDv0e7@Hil(h`N3_<)Gh%jEk_Gg;&B+wW%j3zL%X&8s~U+Dy3@tspxtVDQQ
zv7yaE?DXhL48a}%0-M{AFXR<=n?EZcMX>4M)7k;Qvsanj#ZR$Ts4><?i&~)FENRwU
z5SMgf8>25*2Ubt<4}^nPHQvC?XPxB)7BAXZcARL?q%^xS(J7!`Znh=UqsMdO^%hFb
zT$U&elmknoXMNDTP>pK^Mqvx90ISPIxpmU)vEhe}D)u9hD~s?x-;zSG%vM@3>oj*W
zCE^afP^mP=r?-Vwn#@7(?c6N-XRFR@BMg71_m%&YUpjwM%My0Ox01gi@a6l8<N8Y9
z2J)OX(`J=cEgff%JF}`JM;X)B1HTOjJ&p7w{@`b8+MZ8_U`kB*7Z&aSVoRn(tYaN9
z^JE#!OxV4|s)9yq`9fAe2h(ouLJ;5jon(hqiqb~O)q$J>IcH1{qG{Q<P0X36eiQpZ
zK{2`ZY8GdB0!{WyKrih{ha4$6OeB>O^ak!h0#NYHjX>EN9{^OoZHge_@9~==JL$Z=
z*AbMk2B6{Uq%fu<2p9EfCWL7KCIQI!EPXwrFx$9`Xpqi55pN3wbnk8m_pxGiBQ01+
zL|%D&N$g}RwAdZ4xcMspXcnkIkBP%pK*8Y^gS-c6C4G5}dGgi`OLF`4!D^ZSNaam_
z;SUf{zBVh5Uh7lib*fcU1GVe5G)O?xlr7Na!BB@vhhQuu^uK1Wul}iX7JBtZKN#+I
zqf`d0of=?8KD2_EX`Sd&*fcut9kw*c*bU@Qj-|ce4JhF$(3vCt9MkexRU)%ou;i!w
zrU{^D@~>=#o~7;&cC0pZ7jWW^&J)vA4&^u@Q~_f9F?N%c-pxoIAnCag{vhr9?+-ee
zH-E&!1}PN%15=3)H(5Gg*|R$y%M}G0nf9;xu&DsVK5t;At1IMs8WG5<%P13Ou+XI=
zm{8NQB8l-qmX}xOejno5$7=<kSL;VFw%PQh)97a20<w|5l2Lzs!zDC%wB7r(nTrb(
zxivqFYwmM#&9Yd_rRcK;EXvpUuv+PtP0VH$i7MWijpSEgK%q#a8+||1vHgeD+HAyF
z?xsa3Y#fmWk+5mI2HpXID+5uW+i%W3yYZxg<?xe1^dC6+W}?}CO3LFdu^WcAciO*o
zXIYG0(W2sa5{%}ijHk(l15R>V0vD<}CEsZ5@<yu{%gia|v494Onhp}shS+!vecNSq
zhp;L0>ZEWd_yTOXwg7Y4u;n3jpo3^90C%g;4Oresw19A3_I*y?AHHq8!p(S*mkxAz
zISEmA7QC1_&MAAW$!3PtVcE!R@#viAK?~i<cTO1&8i!U%<Mx^TbNv$8oK<h2PAJHu
z?w8M*FDqcV#w>St9kLqTfgMP)<eJC=SebA%=%kGmA<FAS0jT2_mv36IT{0T7>)c_P
z@6Yr~>f2;(U7fiOTH1y~X74DsECPaDwU^$kzb_SsP9{;J+YpT)9`Ro`DzVHO2CsPf
z|4(Q5hTznG;D|?qk`T0i6$&-yLvHdowGA(CNU~V`s8{ZjBNz6SmFq(Px*+=|6EZg<
zc!O+ZMmOsKL`~ZcaA{{`ahV#3fotiI`_7z`evs&fofu=NmbMrVnk4v~LBw60oaGNm
zY1`V)npy>#!RndSqF!YboqEZPbi-{Iw>@1)OdJwKz#{1GuT=HxEd8mCdo+9T+Sy7#
z2Lm?4zRS<jK;(kQ&wOfSc?nbSliT{)hw1>}<$z$QcuIy6_leAQW(QrOU*K4_O#fiN
z*P-gzi4cpf<{_ES+o&<&JuuLnc75z!r^AX0<SbhUmYIB{#7wb|Rj1Fm%V-!3vuYH@
z`>lDG8za*|;1~3|x}X9G3-(Z^_NSeJafDT2DBv}*fY*dIOR!dSE-li9<j8$xv<XUj
z=2_y3DE`rjQU89zVAvoq?~1!s5b{s6c3y_%429zt2ggqcts0s^79+<AuL%f}fKH^R
z=ZG};=KZqzdgf~yFlD+@!~QPv=jZ+@2TgM0stY2+7xOn@{(NkDSx8ew1Zf)RoJ&(4
z<(S{~J?Y&|OcD{Ma~r^6Q>(enyP}45`vmaw%|Ejbdugu?G^D4y%fg6h>4LHy<obAy
zdx&&#_6+q7uHW+4MJug{I+r^?E$?6JEo(kxC#5lI-GJNPEl?EM3lF@c<^a-pdssX5
zX;3+eymav@?qTL-t9&SdAv*S2_})B|l=1*M(Q1>kGByUIXP<fBa;m}YCl#xv5FIPU
zZII*YD&JYD*sd<{_Z}YU(HkWyp$##)gkQw%h%)v|wfZ{${MF7v{nMqWMP2^~9{%Et
zPX!;&Q!4ov^gii||3&@&U+^S{?{?a89Z#9Sph@oOH3FkJGa~6I<My|hVT0QkV(Z@4
z`o1b~#{i(y0>Gzt3>5r!_M)yRd)zo@aM<|~e0sCu$Gc+ZZM@ri8fA~aW-9lTim2xQ
z2JUCwKc8DS@{GsDlTJDGa_43J<I24+D{$LX;EKk;l?L?6*aNJ|g@MmEqn7*<vYG=N
zJ@u9>f%_QQ?QS3J8f$wH6zsCsXp4_t5E90eN)$^?1b{n69=G<4fm?gXNK^w&_M5xB
zk0_zvm=2IpvO$;F-~RX}@LY0a1;>Ldy~ut)WFc_j&y(@qREAz}cXhUM&k=e{!xFAi
za&(0rc$$VK@vy|ED+rj)V;S_yX#FGLy2p=qBK4oue=UXK{El(a`_mhZ16SsJcg`nx
zQ(gy_(%q@=eeyA2fiK?mgglz!=A#E*{!CLeYa~b~d~yRtmVw_G=H%&kVjkH)r;8P<
zWpA0C+>!DS8I>kGzEka>04A1mhxaRPUioy6qP;7m2ZaK_IFt$01BNz1KCe#fbocq4
zvkz|USB%)7+++OlNZ#Jl#^lz?Ks0Jr<gk%-eHb4-u-NX`-2vuC+oAL)S5_dPt(3Iq
z1c_u=wly%u9DH_#UVpL+_)6u_2E<)`7h?sN8qI3pLXa%iGlOZCQ)@s(O-s7q&1J)G
zr{7e*W<$^?2AytXhe_}Agt}y;V~yqMhXnf4{43u_{eDkPrwF1sV-(<V`kuseiCNVR
z`co@ll^N<^rYf>k8wnZl0QSygD$Xolx%%rp4*go+-W7_RSHQH0w+f75wj<J~CPHy<
z)iw4m-WDh@LO*LTxWJC3+Ns$awO{^V0xS;B^qg>$(0gp_+3*zE+o^m(6knb+f-fEo
zKV3Rie#(ftK2ntYvz7me5a4)Fib#-n0aZkA5U}_MSJMcp#iOULL}V4i0y~A3QZu)>
z${xH0`}PI6mnC~~22Okie2=X}gt3+=RJH$nq^@8PO-5)&R3vb>HMHT)UHvgbzy<Ad
zQWb#LBXf!aoNDf##(}MBu(5(Op(|TmKjNtX;40dm8uXQFjBcN<tNJX3LB{j#ype8w
zRe1A40kDUB@ON`kMc$f<&^ITWomp%}2zzscUI{2d^U+hV-^W>8EE>~Pq<o716*c-I
zlM;oAcq15?aPh6gKn6{=Ue)^%VXXk=wX1?Bw@4>mN0m5N?nU&3C{KO<=+Jos81*7(
zYB9snE9}%hC8#?^Ra!G;Vg3X+o<MhgEDsRkMAd!X1g=7_?Uj)Tb@~;HfTGos)^a1s
zlTA~urlT-qvD^OG)6Ih%wB;&q$XqK@^5;?4rMTY`#;;3ZFtj3G)TA}TX?9{YtshMu
zFGn<skbsB0Gu2Z3`SQmZuC9CWxhvT-9oGvVM2kZEI^S=e#^q&(6<3#pn9A4efxRm<
zIS<}lR=1SrQvWElqGb|i)1J-47PfRYp}+EE7Q+;Ri=3KATy*!2Uia*a30^Jbl4T9s
zjO8k9{2=Y7^0940f7aO4a!c(o5SB0jM*^K&RwV(^1L>7i>XN_zpK&k!0DG48_;osP
zoIi<_-b#H{AR657VG_Q*(V7|8L%sRWbZ@_IMw+KL*C65yPeLbIWY|8tXNdjyg=a`l
znPA$7QrcJsYW4V!5x%s~a<5i9%i9t*tx~_=1q|`w?;Gx35@eAkWaVD(igJ-)6wmn;
zkAH@~za45qsL)3--O*6(KXGFA`)v>PyNwmBwxx><^$QrOn7=NoM}}~zkEpa<T=^<e
z)UT!JI1+yM<aYIyHrJ5$l2Q?9AlEsH`tikOz8i!`iT^xZVbrzQ{tRYCxN<sYc#YQ4
zizEypP@1fn0X8b3E#<nec47^CGq%Hd78lI&J4oqc95Y}GEdgsn+>?#!y^<c|FJ%=I
z=ntMI!)lg_T<Q<EJ6#lecuYO?YroA$1PS6!PB>|bv?-s2kA&2bEG<P1UaJ^tM<cD!
z33bg-F7>2FY&3u6b$wR2lcM9$+U7IXXb8~8uk69<W7OC(^0z)dP5$YUVUe3YlYyt5
z(bG^6rzl4}*HYu=j6)UBwkB@u6!o;qn60_OL>q9jc)<A3tEk;t9vv$#^C?`ExY!t-
zzCxP*^=wZy$}lcZuW9*(M#l_x2XGN3YoaN1)w;+1#EF1iL(y(Yl0*6QN@l9}LPl2g
z^@?l7SHN5ouR46xbipHnB=f8iIgfS6ulD@lXEW}0I|;MLw+zoec9{?70*>f=l^=Rk
zrz{`fyfvM>z=m3t6BD+{;DIw&7ar$ID+O}<k=t=;Bk4++NsdD^d%Nl02@8B|Sqeu!
zu&v&1;rb;u|3k0J;+n;nR|^p-BOiqWNp?G5C`jNG6FI?juaU&H&tN=S@p!wt$5*!x
zFtd{91AnyV*q3u9jqXy1`5I}GOJ>9A^&A(0!=FZkt&<%nar~n%@f<@Z?3;VQ3&ULQ
z*>hZ7Icn+74HeQ{K@-mt-uFnYmlO{5Yen-9f0umL#|k5V4tF~E!erIYn9ywQP^gh2
z<je+N@i7#%Z0bxgktfSj2v@pF%^irxZ|6;HEek7s!L&a2ubp~nU9~OE8gJ^2OR56q
zk;iay{x(_c*vQ&tzSw>_;!+%7E!n;(4iOuzmvs$qi0<$KYGGogf}c&FK%GZ6i#J;)
zU>2YZi`Ni-tXTW}M^o|f)kA)xa-0_=fRM_Y7}H{0S6b@nz0U4nJ&jv66H;1(--}<*
z#cuDUlv#IYcxjHhRH#&~dmr81`BX*XT19E+;emsuV@f5kNTFCr44S!%G=@16+`6d6
zZcIG#paC`pD5yi6CAbWw>TXHka^v3i3AG%x5h{yZvdo0C($QqAD%Rdcul1E$>*`qg
z*}(kfwHh7s-eAXiGqU)!ByseGU!g9wk`m{n!|`<0-}FW|<-V66R^n!1e1--3Nv+;M
ztVF%h-(=V4HJG;Et{E{e&a4K~?G^nWQ@8JjaEh|30K|X4&)$Y(J6a03cvX4Q#6D)N
zA>8CqyGH1%kM<G>By6XSQ+rME7+247j9VV=TTfE5B`Xl-&-UzVw#+v>M%A1L6G@q~
zQNplAx11?d<fiZS$tR*XxFK>a*st_<iZmDl@=_G<R-;(O8oMM<lfMY|D1u>ICbgtL
z{OalU$i66?d$QfSP1_Bmwj(Ah5Z43rLQA>N4#jn|Wi}U#9ffQx)aG<He^HB9RXRRs
znUZbyo}Nj9?Go#z_p|-7>T{ZfKr5{WhgmI8g#xkY?y7aKiP6qC{UFCC!K#qVrZP(<
zx%G-SlhN?qN>bpS4f)W=#az2cqPu0QQUgSb!Uu`AEm*k-!*&is#)F<^;WMRv$}vkX
z53fy^ndyu=6h1_91EyT>zNEizXzILOI|87*ipH50*P@YhAhfL#7-CMeKzP3!SVQ*d
z-gYT+-@eWauh>$h2-{Dj66N{XGiH<AXVChUgQDTsZ+pcs&d9ax+rZMgNI%@{2z)Kd
zKI109$B?7wn?_%}Wt?@NM4OTcXHy`-XcyoI14LbqV9EpGu$esCzEum9XmtQ6tLIp0
z{!>fuY7q9IMA*!;`;)_izo7Uc>G=M+paHOg(U%2lv#F`_%UN&67<2+D$7Z0`P=(-g
zTT9Gmt#y!_2DP7F!%P{a?=F@Tl?pY)GaHMV1*HuV6E4m7)r<bb<6N45JdtzF`=d0n
znn^eGKvqvEJ(1bZIDD2{TCoFM#O62`aJB}UFqS=ty9(rPDUsK`(fVLR1)FWvB76yJ
ztt{G}zesVZCJ5fG7O5Kd(epy*i~a3pkShG4kwsZOS+_27{Q7P1fvwSJMsBOI7H__?
zKG`ZoU&YnYmUMn%Fi7I_h~~_i>|3GwpRY@o6sLHkaQ!WZFQCUwYc&btP1RS2#IzWt
zFELZEZb(_uKYPGmp#z$O-D;2BPp5Oi@S4edo}T)tbnY;!-%MNa-mT~qPtrH<x)-Tq
z?^W5>M(4U-;j6f@%tO7alB0&xHT6{r``M8UWG4qnT#gQCqQ&gRfJ$YGqR(hjhA6a{
ze}U(F0)9p(8Y#J*V-AZo^=elcw=s5_kDmKE5Tc*^XHEXYEK?vm9s<A~weqCsd`W=w
z_Ol}T5~1peA$vPyB~1o!kYBUY27Y$H>4Ftj%LUH*3R*yUR-<pVAw|=p%uw2qn+;?i
z_)PoK1Aj-e_n?BLj6@0}cXiO_K9AN?*fX}<F0|>8S^JpOjy3YOjMVIFojzIX)}`a!
zitgmGoTr&n*sr0!kPWnyWybEVRUBPgqmu;&q<swC#T@!Z+ANF_m)U@8e-GN{B<%zP
z6bAhc0@@g*jP-I=SbTQ>C|%0h;tW=%%!;lK`n8n4o9F$-6Jj!zGiB~E4%!i^5(<i#
zpV!z!o!Dh>hi%@{Edk-)gGeO>KY~b^LMhU`vUm1I$FS9VMsE^ezEAr=f!hi6RX|_=
zpkRCTKG3wkpKN7Rzd&LuZzDJZJ4e}$BT16yA@QE~pb*we;*4eV4tY=YJy!LZ^4(Y0
z>)cm=zIZkE9z+dkNe!M!vBE%=udwL}1fHEklYi*N2qD=PCQdwxWG9Ngt$Fd|GyikJ
zaf`t(O$Z4XnVC)*v$B#Wdzt_ONCV$Zs0)p$Xas)!MgQ1hyL99!Pd}~s<VOcP82kOO
zU)uu8I12-H^`x+Qbo?Q7EITTv1VyI`jIvWJm_>i|6e_mC3~cPLtt7{hHu@|f0iWLQ
zH;$dvXO4B$`S@2~fY1CDXWg5!_Ki-wnYtCMEQ;-8Gl+x)?QZ>}0`4s410S~cJ0M$^
zIwA7pj8A<2=b<x^Lg!NPVV*x;_7T1!#4Dl$h4bH$mKY|>m(tUtcKvq9n~tk!@&GT3
zVSrL}y9*vozN)cgRgoY>i=@w<y?elD%n|%)5oXJR>p3T_tCHa{c`vUYC>uGzq6xUr
z1Zs@Wym_jBK3cjY-<ZYzMMuzw)L#pQejAU&_~a%&3h4OJ-n-=Stz&E86>U3H?ayHH
zTtS8cj~r~!u1AgI{l`=Yszf>!Zfwq)4tcQi6My9fa&@EeqshjQ_?Hpvc5r&B`G%uf
zFQv7|$L58(bW`9y0a#06sKw0X&~wV+fn%7`y<0}5hqa4~R?Q$Dm)9_`HZV)+1}C-7
zP3wTbAKyfk1XBEl)@O0CY^+br@R{9sS82A6Hxi7!`evTob*wpqyW7fDcyxnHC$;gO
z-Fr6AHTH3GicTp5cfNjHW*Zgb!Ve;}ZrZ}Z+flwDbj(R7NwEn4Zx8e<CMNAaj?n5O
zYxbl8bC}yp4%$Z%CX_?zPR{BP2h2d&LNsKl`Y&-eOl|~*kLkY4lOAyrXFzP}os~ar
zW{|kfF|*jUp`r%$&_{41F^k^iB_~Tr^fGaopJA;<bz|B61B^m4gX*xBIU=w>YJGEv
z4`Aj3*OZQqIP1*h6Xy^6r}Tu1cDT;4KnRbQU|;9^ABD_J`E`nJcK6%fUg&4Sq3Zhg
zAIu=K#OAn2Y5M-7sUDl-8-6(g65klCdUqAS^zcS6w(pC61E|!p+3w3tPI$@&{Fy!y
zv-Z&wEkHq+v($7a>`71fw@*zSZ=qjIKPcP+1<_cV1rIzLt&^Qtd6<9;I-Xu(DoWMI
z8LT_kAZG7e7?3@|U?mn9yRhxT8}Gkx1|F&kCyiNkBwjyX@~GnS0~#U|UU+!zfx~B$
z!9r1HU12F)s0sPo8BW<8v5)HYI<jwqt2{X<_fRQmUCuFR&;+!f#ne!~+A5A=sk30X
zBdhxuIdg6`xjUkO(>8`xp|?+DdunkP2nS;BYl%!5-pq?JZF8dPNDF2UB(r!-1<Bf?
z+kA(1gUK!76P5ZwgEEY;?pLaUzz5N}@BNK{eclb9?VQRT1k(c8I_MRT^aQ2pZtO_P
z2}|>Jg)YJSJF`uVYilD-wxBwYXnqu{UwUXO8*zUnh|OFc5z2cl<DPJ%%}4PfZzZoL
z@vu!<$3DMu4ELCJ(GQae#Fx)I`<FXA)N&hklh+<a;?T118MiH&T?b8M&K<?1_=$eL
zA|6{gBS<|JyM`-Ob%E@E-}N`#RTzoic=d`muM_8D6?*vfl}AShO@i|L7;jXF6|TNs
z-SU4`_uWBFwcERPR6uD`q$y}<(wm`LDAEN4q}R|1LMYM|R60m+0Td9WBRx_gDxCx|
z(g`R8LPruHgaEml?>lF{GxvA?{mtAvlVQkAl6UR5thJtJt^Mo}B=TwHQ$kOe(rwd1
zG*g_TlWpaEbn5o3bttoj?db%$Rp0titBV)I=pMV|aW@6Lf`42>bY7$g4^|!HPb~nq
zoXB29RVCGbn#5K8rnBs2pDXzw;rCkwWT0ah-oil7C&^>f!{pd4U+(Cn(}m*B4-b+W
z4t(ArcKCAp&0g%c>A%qF3kC<VqO6f%QyKtt5`@+xya3=9SXnJgZXkWFWbKn@yf-L%
zjP@*aN-AEP87MFx(k>a?p)ph#=oa1M*7`f8S3A9APcVGwx?bQw;E>@^JW$nBX*>)w
zov#dNy!vob<Gf^68UQeJjZ>KJUuCSCPjI;3p_uR5r!J5$WP>b@17PP30CV48thX4z
zfex%NPG6`4E7EEBScAH&_Kl?`^353lgh>T{!#%bjY&CWb?lzB=h%PCYkYvaY?2Etb
zqMe_04C4WW)o^O?4fyBEoeDP)@0A7G3aT9WhD!FIzS6!IfkX1l_Xc&2ucthA*pmGm
zEUP`hw@jGM1PmlgR?&tELEWoE@&B4(i#ri%frCojILxT3*NzTQ%?WcN_@!@|hUB!t
zN_ATRE(O3|scglI5?dIKw4P6yTW8s9zM~5I_cqzAz>orM6Svr7P|tKP=Dd-i#Arya
z(k4YNEwKBbex$H+rtVYc3QI+Pm1!?vm0tkU{B?gD$y>OQW`jUCp#<5PRs-*>Qr|x*
z{PiNxjs}O@z35wiW5N8)n|dlM7t(elr26MgRX^_YroAr#2(^@v6w|!NJ1|RtN0UB2
z$AiDX&IhqtVCMbTqtATevl=tAAGrV-xxj)^n8SIEZXsU9Ef)DsCLv+nTi;sd1qy>A
zIqMIjY7|EqqStT!Js43Y-onet|Mdvjl$57*h~qUy)(2x(G%7H<Dmg=T#riq-V!$^5
ztl-d%=GWlOOqYJnuqn9{M@)rOz0;{~{f>Igk)_iZ&*}|2qp(w%R{y)z6aPvEjwBsG
zork|Bfp6r6V{Sr=N}np0Jw8~R2hi%$AG^=;wF;kc(vTU*PE()p!y|{Cf)Qm<VN<wj
zTHv(RcQ2tA3{?R9kCPEtgy-DI>5>v7Ik4)eW`w3Z`1l55v)C3OS-r<EuJQ{6G;Y5%
zmv^JbQ>Gm9P}X!g?Zmz@7toY;WBhFBF|y`9rQiSsc$O!xZnTZzN-49t(9f#1x%Pz7
zt^0ZfufE6u=_~YjHbo`zcpIr<jxSJbD+HjA{mA{lUr3ke-fHN*H}lI$q^hn+Iz621
zA84H++JT#cnp1D?x1yupf(6ziS%GFIg|uwLw4~h7&FEwmnW<n(RUIVD7PLB2@qU1D
z_Sj8|fU6d~Lw9}a+8-5aBdq~&sA*6!QW+pcX4%Qg4ZVF2gd?~1S$59K2GQ=_NV{zQ
z;8VYY20;BW9fL)n!Ex`!$6^t`gK<ZZLBI@~9RI?a*40P79vVb;AgP*=6yRO3abykI
zUP{3d_qaE9rffShs~Tr=kIT+QZ7Ksj1=8J#EQImO=(?_bQiTMbZbIH4P>qsAdjyjz
z#_DMb+Iy72w|%C4CaU=Z(4*X+np(H)eG7pUS$Qg~(ApY(dzM1nRLW_K{wX&n&kLMt
zwbqa6BhDc2Lr@p)QW`!q!kh4k46DFbRm+nFg%U2Pd~?4p2hf{{d?5X}0PFZErSpkV
z$zh9U1}@EsgYpNIjqkSU2q?hx;`()XmGZE=-}SBxTvp!y<+;;HxKucJtFPnB!aOKn
z!JxG9^*l=N;v%J?zQ>7QbG^_@IyBpZZ1d#p*=fvKsFmwk-mw1RV+2=-db(0wUUtWb
z67%;SsM6p~8+xVH)1OvV-4Of*C+%=FfV<Ba?n<~2B-W%b>fSQaZ)%+g<PJ}c2@?y9
z1)zi?<Z_FnvnxK+d-NDsO)Cs?^_9proN~OgoE>u=GEqQT-_W>pfz<ioI(HWwGkG{Y
zW$?-Hp#&C4f}|BU8XW7|cW1H6+>c#)CpSRRn~uwnEc7d=%s0J77;Y=t#l==)rt#TO
z=<@Tm!@`c7wq_Yp%$|uU|8+Im$K&o^71mWr@`z67ZRYM1)b^2x?CBU)H~);0GEiZH
zOZ$%|fKWF0E17XeXK~h6uEsp5cyenrKXeN$=C!btmgJ|Au3y-2F$C5zd!64(p!*50
zJT}m}A`eEWou9wopzB!4`1QVit9u+~kJycn;Ys2wZlB-yiZwoMq|<WJ-j}C?1}c|b
z5)H|Q64ob@NG-n$JnATg2!XtvVPfCb18$vf3gw#jVn7cRuP2AtR(v!y%L`rGG^jdq
zCzwlcQ1i?sYw8uS?tqW|=`{ERpoM-OA2I-e=&qj2qpNxx?MK&7j+hsf>j&<`3+p1-
zPkH53Nj-f{DKr<5Y~hU{LW9Vc4U~$+U`gOb_WNWvykm;lCdNNt+$+T88)KP#+mH~m
zzOP2!TN=qf01fgrC%DlsP%I-{6UYD`aN2_g>_NP!7w}EXN}JmKvXO|K(4Wi+x61_C
zVckbFJ-DwdVo-p0dvx<}9z=DY5C#3lKqzq&QgNAln$;Bf5ds$N^K)$@j1JT(@^pX|
ze|-<Lu8<buxkX*0T&@QiY~fYNEe3h_0j>AEg`%}~vln3<Lw7@3KYp5bOexuALXrVK
zOftE)ZKM<=k8#vhnZJ3T_Q!8R7KpkY;jxoDf;Ba+N|Gcm33^;jq=>#?@IPMA6m;j_
zxDPmA$<2JXxaN(3P1ra8i(PioO}jwqD<<y9wPD!9GnWYvmd06WsO+h>9*SznKot>f
zU^8$@VeZ7(sUga=(!2s`64~=Y=zZ&N_P?#!0aZV^H@M*U@v3X%%Rb`=uBeXbmAe(q
zx@lWSRee2&;oorFl3F_UMm$ZNcEH3mfd8SbnTnqK$s@LSoC!XyeZRXOJJxbwC}SWh
z>Zlu+wx;ClUp10rN|V*ZZ}O7F2>4SwAlPXc`Ts>us&Rft;Y#x)+euAVy$I%_-qPgT
zGpbVLB@oqPCvl|0l$4`%Wq~ElKoW`Qo4tRZ?jlgHx-Q9+dc%b6FPj@q%>GF9agOA2
zu2UKUekm!V(EP9yhm;152F>E~MZ6nwU1j^-X)64yG9>`Kw<z=<qrd3u`DN3GbK|4q
z?`5Ac)`fnuZP@>Ktb|Fq-ouGOJ)TJjh+I~g#4&G5&|T-2J|=(f{m1tg%~|U$Mr0OX
zg|4y_xK|XRGz0j3?LKODD4lm{QXypLSzk1x(iHCgC8?;JqS%C!00z4!qw^1suNHkO
zYOprqt1c9t^E~zXNy+J<oHh)u8I(*ctPFSX{Ti0sr)VM7BtNz8Nr;7=yXs%c0?eTH
zzbSYi8I++@P`D%$RIx2labLGU|6cBS@MOYp;zwr<Z9;P__t&$j4)@6Grdbw=@_M4v
z_w|knj*0qKU^6{Fc9Vl89C1Da^IrE7Rd$v<H&5?Aq1yUSEkOGzPr0>L9{iOJHC&C)
zEww)xD2=dJ8Cc;<YxY$2Aow7VANK<*s3M2dkA0|JZd0?IX*e&0z^L=eoW9AT8(>+1
z5?<_9Oqa}#u6eS=q^C5Hjx5xL6uRa<`|yCxT3VJfVy1;a1r#`B%r#RLQa_vqvZV{e
zoM|V@Hsh;P-&_qO^uC^)0m>SOh>;rIqxPc-oHON0MUT$vaG5W>uh}J?$+?!xb!6;#
zEFydC)`wES2la3;JMSc9g>A<N)@7-NicyaFY>15%{zu!jl{;1`i}t~J)8}hkeA7^C
zsnI!Yko{9>c`l&VEfA-pp2u!P){)EOJDJLlEJ&f4pIWJmdaul@0DJ`OT!6t@1Oz&4
z`{j6>+(sKc%-$a-0|4b@yi2Z84H(dh=V-!aZcqUU0&KUXXZo6E>Y}qD&0L&J*4Nt5
z(|RgrDH0V?y3nM)`fjILyIdm;i65%6dKd&1g9A~De@KB4_E;@bm+XVET^T)4$eTWa
zq2pn{alK|wb_Z^HnOdv$ZyH|Ybl#+{iIVo4qM`kOeqOG17{!nDDHn{~o20Fytx-{(
zNWOtW6<q>KRrJdc-><E4ee{0<K=R)-bEh9uSQve)B!_CUr*(^|E(FKJ>h1l;9}0n4
zO>eK{k@nuYX>)Km%l5j$y?{wAx(>MI+_*t9HW^!>Ohu<HjFmLr8b#X5ZGE}0;V%ml
z)8aP#`MiZseGpit#O$-a*&{q4`Oli!(+tHPdXNp-{Yjk0zBp)h2&r9z6c%-v3}{01
z;WmF8tFgax${sN-b(^(L0O~<NmHc@9_+Uq!W8N+R$v<!L-OK{gRcSD*BG$NYobm(u
zVQx@X`LykfJIP2w?dpldr-XS$<wBL^hBYAf$=Bd&$*MSQDy;|2bW7}3;y1eUc$fEX
zj(3%FnC=d$v=7efAq=Zwl-9i@L-tGqhJ3N)bi?^wt^@4oxLjfh!^+l~oV(nATq4^2
z4kJMdLrLMswW2<_j{oYwY?y@{dZ}!TS2WMXudfl)e{TPqPY;+F3Zz{1$I;xanX%Q0
z6Wa6~6TDhu%yX0`pm>mE2r#_^3l-HJ+Chf^_1tuzqM4vl>4_dfhStqyX8y4U4bAgB
zX37@;d<)>6vcEBy7<-~^L#Z63{Y)CDU=8uO0s|*rz!(0u8Gr{`Z9Tdn)3dZJ&r>1M
zyR_e0BbMhR6EG(Pk&fTxLAXpT{V*Trw3B&?8WHYBZ4MV}PTa<H-nNo+9mx=B>I$Wh
zl!4@9x<g|5==KukSF%qxCNN9&YhEETHFPxdgJ0<Jv@lfZJ1CU{;=_eUgec&~0akR=
zOnAi_S`*q=4Un@gP5WZs<rS9`=CKMRgJgjD%3c3N((enuS~V82ZRDVhe<a>U@^f%@
zJg#9~S(F?7xr{f>#`1B8;(F^0>D67Z-@g|f*VS}E>a<cH=^Ft(yc?^4Jd}1Hg^~KC
zP?JZ#d~Uqd#>a+SZv~L4R|*e5nqM{vwJi*z+A}31^xkc19-CP-0)Tp1dl^Yvy27Oa
zo}k}Ho_k_b;Ivo6pvDCtZ*+DkP#hJ#EO0${>!<a^bxrQaGXc{j#?~D`*ZU~T;_IOa
zCeijaqgbJQWBSKFcL0OwXZ<PP9z{>$-?TGJ^Q^AXb_<m9bjhDuLNpiZLitq4yY0Bv
zv^bOf{NwTOH7NW?6g-|eSNi{m4!{S_4`2NcSN)gGvJd`M5&a9V0zSp#pYkh!gMlL9
z=(YcdZ^z%My8BPWIR4S=7E#C8^O5zxYHENX=l&lt<R|JLt@s{W8YjJ0s{12g?^BPn
zg})~2|JOf7Al6z7jQi;d^<tn8<*yYAuoJBis5+Vwrrj<bVeJg|>T<#~7(f{&QwMYx
zT<8F9U%oOxq$a<<&CLRjb2ay<Pjj0bYXVwQu6FV2TG+M$ZJu!>yWtD^cLFQss}_^l
z)SetAAjQSe&8khgPO`RcVM7#<Exz$s$=m=?NY6d)qlxLRjT%wa3;z9QpSR+$-=mox
z77r%vN0^3C=#M*zMg|dMR110%=it{$@12}JRpJ`jY-BL#L+jzWo5uolMQL>jBeuHX
z4&84GT^mzJ%1wck$H=@}fi|PbjrJ-8t!Y6|`wdU6`B9gQodiqai|Xf5WPE8I`VVJ&
zQv=-=&dmlNYD^&u%izcI`Zu8l6*bxex5s=pc6xz6CpCcu2aMnEae0e#7`=`)dDAJi
zftIC&`rOAqR7{mcMqM3rDz*S!@!J&;5jKIv(3p`k74=ne7e;B9z5N^dW{ENtBO+|_
zI<9E5q|8p>CcWLBH=Y5Dp|vAgAk|FCQUV`meNo?-y>$mNQJ78M15K(^XefCXMewiE
zm(=rNw;xe_ET-o13q_vVieyriB@>W6w7NEhLo(#ev{`SM{Qfo!+>Kl1hK>xl8>e|Q
zH}!5B3D1|=o)3cvs@@Q$xysd@d(aN{zGaAL88kJ0pBJi(zN?t0^s?(~(4PyKN18$6
zN$<gc=)#fXC273@ut6}OTCJ{o#BnIujbCIyzo*G5C4S{GRTdr<l*8ZG!;qcl--P;9
z@atkI+#a&m9s}uLk$z|eiezeC1o_-0C0A=*wIQv?mj2MzLUgpD-2D5>MC48j`wO=3
zK(F!rUzTg_XfKPA=g)7dDp}#Fm11;e?yuYz+iFI}-a!n`E}(LVDyEmu4YWONizw?u
zR=kb{88F9<A+a)5mOy22t7A<L!%`6+NqC5qLU1q^HGXhLx8@wk_xM6_$_J%2A{o(h
z-SRR;pY?7dR(}U8wFkf1rVi*;<X+Z>)SQF-T9xy^^&&5H#c6MbDiIVF89gK+Z6TnP
z6#j09N91u*3N-0hF8b$lfvjw(0c;VjM2g7Bd|4Fn-KaCTHbY2XbN`Dw!mfQBUgJW@
zyVc*aOpuq_7W%XBwl^Q|tPS|S{O`gGdQV){Tza0DE|-L6^@+2-HSc*4ydjkFk=IvP
zaza%+!{?suy9OIqG~xk3QjS*4X7sm=2V;<tBgm}&2nWI;);u`zl1SAU*IRHzkGtu$
zi0hR0KFhDlD8Mu_LoyM(VGt>06s7c=)EzEhq0F1UrF_p5eOW@!C-@r<bgDOGXH`!c
zG5l-%$Vxb*tf7FR>Uj<4bQ{mrn9TY2Is**yV>|_}WM5r5{~gQ^kA&L>$Y*(Tvzh&6
zt0fV-q8a4bBcQ8#<)!K?ylyz1XOD;OvirLoSL<C!<<|gwcBwvLrX}?U!K;kUz5qP4
zcpe3w#a7<QcIrp?{0gBHlEQCCZEbd;7z#;-m~C#KjV*2+$rTky3Q-V$9(v{ge*ro4
z9e!DYOrJU1^SN|nFz_9LO*dRcaeM`9#34_oBUdF1@VNTFaO(!!B#nXc&Mwb}KFlMf
z2HZaFNm0n10lKP*^p|X&Mk+0jpWLNV*Yhv5uyi3_O`UxLVP;^tdzZoR4kLrvXTeCm
zmTT7{uT{hqwfv5}CLGri`Jv_cTvAMndhs>k>XvRS``N_YY_(!7=xB(y^-M0=#l5`r
zvND{*pofASA}ULaSSu^b`hV)!r_}HuMAvhPk`q$w_;l&s=W6jB4|{XivIQTaPncZM
zVtp~gcIA>x*C5z1kb?!=Wi@@uHboTAg`lT%nB;i9AO}uS=S#`<2l4*`?ig>UgVV+M
z*HNB%>_)5Q^s&A^_gNAbcI6!xOZ?}3zMG!)Ye!bh-=4IlQTIJ|?l-~xZv7I=<%;s>
zJ;m%4`?RJ8%tq`zrH@d2`8-3>slFb<=3)zj>c&Bsrsex_(XEE-J1B56^Vo~9clR<K
z7Og;X+0L=J)9Zpg3M!V}-y+AXiWQs(o}IYh<r9&PAkz3I+H*Zsr9^B%TTWgq(B|Lt
zfu$o=eCa_M)nU7hMS^ig^bgEa;zgTbIxjop4%GRq=&m_D@*aTVBSm{u1ua>lJ<yg4
z3)z7}F0aq<{cP+0mHfmnqzQiEWJ=3zD;WB-l4!+1kuHlipN>b7uEix5X}sE}cL{<`
zwB+A)iRZbZ-ZTu!qQkqMpb$Ur9%A#02CZr2)fc@DosB+=hN@a?8kWar!v;kTb$>km
zzVbmroFn_iz%EVk7vzXR-9pKWlWBnyW7M#h@^uTVj_91hIE0HWi7T7MxI?&A9I@bg
zDQ2i|9zqTw@qbzMKqHLhP{Waev(<3?^9hAtd9cx(UZ0l5Pk+ESM_#sO|54kO9p3Q2
zzBt`J^8~80VSRTSZDGx3l25D_j4Mh%VR)mD_eQD_T@jyJV@$vWc2mAv)si<Z-elXe
ztfwUUH&ISECwri=T@G}nMoDgG36t29McEI+VvEb+u-n)Bz1jF}2H`p-^axs+=jsJ@
ziLG=E#cd#8lPlHWKy@`8zKv7EL<dGakYGbuQ9A9iODas?34Wn!8<AnC!bFROd-*WU
z_q`|>gMPF8mBcs|mjjb8r_*i1&OagbMi|)II+4z@R%!RcMG`j8yC8RD@m;=XL{o1g
zCf{>!5w70o3%^j8_BG4UnRET@bmF}1kfALb;ix+MRtsa`xKMv{yExi7k|VB2P+Rtz
z)VG%zF|jzCPYf?1io>|Cbr515gwBEeiI!-Jk~%afGH-QRGc2~D!N;a`ahiLrrTx}|
z1A0MjN7L}PW1m%D)s92u0Gyb9YC=Wc(sU3`tVuWQ=>l$RrFs6KL+zcbYL>p?B9Js@
z&gBw^iV}ogkELd9d`V+#`T7~3D#$-~I4x(Iz5*=K=QpZ3qMCuXr&<agkl}&{oa*fQ
z*;f`wJM;y*-PSO%?klHPq*N`%7U8SUN=8$<hwEb=oQBy)N(%NL<ohP35a)sg+#m}w
z_*ziEXCND9f3Xd+D?4{+eOp{|yfW$}*=BFXs)hD57!fE~BDUWBf?BcW(TuWgaZVZS
z4XM?5fR48cQ#WLJm&?VqAYRE1=qJh*jWVgv^_}yT%o1p82`VXR=_Pi0_hm`~WT`)F
z(o&sliJC^gG8zOo8wTlJjm|Se*>!GB!*?7WLC}2_R<E(IZU{7GgYmKFjG81Y&#5(Y
z{KRoVu2&qmURXXs;OO(nP%?r7(iLdL$R(-ReYyUK%h}<ed(#~OuwhM2^1d?MSH0{K
z;Bh3tSvw$T;X7}Ptq=^enEj!yGzu9Mq+8zl-RDsgT`+i|+R!RRl;2|T%*nl0X6bc}
zUkjV@t5dHgq>j3JhY|=#_kQmB)*{_t_`nszBQ9LoO*SxO!@BEu@h-;_#<Vt6&Ub)m
zK&$Ctj8}3!<R3~XSuWZbBa$v|AOd?cW5HSJY8|I1qVJquV}exISdp*?Pkm=0r*4eF
z{+ctpKe&4do#B%G(-$+OZoFb?+0o`;^W(!?wV6Q%UgMO$IJ^%vPe(kJuKkni+IH)<
za2MQIlkg?B94PUyFiRd(8;ZU1Kuh*l)LYD_7jG7|k;R?ScXg%t*pA8tiQ}?Dd6}~#
zVn0)Tb<cQhr5bm`ess#>r9KdHZYIVE-k|f}<&W&jkDD(#$UjSm`BW4fmzsVXc8R|z
z)Ygi50b^;!A9J0TV*&H%HSRHt<f7M<fb?R`NGqDtfB;*I+3%a|Vu_%mXV;BSi_`g)
zQYjDu5V$#}I+e%H8agY}zdFQt=H&5wDY4$|s`OO>TnXHx-(7P9e?O|;|4lYHP)~Zy
zuo|LMe1Hcns2R<kZ+I}tNA(qW=235@Scux7aX<%I&`JaaDOz^pT?j*s4_OJ}Jdgr!
zeNM!D&k11>;4F?r({9}~R3WU#Nl{H9$%4#`0VIAr7ZF93EI2jHElI3C;6eji?a5S*
zYe$6DIahg?cJ^v;G6wLL3M?+@pc>75FFw%4>GG@x`FC>YN~&4aOjPk)?6-x!K5ZDv
z{3ua5zSWUTpmha;5U@W_rU@S=YYE%2#A;9u(~!xqI?8!$@t<be>^5J`SZeEHC`GVP
zL%@O?5{`ccyycoZ+PL9G@2<XuoPEX!%DDfFDrK&r#42@Z4BivFS{9Kn>WqH=Ug1is
zDQuk|()F_S)P^-Zqe1lVT+rSk*R)4XsF`jl4jNM<QcnTgtRUotdg6SMt_pvO>(Qp|
z+RIVHqp{tXQ6}>Vp?=j1o!0E<tbU5eNKu%VGnM@9n+0vN?i&5_NjBOjoj%c=-J_Y~
zsGle$q~IIhD*Q>~rGXa%eY_O-1AIWE4#ujIye7~+4jF4y;O~di8p^6#cfYinmM$Lb
zpzFN_hy8~2hagmizpOh7ZO)|=t#uO!2G7_zUK+GLOkcUTe>Q9=|3>SRswdRU=~$r%
z7`qag;=1Yy`|TZUls=J6`Z_(3FQ`h*@%w08G}xh*PUeIpp4}Xh?Tc<T*qF@+9-{J4
zEMg!=(Ws=ATsxaCh&12|1-Y@=X1n;##AAQ2ut9c(<oXuti8(>u=+69al-It36uzRV
z>9Qx+ue>5TMB?tPa4nQ@U#PtBt2&=6+3u{6I51(rtDky$ZV>U;zrZ_HDR=Z0;P?Tr
zn>Rmz&5r;1-+gJFOyrc^8<>np&~8*IxAh*_EOM$$iu-A1VoT5YlcG{C(dglQ0C|LK
zHz_Zf46WBg#xlJLTkpY~<MgZr8L`~VU?<|oOv-nd4tq8qN}lmISzz>9l@B;8K|LJd
z6dnnVSHT6Avqyd^?~iHP<+L?S_hX>j6*e&%jySk<<^Dr66Q1Fwa7VLec6MfE+Kgen
zn~#ovB!kgwUB078aQPbh8ZdAXb&p&4)tl;ar6RAx?&=erpR0{B`1W7VmN}eXDHrCS
zob(P=H~eu|V!Yy!mO6Dy@e$o!Ym*?gsr^LjlabG@-wCU4Uq>2uG5j<!(P(I#-HR(K
ziL$FTD@%<#nh*SNY#t$|X@|4*)S1(*+9q*J_ru@EUH?!~p5W?%+`6K!Mbl|lV|4S%
z|8<S$&jn<ZvkXTc=>K&k9K)mG^CEGnmm)iD-f0W(C(ph;Hm{$5a@2e6A_Lu;u!&I7
zWpe&sS0YtZb=Ymja+CW^RJ(S@qitT~{bTboG#jdB$FE#tA2%sCQsX5knjBvVM9q?t
zSR<&daQ<9KhH%-EYIyXqdBVxAmdKAHcO+^n9=(hCZxii?sfj@R+@AKnOdRICd~PaJ
zHMkhNo2=T#;wfTcpc)#*@x(Rxjgo!@^-yQG9Zb`FU_0C4(4D08g=?nfJ^9eZfMzFL
z`(M)!Wm4Y#+}M<8&F%A8T$=i-Y%!w4He+fN=M&|bV~c!rNK6)=B0W7MD@J*BZI9BD
z#t)H4BWb%k_eoY9aPctr(Jp3GAu?R$gYBh%FM5reXvi-R#<52E6#WtgGws)PcCpa5
zjrFHiMF(Xmh=`VskMtdebEKhqI}c$sL$~dp{y`)wuQ0%7^|EOEG5_8KnMEBmYsTNj
z+IN@T^?6kXL-$u45n%<^z7^n>v6;KroncwFd<eQgLP{_hlkJ!+VP>EzP1^r_NV;`<
zwjd->QXrAb$tEb&{$ca<KPwY9xxE`t8jtTScVRm>wUHlDhTXNDZDRl?FK{*>)~&(h
z@ObhLvD%awcvL#j_Ug!mPs|8B>mTkuM`R0l|2XoKmfwKC@m{PO>^EZG_o{3gvds8>
z6L@pb5OKJ>JkzJE?Be+l`AG8joK=>x@BP}UY*VH1G%OVp$kSAVw+U)0Gx3;@Br5w1
z-Z4hbWR_Y_t)qBN#c7iat+$SBEX^YA>}uSwuzARC{p?dR+4-muQw41g&gD>Cdr&{D
z!I|ll?QRj`=k8tIzE}Ci%5QSq5hb>6fw$rPS=<_?ypym6=ALQJnt1Y+9<_D<+?gHB
zwGNTLj*6(dfAjS8_6EYN+8R`-^rOk79+Wt^IcsZAG7nwNa3-SU=vS(fL0qci+vbYb
z{|wAGghU67h7PPMG<r;3cnRM`Fl**)g!nbg-aMSBvDN?Qc&K>o8%W|E2AiQjzPAwx
zVVy^+>RV*mR^wZDu{W4t8(Fq~M;A)(9)3cMxk=&kpcVq--a8@5+1-g(vxj@=&o`Nv
zwIRnwGk{~_Vc~#|*u%ZFn-VIH&JMPOGN!*bQEY^o*MWI|A_Hq@t;dFHy$IhUE26w)
zMbQJR#y)5Kbr=^9`e_Zs1u65<Ll(@T1184ns|`8CRWW5?r0Lq?LFYWELpX6%Jko1{
z=^s~wDK#1-oAIjJC#^+1YyV@EGyw`0q9Avt?_s=>tv#I*msah=NX`D^pv8kZA%War
z|K{2qEuFo&sGca2dNr8Mu|u!Ed-3ZS8x2Ke=g&^*e6`ow5gm_<J{NSgwrcZkh-Llq
zM#P~>ninDh-Vi%9zx}TK{0hhr4rnpgVf2Vzc{m%&;=w3_Ejx2cc3|iEGCooKK~!hx
z=Eo^?043AX(;1meJ;IcKhDRJMGGXkqxKV@4@6X_$rRKu?$m+~q$Q%Bz#_C=2-7I9h
z-B%r_0hPN?zqr!k_M!L9VvhHc2?YJVFs!J|x+0T2<}Fnwnq6;y`m$^u)Tn24q9(+J
zI9f;XrPOoR>u`4z$Ab!;jOhw&WQL#u@0=!c1X$X7PErCyVm!sNm9d^ktlaLNr99Nt
zSua0*O5<B*?=EYew{tVIP=xaqD!>vMwJYB!H#06{r8t4(Ivk1wtZWDz(_iepAHkYF
z*048yR#~{<4OB?j8BN`v7#LleQS>eiThAlfA_*T7ZHR}`atEGBWfWydyw8h@=@CIK
zy^Y3qsPavDiz8>DFccYy48?XVD(<bUzmJ05*8)V=wIVXKrZmOHWGOZ4X=_8^)N0~J
zeSB2#yO^Q5f5#jQ2$1*jX<a_G^x-7UxR(PuWPP@aYrG<lyoS{rB-yHX&@)U)E_iy+
zR%ggMuzTnRW?`rD-fEIQPD$X`bgFNkqo6VZ^SCXbjv939Y;a~KSa^>f#*6z15Zxxt
z-!4(GLx=CJfQd5|@q_wa7P-a-O5M=+T*b>*>-D)OyRUwci5*(|d+}4UiBov%Z00n9
z!OA>+TrR9X=P6b?;+5=|%A23x*h&N?V9|jC0h8R)M?dPwrS?&u7ISmnvt<))+Z(^>
zH{n8rk9RH1VA_)u7BRv{+z=s6oy>$ea;7!w$(4D*Mh}BmnR;${uEYT$N)~7^VrLn=
zvcLB(8pRD0YtjhZ`y7r4>zl{c!)RK?oeSXWpv0@W-ZLTH(9MV^1{lOVdY3(0c#vau
zh@aMFYMs3~)-g%wDK$V5S~$RM<43&Zu@+-RUBF}w#?g37CSDzt9|Ioo1XcB4dsuJK
zjP+#Mrr1`=2yn@Nzxot(Ila*xQKjb=BklJt8>TXqH08~9yb-GZim}Zs_P|4)W(2ZU
z)FzoVSRE<NvNhvjR<I<W_lM>cf4cGvCO7T=Jc~lfS;{x1$?e)Zs8jd;yo?@f=2dnf
zQUYB+R*`$~6~2H(2AVWk7bHD(`H;wTQx^YW0NRTCk^W!=ycnahdl*4MOHBzZ8;9o`
z{S$d94O)ZX6#a;g+|GHxj*uMB7B18A3!YDumx|wnLu}=-!6L9POT~k>;d$n85}77B
zXEt<?Xk|?}=r}ycu?nNJ9z(B3!R|Sr^JKl8CO5{vxFg#Gn;9PmBIl!b<dt{VtsVwY
z*R#x-d+wRCGW#YCDaj2%=ZOw<9S4Fi8#mTMWUXQs(@#CVL9l<b_eVW_wHgUj{6==p
z)Q~J5+ZrIj+)Xch`%-DnUCQiiyE|HC{dT=emNjjYcsog$xHmTehN)~$64ze15lR?4
z!3Y4e)t`GFW^2>&S&51hZK~PokP>qI(f#5_O|zKKU}R=BUFT;T4B(JyF@9*F9!pGD
zcny^gcKG^%z!w(Bj>oR#1xtg83vblw7(t13C5xdoV)>rct*!Y>hiI>WTB=LYv*A?r
zST%L?6T&HnzV8~Z>4wi&T|KiPTeU_?djTBmxlMQ<N1!CbtaOjk#!`3_cA`fXf2*kL
znq%++`*!}hot0cfR({?p?~r(Y;Zzd10KD+Xk_LvJGolMz%|y|%7O)x%EyA@H45N8m
zsk8oGRQuFFt$|rnW4e6hxo2)LuK#nUOO1Y5?2$?*@Du7$A^xkb7~_^&mD)ePt@*to
zjM{t?lugWp6?k5T6XndgIE`}DFthS4*VEjZWdj~a@s{^o-jY&J#`&rknX7{(ie=Xv
z46K7r=^<4p6R<_<`@+!5WtiQ*u?0in+c)xE%s=&t9Na{yl&>C(<vIcr1T(W~`vHJg
zo7188iLUj4B{=vq+zvkQN0EU2;4xMWQQuEMCRYG>aM)+}rf>3fUO(L3U0J-Mlbv@x
zZbKweLF=33^s8!y;}s$B<FlXe#b*DiiO5O5Gpj=3YvoNbCY7<{N*q(4`qE6@j1!w|
za!dQR2zO+x3P_&q@Dfox`SqE!<Ty01YMo*?ZeFRK8_87xU*xJEZ!#Hp(^lH$BEDp}
z1;j=}K&aTjCj8q~f4;f402KM9OTO#mX1cfKa7V}nbWBXbZ64k-|9dma)q5lNjhteG
zM8wffe5Xpawj`cA0P0w{@rBBZTvaeeZe;F$8uKDqd*xn9JA>30nuC%5mBmsTgv!7I
z!k;XAnnrKgDtJW(`3=i1CVON~2{v#-x)Nrae;IKOceAh0xCwad#fhHRb3khqzOLgi
zHmKVaFnsa?%=>h2s4|7vB{`1oY;~T9{WjYXG^{ZPR;3>P5LAh<4~)C{Ax{lpTIMKx
zk@_LTNm+c!hiN7JLFA!rXmE$J*@lJ$W1MBS(Bi?gbW(GgQbJRRK(`sNm*Bqpc_A(@
zdtRUsbGI;*XB(rveRv`Dwg`J;vlqt^^2z|1`jtjRH{L665^iU7Nsu4)Qqfo`*oSOU
zys#3Lp)*LEe5<x+i)}{MyUG7KST*S9>&SIAEo)fP^WEg+2Z46HXnYyPTgD|{=1Zxe
z-6hFHPm%ZF&GC*Zj$o%*U@g=+L&AiU{*HUY>6q^9s;fWiseveRSDL{1E{P;S@BX93
zepe9(pYHu6G6*iawS9A+PDuAir5cE!e7NnYp!!l=|AQQdkK)VHdU>7@fzZ`lx<@xv
zvi^;I4Dx_a#wLAAmbd}jqvT9#6)%vCN?!4zPw~WqCq40ufdK7_op#|XqHHwgC8iHZ
zqx&*8r_1tX)9ZK*fBwFI>7(=OF`ULg(H%N?`g`lp@b}^->JQ~(=LW&FvXb@9)`hXJ
zTNe+W@mQliM(wk%B`+_uE>OVKO2dGZ_pM<+Uc4Q>)(Xal+D<S08uy`<P0NLi-tio%
zwGoRrWYfV<ZVLzUj>^?Kf2nB|`zO=AB8^|_3-PWEbM8jGA0Jbzn%G1{LkHHsN`Kdq
z4cbP75ibL|44NXER<<G=SbtoY?#gAC<$LD<Jm*yBk?FUzyAmS@)VA1S6jt6Uck0#K
zFHf=Hz*k{<ECqN->y9-L3Ut->D(ha$>dm!XY<&deqecJ~b9oi}bq4i|S`B`G`Hywz
zQ8zOpEC=`TS^8O|+WQMFN<Kq7h;<JeDq@`%Y1Q$;?(Wi$shW>tH=Aq7+V1GsZ0Ti3
zNy$Llr5X`l^!w4;H#tqg>wT?JpS0A&{tYh+<$$r()U`ywkOMnn-_+%at^l<z^Mf9Y
zqJ(7|DF0>eF0BF9`O<QGa$=L2fbRw%8ShApi6k~$NuqRmTpp)YewES{t=8A)!Xvl$
zIx?7D-biT9NbElkLjeq^mHlXW>xTKH4qbodfhNx^(<r@t8GDsH-=NCq-jp`^g#`OC
zUhGxKclO+Uy_Ga-1M5EH)Z7nCfU>57@jcwI4`HgKx(_a1iN80o3CCzo!rmB2ylusu
zN`woTz`>gA9CXb%oWo$u2Vz4K8*e*SBU93+<ZCTAaqyHOs~?lSfNZxco;_@kOGru;
z8B%R+_dvf6P_w@z@kFi5HhAq4Y)Y1GQc)3?U!U(=P#;D`WliyBfvG<wcMoufu|IqV
zdq%ISQ|sS2EKY}^!FYoi>F(FAHMXksbFF`YWvmen;K)*1Q6!<X>tk?f!+8qmeV!J5
zd{B;OO85;!w}PMdCYcuVreR4vSdJitc#5Dp$gQNAVPSWO9@t$Qz%w8^Z?G0hk&BVN
z;YuSeV*!ejeP_aARP<HCrYSf6gO~u&K=}3aHfz==bx;0$k0LeFZa!6@MSkxMV2xn<
zIP-BNpmgUF3YE<mjpupyc#Z8|Y>SRqOHGUIdG%X+PMnOsHjYd|-}v#39+XizaV*uu
zIQF34?z^guMHhMhnD2lxU7@`EUYvaUd#R|8fZ8rG;yMdV?@a!#v!vK;bcj{K{UPzh
zy8NRr-x3H8NCu#UQgpL>(j?XPd>5Sgcb2ALT)|^0fxsl)6FKL*)t4%m{$gg%pWDsN
zJh=Da)`6e;(QRa#V6v(cFP-G8%xOoy-y6j&lICW*c}}q8iRd0aCcP^#w6V(5ix*4d
zmFx!P<Mo5I<GG?@l#|VHd+fCnS@3)vx3?<iuEM$k2Dz4HjM$8me#h%*nvaRUaL|wp
zbxX*O{}-8J8FyfN9>NEkiz!~8JT=inefhAUA6SZI(QMx%an;9w=6R9$u<8SsFO>sV
z&r;d$nQwl^gtOWNYCVCvS|b_y)lzx&gEtX0v@47#Ue(Yy?E`#dqLOmAYPirph`qhs
zSuOo(R=(xg1vl3JLD%dSezJm9mv{VEerMlD-QqN25qCqp#mAyn8TCI&rz7L6Tp}Uv
zzY#6vyk<&TFScn2X;}4Q>V^OIT{hJt!Xk!rZlQdK`Ttv<%Z_gZcH+bdq?X#<N5|~l
z|32q+(Y3=XEkZL#^<hT2ldRkHpBh6cQ%&`iit-v~J<dbra*4d3WkmnK8On0Az<BTC
rGp@Fjt%lBXoRzWvg+9GGa=`2BzW8ad%XAR9loML&`fAnp>>~dUw0%u9

literal 0
HcmV?d00001

diff --git a/docs/source/design/v1/prefix_caching.md b/docs/source/design/v1/prefix_caching.md
new file mode 100644
index 0000000000000..dc8432baef9d9
--- /dev/null
+++ b/docs/source/design/v1/prefix_caching.md
@@ -0,0 +1,228 @@
+# Automatic Prefix Caching
+
+Prefix caching kv-cache blocks is a popular optimization in LLM inference to avoid redundant prompt computations. The core idea is simple – we cache the kv-cache blocks of processed requests, and reuse these blocks when a new request comes in with the same prefix as previous requests. Since prefix caching is almost a free lunch and won’t change model outputs, it has been widely used by many public endpoints (e.g., OpenAI, Anthropic, etc) and most open source LLM inference frameworks (e.g., SGLang).
+
+While there are many ways to implement prefix caching, vLLM chooses a hash-based approach. Specifically, we hash each kv-cache block by the tokens in the block and the tokens in the prefix before the block:
+
+```text
+                    Block 1                  Block 2                  Block 3
+         [A gentle breeze stirred] [the leaves as children] [laughed in the distance]
+Block 1: |<--- block tokens ---->|
+Block 2: |<------- prefix ------>| |<--- block tokens --->|
+Block 3: |<------------------ prefix -------------------->| |<--- block tokens ---->|
+```
+
+In the example above, the KV cache in the first block can be uniquely identified with the token “A gentle breeze stirred”. The third block can be uniquely identified with the tokens in the block “laughed in the distance”, along with the prefix tokens “A gentle breeze stirred the leaves as children”. Therefore, we can build the block hash of `hash(tuple[components])`, where components are:
+
+* Parent hash value: The hash value of the parent hash block.
+* Block tokens: A tuple of tokens in this block. The reason to include the exact tokens is to reduce potential hash value collision.  
+* Extra hashes: Other values required to make this block unique, such as LoRA IDs and multi-modality input hashes (see the example below).
+
+Note 1: We only cache full blocks.
+
+Note 2: The above hash key structure is not 100% collision free. Theoretically it’s still possible for the different prefix tokens to have the same hash value, but this should be nearly impossible to happen. Of course, contributions are welcome if you have an awesome idea to eliminate collusion entirely.
+
+**A hashing example with multi-modality inputs**  
+In this example, we illustrate how prefix caching works with multi-modality inputs (e.g., images). Assuming we have a request with the following messages:
+
+```text
+messages = [
+    {"role": "user",
+     "content": [
+         {"type": "text",
+          "text": "What's in this image?"
+         },
+         {"type": "image_url",
+          "image_url": {"url": image_url},
+         },
+    ]},
+]
+```
+
+It will become the following prompt:
+
+```text
+Prompt:
+    <s>[INST]What's in this image?\n[IMG][/INST]
+
+Tokenized prompt:
+    [1, 3, 7493, 1681, 1294, 1593, 3937, 9551, 10, 4]
+
+Prompt with placeholders (<P>):
+    [1, 3, 7493, 1681, 1294, 1593, 3937, 9551, <P>, <P>, ..., <P>, 4]
+```
+
+As we can see, after the tokenization, the `[IMG]` will be replaced by a sequence of placeholder tokens, and these placeholders will be replaced by image embeddings during prefill. The challenge for prefix caching to support this case is we need to differentiate images from the placeholders. To address this problem, we encode the image hash generated by the frontend image processor. For example, the hash of the blocks in the above prompt would be (assuming block size 16, and we have 41 placeholder tokens):
+
+```text
+Block 0
+    Parent hash: None
+    Token IDs: 1, 3, 7493, 1681, 1294, 1593, 3937, 9551, <p>, ..., <p>
+    Extra hash: <image hash>
+Block 1
+    Parent hash: Block 0 hash
+    Token IDs: <p>, ..., <p>
+    Extra hash: <image hash>
+Block 2
+    Parent hash: Block 1 hash
+    Token IDs: <p>, ..., <p>
+    Extra hash: <image hash>
+Block 3
+    Parent hash: Block 2 hash
+    Token IDs: <p>, ..., <p>, 4
+    Extra hash: <image hash>
+```
+
+In the rest of this document, we first introduce the data structure used for prefix caching in vLLM v1, followed by the prefix caching workflow of major KV cache operators (e.g., allocate, append, free, eviction). Finally, we use an example to illustrate the end to end prefix caching workflow.
+
+## Data Structure
+
+The prefix caching in vLLM v1 is implemented in the KV cache manager. The basic building block is the “Block” data class (simplified):
+
+```python
+class KVCacheBlock:
+    # The block ID (immutable)
+    block_id: int
+    # The block hash (will be assigned when the block is full,
+    # and will be reset when the block is evicted).
+    block_hash: BlockHashType
+    # The number of requests using this block now.
+    ref_cnt: int
+
+    # The pointers to form a doubly linked list for the free queue.
+    prev_free_block: Optional["KVCacheBlock"] = None
+    next_free_block: Optional["KVCacheBlock"] = None
+```
+
+There are two design points to highlight:
+
+1. We allocate all KVCacheBlock when initializing the KV cache manager to be a block pool. This avoids Python object creation overheads and can easily track all blocks all the time.  
+2. We introduce doubly linked list pointers directly in the KVCacheBlock, so that we could construct a free queue directly. This gives us two benefits:  
+   1. We could have O(1) complexity moving elements in the middle to the tail.  
+   2. We could avoid introducing another Python queue (e.g., `deque`) which has a wrapper to the elements.
+
+As a result, we will have the following components when the KV cache manager is initialized:
+
+:::{image} /assets/design/v1/prefix_caching/overview.png
+:alt: Component Overview
+:::
+
+* Block Pool: A list of KVCacheBlock.  
+* Free Block Queue: Only store the pointers of head and tail blocks for manipulations.  
+* Cache blocks: Mapping from hash key to block IDs.  
+* Request blocks: Mapping from request ID to allocated block IDs.
+
+## Operations
+
+### Block Allocation
+
+**New request:** Workflow for the scheduler to schedule a new request with KV cache block allocation:
+
+1. The scheduler calls `kv_cache_manager.get_computed_blocks()` to get a sequence of blocks that have already been computed. This is done by hashing the prompt tokens in the request and looking up Cache Blocks.  
+2. The scheduler calls `kv_cache_manager.allocate_slots()`. It does the following steps:  
+   1. Compute the number of new required blocks, and return if there are no sufficient blocks to allocate.  
+   2. “Touch” the computed blocks. It increases the reference count of the computed block by one, and removes the block from the free queue if the block wasn’t used by other requests. This is to avoid these computed blocks being evicted. See the example in the next section for illustration.  
+   3. Allocate new blocks by popping the heads of the free queue. If the head block is a cached block, this also “evicts” the block so that no other requests can reuse it anymore from now on.  
+   4. If an allocated block is already full of tokens, we immediately add it to the Cache Block, so that the block can be reused by other requests in the same batch.
+
+**Running request:** Workflow for the scheduler to schedule a running request with KV cache block allocation:
+
+1. The scheduler calls `kv_cache_manager.append_slots()`. It does the following steps:  
+   1. Compute the number of new required blocks, and return if there are no sufficient blocks to allocate.  
+   2. Allocate new blocks by popping the heads of the free queue. If the head block is a cached block, this also “evicts” the block so that no other requests can reuse it anymore from now on.  
+   3. Append token IDs to the slots in existing blocks as well as the new blocks. If a block is full, we add it to the Cache Block to cache it.
+
+**Duplicated blocks**  
+Assuming block size is 4 and you send a request (Request 1\) with prompt ABCDEF and decoding length 3:
+
+```text
+Prompt: [A, B, C, D, E, F]
+Output: [G, H, I]
+
+Time 0:
+  Tokens: [A, B, C, D, E, F, G]
+  Block Table: [0 (ABCD), 1 (EFG)]
+  Cache Blocks: 0
+Time 1:
+  Tokens: [A, B, C, D, E, F, G, H]
+  Block Table: [0 (ABCD), 1 (EFGH)]
+  Cache Blocks: 0, 1
+Time 2:
+  Tokens: [A, B, C, D, E, F, G, H, I]
+  Block Table: [0 (ABCD), 1 (EFGH), 2 (I)]
+  Cache Blocks: 0, 1
+```
+
+Now block 0 and block 1 are cached, and we send the same request again (Request 2\) with greedy sampling, so that it will produce exactly the same outputs as the Request 1:
+
+```text
+Prompt: [A, B, C, D, E, F]
+Output: [G, H, I]
+
+Time 0:
+  Tokens: [A, B, C, D, E, F, G]
+  Block Table: [0 (ABCD), 3 (EFG)]
+  Cache Blocks: 0, 1
+Time 1:
+  Tokens: [A, B, C, D, E, F, G, H]
+  Block Table: [0 (ABCD), 3 (EFGH)]
+  Cache Blocks: 0, 1, 3
+```
+
+As can be seen, block 3 is a new full block and is cached. However, it is redundant as block 1, meaning that we cached the same block twice. In v0, when detecting block 3 is duplicated, we free block 3 and let Request 2 use block 1 instead, so its block table becomes `[0, 1]` in Time 1. However, the block table in vLLM v1 is append-only, meaning that changing the block table from `[0, 3]` to `[0, 1]` is not allowed. As a result, we will have duplicated blocks for the hash key E-H. This duplication will be eliminated when the request is freed.
+
+### Free
+
+When a request is finished, we free all its blocks if no other requests are using them (reference count = 0). In this example, we free request 1 and block 2, 3, 4, 8 associated with it. We can see that the freed blocks are added to the tail of the free queue in the *reverse* order. This is because the last block of a request must hash more tokens and is less likely to be reused by other requests. As a result, it should be evicted first.
+
+:::{image} /assets/design/v1/prefix_caching/free.png
+:alt: Free Queue after Free a Request
+:::
+
+### Eviction (LRU)
+
+When the head block (least recently used block) of the free queue is cached, we have to evict the block to prevent it from being used by other requests. Specifically, eviction involves the following steps:
+
+1. Pop the block from the head of the free queue. This is the LRU black to be evicted.  
+2. Remove the block ID from the Cache Block.  
+3. Remove the block hash.
+
+## Example
+
+In this example, we assume the block size is 4 (each block can cache 4 tokens), and we have 10 blocks in the KV-cache manager in total.
+
+**Time 1: The cache is empty and a new request comes in.** We allocate 4 blocks. 3 of them are already full and cached. The fourth block is partially full with 2 of 4 tokens.
+
+:::{image} /assets/design/v1/prefix_caching/example-time-1.png
+:alt: Example Time 1
+:::
+
+**Time 3: Request 0 makes the block 3 full and asks for a new block to keep decoding.** We cache block 3 and allocate block 4.
+
+:::{image} /assets/design/v1/prefix_caching/example-time-3.png
+:alt: Example Time 3
+:::
+
+**Time 4: Request 1 comes in with the 14 prompt tokens, where the first 11 tokens are the same as request 0.** We can see that only 2 blocks (11 tokens) hit the cache, because the 3rd block only matches 3 of 4 tokens.
+
+:::{image} /assets/design/v1/prefix_caching/example-time-4.png
+:alt: Example Time 4
+:::
+
+**Time 5: Request 0 is finished and free.** Blocks 2, 3 and 4 are added to the free queue in the reverse order (but block 2 and 3 are still cached). Block 0 and 1 are not added to the free queue because they are being used by Request 1.
+
+:::{image} /assets/design/v1/prefix_caching/example-time-5.png
+:alt: Example Time 5
+:::
+
+**Time 6: Request 1 is finished and free.**
+
+:::{image} /assets/design/v1/prefix_caching/example-time-6.png
+:alt: Example Time 6
+:::
+
+**Time 7: Request 2 comes in with the 33 prompt tokens, where the first 16 tokens are the same as request 0\.** Note that even the block order in the free queue was `7 - 8 - 9 - 4 - 3 - 2 - 6 - 5 - 1 - 0`, the cache hit blocks (i.e., 0, 1, 2) are touched and removed from the queue before allocation, so the free queue becomes `7 - 8 - 9 - 4 - 3 - 6 - 5`. As a result, the allocated blocks are 0 (cached), 1 (cached), 2 (cached), 7, 8, 9, 4, 3 (evicted).
+
+:::{image} /assets/design/v1/prefix_caching/example-time-7.png
+:alt: Example Time 7
+:::
diff --git a/docs/source/index.md b/docs/source/index.md
index e90e81c72860a..ee25678e2c418 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -153,6 +153,13 @@ design/automatic_prefix_caching
 design/multiprocessing
 :::
 
+:::{toctree}
+:caption: V1 Design Documents
+:maxdepth: 2
+
+design/v1/prefix_caching
+:::
+
 % How to contribute to the vLLM project
 
 :::{toctree}

From 89003c4082db880e103e84f5015424e79f9aa762 Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Sat, 1 Feb 2025 05:13:04 +0800
Subject: [PATCH 447/462] [v1][Bugfix] Add extra_keys to block_hash for prefix
 caching (#12603)

This pr adds extra key to block hash, to generate different hash value
for two blocks with the same token string but different extra_keys in
their parent blocks. For example, it can generate different hash value
for the second block of the following two requests:
```python
request1 = make_request(
        request_id=0,
        prompt_token_ids=[_ for _ in range(6)],
        mm_positions=[{
            "offset": 0,
            "length": 3
        }, {
            "offset": 3,
            "length": 3
        }],
        mm_hashes=["hash1", "hash2"],
    )
    request2 = make_request(
        request_id=1,
        prompt_token_ids=[_ for _ in range(6)],
        mm_positions=[{
            "offset": 0,
            "length": 3
        }, {
            "offset": 3,
            "length": 3
        }],
        mm_hashes=["hash3", "hash2"],
    )
```

---------

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
---
 tests/v1/core/test_kv_cache_utils.py | 34 +++++++++++++++++++++++++++-
 vllm/v1/core/kv_cache_utils.py       |  6 +++--
 2 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index f4081766e39a2..0a5ba1f98221f 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -192,7 +192,7 @@ def test_hash_block_tokens():
                                    extra_keys)
     assert isinstance(block_hash, BlockHashType)
     assert block_hash.hash_value == hash(
-        (parent_block_hash, *curr_block_token_ids))
+        (parent_block_hash, curr_block_token_ids, extra_keys))
     assert block_hash.token_ids == curr_block_token_ids
     assert block_hash.extra_keys == extra_keys
 
@@ -227,6 +227,38 @@ def test_hash_request_tokens():
     assert block_hashes[1].extra_keys == ("hash2", )
 
 
+def test_hash_tokens_different_mm_input():
+    request1 = make_request(
+        request_id=0,
+        prompt_token_ids=[_ for _ in range(6)],
+        mm_positions=[{
+            "offset": 0,
+            "length": 3
+        }, {
+            "offset": 3,
+            "length": 3
+        }],
+        mm_hashes=["hash1", "hash2"],
+    )
+    request2 = make_request(
+        request_id=1,
+        prompt_token_ids=[_ for _ in range(6)],
+        mm_positions=[{
+            "offset": 0,
+            "length": 3
+        }, {
+            "offset": 3,
+            "length": 3
+        }],
+        mm_hashes=["hash3", "hash2"],
+    )
+    block_size = 3
+    block_hashes1 = hash_request_tokens(block_size, request1)
+    block_hashes2 = hash_request_tokens(block_size, request2)
+    assert block_hashes1[0] != block_hashes2[0]
+    assert block_hashes1[1] != block_hashes2[1]
+
+
 def test_hash_request_tokens_no_mm_inputs():
     request = make_request(
         request_id=0,
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index dbdda51aedaa0..2b6557ad3ce66 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -262,8 +262,10 @@ def hash_block_tokens(
         The hash value of the block and the token ids in the block.
         The entire tuple is used as the hash key of the block.
     """
-    return BlockHashType(hash((parent_block_hash, *curr_block_token_ids)),
-                         tuple(curr_block_token_ids), extra_keys)
+    curr_block_token_ids_tuple = tuple(curr_block_token_ids)
+    return BlockHashType(
+        hash((parent_block_hash, curr_block_token_ids_tuple, extra_keys)),
+        curr_block_token_ids_tuple, extra_keys)
 
 
 def hash_request_tokens(block_size: int,

From 415f19474dedc69934cab79cfb8f5bdc19e2ae0d Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Fri, 31 Jan 2025 13:39:36 -0800
Subject: [PATCH 448/462] [release] Add input step to ask for Release version
 (#12631)

Instead of having to create a new build with release version put in as
env var.
---
 .buildkite/release-pipeline.yaml | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index 51618a2955fb1..829414bf8a3ba 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -56,6 +56,11 @@ steps:
     env:
       DOCKER_BUILDKIT: "1"
 
+  - input: "Provide Release version here"
+    fields:
+      - text: "What is the release version?"
+        key: "release-version"
+
   - block: "Build CPU release image"
     key: block-cpu-release-image-build
     depends_on: ~
@@ -66,7 +71,7 @@ steps:
       queue: cpu_queue_postmerge
     commands:
       - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$RELEASE_VERSION --progress plain -f Dockerfile.cpu ."
-      - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$RELEASE_VERSION"
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --progress plain -f Dockerfile.cpu ."
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
     env:
       DOCKER_BUILDKIT: "1"

From 145c2ff648ad0a300f880ac38811d0d8a2eb3e79 Mon Sep 17 00:00:00 2001
From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Date: Fri, 31 Jan 2025 18:28:47 -0500
Subject: [PATCH 449/462] [Bugfix] Revert MoE Triton Config Default (#12629)

SUMMARY:
* previous PR for pulling in block configs also changed defaults
(https://github.com/vllm-project/vllm/pull/11589/files) for FP8
* this broke L4 MoE since there was not enough SHM for the default
configuration
* this reverts the non-block example to the default

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
---
 .../layers/fused_moe/fused_moe.py             | 41 +++++--------------
 1 file changed, 11 insertions(+), 30 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 39607dc4ca11e..c966be99ed24f 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -660,36 +660,17 @@ def get_default_config(
     is_marlin: bool,
     block_shape: Optional[List[int]] = None,
 ) -> Dict[str, int]:
-    if dtype == "fp8_w8a8":
-        if block_shape is None:
-            config = {
-                "BLOCK_SIZE_M": 128,
-                "BLOCK_SIZE_N": 256,
-                "BLOCK_SIZE_K": 128,
-                "GROUP_SIZE_M": 32,
-                "num_warps": 8,
-                "num_stages": 4,
-            }
-            if M <= E:
-                config = {
-                    "BLOCK_SIZE_M": 64,
-                    "BLOCK_SIZE_N": 128,
-                    "BLOCK_SIZE_K": 128,
-                    "GROUP_SIZE_M": 1,
-                    "num_warps": 4,
-                    "num_stages": 4,
-                }
-        else:
-            # Block-wise quant: BLOCK_SIZE_N must be divisible by block_shape[0]
-            # BLOCK_SIZE_K must be divisible by block_shape[1]
-            config = {
-                "BLOCK_SIZE_M": 64,
-                "BLOCK_SIZE_N": block_shape[0],
-                "BLOCK_SIZE_K": block_shape[1],
-                "GROUP_SIZE_M": 32,
-                "num_warps": 4,
-                "num_stages": 3,
-            }
+    if dtype == "fp8_w8a8" and block_shape is not None:
+        # Block-wise quant: BLOCK_SIZE_N must be divisible by block_shape[0]
+        # BLOCK_SIZE_K must be divisible by block_shape[1]
+        config = {
+            "BLOCK_SIZE_M": 64,
+            "BLOCK_SIZE_N": block_shape[0],
+            "BLOCK_SIZE_K": block_shape[1],
+            "GROUP_SIZE_M": 32,
+            "num_warps": 4,
+            "num_stages": 3,
+        }
     else:
         config = {
             "BLOCK_SIZE_M": 64,

From eb5741ad422f04d0bac60c9b6c07183e0431ce8c Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Fri, 31 Jan 2025 18:29:11 -0500
Subject: [PATCH 450/462] [Kernel][Quantization] Integrate block-quantized
 CUTLASS kernels for DeepSeekV3 (#12587)

Integrates the block-quantized kernels introduced in
https://github.com/vllm-project/vllm/pull/11868 for use in linear
layers.

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
---
 csrc/ops.h                                    |   1 +
 .../cutlass_w8a8/scaled_mm_c3x.cu             |   8 +-
 .../cutlass_w8a8/scaled_mm_entry.cu           |  15 +-
 csrc/torch_bindings.cpp                       |   7 +
 vllm/_custom_ops.py                           |   5 +
 .../model_executor/layers/quantization/fp8.py |   5 +-
 .../layers/quantization/utils/fp8_utils.py    | 146 ++++++++++++++----
 .../layers/quantization/utils/w8a8_utils.py   |  10 ++
 8 files changed, 160 insertions(+), 37 deletions(-)

diff --git a/csrc/ops.h b/csrc/ops.h
index 346898964010d..e39d4ef3188a3 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -153,6 +153,7 @@ torch::Tensor ggml_mul_mat_a8(torch::Tensor W, torch::Tensor X, int64_t type,
 
 #ifndef USE_ROCM
 bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability);
+bool cutlass_scaled_mm_supports_block_fp8(int64_t cuda_device_capability);
 
 void cutlass_scaled_mm(torch::Tensor& out, torch::Tensor const& a,
                        torch::Tensor const& b, torch::Tensor const& a_scales,
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
index e6f06d72fbfd4..72d549e597df5 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
@@ -58,7 +58,13 @@ void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
 
     vllm::cutlass_scaled_mm_blockwise_sm90_fp8(c, a, b, a_scales, b_scales);
   } else {
-    TORCH_CHECK(false, "Unsupported scale group shapes for CUTLASS 3.x GEMM");
+    TORCH_CHECK(false,
+                "Unsupported scale group shapes for CUTLASS 3.x GEMM.\n "
+                "a_scale_group_shape must be [1, 128], got: [",
+                a_scale_group_shape[0], ", ", a_scale_group_shape[1],
+                "]\n"
+                "b_scale_group_shape must be [128, 128], got: [",
+                b_scale_group_shape[0], ", ", b_scale_group_shape[1], "]");
   }
 }
 
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
index da77312bc4b98..6bef55088682a 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
@@ -81,6 +81,19 @@ bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability) {
   return false;
 }
 
+bool cutlass_scaled_mm_supports_block_fp8(int64_t cuda_device_capability) {
+  // CUTLASS block-quantized FP8 kernels need at least CUDA 12.0
+  // and at least SM90 (Hopper)
+
+#if defined CUDA_VERSION
+  if (cuda_device_capability >= 90) {
+    return CUDA_VERSION >= 12000;
+  }
+#endif
+
+  return false;
+}
+
 void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
                        torch::Tensor const& b, torch::Tensor const& a_scales,
                        torch::Tensor const& b_scales,
@@ -212,4 +225,4 @@ void cutlass_scaled_mm_azp(torch::Tensor& c, torch::Tensor const& a,
       "No compiled cutlass_scaled_mm_azp for a compute capability less than "
       "CUDA device capability: ",
       version_num);
-}
\ No newline at end of file
+}
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 1846d9ac29943..186e9c0e81b77 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -324,6 +324,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def("cutlass_scaled_mm_supports_fp8(int cuda_device_capability) -> bool");
   ops.impl("cutlass_scaled_mm_supports_fp8", &cutlass_scaled_mm_supports_fp8);
 
+  // Check if cutlass scaled_mm supports block quantization (used by DeepSeekV3)
+  ops.def(
+      "cutlass_scaled_mm_supports_block_fp8(int cuda_device_capability) -> "
+      "bool");
+  ops.impl("cutlass_scaled_mm_supports_block_fp8",
+           &cutlass_scaled_mm_supports_fp8);
+
   // Check if cutlass sparse scaled_mm is supported for CUDA devices of the
   // given capability
   ops.def(
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index fd94134de0219..da237da2eccac 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -435,6 +435,11 @@ def cutlass_scaled_mm_supports_fp8(cuda_device_capability: int) -> bool:
     return torch.ops._C.cutlass_scaled_mm_supports_fp8(cuda_device_capability)
 
 
+def cutlass_scaled_mm_supports_block_fp8(cuda_device_capability: int) -> bool:
+    return torch.ops._C.cutlass_scaled_mm_supports_block_fp8(
+        cuda_device_capability)
+
+
 def cutlass_scaled_mm(a: torch.Tensor,
                       b: torch.Tensor,
                       scale_a: torch.Tensor,
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 57dd6e310297d..adab1973b40ee 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -21,7 +21,8 @@
     is_layer_skipped)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     all_close_1d, apply_fp8_linear, convert_to_channelwise,
-    cutlass_fp8_supported, normalize_e4m3fn_to_e4m3fnuz, per_tensor_dequantize,
+    cutlass_block_fp8_supported, cutlass_fp8_supported,
+    normalize_e4m3fn_to_e4m3fnuz, per_tensor_dequantize,
     requantize_with_max_scale)
 from vllm.model_executor.parameter import (BlockQuantScaleParameter,
                                            ModelWeightParameter,
@@ -133,6 +134,7 @@ class Fp8LinearMethod(LinearMethodBase):
     def __init__(self, quant_config: Fp8Config):
         self.quant_config = quant_config
         self.cutlass_fp8_supported = cutlass_fp8_supported()
+        self.cutlass_block_fp8_supported = cutlass_block_fp8_supported()
 
         # For GPUs that lack FP8 hardware support, we can leverage the Marlin
         # kernel for fast weight-only FP8 quantization
@@ -359,6 +361,7 @@ def apply(self,
                 weight_scale=layer.weight_scale_inv,
                 input_scale=layer.input_scale,
                 bias=bias,
+                cutlass_block_fp8_supported=self.cutlass_block_fp8_supported,
             )
 
         return apply_fp8_linear(
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index a7a3fa6601639..ccebff341a7ed 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -8,6 +8,7 @@
 import triton
 import triton.language as tl
 
+from vllm import _custom_ops as ops
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 
@@ -21,20 +22,34 @@ def apply_w8a8_block_fp8_linear(
     weight_scale: torch.Tensor,
     input_scale: Optional[torch.Tensor] = None,
     bias: Optional[torch.Tensor] = None,
+    cutlass_block_fp8_supported: bool = True,
 ) -> torch.Tensor:
     assert input_scale is None
     # View input as 2D matrix for fp8 methods
     input_2d = input.view(-1, input.shape[-1])
     output_shape = [*input.shape[:-1], weight.shape[0]]
 
-    q_input, x_scale = per_token_group_quant_fp8(input_2d, block_size[1])
-    output = w8a8_block_fp8_matmul(q_input,
-                                   weight,
-                                   x_scale,
-                                   weight_scale,
-                                   block_size,
-                                   output_dtype=input.dtype)
-
+    shape_supported_by_cutlass = (weight.shape[0] % 128 == 0
+                                  and weight.shape[1] % 128 == 0)
+    if cutlass_block_fp8_supported and shape_supported_by_cutlass:
+        q_input, x_scale = per_token_group_quant_fp8(input_2d,
+                                                     block_size[1],
+                                                     column_major_scales=True)
+        output = ops.cutlass_scaled_mm(q_input,
+                                       weight.T,
+                                       out_dtype=input.dtype,
+                                       scale_a=x_scale,
+                                       scale_b=weight_scale.T)
+    else:
+        q_input, x_scale = per_token_group_quant_fp8(input_2d,
+                                                     block_size[1],
+                                                     column_major_scales=False)
+        output = w8a8_block_fp8_matmul(q_input,
+                                       weight,
+                                       x_scale,
+                                       weight_scale,
+                                       block_size,
+                                       output_dtype=input.dtype)
     if bias is not None:
         output = output + bias
     return output.to(dtype=input.dtype).view(*output_shape)
@@ -98,10 +113,7 @@ def _per_token_group_quant_fp8(
     y_ptr,
     y_q_ptr,
     y_s_ptr,
-    # Stride of input
-    y_stride,
-    # Columns of input
-    N,
+    group_size,
     # Avoid to divide zero
     eps,
     # Information for float8
@@ -116,12 +128,60 @@ def _per_token_group_quant_fp8(
     """
     # Map the program id to the row of X and Y it should compute.
     g_id = tl.program_id(0)
-    y_ptr += g_id * y_stride
-    y_q_ptr += g_id * y_stride
+    y_ptr += g_id * group_size
+    y_q_ptr += g_id * group_size
     y_s_ptr += g_id
 
     cols = tl.arange(0, BLOCK)  # N <= BLOCK
-    mask = cols < N
+    mask = cols < group_size
+
+    y = tl.load(y_ptr + cols, mask=mask, other=0.0).to(tl.float32)
+    # Quant
+    _absmax = tl.maximum(tl.max(tl.abs(y)), eps)
+    y_s = _absmax / fp8_max
+    y_q = tl.clamp(y / y_s, fp8_min, fp8_max).to(y_q_ptr.dtype.element_ty)
+
+    tl.store(y_q_ptr + cols, y_q, mask=mask)
+    tl.store(y_s_ptr, y_s)
+
+
+@triton.jit
+def _per_token_group_quant_fp8_colmajor(
+    # Pointers to inputs and output
+    y_ptr,
+    y_q_ptr,
+    y_s_ptr,
+    group_size,
+    # Num columns of y
+    y_num_columns,
+    # Stride from one column to the next of y_s
+    y_s_col_stride,
+    # Avoid to divide zero
+    eps,
+    # Information for float8
+    fp8_min,
+    fp8_max,
+    # Meta-parameters
+    BLOCK: tl.constexpr,
+):
+    """A Triton-accelerated function to perform per-token-group
+    quantization on a tensor.
+    This function converts the tensor values into float8 values.
+    """
+    # Map the program id to the row of X and Y it should compute.
+    g_id = tl.program_id(0)
+    y_ptr += g_id * group_size
+    y_q_ptr += g_id * group_size
+
+    # Convert g_id the flattened block coordinate to 2D so we can index
+    # into the output y_scales matrix
+    blocks_per_row = y_num_columns // group_size
+    scale_col = g_id % blocks_per_row
+    scale_row = g_id // blocks_per_row
+    y_s_ptr += scale_col * y_s_col_stride + scale_row
+
+    cols = tl.arange(0, BLOCK)  # group_size <= BLOCK
+    mask = cols < group_size
 
     y = tl.load(y_ptr + cols, mask=mask, other=0.0).to(tl.float32)
     # Quant
@@ -138,12 +198,13 @@ def per_token_group_quant_fp8(
     group_size: int,
     eps: float = 1e-10,
     dtype: Optional[torch.dtype] = None,
+    column_major_scales: bool = False,
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     """Function to perform per-token-group quantization on an input tensor `x`.
     It converts the tensor values into signed float8 values and returns the
     quantized tensor along with the scaling factor used for quantization.
     Args:
-        x: The input tenosr with ndim >= 2.
+        x: The input tensor with ndim >= 2.
         group_size: The group size used for quantization.
         eps: The minimum to avoid dividing zero.
         dtype: The dype of output tensor. Note that only `torch.float8_e4m3fn`
@@ -167,29 +228,46 @@ def per_token_group_quant_fp8(
     x_q = torch.empty_like(x, device=x.device, dtype=dtype)
     M = x.numel() // group_size
     N = group_size
-    x_s = torch.empty(
-        x.shape[:-1] + (x.shape[-1] // group_size, ),
-        device=x.device,
-        dtype=torch.float32,
-    )
+    if column_major_scales:
+        shape = (x.shape[-1] // group_size, ) + x.shape[:-1]
+        x_s = torch.empty(shape, device=x.device,
+                          dtype=torch.float32).permute(-1, -2)
+    else:
+        shape = x.shape[:-1] + (x.shape[-1] // group_size, )
+        x_s = torch.empty(shape, device=x.device, dtype=torch.float32)
 
     BLOCK = triton.next_power_of_2(N)
     # heuristics for number of warps
     num_warps = min(max(BLOCK // 256, 1), 8)
     num_stages = 1
-    _per_token_group_quant_fp8[(M, )](
-        x,
-        x_q,
-        x_s,
-        group_size,
-        N,
-        eps,
-        fp8_min=fp8_min,
-        fp8_max=fp8_max,
-        BLOCK=BLOCK,
-        num_warps=num_warps,
-        num_stages=num_stages,
-    )
+    if column_major_scales:
+        _per_token_group_quant_fp8_colmajor[(M, )](
+            x,
+            x_q,
+            x_s,
+            group_size,
+            x.shape[1],
+            x_s.stride(1),
+            eps,
+            fp8_min=fp8_min,
+            fp8_max=fp8_max,
+            BLOCK=BLOCK,
+            num_warps=num_warps,
+            num_stages=num_stages,
+        )
+    else:
+        _per_token_group_quant_fp8[(M, )](
+            x,
+            x_q,
+            x_s,
+            group_size,
+            eps,
+            fp8_min=fp8_min,
+            fp8_max=fp8_max,
+            BLOCK=BLOCK,
+            num_warps=num_warps,
+            num_stages=num_stages,
+        )
 
     return x_q, x_s
 
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index 9977804188a50..3af3b3e0ea942 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -30,6 +30,16 @@ def cutlass_fp8_supported() -> bool:
     return ops.cutlass_scaled_mm_supports_fp8(capability)
 
 
+def cutlass_block_fp8_supported() -> bool:
+    if not current_platform.is_cuda():
+        return False
+
+    capability_tuple = current_platform.get_device_capability()
+    capability = -1 if capability_tuple is None else capability_tuple.to_int()
+
+    return ops.cutlass_scaled_mm_supports_block_fp8(capability)
+
+
 def per_tensor_dequantize(
         tensor: torch.Tensor, inv_scale: Union[float,
                                                torch.Tensor]) -> torch.Tensor:

From fc542144c4477ffec1d3de6fa43e54f8fb5351e8 Mon Sep 17 00:00:00 2001
From: Ryan Nguyen <96593302+xpbowler@users.noreply.github.com>
Date: Fri, 31 Jan 2025 18:37:30 -0500
Subject: [PATCH 451/462] [Feature] Fix guided decoding blocking bitmask memcpy
 (#12563)

**[Guided decoding performance optimization]** Sending the guided
decoding bitmask in xgrammar to the GPU
(`self.token_bitmask.to(scores.device)`) is a blocking operation that
prevents the CPU from pre-launching the sampler kernels. The CPU waits
until decode is complete, then copies the bitmask over. This PR changes
the operation to async via setting `non-blocking=True`.

(Current) The CPU is blocked on a `cudaStreamSynchronize` and only
pre-empts the sampling kernels after bitmask application. Below is the
Nsys profile for one decode phase from Llama 3.1 8B.

![image](https://github.com/user-attachments/assets/8997eae1-b822-4f52-beb8-ef19a7c6b824)

With the optimization, this is no longer the case:

![image](https://github.com/user-attachments/assets/6d5ea83f-f169-4f98-a8c1-41c719b3e1e7)

---------

Signed-off-by: Ryan N <ryan.nguyen@centml.ai>
---
 vllm/model_executor/guided_decoding/xgrammar_decoding.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
index 2d8594cb8aafa..ee30ce96f0a1e 100644
--- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py
+++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
@@ -307,8 +307,8 @@ def __call__(self, input_ids: list[int],
         # Note: In this method, if the tensors have different dimensions
         # on CPU device fails, but on GPU it runs without error. Hence the
         # unsqueeze above for scores, to match the token bitmask shape
-        xgr.apply_token_bitmask_inplace(scores,
-                                        self.token_bitmask.to(scores.device))
+        xgr.apply_token_bitmask_inplace(
+            scores, self.token_bitmask.to(scores.device, non_blocking=True))
         if device_type != "cuda":
             scores = scores.to(dtype).to(device_type).squeeze()
 

From 60808bd4c7a27b6d28f82657e38a5b303f7534a9 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Fri, 31 Jan 2025 23:38:35 +0000
Subject: [PATCH 452/462] [Doc] Improve installation signposting (#12575)

- Make device tab names more explicit
- Add comprehensive list of devices to
https://docs.vllm.ai/en/latest/getting_started/installation/index.html
- Add `attention` blocks to the intro of all devices that don't have
pre-built wheels/images

---------

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .../ai_accelerator/hpu-gaudi.inc.md           |  4 ++
 .../installation/ai_accelerator/index.md      | 33 +++++++------
 .../installation/ai_accelerator/neuron.inc.md |  4 ++
 .../ai_accelerator/openvino.inc.md            |  4 ++
 .../installation/ai_accelerator/tpu.inc.md    |  4 ++
 .../installation/cpu/apple.inc.md             |  4 ++
 .../installation/cpu/arm.inc.md               |  4 ++
 .../getting_started/installation/cpu/index.md | 13 ++---
 .../installation/cpu/x86.inc.md               | 12 +++--
 .../getting_started/installation/gpu/index.md | 49 ++++++++++---------
 .../installation/gpu/rocm.inc.md              | 20 ++++----
 .../installation/gpu/xpu.inc.md               |  4 ++
 .../getting_started/installation/index.md     | 15 ++++++
 13 files changed, 111 insertions(+), 59 deletions(-)

diff --git a/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md b/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
index 704a16233981f..f3b0d6dc9bdc8 100644
--- a/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
+++ b/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
@@ -2,6 +2,10 @@
 
 This tab provides instructions on running vLLM with Intel Gaudi devices.
 
+:::{attention}
+There are no pre-built wheels or images for this device, so you must build vLLM from source.
+:::
+
 ## Requirements
 
 - OS: Ubuntu 22.04 LTS
diff --git a/docs/source/getting_started/installation/ai_accelerator/index.md b/docs/source/getting_started/installation/ai_accelerator/index.md
index 88352f639567b..01793572fee7c 100644
--- a/docs/source/getting_started/installation/ai_accelerator/index.md
+++ b/docs/source/getting_started/installation/ai_accelerator/index.md
@@ -5,7 +5,8 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 :::::{tab-set}
 :sync-group: device
 
-::::{tab-item} TPU
+::::{tab-item} Google TPU
+:selected:
 :sync: tpu
 
 :::{include} tpu.inc.md
@@ -25,7 +26,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 
 ::::
 
-::::{tab-item} Neuron
+::::{tab-item} AWS Neuron
 :sync: neuron
 
 :::{include} neuron.inc.md
@@ -52,7 +53,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 :::::{tab-set}
 :sync-group: device
 
-::::{tab-item} TPU
+::::{tab-item} Google TPU
 :sync: tpu
 
 :::{include} tpu.inc.md
@@ -72,7 +73,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 
 ::::
 
-::::{tab-item} Neuron
+::::{tab-item} AWS Neuron
 :sync: neuron
 
 :::{include} neuron.inc.md
@@ -99,7 +100,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 :::::{tab-set}
 :sync-group: device
 
-::::{tab-item} TPU
+::::{tab-item} Google TPU
 :sync: tpu
 
 :::{include} tpu.inc.md
@@ -119,7 +120,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 
 ::::
 
-::::{tab-item} Neuron
+::::{tab-item} AWS Neuron
 :sync: neuron
 
 :::{include} neuron.inc.md
@@ -146,7 +147,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 :::::{tab-set}
 :sync-group: device
 
-::::{tab-item} TPU
+::::{tab-item} Google TPU
 :sync: tpu
 
 :::{include} tpu.inc.md
@@ -166,7 +167,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 
 ::::
 
-::::{tab-item} Neuron
+::::{tab-item} AWS Neuron
 :sync: neuron
 
 :::{include} neuron.inc.md
@@ -193,7 +194,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 :::::{tab-set}
 :sync-group: device
 
-::::{tab-item} TPU
+::::{tab-item} Google TPU
 :sync: tpu
 
 :::{include} tpu.inc.md
@@ -213,7 +214,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 
 ::::
 
-::::{tab-item} Neuron
+::::{tab-item} AWS Neuron
 :sync: neuron
 
 :::{include} neuron.inc.md
@@ -242,7 +243,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 :::::{tab-set}
 :sync-group: device
 
-::::{tab-item} TPU
+::::{tab-item} Google TPU
 :sync: tpu
 
 :::{include} tpu.inc.md
@@ -262,7 +263,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 
 ::::
 
-::::{tab-item} Neuron
+::::{tab-item} AWS Neuron
 :sync: neuron
 
 :::{include} neuron.inc.md
@@ -289,7 +290,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 :::::{tab-set}
 :sync-group: device
 
-::::{tab-item} TPU
+::::{tab-item} Google TPU
 :sync: tpu
 
 :::{include} tpu.inc.md
@@ -309,7 +310,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 
 ::::
 
-::::{tab-item} Neuron
+::::{tab-item} AWS Neuron
 :sync: neuron
 
 :::{include} neuron.inc.md
@@ -336,7 +337,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 :::::{tab-set}
 :sync-group: device
 
-::::{tab-item} TPU
+::::{tab-item} Google TPU
 :sync: tpu
 
 :::{include} tpu.inc.md
@@ -354,7 +355,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 
 ::::
 
-::::{tab-item} Neuron
+::::{tab-item} AWS Neuron
 :sync: neuron
 
 :::{include} neuron.inc.md
diff --git a/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md b/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md
index 145cc9d668efd..f149818acafb8 100644
--- a/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md
+++ b/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md
@@ -4,6 +4,10 @@ vLLM 0.3.3 onwards supports model inferencing and serving on AWS Trainium/Infere
 Paged Attention and Chunked Prefill are currently in development and will be available soon.
 Data types currently supported in Neuron SDK are FP16 and BF16.
 
+:::{attention}
+There are no pre-built wheels or images for this device, so you must build vLLM from source.
+:::
+
 ## Requirements
 
 - OS: Linux
diff --git a/docs/source/getting_started/installation/ai_accelerator/openvino.inc.md b/docs/source/getting_started/installation/ai_accelerator/openvino.inc.md
index a7867472583d6..112e8d4d9b256 100644
--- a/docs/source/getting_started/installation/ai_accelerator/openvino.inc.md
+++ b/docs/source/getting_started/installation/ai_accelerator/openvino.inc.md
@@ -2,6 +2,10 @@
 
 vLLM powered by OpenVINO supports all LLM models from [vLLM supported models list](#supported-models) and can perform optimal model serving on all x86-64 CPUs with, at least, AVX2 support, as well as on both integrated and discrete Intel® GPUs ([the list of supported GPUs](https://docs.openvino.ai/2024/about-openvino/release-notes-openvino/system-requirements.html#gpu)).
 
+:::{attention}
+There are no pre-built wheels or images for this device, so you must build vLLM from source.
+:::
+
 ## Requirements
 
 - OS: Linux
diff --git a/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md b/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md
index 6827afc805fd8..c0d50feafce56 100644
--- a/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md
+++ b/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md
@@ -30,6 +30,10 @@ For TPU pricing information, see [Cloud TPU pricing](https://cloud.google.com/tp
 You may need additional persistent storage for your TPU VMs. For more
 information, see [Storage options for Cloud TPU data](https://cloud.devsite.corp.google.com/tpu/docs/storage-options).
 
+:::{attention}
+There are no pre-built wheels for this device, so you must either use the pre-built Docker image or build vLLM from source.
+:::
+
 ## Requirements
 
 - Google Cloud TPU VM
diff --git a/docs/source/getting_started/installation/cpu/apple.inc.md b/docs/source/getting_started/installation/cpu/apple.inc.md
index 0808b869fdb7b..3bf1d47fa0ff9 100644
--- a/docs/source/getting_started/installation/cpu/apple.inc.md
+++ b/docs/source/getting_started/installation/cpu/apple.inc.md
@@ -4,6 +4,10 @@ vLLM has experimental support for macOS with Apple silicon. For now, users shall
 
 Currently the CPU implementation for macOS supports FP32 and FP16 datatypes.
 
+:::{attention}
+There are no pre-built wheels or images for this device, so you must build vLLM from source.
+:::
+
 ## Requirements
 
 - OS: `macOS Sonoma` or later
diff --git a/docs/source/getting_started/installation/cpu/arm.inc.md b/docs/source/getting_started/installation/cpu/arm.inc.md
index 08a764e1a25f4..a661a0ca5adc7 100644
--- a/docs/source/getting_started/installation/cpu/arm.inc.md
+++ b/docs/source/getting_started/installation/cpu/arm.inc.md
@@ -4,6 +4,10 @@ vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CP
 
 ARM CPU backend currently supports Float32, FP16 and BFloat16 datatypes.
 
+:::{attention}
+There are no pre-built wheels or images for this device, so you must build vLLM from source.
+:::
+
 ## Requirements
 
 - OS: Linux
diff --git a/docs/source/getting_started/installation/cpu/index.md b/docs/source/getting_started/installation/cpu/index.md
index 2f549ede0cf48..d53430403583c 100644
--- a/docs/source/getting_started/installation/cpu/index.md
+++ b/docs/source/getting_started/installation/cpu/index.md
@@ -5,7 +5,8 @@ vLLM is a Python library that supports the following CPU variants. Select your C
 :::::{tab-set}
 :sync-group: device
 
-::::{tab-item} x86
+::::{tab-item} Intel/AMD x86
+:selected:
 :sync: x86
 
 :::{include} x86.inc.md
@@ -15,7 +16,7 @@ vLLM is a Python library that supports the following CPU variants. Select your C
 
 ::::
 
-::::{tab-item} ARM
+::::{tab-item} ARM AArch64
 :sync: arm
 
 :::{include} arm.inc.md
@@ -44,7 +45,7 @@ vLLM is a Python library that supports the following CPU variants. Select your C
 :::::{tab-set}
 :sync-group: device
 
-::::{tab-item} x86
+::::{tab-item} Intel/AMD x86
 :sync: x86
 
 :::{include} x86.inc.md
@@ -54,7 +55,7 @@ vLLM is a Python library that supports the following CPU variants. Select your C
 
 ::::
 
-::::{tab-item} ARM
+::::{tab-item} ARM AArch64
 :sync: arm
 
 :::{include} arm.inc.md
@@ -92,7 +93,7 @@ Currently, there are no pre-built CPU wheels.
 :::::{tab-set}
 :sync-group: device
 
-::::{tab-item} x86
+::::{tab-item} Intel/AMD x86
 :sync: x86
 
 :::{include} x86.inc.md
@@ -102,7 +103,7 @@ Currently, there are no pre-built CPU wheels.
 
 ::::
 
-::::{tab-item} ARM
+::::{tab-item} ARM AArch64
 :sync: arm
 
 :::{include} arm.inc.md
diff --git a/docs/source/getting_started/installation/cpu/x86.inc.md b/docs/source/getting_started/installation/cpu/x86.inc.md
index f146ae0918b44..1dafc3660060e 100644
--- a/docs/source/getting_started/installation/cpu/x86.inc.md
+++ b/docs/source/getting_started/installation/cpu/x86.inc.md
@@ -2,12 +2,20 @@
 
 vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32, FP16 and BF16.
 
+:::{attention}
+There are no pre-built wheels or images for this device, so you must build vLLM from source.
+:::
+
 ## Requirements
 
 - OS: Linux
 - Compiler: `gcc/g++ >= 12.3.0` (optional, recommended)
 - Instruction Set Architecture (ISA): AVX512 (optional, recommended)
 
+:::{tip}
+[Intel Extension for PyTorch (IPEX)](https://github.com/intel/intel-extension-for-pytorch) extends PyTorch with up-to-date features optimizations for an extra performance boost on Intel hardware.
+:::
+
 ## Set up using Python
 
 ### Pre-built wheels
@@ -29,7 +37,3 @@ vLLM initially supports basic model inferencing and serving on x86 CPU platform,
 ### Build image from source
 
 ## Extra information
-
-## Intel Extension for PyTorch
-
-- [Intel Extension for PyTorch (IPEX)](https://github.com/intel/intel-extension-for-pytorch) extends PyTorch with up-to-date features optimizations for an extra performance boost on Intel hardware.
diff --git a/docs/source/getting_started/installation/gpu/index.md b/docs/source/getting_started/installation/gpu/index.md
index 0a61f889753a3..f82c4bda28620 100644
--- a/docs/source/getting_started/installation/gpu/index.md
+++ b/docs/source/getting_started/installation/gpu/index.md
@@ -5,7 +5,8 @@ vLLM is a Python library that supports the following GPU variants. Select your G
 :::::{tab-set}
 :sync-group: device
 
-::::{tab-item} CUDA
+::::{tab-item} NVIDIA CUDA
+:selected:
 :sync: cuda
 
 :::{include} cuda.inc.md
@@ -15,7 +16,7 @@ vLLM is a Python library that supports the following GPU variants. Select your G
 
 ::::
 
-::::{tab-item} ROCm
+::::{tab-item} AMD ROCm
 :sync: rocm
 
 :::{include} rocm.inc.md
@@ -25,7 +26,7 @@ vLLM is a Python library that supports the following GPU variants. Select your G
 
 ::::
 
-::::{tab-item} XPU
+::::{tab-item} Intel XPU
 :sync: xpu
 
 :::{include} xpu.inc.md
@@ -45,7 +46,7 @@ vLLM is a Python library that supports the following GPU variants. Select your G
 :::::{tab-set}
 :sync-group: device
 
-::::{tab-item} CUDA
+::::{tab-item} NVIDIA CUDA
 :sync: cuda
 
 :::{include} cuda.inc.md
@@ -55,7 +56,7 @@ vLLM is a Python library that supports the following GPU variants. Select your G
 
 ::::
 
-::::{tab-item} ROCm
+::::{tab-item} AMD ROCm
 :sync: rocm
 
 :::{include} rocm.inc.md
@@ -65,7 +66,7 @@ vLLM is a Python library that supports the following GPU variants. Select your G
 
 ::::
 
-::::{tab-item} XPU
+::::{tab-item} Intel XPU
 :sync: xpu
 
 :::{include} xpu.inc.md
@@ -87,7 +88,7 @@ vLLM is a Python library that supports the following GPU variants. Select your G
 :::::{tab-set}
 :sync-group: device
 
-::::{tab-item} CUDA
+::::{tab-item} NVIDIA CUDA
 :sync: cuda
 
 :::{include} cuda.inc.md
@@ -97,14 +98,14 @@ vLLM is a Python library that supports the following GPU variants. Select your G
 
 ::::
 
-::::{tab-item} ROCm
+::::{tab-item} AMD ROCm
 :sync: rocm
 
 There is no extra information on creating a new Python environment for this device.
 
 ::::
 
-::::{tab-item} XPU
+::::{tab-item} Intel XPU
 :sync: xpu
 
 There is no extra information on creating a new Python environment for this device.
@@ -118,7 +119,7 @@ There is no extra information on creating a new Python environment for this devi
 :::::{tab-set}
 :sync-group: device
 
-::::{tab-item} CUDA
+::::{tab-item} NVIDIA CUDA
 :sync: cuda
 
 :::{include} cuda.inc.md
@@ -128,7 +129,7 @@ There is no extra information on creating a new Python environment for this devi
 
 ::::
 
-::::{tab-item} ROCm
+::::{tab-item} AMD ROCm
 :sync: rocm
 
 :::{include} rocm.inc.md
@@ -138,7 +139,7 @@ There is no extra information on creating a new Python environment for this devi
 
 ::::
 
-::::{tab-item} XPU
+::::{tab-item} Intel XPU
 :sync: xpu
 
 :::{include} xpu.inc.md
@@ -157,7 +158,7 @@ There is no extra information on creating a new Python environment for this devi
 :::::{tab-set}
 :sync-group: device
 
-::::{tab-item} CUDA
+::::{tab-item} NVIDIA CUDA
 :sync: cuda
 
 :::{include} cuda.inc.md
@@ -167,7 +168,7 @@ There is no extra information on creating a new Python environment for this devi
 
 ::::
 
-::::{tab-item} ROCm
+::::{tab-item} AMD ROCm
 :sync: rocm
 
 :::{include} rocm.inc.md
@@ -177,7 +178,7 @@ There is no extra information on creating a new Python environment for this devi
 
 ::::
 
-::::{tab-item} XPU
+::::{tab-item} Intel XPU
 :sync: xpu
 
 :::{include} xpu.inc.md
@@ -196,7 +197,7 @@ There is no extra information on creating a new Python environment for this devi
 :::::{tab-set}
 :sync-group: device
 
-::::{tab-item} CUDA
+::::{tab-item} NVIDIA CUDA
 :sync: cuda
 
 :::{include} cuda.inc.md
@@ -206,7 +207,7 @@ There is no extra information on creating a new Python environment for this devi
 
 ::::
 
-::::{tab-item} ROCm
+::::{tab-item} AMD ROCm
 :sync: rocm
 
 :::{include} rocm.inc.md
@@ -216,7 +217,7 @@ There is no extra information on creating a new Python environment for this devi
 
 ::::
 
-::::{tab-item} XPU
+::::{tab-item} Intel XPU
 :sync: xpu
 
 :::{include} xpu.inc.md
@@ -233,7 +234,7 @@ There is no extra information on creating a new Python environment for this devi
 :::::{tab-set}
 :sync-group: device
 
-::::{tab-item} CUDA
+::::{tab-item} NVIDIA CUDA
 :sync: cuda
 
 :::{include} cuda.inc.md
@@ -243,7 +244,7 @@ There is no extra information on creating a new Python environment for this devi
 
 ::::
 
-::::{tab-item} ROCm
+::::{tab-item} AMD ROCm
 :sync: rocm
 
 :::{include} rocm.inc.md
@@ -253,7 +254,7 @@ There is no extra information on creating a new Python environment for this devi
 
 ::::
 
-::::{tab-item} XPU
+::::{tab-item} Intel XPU
 :sync: xpu
 
 :::{include} xpu.inc.md
@@ -270,7 +271,7 @@ There is no extra information on creating a new Python environment for this devi
 :::::{tab-set}
 :sync-group: device
 
-::::{tab-item} CUDA
+::::{tab-item} NVIDIA CUDA
 :sync: cuda
 
 :::{include} cuda.inc.md
@@ -279,7 +280,7 @@ There is no extra information on creating a new Python environment for this devi
 
 ::::
 
-::::{tab-item} ROCm
+::::{tab-item} AMD ROCm
 :sync: rocm
 
 :::{include} rocm.inc.md
@@ -288,7 +289,7 @@ There is no extra information on creating a new Python environment for this devi
 
 ::::
 
-::::{tab-item} XPU
+::::{tab-item} Intel XPU
 :sync: xpu
 
 :::{include} xpu.inc.md
diff --git a/docs/source/getting_started/installation/gpu/rocm.inc.md b/docs/source/getting_started/installation/gpu/rocm.inc.md
index 131ad1704ea11..c8fd11415cfda 100644
--- a/docs/source/getting_started/installation/gpu/rocm.inc.md
+++ b/docs/source/getting_started/installation/gpu/rocm.inc.md
@@ -2,6 +2,10 @@
 
 vLLM supports AMD GPUs with ROCm 6.2.
 
+:::{attention}
+There are no pre-built wheels for this device, so you must either use the pre-built Docker image or build vLLM from source.
+:::
+
 ## Requirements
 
 - GPU: MI200s (gfx90a), MI300 (gfx942), Radeon RX 7900 series (gfx1100)
@@ -13,14 +17,6 @@ vLLM supports AMD GPUs with ROCm 6.2.
 
 Currently, there are no pre-built ROCm wheels.
 
-However, the [AMD Infinity hub for vLLM](https://hub.docker.com/r/rocm/vllm/tags) offers a prebuilt, optimized
-docker image designed for validating inference performance on the AMD Instinct™ MI300X accelerator.
-
-:::{tip}
-Please check [LLM inference performance validation on AMD Instinct MI300X](https://rocm.docs.amd.com/en/latest/how-to/performance-validation/mi300x/vllm-benchmark.html)
-for instructions on how to use this prebuilt docker image.
-:::
-
 ### Build wheel from source
 
 0. Install prerequisites (skip if you are already in an environment/docker with the following installed):
@@ -112,7 +108,13 @@ for instructions on how to use this prebuilt docker image.
 
 ### Pre-built images
 
-Currently, there are no pre-built ROCm images.
+The [AMD Infinity hub for vLLM](https://hub.docker.com/r/rocm/vllm/tags) offers a prebuilt, optimized
+docker image designed for validating inference performance on the AMD Instinct™ MI300X accelerator.
+
+:::{tip}
+Please check [LLM inference performance validation on AMD Instinct MI300X](https://rocm.docs.amd.com/en/latest/how-to/performance-validation/mi300x/vllm-benchmark.html)
+for instructions on how to use this prebuilt docker image.
+:::
 
 ### Build image from source
 
diff --git a/docs/source/getting_started/installation/gpu/xpu.inc.md b/docs/source/getting_started/installation/gpu/xpu.inc.md
index bc01c6000bc07..4116826789e5c 100644
--- a/docs/source/getting_started/installation/gpu/xpu.inc.md
+++ b/docs/source/getting_started/installation/gpu/xpu.inc.md
@@ -2,6 +2,10 @@
 
 vLLM initially supports basic model inferencing and serving on Intel GPU platform.
 
+:::{attention}
+There are no pre-built wheels or images for this device, so you must build vLLM from source.
+:::
+
 ## Requirements
 
 - Supported Hardware: Intel Data Center GPU, Intel ARC GPU
diff --git a/docs/source/getting_started/installation/index.md b/docs/source/getting_started/installation/index.md
index 0f5e013ce071a..c64c3a7208eeb 100644
--- a/docs/source/getting_started/installation/index.md
+++ b/docs/source/getting_started/installation/index.md
@@ -6,8 +6,23 @@ vLLM supports the following hardware platforms:
 
 :::{toctree}
 :maxdepth: 1
+:hidden:
 
 gpu/index
 cpu/index
 ai_accelerator/index
 :::
+
+- <project:gpu/index.md>
+  - NVIDIA CUDA
+  - AMD ROCm
+  - Intel XPU
+- <project:cpu/index.md>
+  - Intel/AMD x86
+  - ARM AArch64
+  - Apple silicon
+- <project:ai_accelerator/index.md>
+  - Google TPU
+  - Intel Gaudi
+  - AWS Neuron
+  - OpenVINO

From 44bbca78d71330909dbfdde232debdc73a4d5a81 Mon Sep 17 00:00:00 2001
From: Brian Dellabetta <brian-dellabetta@users.noreply.github.com>
Date: Fri, 31 Jan 2025 17:38:48 -0600
Subject: [PATCH 453/462] [Doc] int4 w4a16 example (#12585)

Based on a request by @mgoin , with @kylesayrs we have added an example
doc for int4 w4a16 quantization, following the pre-existing int8 w8a8
quantization example and the example available in
[`llm-compressor`](https://github.com/vllm-project/llm-compressor/blob/main/examples/quantization_w4a16/llama3_example.py)

FIX #n/a (no issue created)

@kylesayrs and I have discussed a couple additional improvements for the
quantization docs. We will revisit at a later date, possibly including:
- A section for "choosing the correct quantization scheme/ compression
technique"
- Additional vision or audio calibration datasets

---------

Signed-off-by: Brian Dellabetta <bdellabe@redhat.com>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
---
 docs/source/features/quantization/index.md |   1 +
 docs/source/features/quantization/int4.md  | 166 +++++++++++++++++++++
 docs/source/features/quantization/int8.md  |   4 +-
 3 files changed, 169 insertions(+), 2 deletions(-)
 create mode 100644 docs/source/features/quantization/int4.md

diff --git a/docs/source/features/quantization/index.md b/docs/source/features/quantization/index.md
index d972dc85fc23c..1c98620aa2145 100644
--- a/docs/source/features/quantization/index.md
+++ b/docs/source/features/quantization/index.md
@@ -12,6 +12,7 @@ supported_hardware
 auto_awq
 bnb
 gguf
+int4
 int8
 fp8
 quantized_kvcache
diff --git a/docs/source/features/quantization/int4.md b/docs/source/features/quantization/int4.md
new file mode 100644
index 0000000000000..f8939e5bf0150
--- /dev/null
+++ b/docs/source/features/quantization/int4.md
@@ -0,0 +1,166 @@
+(int4)=
+
+# INT4 W4A16
+
+vLLM supports quantizing weights to INT4 for memory savings and inference acceleration. This quantization method is particularly useful for reducing model size and maintaining low latency in workloads with low queries per second (QPS).
+
+Please visit the HF collection of [quantized INT4 checkpoints of popular LLMs ready to use with vLLM](https://huggingface.co/collections/neuralmagic/int4-llms-for-vllm-668ec34bf3c9fa45f857df2c).
+
+:::{note}
+INT4 computation is supported on NVIDIA GPUs with compute capability > 8.0 (Ampere, Ada Lovelace, Hopper, Blackwell).
+:::
+
+## Prerequisites
+
+To use INT4 quantization with vLLM, you'll need to install the [llm-compressor](https://github.com/vllm-project/llm-compressor/) library:
+
+```console
+pip install llmcompressor
+```
+
+## Quantization Process
+
+The quantization process involves four main steps:
+
+1. Loading the model
+2. Preparing calibration data
+3. Applying quantization
+4. Evaluating accuracy in vLLM
+
+### 1. Loading the Model
+
+Load your model and tokenizer using the standard `transformers` AutoModel classes:
+
+```python
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID, device_map="auto", torch_dtype="auto",
+)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+```
+
+### 2. Preparing Calibration Data
+
+When quantizing weights to INT4, you need sample data to estimate the weight updates and calibrated scales.
+It's best to use calibration data that closely matches your deployment data.
+For a general-purpose instruction-tuned model, you can use a dataset like `ultrachat`:
+
+```python
+from datasets import load_dataset
+
+NUM_CALIBRATION_SAMPLES = 512
+MAX_SEQUENCE_LENGTH = 2048
+
+# Load and preprocess the dataset
+ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft")
+ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
+
+def preprocess(example):
+    return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)}
+ds = ds.map(preprocess)
+
+def tokenize(sample):
+    return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False)
+ds = ds.map(tokenize, remove_columns=ds.column_names)
+```
+
+### 3. Applying Quantization
+
+Now, apply the quantization algorithms:
+
+```python
+from llmcompressor.transformers import oneshot
+from llmcompressor.modifiers.quantization import GPTQModifier
+from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
+
+# Configure the quantization algorithms
+recipe = GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"])
+
+# Apply quantization
+oneshot(
+    model=model,
+    dataset=ds,
+    recipe=recipe,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+)
+
+# Save the compressed model
+SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
+```
+
+This process creates a W4A16 model with weights quantized to 4-bit integers.
+
+### 4. Evaluating Accuracy
+
+After quantization, you can load and run the model in vLLM:
+
+```python
+from vllm import LLM
+model = LLM("./Meta-Llama-3-8B-Instruct-W4A16-G128")
+```
+
+To evaluate accuracy, you can use `lm_eval`:
+
+```console
+$ lm_eval --model vllm \
+  --model_args pretrained="./Meta-Llama-3-8B-Instruct-W4A16-G128",add_bos_token=true \
+  --tasks gsm8k \
+  --num_fewshot 5 \
+  --limit 250 \
+  --batch_size 'auto'
+```
+
+:::{note}
+Quantized models can be sensitive to the presence of the `bos` token. Make sure to include the `add_bos_token=True` argument when running evaluations.
+:::
+
+## Best Practices
+
+- Start with 512 samples for calibration data, and increase if accuracy drops
+- Ensure the calibration data contains a high variety of samples to prevent overfitting towards a specific use case
+- Use a sequence length of 2048 as a starting point
+- Employ the chat template or instruction template that the model was trained with
+- If you've fine-tuned a model, consider using a sample of your training data for calibration
+- Tune key hyperparameters to the quantization algorithm:
+  - `dampening_frac` sets how much influence the GPTQ algorithm has. Lower values can improve accuracy, but can lead to numerical instabilities that cause the algorithm to fail.
+  - `actorder` sets the activation ordering. When compressing the weights of a layer weight, the order in which channels are quantized matters. Setting `actorder="weight"` can improve accuracy without added latency.
+
+The following is an example of an expanded quantization recipe you can tune to your own use case:
+
+```python
+from compressed_tensors.quantization import (
+    QuantizationArgs,
+    QuantizationScheme,
+    QuantizationStrategy,
+    QuantizationType,
+) 
+recipe = GPTQModifier(
+    targets="Linear",
+    config_groups={
+        "config_group": QuantizationScheme(
+            targets=["Linear"],
+            weights=QuantizationArgs(
+                num_bits=4,
+                type=QuantizationType.INT,
+                strategy=QuantizationStrategy.GROUP,
+                group_size=128,
+                symmetric=True,
+                dynamic=False,
+                actorder="weight",
+            ),
+        ),
+    },
+    ignore=["lm_head"],
+    update_size=NUM_CALIBRATION_SAMPLES,
+    dampening_frac=0.01
+)
+```
+
+## Troubleshooting and Support
+
+If you encounter any issues or have feature requests, please open an issue on the [`vllm-project/llm-compressor`](https://github.com/vllm-project/llm-compressor) GitHub repository. The full INT4 quantization example in `llm-compressor` is available [here](https://github.com/vllm-project/llm-compressor/blob/main/examples/quantization_w4a16/llama3_example.py).
diff --git a/docs/source/features/quantization/int8.md b/docs/source/features/quantization/int8.md
index fedb16f4350e5..b381f34bccd34 100644
--- a/docs/source/features/quantization/int8.md
+++ b/docs/source/features/quantization/int8.md
@@ -8,7 +8,7 @@ This quantization method is particularly useful for reducing model size while ma
 Please visit the HF collection of [quantized INT8 checkpoints of popular LLMs ready to use with vLLM](https://huggingface.co/collections/neuralmagic/int8-llms-for-vllm-668ec32c049dca0369816415).
 
 :::{note}
-INT8 computation is supported on NVIDIA GPUs with compute capability > 7.5 (Turing, Ampere, Ada Lovelace, Hopper).
+INT8 computation is supported on NVIDIA GPUs with compute capability > 7.5 (Turing, Ampere, Ada Lovelace, Hopper, Blackwell).
 :::
 
 ## Prerequisites
@@ -132,4 +132,4 @@ Quantized models can be sensitive to the presence of the `bos` token. Make sure
 
 ## Troubleshooting and Support
 
-If you encounter any issues or have feature requests, please open an issue on the `vllm-project/llm-compressor` GitHub repository.
+If you encounter any issues or have feature requests, please open an issue on the [`vllm-project/llm-compressor`](https://github.com/vllm-project/llm-compressor) GitHub repository.

From b1340f9d55cd36a92aff713213e95f354a1bd1b4 Mon Sep 17 00:00:00 2001
From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Date: Fri, 31 Jan 2025 21:32:04 -0500
Subject: [PATCH 454/462] [V1] Bugfix: Validate Model Input Length (#12600)

SUMMARY:
* avoid crashing the engine when we get an input longer than
max_model_len

FIX #12567(*link existing issues this PR will resolve*)
---
 vllm/v1/engine/processor.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 43419d2ff5381..6196c11052078 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -206,6 +206,11 @@ def _validate_model_inputs(self, inputs: ProcessorInputs):
         if prompt_ids is None or len(prompt_ids) == 0:
             raise ValueError("Prompt cannot be empty")
 
+        if len(prompt_ids) >= self.model_config.max_model_len:
+            raise ValueError(
+                f"Prompt length of {len(prompt_ids)} is longer than the "
+                f"maximum model length of {self.model_config.max_model_len}.")
+
         if self.model_config.is_multimodal_model:
             max_prompt_len = self.model_config.max_model_len
 

From cb3e73e4c8142b5ce8ac34efc2fa04d90f142dc5 Mon Sep 17 00:00:00 2001
From: fade_away <1028552010@qq.com>
Date: Sat, 1 Feb 2025 12:52:07 +0800
Subject: [PATCH 455/462] [BugFix] fix wrong output when using lora and
 num_scheduler_steps=8 (#11161)

FIX issue https://github.com/vllm-project/vllm/issues/9688
https://github.com/vllm-project/vllm/issues/11086 #12487

---------

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: weilong.yu <weilong.yu@shopee.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/worker/model_runner.py | 4 ++++
 vllm/worker/worker.py       | 3 ---
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 160c0662ce976..322d91d62ce46 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1346,6 +1346,10 @@ def _dummy_run(self,
 
             self.execute_model(model_input, kv_caches, intermediate_tensors)
             torch.cuda.synchronize()
+            if self.lora_config:
+                # Remove dummy loras.
+                assert self.lora_manager is not None
+                self.remove_all_loras()
             return
 
     def remove_all_loras(self):
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 24bba79fedd75..1d2884d3ddf51 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -264,10 +264,7 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
                f"{(available_kv_cache_memory / GiB_bytes):.2f}GiB.")
 
         logger.info(msg)
-
         # Final cleanup
-        if self.model_runner.lora_manager:
-            self.model_runner.remove_all_loras()
         gc.collect()
 
         return num_gpu_blocks, num_cpu_blocks

From 1867c258bda3bc6adb07090c508fd85e3ceed547 Mon Sep 17 00:00:00 2001
From: Eldar Kurtic <eldarkurtic314@gmail.com>
Date: Sat, 1 Feb 2025 06:07:46 +0100
Subject: [PATCH 456/462] Fix target matching for fused layers with
 compressed-tensors (#12617)

Without this PR
---------------
Quantizing models with llm-compressor and a recipe that explicitly lists
names of layers produces a model that is not loadable by vLLM (i.e.
`vllm serve <model>` fails with `raise ValueError(f"Unable to find
matching target for {module} in the ...`).

Example recipe:
```
recipe = """
quantization_stage:
  run_type: oneshot
  quantization_modifiers:
    GPTQModifier:
      ignore: ["lm_head"]
      config_groups:
        group_0:
          weights:
            num_bits: 4
            type: "int"
            symmetric: true
            strategy: "group"
            group_size: 128
          targets: [
            "model.layers.0.mlp.down_proj",
            "model.layers.2.mlp.down_proj",
            "model.layers.3.mlp.down_proj",
            "model.layers.4.mlp.down_proj",
            "model.layers.5.mlp.down_proj",
            "model.layers.6.mlp.down_proj",
            "model.layers.7.mlp.down_proj",
            "model.layers.8.mlp.down_proj",
            "model.layers.9.mlp.down_proj",
            "model.layers.10.mlp.down_proj",
            "model.layers.11.mlp.down_proj",
            "model.layers.12.mlp.down_proj",
            "model.layers.13.mlp.down_proj",
            "model.layers.14.mlp.down_proj",
            "model.layers.15.mlp.down_proj",
            "model.layers.16.mlp.down_proj",
            "model.layers.17.mlp.down_proj",
            "model.layers.19.mlp.down_proj",
            "model.layers.21.mlp.down_proj",
            "model.layers.22.mlp.down_proj",
            .
            .
            .
          ]
"""
```

To reproduce the vLLM error:
```bash
vllm serve nm-testing/eldar-test
```

With this PR
------------
Models are loaded correctly without any errors.
---
 .../quantization/compressed_tensors/utils.py  | 41 ++++++++++++++++++-
 1 file changed, 40 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
index 8fcbda377428e..5bab5a02d83b7 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
@@ -103,7 +103,8 @@ def find_matched_target(layer_name: Optional[str], module: Module,
 
     matched_target = (_find_first_match(layer_name, targets)
                       or _find_first_match(module.__class__.__name__, targets,
-                                           True))
+                                           True)
+                      or _match_fused_layer(layer_name, targets))
 
     if matched_target is None:
         raise ValueError(f"Unable to find matching target for {module} in the "
@@ -152,3 +153,41 @@ def _is_equal_or_regex_match(value: str,
     elif target == value:
         return True
     return False
+
+
+def _match_fused_layer(layer_name: str,
+                       target_layers: Iterable[str]) -> Optional[str]:
+    """
+    Match a fused layer name to its corresponding individual layer in 
+    target_layers.
+
+    Examples:
+        layer_name = "model.layers.0.self_attn.qkv_proj"
+        target_layers = ["model.layers.0.self_attn.q_proj",
+                        "model.layers.0.self_attn.k_proj",
+                        "model.layers.0.self_attn.v_proj"]
+    """
+    # Split into parent path and layer type
+    # e.g., "model.layers.0.self_attn" and "qkv_proj"
+    parent_path = ".".join(layer_name.split(".")[:-1])
+    layer_type = layer_name.split(".")[-1]
+
+    if layer_type not in FUSED_LAYER_NAME_MAPPING:
+        return None
+
+    possible_layer_types = FUSED_LAYER_NAME_MAPPING[layer_type]
+
+    # Look for a target layer that:
+    # 1. Has the same parent path
+    # 2. Ends with one of the possible individual layer types
+    for target in target_layers:
+        is_same_parent = parent_path in target
+        is_matching_type = any(type_suffix in target
+                               for type_suffix in possible_layer_types)
+
+        if is_same_parent and is_matching_type and all(
+                '.'.join([parent_path, type_suffix])
+                for type_suffix in possible_layer_types):
+            return target
+
+    return None

From 35b7a05507e32f6cc60aec7148d3df0c788f7373 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Fri, 31 Jan 2025 21:22:23 -0800
Subject: [PATCH 457/462] [ci] Upgrade transformers to 4.48.2 in CI
 dependencies (#12599)

---
 requirements-common.txt | 2 +-
 requirements-test.in    | 2 +-
 requirements-test.txt   | 5 +++--
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/requirements-common.txt b/requirements-common.txt
index 7051ca8cb50cd..e5248572ce4d4 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -5,7 +5,7 @@ requests >= 2.26.0
 tqdm
 blake3
 py-cpuinfo
-transformers >= 4.45.2  # Required for Llama 3.2 and Qwen2-VL.
+transformers >= 4.48.2  # Required for Bamba.
 tokenizers >= 0.19.1  # Required for Llama 3.
 protobuf # Required by LlamaTokenizer.
 fastapi >= 0.107.0, < 0.113.0; python_version < '3.9'
diff --git a/requirements-test.in b/requirements-test.in
index 13ad17b256734..229d743ec802b 100644
--- a/requirements-test.in
+++ b/requirements-test.in
@@ -27,7 +27,7 @@ matplotlib # required for qwen-vl test
 mistral_common[opencv] >= 1.5.0 # required for pixtral test
 datamodel_code_generator # required for minicpm3 test
 lm-eval[api]==0.4.4 # required for model evaluation test
-
+transformers==4.48.2 
 # quantization
 bitsandbytes>=0.45.0
 buildkite-test-collector==0.1.9
diff --git a/requirements-test.txt b/requirements-test.txt
index df7e904bb0d34..e032aac710dd0 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -2,7 +2,7 @@
 # This file is autogenerated by pip-compile with Python 3.12
 # by the following command:
 #
-#    python3.12 -m piptools compile requirements-test.in -o requirements-test.txt
+# python3.12 -m piptools compile requirements-test.in -o requirements-test.txt
 #
 absl-py==2.1.0
     # via rouge-score
@@ -617,8 +617,9 @@ tqdm==4.66.6
     #   transformers
 tqdm-multiprocess==0.0.11
     # via lm-eval
-transformers==4.47.0
+transformers==4.48.2
     # via
+    #   -r requirements-test.in
     #   genai-perf
     #   lm-eval
     #   peft

From cfa134d2475096eae47a58d14a6dec4c3fba9294 Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Sat, 1 Feb 2025 00:41:35 -0500
Subject: [PATCH 458/462] [Bugfix/CI] Fixup benchmark_moe.py (#12562)

Fixes `is_marlin` not being passed into `get_default_config`

Also allow `--tensor-parallel-size` in addition to `-tp` and `--tp-size`

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
---
 benchmarks/kernels/benchmark_moe.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
index 5c8bf33afebc8..068830f02fb5e 100644
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -343,9 +343,13 @@ def benchmark(
         op_config = get_moe_configs(num_experts, shard_intermediate_size // 2,
                                     dtype_str)
         if op_config is None:
-            config = get_default_config(num_tokens, num_experts,
-                                        shard_intermediate_size, hidden_size,
-                                        topk, dtype_str)
+            config = get_default_config(num_tokens,
+                                        num_experts,
+                                        shard_intermediate_size,
+                                        hidden_size,
+                                        topk,
+                                        dtype_str,
+                                        is_marlin=False)
         else:
             config = op_config[min(op_config.keys(),
                                    key=lambda x: abs(x - num_tokens))]
@@ -536,7 +540,11 @@ def _distribute(method: str, inputs: List[Any]) -> List[Any]:
     parser.add_argument("--model",
                         type=str,
                         default="mistralai/Mixtral-8x7B-Instruct-v0.1")
-    parser.add_argument("--tp-size", "-tp", type=int, default=2)
+    parser.add_argument("--tp-size",
+                        "-tp",
+                        "--tensor-parallel-size",
+                        type=int,
+                        default=2)
     parser.add_argument("--dtype",
                         type=str,
                         choices=["auto", "fp8_w8a8", "int8_w8a16"],

From 3e1c76cf3a87854396d9e86a56a335e7d750c85f Mon Sep 17 00:00:00 2001
From: Rahul Tuli <rahul@neuralmagic.com>
Date: Fri, 31 Jan 2025 23:41:59 -0600
Subject: [PATCH 459/462] Fix: Respect `sparsity_config.ignore` in Cutlass
 Integration (#12517)

This PR addresses a bug in the Cutlass integration where the
`sparsity_config.ignore` list was not being respected. When only a
subset of modules were configured as Sparse24, the system incorrectly
selected Cutlass for non-sparse modules as well. This update ensures the
correct scheme is selected for non-sparse modules, fixing this behavior.

---

### Changes

- Updated logic to correctly respect `sparsity_config.ignore`.
- Ensured non-sparse modules use the appropriate scheme instead of
defaulting to Cutlass.

---

<details>
<summary>Testing Setup</summary>

The fix has been tested on top of [this
diff](https://github.com/vllm-project/vllm/pull/12097).

#### Steps to Test:
```bash
git checkout -b my-test-branch origin/rahul-bitmask-additions # compressed Cutlass support
git revert --no-edit aa2cd2c # revert Tyler's commit to turn off Cutlass for W16A16
git cherry-pick ca624cddb # this branch
```

#### Additional Patch Required:
```diff
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index a54177c1c..f916dd0c9 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -9,7 +9,7 @@ from compressed_tensors.quantization import (QuantizationArgs,
                                              QuantizationStrategy,
                                              QuantizationType)
 from pydantic import BaseModel
-
+from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
                                                UnquantizedLinearMethod)
@@ -27,7 +27,7 @@ from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
     should_ignore_layer)
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
 from vllm.platforms import current_platform
-
+logger = init_logger(__name__)
 __all__ = ["CompressedTensorsLinearMethod"]

 SPARSITY_CONFIG_NAME: Literal["sparsity_config"] = "sparsity_config"
```

Apply using:
```bash
git apply logging-patch.patch
```

</details>

---

<details>
<summary>Models Tested</summary>

- `nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-partial-24`
- `nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-full-sparse24`
-
`nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-partial-24-entire-fp8-compressed`
-
`nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-partial-24-remaining-fp8-compressed`

</details>

---


<details>
<summary>Example Output</summary>

#### Layers 0-5 (Sparse24)
```
Using scheme: CompressedTensors24 for model.layers.0.self_attn.qkv_proj
Using scheme: CompressedTensors24 for model.layers.0.self_attn.o_proj
Using scheme: CompressedTensors24 for model.layers.0.mlp.gate_up_proj
Using scheme: CompressedTensors24 for model.layers.0.mlp.down_proj
...
```

#### Layers 6+ (Non-Sparse, FP8)
```
Using scheme: CompressedTensorsW8A8Fp8 for model.layers.6.self_attn.qkv_proj
Using scheme: CompressedTensorsW8A8Fp8 for model.layers.6.self_attn.o_proj
Using scheme: CompressedTensorsW8A8Fp8 for model.layers.6.mlp.gate_up_proj
Using scheme: CompressedTensorsW8A8Fp8 for model.layers.6.mlp.down_proj
...
```

</details>

**Note:** Assumed all modules in fused layers such as `QKV_proj` and
`Gate_up_proj` follow the same quantization/pruning scheme.

---

For related tasks using the Asana app for GitHub, refer to [[this
link](https://app.asana.com/0/0/1209227810815160)](https://app.asana.com/0/0/1209227810815160).

Signed-off-by: Rahul Tuli <rahul@neuralmagic.com>
---
 .../compressed_tensors/compressed_tensors.py  | 59 +++++++++++-------
 .../quantization/compressed_tensors/utils.py  | 60 +++++++++++++++++--
 2 files changed, 91 insertions(+), 28 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index dd2dd02eaf723..37981ed918e7a 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -1,4 +1,5 @@
-from typing import Any, Dict, List, Literal, Optional, cast
+from contextlib import suppress
+from typing import Any, Dict, List, Literal, Optional, Tuple, cast
 
 import torch
 from compressed_tensors.config import (CompressionFormat,
@@ -44,6 +45,7 @@ def __init__(
         ignore: List[str],
         quant_format: str,
         sparsity_scheme_map: Dict[str, SparsityCompressionConfig],
+        sparsity_ignore_list: List[str],
         kv_cache_scheme: Optional[Dict[str, Any]] = None,
         config: Optional[Dict[str, Any]] = None,
     ):
@@ -54,6 +56,7 @@ def __init__(
         self.target_scheme_map = target_scheme_map
         self.kv_cache_scheme = kv_cache_scheme
         self.sparsity_scheme_map = sparsity_scheme_map
+        self.sparsity_ignore_list = sparsity_ignore_list
         self.config = config
 
     def get_linear_method(self) -> "CompressedTensorsLinearMethod":
@@ -98,7 +101,7 @@ def from_config(cls, config: Dict[str, Any]) -> "CompressedTensorsConfig":
         quant_format = cast(str, config.get("format"))
         target_scheme_map = cls._quantization_scheme_map_from_config(
             config=config)
-        sparsity_scheme_map = cls._sparsity_scheme_map_from_config(
+        sparsity_scheme_map, sparsity_ignore_list = cls._parse_sparsity_config(
             config=config)
 
         return cls(
@@ -106,20 +109,23 @@ def from_config(cls, config: Dict[str, Any]) -> "CompressedTensorsConfig":
             ignore=ignore,
             quant_format=quant_format,
             sparsity_scheme_map=sparsity_scheme_map,
+            sparsity_ignore_list=sparsity_ignore_list,
             config=config,
         )
 
     @classmethod
-    def _sparsity_scheme_map_from_config(
-            cls, config: Dict[str,
-                              Any]) -> Dict[str, SparsityCompressionConfig]:
+    def _parse_sparsity_config(
+        cls, config: Dict[str, Any]
+    ) -> Tuple[Dict[str, SparsityCompressionConfig], List[str]]:
         """
         :param config: The `quantization_config` dictionary from config.json
-        :return: A dictionary mapping target layer names to their corresponding
-            sparsity compression configurations
+        :return: A tuple with two elements
+            1. A dictionary mapping target layer names to their corresponding
+                sparsity_config
+            2. A list of layer names to ignore for sparsity
         """
         if not (sparsity_config := config.get(SPARSITY_CONFIG_NAME)):
-            return dict()
+            return dict(), []
 
         sparsity_config = SparsityCompressionConfig.model_validate(
             sparsity_config)
@@ -127,7 +133,8 @@ def _sparsity_scheme_map_from_config(
             target: sparsity_config
             for target in sparsity_config.targets or list()
         }
-        return sparse_scheme_map
+        sparsity_ignore_list = sparsity_config.ignore or list()
+        return sparse_scheme_map, sparsity_ignore_list
 
     @classmethod
     def _quantization_scheme_map_from_config(
@@ -352,7 +359,6 @@ def get_scheme(self,
         """
         compressed-tensors supports non uniform in the following way:
 
-        ignore: List of layer_names or nn.Module names to be ignored.
         targets of config_groups: There can be N config_groups which each
             have a quantization scheme. Each config_group has a list of targets
             which can be a full layer_name, a regex for a layer_name, or
@@ -370,6 +376,8 @@ def get_scheme(self,
         # need to make accelerate optional in ct to do this
 
         # Will be empty for models with only sparsity
+        weight_quant = input_quant = None
+        sparsity_scheme: Optional[SparsityCompressionConfig] = None
         if self.target_scheme_map:
             matched_target = find_matched_target(
                 layer_name=layer_name,
@@ -379,19 +387,24 @@ def get_scheme(self,
             scheme_dict = self.target_scheme_map[matched_target]
             weight_quant = scheme_dict.get("weights")
             input_quant = scheme_dict.get("input_activations")
-        elif self.sparsity_scheme_map:
-            matched_target = find_matched_target(
-                layer_name=layer_name,
-                module=layer,
-                targets=self.sparsity_scheme_map.keys())
-            weight_quant = None
-            input_quant = None
 
-        # For models with sparsity, assumes that the sparse layers are also
-        # quantized for cutlass 2:4 support
-        sparsity_scheme: Optional[
-            SparsityCompressionConfig] = self.sparsity_scheme_map.get(
-                matched_target)
+        if self.sparsity_scheme_map:
+            is_ignored = False
+            with suppress(ValueError):
+                is_ignored = find_matched_target(
+                    layer_name=layer_name,
+                    module=layer,
+                    targets=self.sparsity_ignore_list)
+
+            # if the layer is in the sparsity ignore list,
+            # we should not apply any sparsity scheme
+
+            if not is_ignored:
+                matched_target = find_matched_target(
+                    layer_name=layer_name,
+                    module=layer,
+                    targets=self.sparsity_scheme_map.keys())
+                sparsity_scheme = self.sparsity_scheme_map.get(matched_target)
 
         if self.supports_cutlass_24(weight_quant=weight_quant,
                                     input_quant=input_quant,
@@ -419,6 +432,8 @@ def get_scheme(self,
         # Raise error if device does not support the scheme
         # (e.g. fp8 needs ada lovelace)
         self._check_scheme_supported(scheme.get_min_capability())
+        logger.debug("Using scheme: %s for %s", scheme.__class__.__name__,
+                     layer_name)
         return scheme
 
     def get_cache_scale(self, name: str) -> Optional[str]:
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
index 5bab5a02d83b7..34996b08e9c91 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
@@ -12,7 +12,7 @@ def is_activation_quantization_format(format: str) -> bool:
     _ACTIVATION_QUANTIZATION_FORMATS = [
         CompressionFormat.naive_quantized.value,
         CompressionFormat.int_quantized.value,
-        CompressionFormat.float_quantized.value
+        CompressionFormat.float_quantized.value,
     ]
     return format in _ACTIVATION_QUANTIZATION_FORMATS
 
@@ -68,7 +68,7 @@ def should_ignore_layer(layer_name: Optional[str],
 def check_equal_or_regex_match(layer_name: str,
                                targets: Iterable[str]) -> bool:
     """
-    Checks whether a layer_name is exactly equal or a regex match for 
+    Checks whether a layer_name is exactly equal or a regex match for
     if target starts with 're:' to any target in list.
     """
     for target in targets:
@@ -77,17 +77,64 @@ def check_equal_or_regex_match(layer_name: str,
     return False
 
 
+def _handle_fused_layers(func):
+    """
+    Decorator to handle fused layers by mapping vllm fused layer names
+    to their corresponding unfused layer names for quantization/pruning schemes.
+    """
+    # fused_layer_name -> unfused_layer_name
+    fused_layer_map = {
+        "qkv_proj": "q_proj",
+        "gate_up_proj": "up_proj",
+    }
+
+    def fused_layer_handler(layer_name: Optional[str], module: Module,
+                            targets: Iterable[str]) -> Optional[str]:
+        """
+        Wrapper function specifically designed to support the
+        find_matched_target function.
+
+        It handles cases where the provided layer name corresponds to a
+        fused layer in vllm, mapping it to its equivalent unfused layer name
+        based on the predefined fused_layer_map. If the original layer name
+        raises a ValueError in the wrapped function, this handler
+        will attempt to resolve the issue by substituting with unfused
+        layer name.
+
+        :param layer_name: Name of the layer, which may be fused.
+        :param module: An instance of torch.nn.Module.
+        :param targets: A list of target names or patterns to match.
+        :return: The result of the wrapped find_matched_target function with
+            the resolved layer name.
+        :raises ValueError: If the layer name cannot be resolved to a 
+            valid target.
+        """
+        try:
+            return func(layer_name, module, targets)
+        except ValueError:
+            if layer_name is None:
+                layer_name = ""
+            parent_name, fused_proj_name = layer_name.rsplit(".", 1)
+            unfused_proj_name = fused_layer_map.get(fused_proj_name,
+                                                    fused_proj_name)
+            new_layer_name = f"{parent_name}.{unfused_proj_name}"
+            return func(new_layer_name, module, targets)
+
+    return fused_layer_handler
+
+
+@_handle_fused_layers
 def find_matched_target(layer_name: Optional[str], module: Module,
                         targets: Iterable[str]) -> str:
     """
     Helper function to look up which "target" in the compressed-tensors
     config that a layer corresponds to.
 
-    Recall that a compressed-tensors configs has a concept of 
+    Recall that a compressed-tensors configs has a concept of
     config_groups, where each layer can be quantized with with a different
     scheme.
 
-    targets in each config_group will be a list of either layer names 
+    targets in each config_group will be a list of either layer names
     (or regexes corresponding to layer names) or names of torch Modules.
 
     First, we try to match the layer_name with a target
@@ -107,8 +154,9 @@ def find_matched_target(layer_name: Optional[str], module: Module,
                       or _match_fused_layer(layer_name, targets))
 
     if matched_target is None:
-        raise ValueError(f"Unable to find matching target for {module} in the "
-                         "compressed-tensors config.")
+        raise ValueError(
+            f"Unable to find matching target for {layer_name} in the "
+            "compressed-tensors config.")
 
     return matched_target
 

From baeded25699f9f4851843306f27f685c4d4ee7c5 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Sat, 1 Feb 2025 00:52:51 -0500
Subject: [PATCH 460/462] [Attention] Deepseek v3 MLA support with FP8 compute
 (#12601)

This PR implements the Deepseek V3 support by performing matrix absorption the fp8 weights

---------

Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: simon-mo <simon.mo@hey.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Zhuohan Li <zhuohan123@gmail.com>
Co-authored-by: Tyler Michael Smith <tysmith@redhat.com>
Co-authored-by: Alexander Matveev <59768536+alexm-neuralmagic@users.noreply.github.com>
---
 vllm/attention/backends/mla/utils.py          | 220 +++++++++++++++---
 vllm/attention/backends/triton_mla.py         |  18 +-
 vllm/attention/layer.py                       |   4 +-
 vllm/config.py                                |  39 +++-
 vllm/envs.py                                  |  12 +-
 .../layers/quantization/utils/fp8_utils.py    |  74 ++++--
 .../layers/quantization/utils/quant_utils.py  | 116 ++++++++-
 vllm/model_executor/model_loader/loader.py    |  24 +-
 vllm/model_executor/models/deepseek_v3.py     | 154 +++++++++++-
 vllm/worker/cache_engine.py                   |   4 +-
 10 files changed, 580 insertions(+), 85 deletions(-)

diff --git a/vllm/attention/backends/mla/utils.py b/vllm/attention/backends/mla/utils.py
index c6c8a6034e20f..e8fec234c0225 100644
--- a/vllm/attention/backends/mla/utils.py
+++ b/vllm/attention/backends/mla/utils.py
@@ -1,17 +1,29 @@
 from abc import abstractmethod
 from dataclasses import dataclass
-from typing import Any, Dict, Generic, List, Optional
+from typing import Any, Dict, Generic, List, Optional, Tuple
 
 import torch
+from compressed_tensors.quantization import QuantizationStrategy
 
 from vllm import _custom_ops as ops
 from vllm import envs
 from vllm.attention.backends.abstract import (AttentionLayer,
                                               AttentionMetadata,
                                               MLAAttentionImpl, T)
-from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.distributed import (get_tensor_model_parallel_world_size,
+                              tensor_model_parallel_all_reduce)
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               RowParallelLinear)
+                                               LinearBase, RowParallelLinear,
+                                               UnquantizedLinearMethod)
+from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
+    CompressedTensorsLinearMethod)
+from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
+    CompressedTensorsW8A8Fp8)
+from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    apply_fp8_linear_generic, current_platform_fp8_dtype, is_fp8)
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    scaled_dequantize, scaled_quantize)
 from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
 from vllm.vllm_flash_attn import flash_attn_varlen_func
 
@@ -25,11 +37,11 @@ class MLACommonMetadata(AttentionMetadata):
 
 class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
     """
-    Common class for implementing repeated parts 
-    
+    Common class for implementing repeated parts
+
     Main reference: DeepseekV2 paper, and FlashInfer Implementation
     (https://arxiv.org/abs/2405.04434 and https://github.com/flashinfer-ai/flashinfer/pull/551).
-    
+
     Deepseek's MLA attention works the following way:
     * Use a single latent vector to represent the entire KV cache.
     * The attention "simulates" a multi-head attention, while the compute is
@@ -46,7 +58,7 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
         * V: V head dim.
         * kv_c: latent/compressed KV
         * q_c: latent/compressed Q
-        
+
         #
         # Outside the MLA attention backend
         #
@@ -55,21 +67,21 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
            kv_c_k_pe (B, Lkv+R).
         2. The kv_c_k_pe is split into kv_c (B, Lkv) and k_pe (B, R). cq
            and kv_c are normalized.
-        
+
         #
         # Inside the MLA attention backend
         #
 
         * if prefill:
-        
-        3. The q_c is then projected up into the multi-head version. 
-           * q_c goes from (B, Lq) to (B, N, (P+R)), which is split into q_nope 
-             (B, N, P) and q_pe (B, N, R). 
+
+        3. The q_c is then projected up into the multi-head version.
+           * q_c goes from (B, Lq) to (B, N, (P+R)), which is split into q_nope
+             (B, N, P) and q_pe (B, N, R).
         4. q_pe, k_pe are then passed through rotary embeddings.
         5. kv_c and k_pe are concatenated and inserted into the cache
-        6. The kv_c is then projected up into the multi-head version. 
-           * kv_c goes from (B, Lkv) to (B, N, (P+V)) which has the nope 
-             dimensions for K and V, which is split into k_nope (B, N, P) 
+        6. The kv_c is then projected up into the multi-head version.
+           * kv_c goes from (B, Lkv) to (B, N, (P+V)) which has the nope
+             dimensions for K and V, which is split into k_nope (B, N, P)
              and v (B, N, V).
         7. q (B, N, (P+R)) and k (B, N, (P+R)) matrices are assembled from
            q_nope, q_pe, k_nope, k_pe.
@@ -112,7 +124,7 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
     From @tsu-bin's calculation, we only want to use the absorption technique
     for decode. The prefill algorithm should still use the up-projected MHA
     for less flops and memory usage.
-    
+
     """
 
     def __init__(
@@ -162,8 +174,19 @@ def __init__(
 
     def _v_up_proj_and_o_proj(self, x):
         if envs.VLLM_MLA_PERFORM_MATRIX_ABSORPTION:
-            return self.o_proj_absorbed(
-                x.reshape(-1, self.num_heads * self.kv_lora_rank))[0]
+            if is_fp8(self.W_UV_O):
+                output_parallel = apply_fp8_linear_generic(
+                    x.flatten(start_dim=1), self.W_UV_O, self.W_UV_O_scales,
+                    self.reqaunt_input_group_shape,
+                    self.reqaunt_weight_group_shape)
+            else:
+                output_parallel = torch.matmul(x.flatten(start_dim=1),
+                                               self.W_UV_O)
+            if self.tp_size > 1:
+                output = tensor_model_parallel_all_reduce(output_parallel)
+            else:
+                output = output_parallel
+            return output
         else:
             x = torch.einsum("bnl,lnv->bnv", x, self.W_UV)
             return self.o_proj(x.reshape(-1,
@@ -171,6 +194,12 @@ def _v_up_proj_and_o_proj(self, x):
 
     def _q_proj_and_k_up_proj(self, x):
         if envs.VLLM_MLA_PERFORM_MATRIX_ABSORPTION:
+            if is_fp8(self.W_Q_UK):
+                return apply_fp8_linear_generic(
+                    x, self.W_Q_UK, self.W_Q_UK_scales,
+                    self.reqaunt_input_group_shape,
+                    self.reqaunt_weight_group_shape).view(
+                        -1, self.num_heads, self.kv_lora_rank)
             return torch.matmul(x, self.W_Q_UK)\
                 .view(-1, self.num_heads, self.kv_lora_rank)
         else:
@@ -179,8 +208,91 @@ def _q_proj_and_k_up_proj(self, x):
             return torch.einsum("bnp,lnp->bnl", x, self.W_UK)\
                 .view(-1, self.num_heads, self.kv_lora_rank)
 
-    def process_weights_after_loading(self):
-        kv_b_proj_weight = self.kv_b_proj.weight.T
+    def process_weights_after_loading(self, act_dtype: torch.dtype):
+
+        def is_layer_fp8(layer: LinearBase) -> bool:
+            return isinstance(layer.quant_method, Fp8LinearMethod) or\
+                (isinstance(layer.quant_method, CompressedTensorsLinearMethod)\
+                and isinstance(layer.scheme, CompressedTensorsW8A8Fp8))
+
+        def quantization_scheme_supported(layer: LinearBase) -> bool:
+            return isinstance(layer.quant_method, UnquantizedLinearMethod) or \
+                is_layer_fp8(layer)
+
+        # TODO(lucas) This is very gross, we need a more wide scale refactor of
+        # all the FP8 code with a more standard way of
+        # defining schemes/group-shapes, we should also potentially force
+        # quant_methods to support a decompress function
+        #
+        # returns input_group_shape, weight_group_shape
+        def get_scale_group_shapes_for_fp8(layer: LinearBase) -> \
+            Tuple[Tuple[int, int], Tuple[int, int]]:
+            if isinstance(layer.quant_method, Fp8LinearMethod):
+                if layer.quant_method.block_quant is not None:
+                    weight_block_size = \
+                        layer.quant_method.quant_config.weight_block_size
+                    # per-token-group (1, X), block-quantized (X, Y)
+                    return (1, weight_block_size[-1]), weight_block_size
+                else:
+                    return (-1, -1), (-1, -1)  # per-tensor, per-tensor
+            elif isinstance(layer.quant_method, CompressedTensorsLinearMethod)\
+                and isinstance(layer.scheme, CompressedTensorsW8A8Fp8):
+                # this is hacky but we always assume the for
+                # CompressedTensorsW8A8Fp8 the input is dynamic per-token
+                # we ignore if it is static-per-tensor since we are going to
+                # requantize after later anyways
+                strategy = layer.scheme.strategy
+                if strategy == QuantizationStrategy.TENSOR:
+                    return (1, -1), (-1, -1)  # per-token, per-tensor
+                elif strategy == QuantizationStrategy.CHANNEL:
+                    return (1, -1), (-1, 1)  # per-token, per-channel
+                else:
+                    raise NotImplementedError(
+                        f"QuantizationStrategy.{strategy} is not supported for "
+                        "fp8 MLA, please run with VLLM_MLA_DISABLE=1")
+            else:
+                raise NotImplementedError(
+                    "Can't determine scale group shapes for "
+                    f"{layer.quant_method}, please run with VLLM_MLA_DISABLE=1"
+                )
+
+        def get_scales(layer: LinearBase) -> torch.Tensor:
+            if hasattr(layer, "weight_scale_inv"):
+                return layer.weight_scale_inv
+            return layer.weight_scale
+
+        def get_and_maybe_dequant_weights(layer: LinearBase):
+            if is_layer_fp8(layer):
+                if isinstance(layer.quant_method, \
+                    CompressedTensorsLinearMethod) and \
+                    isinstance(layer.scheme, CompressedTensorsW8A8Fp8):
+                    # NOTE(lucas): note sure why but `CompressedTensorsW8A8Fp8`
+                    # seems to store weights as (input, output) instead of
+                    # (output, input) so we need to transpose
+                    weight = layer.weight.T  # standardize to (output, input)
+                else:
+                    weight = layer.weight
+                _, weight_scale_group_shape = \
+                    get_scale_group_shapes_for_fp8(layer)
+                scales = get_scales(layer)
+
+                return scaled_dequantize(weight, scales,
+                                         weight_scale_group_shape)
+            else:
+                return layer.weight
+
+        if not (quantization_scheme_supported(self.kv_b_proj) and\
+            quantization_scheme_supported(self.q_proj) and\
+                quantization_scheme_supported(self.o_proj)):
+            raise NotImplementedError(
+                "Only FP8 and UnquantizedLinearMethod are supported for MLA"
+                ", please run with VLLM_MLA_DISABLE=1")
+
+        weight_dtype = self.kv_b_proj.weight.dtype
+        assert self.o_proj.weight.dtype == weight_dtype
+        assert self.q_proj.weight.dtype == weight_dtype
+
+        kv_b_proj_weight = get_and_maybe_dequant_weights(self.kv_b_proj).T
         assert kv_b_proj_weight.shape == (
             self.kv_lora_rank,
             self.num_heads * (self.qk_nope_head_dim + self.v_head_dim)), (
@@ -198,18 +310,35 @@ def process_weights_after_loading(self):
         W_UK, W_UV = kv_b_proj_weight.split(
             [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
 
-        q_proj = self.q_proj.weight.T\
+        q_proj_weight = get_and_maybe_dequant_weights(self.q_proj).T\
                 .view(-1, self.num_heads, self.qk_head_dim)
 
         # can be W_Q or W_UQ depending q_lora_rank, the former if
         # q_lora_rank is None, the latter otherwise. From the Attention backend
         # perspective though we call these both W_Q and rely on the layer
         # to pass in the correct matrix
-        W_Q = q_proj[..., :self.qk_nope_head_dim]
-        self.W_QR = q_proj[..., self.qk_nope_head_dim:]\
+        W_Q = q_proj_weight[..., :self.qk_nope_head_dim]
+        self.W_QR = q_proj_weight[..., self.qk_nope_head_dim:]\
             .flatten(start_dim=1).contiguous()
 
+        # W_QR is small so for simplicity we dont bother requantizing it
+        self.W_QR = self.W_QR.to(act_dtype)
+
         if envs.VLLM_MLA_PERFORM_MATRIX_ABSORPTION:
+            requantization_enabled = not envs.VLLM_MLA_DISABLE_REQUANTIZATION
+            if is_fp8(weight_dtype) and requantization_enabled:
+                # This assumes it wise to requantize using the same group shapes
+                # (i.e. strategy, per-tensor, per-channel, block etc.) that the
+                # weights were originally quantized
+                requant_input_group_shape, requant_weight_group_shape = \
+                    get_scale_group_shapes_for_fp8(self.q_proj)
+                assert (requant_input_group_shape, requant_weight_group_shape)\
+                    == get_scale_group_shapes_for_fp8(self.kv_b_proj)
+                assert (requant_input_group_shape, requant_weight_group_shape)\
+                    == get_scale_group_shapes_for_fp8(self.o_proj)
+                self.reqaunt_input_group_shape = requant_input_group_shape
+                self.reqaunt_weight_group_shape = requant_weight_group_shape
+
             #
             # Perform matrix-absorption following
             #     https://github.com/flashinfer-ai/flashinfer/pull/551
@@ -223,25 +352,44 @@ def process_weights_after_loading(self):
             # latter otherwise
             # basically if q_lora_rank is none we are absorbing into q_proj
             # instead of UQ
-            self.W_Q_UK = torch.einsum("qnd,lnd -> qnl", W_Q, W_UK)\
+            W_Q_UK = torch.einsum("qnd,lnd -> qnl", W_Q, W_UK)\
                 .flatten(start_dim=1).contiguous()
 
-            W_O = self.o_proj.weight\
+            if is_fp8(weight_dtype) and requantization_enabled:
+                W_Q_UK, W_Q_UK_scales = scaled_quantize(
+                    W_Q_UK,
+                    self.reqaunt_weight_group_shape,
+                    quant_dtype=current_platform_fp8_dtype)
+                # For FP8 save the transpose so we can use
+                # `apply_w8a8_block_fp8_linear` directly
+                self.W_Q_UK = W_Q_UK.T.contiguous()
+                self.W_Q_UK_scales = W_Q_UK_scales.T.contiguous()
+            else:
+                self.W_Q_UK = W_Q_UK.to(act_dtype)
+
+            W_O = get_and_maybe_dequant_weights(self.o_proj)\
                 .view(-1, self.num_heads, self.v_head_dim)
-            self.W_UV_O = torch.einsum("lnd,hnd -> nlh", W_UV, W_O)\
+            W_UV_O = torch.einsum("lnd,hnd -> nlh", W_UV, W_O)\
                 .flatten(start_dim=0, end_dim=1).contiguous()
 
-            tp_size = get_tensor_model_parallel_world_size()
-            self.o_proj_absorbed = RowParallelLinear(
-                self.W_UV_O.shape[0] * tp_size,
-                self.W_UV_O.shape[1],
-                bias=False,
-                # TODO(lucas) figure out how to properly forward quant_method
-                #quant_config=self.o_proj.quant_method,
-            )
-
-            self.o_proj_absorbed.weight = torch.nn.Parameter(self.W_UV_O.T)
+            if is_fp8(weight_dtype) and requantization_enabled:
+                W_UV_O, W_UV_O_scales = scaled_quantize(
+                    W_UV_O,
+                    self.reqaunt_weight_group_shape,
+                    quant_dtype=current_platform_fp8_dtype)
+                # For FP8 save the transpose so we can use
+                # `apply_w8a8_block_fp8_linear` directly
+                self.W_UV_O = W_UV_O.T.contiguous()
+                self.W_UV_O_scales = W_UV_O_scales.T.contiguous()
+            else:
+                self.W_UV_O = W_UV_O.to(act_dtype)
+
+            self.tp_size = get_tensor_model_parallel_world_size()
         else:
+            if is_fp8(weight_dtype):
+                raise NotImplementedError(
+                    "Currently fp8 requires matrix absorption")
+
             self.W_UV = W_UV
             self.W_UK = W_UK
             self.W_Q = W_Q.flatten(start_dim=1)
diff --git a/vllm/attention/backends/triton_mla.py b/vllm/attention/backends/triton_mla.py
index da09bb70b4f1a..95dc119a47bb5 100644
--- a/vllm/attention/backends/triton_mla.py
+++ b/vllm/attention/backends/triton_mla.py
@@ -57,14 +57,12 @@ def get_state_cls() -> Type["TritonMLAState"]:
 
     @staticmethod
     def get_kv_cache_shape(
-            num_blocks: int,
-            block_size: int,
-            num_kv_heads: int,  # assumed to be 1 for MLA
-            kv_lora_rank: int,  # passed via head_size
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,  # assumed to be 1 for MLA
+        head_size: int,
     ) -> Tuple[int, ...]:
-        # TODO(lucas): remove hardcoding k_pe size as 1/8th of kv_lora_rank
-        k_pe_size = kv_lora_rank // 8
-        return (num_blocks, block_size, kv_lora_rank + k_pe_size)
+        return (num_blocks, block_size, head_size)
 
     @staticmethod
     def swap_blocks(
@@ -83,7 +81,7 @@ def copy_blocks(
 
     @staticmethod
     def get_supported_head_sizes() -> List[int]:
-        return [512]
+        return [576]
 
 
 class TritonMLAState(AttentionState):
@@ -624,8 +622,6 @@ def build(self, seq_lens: List[int], query_lens: List[int],
             self.multimodal_placeholder_maps.items()
         }
 
-        num_kv_splits = 8
-
         return TritonMLAMetadata(
             num_prefills=self.num_prefills,
             slot_mapping=slot_mapping_tensor,
@@ -645,7 +641,7 @@ def build(self, seq_lens: List[int], query_lens: List[int],
             context_lens_tensor=context_lens_tensor,
             block_tables=block_tables,
             use_cuda_graph=use_captured_graph,
-            num_kv_splits=num_kv_splits,
+            num_kv_splits=4,  # TODO(lucas) add heuristic
             head_dim=self.runner.model_config.get_head_size(),
         )
 
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 9b804a29a485d..b97165f625e51 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -200,9 +200,9 @@ def extra_repr(self) -> str:
         s += f", backend={self.impl.__class__.__name__}"
         return s
 
-    def process_weights_after_loading(self):
+    def process_weights_after_loading(self, act_dtype: torch.dtype):
         if hasattr(self.impl, "process_weights_after_loading"):
-            self.impl.process_weights_after_loading()
+            self.impl.process_weights_after_loading(act_dtype)
 
 
 class MultiHeadAttention(nn.Module):
diff --git a/vllm/config.py b/vllm/config.py
index f6bd8b1ad8f14..f998502eef0da 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -739,18 +739,19 @@ def get_hidden_size(self) -> int:
     @property
     def is_deepseek_mla(self) -> bool:
         # TODO add deepseek_v3
-        return hasattr(self.hf_text_config,
-                       "model_type") and (self.hf_text_config.model_type
-                                          in ('deepseek_v2'))
+        return (hasattr(self.hf_text_config, "model_type")) \
+                and (self.hf_text_config.model_type in \
+                    ('deepseek_v2', 'deepseek_v3'))\
+                and (self.hf_text_config.kv_lora_rank is not None)
 
     def get_head_size(self) -> int:
         # TODO remove hard code
         if self.is_deepseek_mla:
+            qk_rope_head_dim = getattr(self.hf_text_config, "qk_rope_head_dim",
+                                       0)
             if self.use_mla:
-                return self.hf_text_config.kv_lora_rank
+                return self.hf_text_config.kv_lora_rank + qk_rope_head_dim
             else:
-                qk_rope_head_dim = getattr(self.hf_text_config,
-                                           "qk_rope_head_dim", 0)
                 qk_nope_head_dim = getattr(self.hf_text_config,
                                            "qk_nope_head_dim", 0)
                 if qk_rope_head_dim and qk_nope_head_dim:
@@ -969,6 +970,32 @@ def is_cross_encoder(self) -> bool:
 
     @property
     def use_mla(self) -> bool:
+        if self.quantization is not None and self.quantization not in [\
+            "fp8", "compressed-tensors"]:
+            logger.warning(
+                "MLA is not supported with %s quantization. "
+                "Disabling MLA.", self.quantization)
+            return False
+
+        # If using a "compressed-tensors" checkpoint, check that all groups
+        # have fp8 for both weights and activations.
+        if self.quantization == "compressed-tensors":
+            quant_config = self._parse_quant_hf_config()
+            for group_name, cfg in quant_config.get("config_groups",
+                                                    ("", {})).items():
+                act_cfg = cfg.get("input_activations", {})
+                act_type = None if act_cfg is None else act_cfg.get("type", "")
+                w_cfg = cfg.get("weights", {})
+                w_type = None if w_cfg is None else w_cfg.get("type", "")
+                if act_type != "fp8" or w_type != "fp8":
+                    logger.warning(
+                        "compressed-tensors MLA support requires fp8 "
+                        "activations and weights in group '%s', but got "
+                        "activations type '%s' and weights type '%s'.\n "
+                        "Full config: %s", group_name, act_type, w_type,
+                        quant_config)
+                    return False
+
         use_mla = (self.is_deepseek_mla and not envs.VLLM_MLA_DISABLE)
         return use_mla
 
diff --git a/vllm/envs.py b/vllm/envs.py
index 2a18e3b9bc51d..25098070b00c9 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -79,6 +79,7 @@
     VLLM_V1_OUTPUT_PROC_CHUNK_SIZE: int = 128
     VLLM_MLA_DISABLE: bool = False
     VLLM_MLA_PERFORM_MATRIX_ABSORPTION: bool = True
+    VLLM_MLA_DISABLE_REQUANTIZATION: bool = False
 
 
 def get_default_cache_root():
@@ -519,7 +520,16 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     # storing more weights, W_Q_UK and W_UV_O, so can increase memory usage,
     # the is enabled by default
     "VLLM_MLA_PERFORM_MATRIX_ABSORPTION":
-    lambda: bool(int(os.getenv("VLLM_MLA_PERFORM_MATRIX_ABSORPTION", "1")))
+    lambda: bool(int(os.getenv("VLLM_MLA_PERFORM_MATRIX_ABSORPTION", "1"))),
+
+    # When running MLA with matrix-absorption enabled and fp8 quantized weights
+    # we perform the matrix-absorption in float32 precision, after the matrices
+    # are absorbed we requantize the weights back to fp8, this flag can be used
+    # to disable the requantization step, and instead convert the absorbed
+    # matrices to match the activation type. This can lead to higher memory and
+    # compute usage but better preserves the accuracy of the original model.
+    "VLLM_MLA_DISABLE_REQUANTIZATION":
+    lambda: bool(int(os.getenv("VLLM_MLA_DISABLE_REQUANTIZATION", "0")))
 }
 
 # end-env-vars-definition
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index ccebff341a7ed..850820f66ff90 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -2,7 +2,7 @@
 import functools
 import json
 import os
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 import torch
 import triton
@@ -10,10 +10,24 @@
 
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    _normalize_quant_group_shape, scaled_dequantize)
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    apply_fp8_linear)
 from vllm.platforms import current_platform
 
 logger = init_logger(__name__)
 
+current_platform_fp8_dtype = (torch.float8_e4m3fnuz
+                              if current_platform.is_rocm() else
+                              torch.float8_e4m3fn)
+
+
+def is_fp8(x: Union[torch.dtype, torch.Tensor]) -> bool:
+    if isinstance(x, torch.Tensor):
+        x = x.dtype
+    return x == torch.float8_e4m3fn or x == torch.float8_e4m3fnuz
+
 
 def apply_w8a8_block_fp8_linear(
     input: torch.Tensor,
@@ -55,6 +69,42 @@ def apply_w8a8_block_fp8_linear(
     return output.to(dtype=input.dtype).view(*output_shape)
 
 
+# Unify the interface between `apply_w8a8_block_fp8_linear` and
+# `apply_fp8_linear`
+# NOTE(lucas): this is quite messy, we should think through this more formally
+def apply_fp8_linear_generic(
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        weight_scale: torch.Tensor,
+        input_group_shape: Tuple[int, int],
+        weight_group_shape: Tuple[int, int],
+        input_scale: Optional[torch.Tensor] = None,  # static scale if one
+) -> torch.Tensor:
+    # View input as 2D matrix for fp8 methods
+    input = input.view(-1, input.shape[-1])
+
+    weight_group_shape = _normalize_quant_group_shape(\
+        weight, weight_group_shape)
+    input_group_shape = _normalize_quant_group_shape(input, input_group_shape)
+
+    def is_dim_blocked(dim, shape, group_shape):
+        return group_shape < shape[dim] and group_shape > 1
+
+    if is_dim_blocked(0, weight.shape, weight_group_shape[0])\
+     and is_dim_blocked(1, weight.shape, weight_group_shape[1]) and\
+     input_group_shape == (1, weight_group_shape[1]):
+        return apply_w8a8_block_fp8_linear(input, weight,
+                                           list(weight_group_shape),
+                                           weight_scale)
+    else:
+        # Despite having linear in the it doesn't conform to
+        # `torch.nn.functional.linear` which is defined as `input @ weight.T`
+        # so we explicitly transpose the weight matrix here
+        return apply_fp8_linear(input, weight.T, weight_scale.T,
+                         use_per_token_if_dynamic=\
+                             (input_group_shape == (1, input.shape[1])))
+
+
 def input_to_float8(
         x: torch.Tensor,
         dtype: Optional[torch.dtype] = None
@@ -75,7 +125,6 @@ def input_to_float8(
 def block_quant_to_tensor_quant(
     x_q_block: torch.Tensor,
     x_s: torch.Tensor,
-    block_size: List[int],
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     """This function converts block-wise quantization to tensor-wise
     quantization. The inputs are block-wise quantization tensor `x_q_block`,
@@ -83,26 +132,7 @@ def block_quant_to_tensor_quant(
     The outputs are tensor-wise quantization tensor and tensor-wise
     quantization scale. Note only float8 is supported for now.
     """
-    block_n, block_k = block_size[0], block_size[1]
-    n, k = x_q_block.shape
-    n_tiles = (n + block_n - 1) // block_n
-    k_tiles = (k + block_k - 1) // block_k
-    assert n_tiles == x_s.shape[0]
-    assert k_tiles == x_s.shape[1]
-
-    x_dq_block = x_q_block.to(torch.float32)
-
-    x_dq_block_tiles = [[
-        x_dq_block[
-            j * block_n:min((j + 1) * block_n, n),
-            i * block_k:min((i + 1) * block_k, k),
-        ] for i in range(k_tiles)
-    ] for j in range(n_tiles)]
-
-    for i in range(k_tiles):
-        for j in range(n_tiles):
-            x_dq_block_tiles[j][i][:, :] = x_dq_block_tiles[j][i] * x_s[j][i]
-
+    x_dq_block = scaled_dequantize(x_q_block, x_s)
     x_q_tensor, scale = input_to_float8(x_dq_block, dtype=x_q_block.dtype)
     return x_q_tensor, scale
 
diff --git a/vllm/model_executor/layers/quantization/utils/quant_utils.py b/vllm/model_executor/layers/quantization/utils/quant_utils.py
index 83055d6000d83..95e785dcc4078 100644
--- a/vllm/model_executor/layers/quantization/utils/quant_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/quant_utils.py
@@ -1,5 +1,5 @@
 """This file is used for /tests and /benchmarks"""
-from typing import List, Optional
+from typing import List, Optional, Tuple
 
 import numpy
 import torch
@@ -20,6 +20,120 @@
 }
 
 
+# Normalize the group_shape to the full extent for any dims that are -1
+def _normalize_quant_group_shape(x: torch.Tensor, group_shape: Tuple[int,
+                                                                     int]):
+    # -1 means full extent
+    return (group_shape[0] if group_shape[0] > 0 else x.shape[-2],
+            group_shape[1] if group_shape[1] > 0 else x.shape[-1])
+
+
+# Useful when treating N-dimensional group scaling as extended numpy-style
+# broadcasting in numpy simply stretches dimensions with an extent of 1 to match
+# the target shape by repeating the data along that dimension (broadcasting)
+# , we extend these semantics to say if the extent of a dimension in the
+# source shape is not 1 and does not match the target shape we repeat each
+# element along that dimension src_shape[dim] // target_shape[dim] times
+# example if we have:
+#       a = [[1, 2], and target_shape = (2, 4)
+#            [3, 4]]
+# then we would expand a to:
+#       a = [[1, 1, 2, 2],
+#            [3, 3, 4, 4]]
+# NOTE this function this function does not explicitly broadcast dimensions
+# with an extent of 1, since this can be done implicitly by pytorch
+def group_broadcast(t, shape):
+    for i, s in enumerate(shape):
+        if t.shape[i] != s and t.shape[i] != 1:
+            assert s % t.shape[i] == 0
+            t = t.unsqueeze(i + 1)\
+                .expand(*t.shape[:i+1], s // t.shape[i], *t.shape[i+1:])\
+                .flatten(i, i + 1)
+    return t
+
+
+# Quantize assuming once scale per group of elements with shape group_shape,
+# example group shapes:
+#  * (-1, -1)   for per-tensor quantization
+#  * (1, -1)    for per-row quantization
+#  * (-1, 1)    for per-column quantization
+#  * (128, 128) for 128x128 deepseek style block quantization
+#  * (1, 128)   for deepseek style activation quantization
+#               (i.e. per-token-per-group)
+def scaled_quantize(
+    x: torch.Tensor,
+    group_shape: Tuple[int, int],
+    quant_dtype: torch.dtype,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    group_shape = _normalize_quant_group_shape(x, group_shape)
+    assert quant_dtype.is_floating_point, \
+        "currently `scaled_quantize` only supports floating point dtypes " \
+        "but could be extended to support other dtypes"
+
+    finfo = torch.finfo(quant_dtype)
+
+    # Reshape (M, N) into (BLK_M, BLOCK_SIZE_M, BLK_N, BLOCK_SIZE_N)
+    assert x.ndim == 2
+    assert x.shape[0] % group_shape[0] == 0 and x.shape[1] % group_shape[1] == 0
+    blk_m, blk_n = x.shape[0] // group_shape[0], x.shape[1] // group_shape[1]
+    x_blkd = x.reshape(blk_m, group_shape[0], blk_n, group_shape[1])
+
+    # Permute to (BLK_M, BLK_N, BLOCK_SIZE_M, BLOCK_SIZE_N)
+    x_blkd_permd = x_blkd.permute(0, 2, 1, 3)
+    # Flatten to (BLK_M, BLK_N, BLOCK_SIZE_M * BLOCK_SIZE_N)
+    x_blkd_permd = x_blkd_permd.flatten(start_dim=2)
+
+    # Compute scales
+    min_val, max_val = x_blkd_permd.aminmax(dim=-1)
+    amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-12)
+    scale = finfo.max / amax
+
+    # Apply scale and convert form:
+    # (BLK_M, BLK_N, BLOCK_SIZE_M * BLOCK_SIZE_N) to (M, N)
+    x_scl_sat = (x_blkd_permd * scale.unsqueeze(-1))\
+        .clamp(min=finfo.min, max=finfo.max)\
+        .reshape(blk_m, blk_n, group_shape[0], group_shape[1])\
+        .permute(0, 2, 1, 3)\
+        .reshape(x.shape)
+
+    return x_scl_sat.to(quant_dtype).contiguous(), scale.float().reciprocal()
+
+
+# inverses `scaled_quantize`
+def scaled_dequantize(
+    x_q: torch.Tensor,
+    x_s: torch.Tensor,
+    group_shape: Optional[Tuple[int, int]] = None,
+    out_dtype: torch.dtype = torch.float32,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    if group_shape is not None:
+        group_shape = _normalize_quant_group_shape(x_q, group_shape)
+
+    if x_s.ndim == 0:  # scalar
+        x_s = x_s.unsqueeze(-1).unsqueeze(-1)  # convert to (1, 1) tensor
+    if x_s.ndim == 1:
+        if group_shape is None:
+            raise AssertionError(
+                "if x_s is 1D tensor, group_shape must be provided otherwise "
+                "its ambiguous which dimension to broadcast x_s to")
+        # unsqueeze the scales for the dimension where we want to broadcast
+        # across the full extent
+        if group_shape[0] == x_q.shape[-2]:
+            x_s = x_s.unsqueeze(-2)
+        elif group_shape[1] == x_q.shape[-1]:
+            x_s = x_s.unsqueeze(-1)
+        else:
+            raise AssertionError(
+                "if x_s is a vector we should be broadcasting it to the full "
+                "extent of one of the dimensions")
+
+    if group_shape is not None:
+        assert x_s.shape[-1] == x_q.shape[-1] // group_shape[1]
+        assert x_s.shape[-2] == x_q.shape[-2] // group_shape[0]
+    x_s = group_broadcast(x_s.to(torch.float32), x_q.shape)
+    return (x_q.to(torch.float32) * x_s).to(out_dtype)
+
+
 def pack_quantized_values_into_int32(w_q: torch.Tensor,
                                      wtype: ScalarType,
                                      packed_dim: int = 0):
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 62babcddd61b1..4be511d12838d 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -398,11 +398,13 @@ def load_model(self, vllm_config: VllmConfig) -> nn.Module:
                     # parameters onto device for processing and back off after.
                     with device_loading_context(module, target_device):
                         quant_method.process_weights_after_loading(module)
-                elif isinstance(module, Attention) and \
+                if isinstance(module, Attention) and \
                     hasattr(module, "process_weights_after_loading"):
                     # When attention modules need to process weights after
                     # currently only used by MLA
-                    module.process_weights_after_loading()
+                    # TODO(lucas): see if there is a way to unify the signatures
+                    # of process_weights_after_loading
+                    module.process_weights_after_loading(model_config.dtype)
         return model.eval()
 
 
@@ -439,6 +441,11 @@ def load_model(self, vllm_config: VllmConfig) -> nn.Module:
                     with device_loading_context(
                             module, torch.device(device_config.device)):
                         quant_method.process_weights_after_loading(module)
+                if isinstance(module, Attention) and \
+                    hasattr(module, "process_weights_after_loading"):
+                    # When attention modules need to process weights after
+                    # currently only used by MLA
+                    module.process_weights_after_loading(model_config.dtype)
         return model.eval()
 
 
@@ -633,6 +640,12 @@ def load_model(self, vllm_config: VllmConfig) -> nn.Module:
                     quant_method = getattr(module, "quant_method", None)
                     if quant_method is not None:
                         quant_method.process_weights_after_loading(module)
+                    if isinstance(module, Attention) and \
+                        hasattr(module, "process_weights_after_loading"):
+                        # When attention modules need to process weights after
+                        # currently only used by MLA
+                        module.process_weights_after_loading(
+                            model_config.dtype)
             rank = get_tensor_model_parallel_rank()
             pattern = os.path.join(
                 local_model_path,
@@ -1272,7 +1285,7 @@ def load_model(self, vllm_config: VllmConfig) -> nn.Module:
 
 class RunaiModelStreamerLoader(BaseModelLoader):
     """
-        Model loader that can load safetensors 
+        Model loader that can load safetensors
         files from local FS or S3 bucket.
     """
 
@@ -1369,6 +1382,11 @@ def load_model(self, vllm_config: VllmConfig) -> nn.Module:
                 if quant_method is not None:
                     with device_loading_context(module, target_device):
                         quant_method.process_weights_after_loading(module)
+                if isinstance(module, Attention) and \
+                    hasattr(module, "process_weights_after_loading"):
+                    # When attention modules need to process weights after
+                    # currently only used by MLA
+                    module.process_weights_after_loading(model_config.dtype)
         return model.eval()
 
 
diff --git a/vllm/model_executor/models/deepseek_v3.py b/vllm/model_executor/models/deepseek_v3.py
index 0b44f0d062c40..f6ab53c85faa3 100644
--- a/vllm/model_executor/models/deepseek_v3.py
+++ b/vllm/model_executor/models/deepseek_v3.py
@@ -27,7 +27,7 @@
 from transformers import PretrainedConfig
 
 from vllm.attention import Attention, AttentionMetadata
-from vllm.config import CacheConfig, VllmConfig
+from vllm.config import CacheConfig, ModelConfig, VllmConfig
 from vllm.distributed import (get_pp_group,
                               get_tensor_model_parallel_world_size,
                               tensor_model_parallel_all_reduce)
@@ -333,12 +333,156 @@ def forward(
         return output
 
 
+class DeepseekV3MLAAttention(nn.Module):
+    """
+    Main reference: DeepseekV2 paper, and FlashInfer Implementation
+    (https://arxiv.org/abs/2405.04434 and https://github.com/flashinfer-ai/flashinfer/pull/551).
+    
+    For more info see MLACommonImpl in: vllm/attention/backends/mla/utils.py
+    """
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        hidden_size: int,
+        num_heads: int,
+        qk_nope_head_dim: int,
+        qk_rope_head_dim: int,
+        v_head_dim: int,
+        q_lora_rank: Optional[int],
+        kv_lora_rank: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim
+        self.v_head_dim = v_head_dim
+
+        self.q_lora_rank = q_lora_rank
+        self.kv_lora_rank = kv_lora_rank
+
+        self.num_heads = num_heads
+        tp_size = get_tensor_model_parallel_world_size()
+        assert num_heads % tp_size == 0
+        self.num_local_heads = num_heads // tp_size
+
+        self.scaling = self.qk_head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        if self.q_lora_rank is not None:
+            self.q_a_proj = ReplicatedLinear(self.hidden_size,
+                                             self.q_lora_rank,
+                                             bias=False,
+                                             quant_config=quant_config,
+                                             prefix=f"{prefix}.q_a_proj")
+            self.q_a_layernorm = RMSNorm(self.q_lora_rank,
+                                         eps=config.rms_norm_eps)
+            self.q_b_proj = ColumnParallelLinear(q_lora_rank,
+                                                 self.num_heads *
+                                                 self.qk_head_dim,
+                                                 bias=False,
+                                                 quant_config=quant_config,
+                                                 prefix=f"{prefix}.q_b_proj")
+        else:
+            self.q_proj = ColumnParallelLinear(self.hidden_size,
+                                               self.num_heads *
+                                               self.qk_head_dim,
+                                               bias=False,
+                                               quant_config=quant_config,
+                                               prefix=f"{prefix}.q_proj")
+
+        self.kv_a_proj_with_mqa = ReplicatedLinear(
+            self.hidden_size,
+            self.kv_lora_rank + self.qk_rope_head_dim,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.kv_a_proj_with_mqa")
+        self.kv_a_layernorm = RMSNorm(self.kv_lora_rank,
+                                      eps=config.rms_norm_eps)
+        self.kv_b_proj = ColumnParallelLinear(
+            self.kv_lora_rank,
+            self.num_heads * (self.qk_nope_head_dim + self.v_head_dim),
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.kv_b_proj")
+        self.o_proj = RowParallelLinear(self.num_heads * self.v_head_dim,
+                                        self.hidden_size,
+                                        bias=False,
+                                        quant_config=quant_config,
+                                        prefix=f"{prefix}.o_proj")
+
+        rope_scaling["rope_type"] = 'deepseek_yarn'
+        self.rotary_emb = get_rope(qk_rope_head_dim,
+                                   rotary_dim=qk_rope_head_dim,
+                                   max_position=max_position_embeddings,
+                                   base=rope_theta,
+                                   rope_scaling=rope_scaling,
+                                   is_neox_style=False)
+        if rope_scaling:
+            mscale_all_dim = rope_scaling.get("mscale_all_dim", False)
+            scaling_factor = rope_scaling["factor"]
+            mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
+            self.scaling = self.scaling * mscale * mscale
+
+        self.mla_attn = Attention(
+            num_heads=self.num_local_heads,
+            head_size=self.kv_lora_rank,
+            scale=self.scaling,
+            num_kv_heads=1,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+            use_mla=True,
+            # MLA Args
+            q_lora_rank=self.q_lora_rank,
+            kv_lora_rank=self.kv_lora_rank,
+            qk_nope_head_dim=self.qk_nope_head_dim,
+            qk_rope_head_dim=self.qk_rope_head_dim,
+            qk_head_dim=self.qk_head_dim,
+            v_head_dim=self.v_head_dim,
+            rotary_emb=self.rotary_emb,
+            q_proj=self.q_proj if self.q_lora_rank is None else self.q_b_proj,
+            kv_b_proj=self.kv_b_proj,
+            o_proj=self.o_proj,
+        )
+
+        self.prefix = prefix
+        self.debug_layer_idx = int(self.prefix.split(".")[-2])
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+    ) -> torch.Tensor:
+        if self.q_lora_rank is not None:
+            ckq = self.q_a_proj(hidden_states)[0]
+            hidden_states_or_q_c = self.q_a_layernorm(ckq)
+        else:
+            hidden_states_or_q_c = hidden_states
+        kv_c, k_pe = self.kv_a_proj_with_mqa(hidden_states)[0].split(
+            [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
+        kv_c_normed = self.kv_a_layernorm(kv_c.contiguous())
+        return self.mla_attn(hidden_states_or_q_c, kv_c_normed, k_pe, kv_cache,
+                             attn_metadata)
+
+
 class DeepseekV3DecoderLayer(nn.Module):
 
     def __init__(
         self,
         config: PretrainedConfig,
         prefix: str,
+        model_config: ModelConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
     ) -> None:
@@ -351,7 +495,11 @@ def __init__(
         # DecoderLayers are created with `make_layers` which passes the prefix
         # with the layer's index.
         layer_idx = int(prefix.split(sep='.')[-1])
-        self.self_attn = DeepseekV3Attention(
+        if model_config.use_mla:
+            attn_cls = DeepseekV3MLAAttention
+        else:
+            attn_cls = DeepseekV3Attention
+        self.self_attn = attn_cls(
             config=config,
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
@@ -428,6 +576,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
         config = vllm_config.model_config.hf_config
+        model_config = vllm_config.model_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
 
@@ -447,6 +596,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             lambda prefix: DeepseekV3DecoderLayer(
                 config,
                 prefix,
+                model_config=model_config,
                 cache_config=cache_config,
                 quant_config=quant_config,
             ),
diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py
index 08316ba74aad8..c427b759b2e97 100644
--- a/vllm/worker/cache_engine.py
+++ b/vllm/worker/cache_engine.py
@@ -110,7 +110,9 @@ def get_cache_block_size(
             parallel_config, LayerBlockType.attention)
 
         key_cache_block = cache_config.block_size * num_heads * head_size
-        value_cache_block = key_cache_block
+        # For MLA there is no value cache, since the latent vector
+        # is joint keys and values.
+        value_cache_block = key_cache_block if not model_config.use_mla else 0
         total = num_attention_layers * (key_cache_block + value_cache_block)
         if cache_config.cache_dtype == "auto":
             dtype = model_config.dtype

From 1e3698393fca22c70dc03539cf534181466d1d25 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Sat, 1 Feb 2025 02:13:10 -0500
Subject: [PATCH 461/462] [CI/Build] Add label automation for
 structured-output, speculative-decoding, v1 (#12280)

We have `v1`, `structured-output`, and `speculative-decoding` labels on
github. This adds automation for applying these labels based on the
files touched by a PR.

Signed-off-by: Russell Bryant <rbryant@redhat.com>

---------

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 .github/mergify.yml | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/.github/mergify.yml b/.github/mergify.yml
index ca4bd7ee2b87f..43bc5ce623d3c 100644
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -35,6 +35,43 @@ pull_request_rules:
       add:
         - frontend
 
+- name: label-structured-output
+  description: Automatically apply structured-output label
+  conditions:
+    - or:
+      - files~=^vllm/model_executor/guided_decoding/
+      - files=tests/model_executor/test_guided_processors.py
+      - files=tests/entrypoints/llm/test_guided_generate.py
+      - files=benchmarks/benchmark_serving_guided.py
+      - files=benchmarks/benchmark_guided.py
+  actions:
+    label:
+      add:
+        - structured-output
+
+- name: label-speculative-decoding
+  description: Automatically apply speculative-decoding label
+  conditions:
+    - or:
+      - files~=^vllm/spec_decode/
+      - files=vllm/model_executor/layers/spec_decode_base_sampler.py
+      - files~=^tests/spec_decode/
+  actions:
+    label:
+      add:
+        - speculative-decoding
+
+- name: label-v1
+  description: Automatically apply v1 label
+  conditions:
+    - or:
+      - files~=^vllm/v1/
+      - files~=^tests/v1/
+  actions:
+    label:
+      add:
+        - v1
+
 - name: ping author on conflicts and add 'needs-rebase' label
   conditions:
       - conflict

From 4f4d427ac2cee0f8ff7f79103001f6617fa8989c Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Fri, 31 Jan 2025 23:46:57 -0800
Subject: [PATCH 462/462] Disable chunked prefill and/or prefix caching when
 MLA is enabled  (#12642)

From @mgoin in https://github.com/vllm-project/vllm/pull/12638

I cannot push to that branch, therefore a new PR to unblock release.

---------

Signed-off-by: mgoin <michael@neuralmagic.com>
Signed-off-by: simon-mo <simon.mo@hey.com>
Co-authored-by: mgoin <michael@neuralmagic.com>
---
 vllm/config.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/vllm/config.py b/vllm/config.py
index f998502eef0da..a13700aba3435 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -3252,6 +3252,16 @@ def __post_init__(self):
 
         current_platform.check_and_update_config(self)
 
+        # If MLA is enabled, force disable chunked prefill and prefix caching
+        if self.model_config and self.model_config.use_mla:
+            logger.info("MLA is enabled; forcing chunked prefill and prefix "
+                        "caching to be disabled.")
+            self.scheduler_config.enable_chunked_prefill = False
+            self.scheduler_config.chunked_prefill_enabled = False
+
+            if self.cache_config is not None:
+                self.cache_config.enable_prefix_caching = False
+
         if not self.instance_id:
             self.instance_id = random_uuid()[:5]