From ca0d92227e3a5e5880dde67da9d96c6d06454328 Mon Sep 17 00:00:00 2001
From: Michael Goin
Date: Fri, 25 Oct 2024 15:40:33 -0400
Subject: [PATCH 001/113] [Bugfix] Fix compressed_tensors_moe bad
config.strategy (#9677)
---
.../quantization/compressed_tensors/compressed_tensors_moe.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 733eece4b5fa6..c21aaa40ff2cc 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -245,7 +245,7 @@ def __init__(
config = self.quant_config.target_scheme_map["Linear"].get("weights")
self.num_bits = config.num_bits
self.packed_factor = 32 // config.num_bits
- self.strategy = config.strategy.value
+ self.strategy = config.strategy
self.group_size = config.group_size
assert config.symmetric, (
"Only symmetric quantization is supported for MoE")
From 228cfbd03fd1ad9b26001817a6d414cc9f2c22ae Mon Sep 17 00:00:00 2001
From: Rafael Vasquez
Date: Fri, 25 Oct 2024 17:32:10 -0400
Subject: [PATCH 002/113] [Doc] Improve quickstart documentation (#9256)
Signed-off-by: Rafael Vasquez
---
docs/source/getting_started/quickstart.rst | 98 ++++++++++++----------
1 file changed, 52 insertions(+), 46 deletions(-)
diff --git a/docs/source/getting_started/quickstart.rst b/docs/source/getting_started/quickstart.rst
index 80b19ac672936..f0e6cddf09ef7 100644
--- a/docs/source/getting_started/quickstart.rst
+++ b/docs/source/getting_started/quickstart.rst
@@ -1,38 +1,50 @@
.. _quickstart:
+==========
Quickstart
==========
-This guide shows how to use vLLM to:
+This guide will help you quickly get started with vLLM to:
-* run offline batched inference on a dataset;
-* build an API server for a large language model;
-* start an OpenAI-compatible API server.
+* :ref:`Run offline batched inference `
+* :ref:`Run OpenAI-compatible inference `
-Be sure to complete the :ref:`installation instructions ` before continuing with this guide.
+Prerequisites
+--------------
+- OS: Linux
+- Python: 3.8 - 3.12
+- GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.)
-.. note::
+Installation
+--------------
+
+You can install vLLM using pip. It's recommended to use `conda `_ to create and manage Python environments.
+
+.. code-block:: console
- By default, vLLM downloads model from `HuggingFace `_. If you would like to use models from `ModelScope `_ in the following examples, please set the environment variable:
+ $ conda create -n myenv python=3.10 -y
+ $ conda activate myenv
+ $ pip install vllm
- .. code-block:: shell
+Please refer to the :ref:`installation documentation ` for more details on installing vLLM.
- export VLLM_USE_MODELSCOPE=True
+.. _offline_batched_inference:
Offline Batched Inference
-------------------------
-We first show an example of using vLLM for offline batched inference on a dataset. In other words, we use vLLM to generate texts for a list of input prompts.
+With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). The example script for this section can be found `here `__.
+
+The first line of this example imports the classes :class:`~vllm.LLM` and :class:`~vllm.SamplingParams`:
-Import :class:`~vllm.LLM` and :class:`~vllm.SamplingParams` from vLLM.
-The :class:`~vllm.LLM` class is the main class for running offline inference with vLLM engine.
-The :class:`~vllm.SamplingParams` class specifies the parameters for the sampling process.
+- :class:`~vllm.LLM` is the main class for running offline inference with vLLM engine.
+- :class:`~vllm.SamplingParams` specifies the parameters for the sampling process.
.. code-block:: python
from vllm import LLM, SamplingParams
-Define the list of input prompts and the sampling parameters for generation. The sampling temperature is set to 0.8 and the nucleus sampling probability is set to 0.95. For more information about the sampling parameters, refer to the `class definition `_.
+The next section defines a list of input prompts and sampling parameters for text generation. The `sampling temperature `_ is set to ``0.8`` and the `nucleus sampling probability `_ is set to ``0.95``. You can find more information about the sampling parameters `here `__.
.. code-block:: python
@@ -44,46 +56,46 @@ Define the list of input prompts and the sampling parameters for generation. The
]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-Initialize vLLM's engine for offline inference with the :class:`~vllm.LLM` class and the `OPT-125M model `_. The list of supported models can be found at :ref:`supported models `.
+The :class:`~vllm.LLM` class initializes vLLM's engine and the `OPT-125M model `_ for offline inference. The list of supported models can be found :ref:`here `.
.. code-block:: python
llm = LLM(model="facebook/opt-125m")
-Call ``llm.generate`` to generate the outputs. It adds the input prompts to vLLM engine's waiting queue and executes the vLLM engine to generate the outputs with high throughput. The outputs are returned as a list of ``RequestOutput`` objects, which include all the output tokens.
+.. note::
+
+ By default, vLLM downloads models from `HuggingFace `_. If you would like to use models from `ModelScope `_, set the environment variable ``VLLM_USE_MODELSCOPE`` before initializing the engine.
+
+Now, the fun part! The outputs are generated using ``llm.generate``. It adds the input prompts to the vLLM engine's waiting queue and executes the vLLM engine to generate the outputs with high throughput. The outputs are returned as a list of ``RequestOutput`` objects, which include all of the output tokens.
.. code-block:: python
outputs = llm.generate(prompts, sampling_params)
- # Print the outputs.
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-
-The code example can also be found in `examples/offline_inference.py `_.
+.. _openai_compatible_server:
OpenAI-Compatible Server
------------------------
vLLM can be deployed as a server that implements the OpenAI API protocol. This allows vLLM to be used as a drop-in replacement for applications using OpenAI API.
-By default, it starts the server at ``http://localhost:8000``. You can specify the address with ``--host`` and ``--port`` arguments. The server currently hosts one model at a time (OPT-125M in the command below) and implements `list models `_, `create chat completion `_, and `create completion `_ endpoints. We are actively adding support for more endpoints.
+By default, it starts the server at ``http://localhost:8000``. You can specify the address with ``--host`` and ``--port`` arguments. The server currently hosts one model at a time and implements endpoints such as `list models `_, `create chat completion `_, and `create completion `_ endpoints.
-Start the server:
+Run the following command to start the vLLM server with the `Qwen2.5-1.5B-Instruct `_ model:
.. code-block:: console
- $ vllm serve facebook/opt-125m
+ $ vllm serve Qwen/Qwen2.5-1.5B-Instruct
-By default, the server uses a predefined chat template stored in the tokenizer. You can override this template by using the ``--chat-template`` argument:
-
-.. code-block:: console
+.. note::
- $ vllm serve facebook/opt-125m --chat-template ./examples/template_chatml.jinja
+ By default, the server uses a predefined chat template stored in the tokenizer. You can learn about overriding it `here `__.
-This server can be queried in the same format as OpenAI API. For example, list the models:
+This server can be queried in the same format as OpenAI API. For example, to list the models:
.. code-block:: console
@@ -91,17 +103,17 @@ This server can be queried in the same format as OpenAI API. For example, list t
You can pass in the argument ``--api-key`` or environment variable ``VLLM_API_KEY`` to enable the server to check for API key in the header.
-Using OpenAI Completions API with vLLM
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+OpenAI Completions API with vLLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Query the model with input prompts:
+Once your server is started, you can query the model with input prompts:
.. code-block:: console
$ curl http://localhost:8000/v1/completions \
$ -H "Content-Type: application/json" \
$ -d '{
- $ "model": "facebook/opt-125m",
+ $ "model": "Qwen/Qwen2.5-1.5B-Instruct",
$ "prompt": "San Francisco is a",
$ "max_tokens": 7,
$ "temperature": 0
@@ -120,36 +132,32 @@ Since this server is compatible with OpenAI API, you can use it as a drop-in rep
api_key=openai_api_key,
base_url=openai_api_base,
)
- completion = client.completions.create(model="facebook/opt-125m",
+ completion = client.completions.create(model="Qwen/Qwen2.5-1.5B-Instruct",
prompt="San Francisco is a")
print("Completion result:", completion)
-For a more detailed client example, refer to `examples/openai_completion_client.py `_.
-
-Using OpenAI Chat API with vLLM
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+A more detailed client example can be found `here `__.
-The vLLM server is designed to support the OpenAI Chat API, allowing you to engage in dynamic conversations with the model. The chat interface is a more interactive way to communicate with the model, allowing back-and-forth exchanges that can be stored in the chat history. This is useful for tasks that require context or more detailed explanations.
+OpenAI Chat API with vLLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~
-Querying the model using OpenAI Chat API:
+vLLM is designed to also support the OpenAI Chat API. The chat interface is a more dynamic, interactive way to communicate with the model, allowing back-and-forth exchanges that can be stored in the chat history. This is useful for tasks that require context or more detailed explanations.
-You can use the `create chat completion `_ endpoint to communicate with the model in a chat-like interface:
+You can use the `create chat completion `_ endpoint to interact with the model:
.. code-block:: console
$ curl http://localhost:8000/v1/chat/completions \
$ -H "Content-Type: application/json" \
$ -d '{
- $ "model": "facebook/opt-125m",
+ $ "model": "Qwen/Qwen2.5-1.5B-Instruct",
$ "messages": [
$ {"role": "system", "content": "You are a helpful assistant."},
$ {"role": "user", "content": "Who won the world series in 2020?"}
$ ]
$ }'
-Python Client Example:
-
-Using the `openai` python package, you can also communicate with the model in a chat-like manner:
+Alternatively, you can use the `openai` python package:
.. code-block:: python
@@ -164,12 +172,10 @@ Using the `openai` python package, you can also communicate with the model in a
)
chat_response = client.chat.completions.create(
- model="facebook/opt-125m",
+ model="Qwen/Qwen2.5-1.5B-Instruct",
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Tell me a joke."},
]
)
print("Chat response:", chat_response)
-
-For more in-depth examples and advanced features of the chat API, you can refer to the official OpenAI documentation.
From 6567e13724110fac2042d06a9e4c01fd822e8909 Mon Sep 17 00:00:00 2001
From: Travis Johnson
Date: Fri, 25 Oct 2024 16:42:56 -0600
Subject: [PATCH 003/113] [Bugfix] Fix crash with llama 3.2 vision models and
guided decoding (#9631)
Signed-off-by: Travis Johnson
Co-authored-by: pavlo-ruban
Co-authored-by: Nick Hill
---
.../guided_decoding/outlines_logits_processors.py | 14 +++++++++++---
1 file changed, 11 insertions(+), 3 deletions(-)
diff --git a/vllm/model_executor/guided_decoding/outlines_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
index c28bd71c9f682..e1309c31f77e7 100644
--- a/vllm/model_executor/guided_decoding/outlines_logits_processors.py
+++ b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
@@ -15,11 +15,11 @@
# limitations under the License.
import copy
import json
-import math
from collections import defaultdict
from functools import lru_cache
from typing import Callable, DefaultDict, Dict, List, Union
+import numpy as np
import torch
from lark import Lark
from outlines import grammars
@@ -77,9 +77,17 @@ def __call__(self, input_ids: List[int],
f"Unsupported instruction type {type(instruction)}")
mask = torch.full((scores.shape[-1], ),
- -math.inf,
+ -torch.inf,
device=scores.device)
- mask[allowed_tokens] = 0
+ # The tokenizer may support more token ids than the model can generate,
+ # eg. Llama 3.2 Vision models have an `<|image|>` token with id 128256
+ # but scores.shape == torch.Size([128256])
+ # Using NumPy is faster for filtering token ids
+ allowed_tokens = np.array(allowed_tokens, dtype=np.int64)
+ allowed_tokens = torch.tensor(allowed_tokens, device=scores.device)
+ allowed_tokens = allowed_tokens.masked_select(
+ allowed_tokens < scores.shape[-1])
+ mask.index_fill_(0, allowed_tokens, 0)
scores.add_(mask)
return scores
From 067e77f9a87c3466fce41c8fe8710fddc69ec26c Mon Sep 17 00:00:00 2001
From: Sam Stoelinga
Date: Fri, 25 Oct 2024 22:05:47 -0700
Subject: [PATCH 004/113] [Bugfix] Steaming continuous_usage_stats default to
False (#9709)
Signed-off-by: Sam Stoelinga
---
vllm/entrypoints/openai/protocol.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 733decf80a711..a212c0d608ddb 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -127,7 +127,7 @@ class ResponseFormat(OpenAIBaseModel):
class StreamOptions(OpenAIBaseModel):
include_usage: Optional[bool] = True
- continuous_usage_stats: Optional[bool] = True
+ continuous_usage_stats: Optional[bool] = False
class FunctionDefinition(OpenAIBaseModel):
From 5cbdccd151ef50e3fc040690248a8d86d3b93c2a Mon Sep 17 00:00:00 2001
From: Mengqing Cao
Date: Sat, 26 Oct 2024 18:59:06 +0800
Subject: [PATCH 005/113] [Hardware][openvino] is_openvino -->
current_platform.is_openvino (#9716)
---
tests/kernels/test_attention_selector.py | 3 +-
vllm/attention/selector.py | 4 +--
vllm/config.py | 4 +--
vllm/executor/openvino_executor.py | 20 +++++--------
vllm/model_executor/model_loader/openvino.py | 4 +--
vllm/platforms/__init__.py | 10 +++++++
vllm/platforms/interface.py | 4 +++
vllm/platforms/openvino.py | 31 ++++++++++++++++++++
vllm/utils.py | 11 +------
vllm/worker/openvino_worker.py | 16 +++++-----
10 files changed, 69 insertions(+), 38 deletions(-)
create mode 100644 vllm/platforms/openvino.py
diff --git a/tests/kernels/test_attention_selector.py b/tests/kernels/test_attention_selector.py
index 8bcee98403775..df3e770e260e0 100644
--- a/tests/kernels/test_attention_selector.py
+++ b/tests/kernels/test_attention_selector.py
@@ -30,7 +30,8 @@ def test_env(name: str, device: str, monkeypatch):
False)
assert backend.name == "ROCM_FLASH"
elif device == "openvino":
- with patch("vllm.attention.selector.is_openvino", return_value=True):
+ with patch("vllm.attention.selector.current_platform.is_openvino",
+ return_value=True):
backend = which_attn_to_use(16, torch.float16, torch.float16, 16,
False)
assert backend.name == "OPENVINO"
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index cd3c642b8c8a2..10d4509b38279 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -10,7 +10,7 @@
from vllm.attention.backends.abstract import AttentionBackend
from vllm.logger import init_logger
from vllm.platforms import current_platform
-from vllm.utils import STR_BACKEND_ENV_VAR, is_hip, is_openvino
+from vllm.utils import STR_BACKEND_ENV_VAR, is_hip
logger = init_logger(__name__)
@@ -193,7 +193,7 @@ def which_attn_to_use(
logger.info("Cannot use %s backend on CPU.", selected_backend)
return _Backend.TORCH_SDPA
- if is_openvino():
+ if current_platform.is_openvino():
if selected_backend != _Backend.OPENVINO:
logger.info("Cannot use %s backend on OpenVINO.", selected_backend)
return _Backend.OPENVINO
diff --git a/vllm/config.py b/vllm/config.py
index 25f841231dedd..a1fba98233b80 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -17,7 +17,7 @@
get_hf_image_processor_config,
get_hf_text_config)
from vllm.utils import (GiB_bytes, cuda_device_count_stateless, get_cpu_memory,
- is_hip, is_openvino, print_warning_once)
+ is_hip, print_warning_once)
if TYPE_CHECKING:
from ray.util.placement_group import PlacementGroup
@@ -1117,7 +1117,7 @@ def __init__(self, device: str = "auto") -> None:
self.device_type = "cuda"
elif current_platform.is_neuron():
self.device_type = "neuron"
- elif is_openvino():
+ elif current_platform.is_openvino():
self.device_type = "openvino"
elif current_platform.is_tpu():
self.device_type = "tpu"
diff --git a/vllm/executor/openvino_executor.py b/vllm/executor/openvino_executor.py
index 4a39839a03199..d0c0333854dae 100644
--- a/vllm/executor/openvino_executor.py
+++ b/vllm/executor/openvino_executor.py
@@ -10,6 +10,7 @@
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.platforms import current_platform
from vllm.sequence import ExecuteModelRequest
from vllm.utils import (GiB_bytes, get_distributed_init_method, get_ip,
get_open_port, make_async)
@@ -17,14 +18,6 @@
logger = init_logger(__name__)
-def is_openvino_cpu() -> bool:
- return "CPU" in envs.VLLM_OPENVINO_DEVICE
-
-
-def is_openvino_gpu() -> bool:
- return "GPU" in envs.VLLM_OPENVINO_DEVICE
-
-
class OpenVINOExecutor(ExecutorBase):
uses_ray: bool = False
@@ -32,7 +25,8 @@ class OpenVINOExecutor(ExecutorBase):
def _init_executor(self) -> None:
assert self.device_config.device_type == "openvino"
assert self.lora_config is None, "OpenVINO backend doesn't support LoRA"
- assert is_openvino_cpu() or is_openvino_gpu(), \
+ assert current_platform.is_openvino_cpu() or \
+ current_platform.is_openvino_gpu(), \
"OpenVINO backend supports only CPU and GPU devices"
self.ov_core = ov.Core()
@@ -163,7 +157,7 @@ def _verify_and_get_model_config(config: ModelConfig) -> ModelConfig:
def _verify_and_get_cache_config(ov_core: ov.Core,
config: CacheConfig) -> CacheConfig:
if envs.VLLM_OPENVINO_CPU_KV_CACHE_PRECISION == "u8":
- if not is_openvino_cpu():
+ if not current_platform.is_openvino_cpu():
logger.info("VLLM_OPENVINO_CPU_KV_CACHE_PRECISION is"
"ignored for GPU, f16 data type will be used.")
config.cache_dtype = ov.Type.f16
@@ -172,7 +166,7 @@ def _verify_and_get_cache_config(ov_core: ov.Core,
"VLLM_OPENVINO_CPU_KV_CACHE_PRECISION env var.")
config.cache_dtype = ov.Type.u8
else:
- if is_openvino_cpu():
+ if current_platform.is_openvino_cpu():
ov_device = envs.VLLM_OPENVINO_DEVICE
inference_precision = ov_core.get_property(
ov_device, hints.inference_precision)
@@ -183,7 +177,7 @@ def _verify_and_get_cache_config(ov_core: ov.Core,
else:
config.cache_dtype = ov.Type.f16
- if is_openvino_cpu():
+ if current_platform.is_openvino_cpu():
if config.block_size != 32:
logger.info(
f"OpenVINO CPU optimal block size is 32, overriding currently set {config.block_size}" # noqa: G004, E501
@@ -198,7 +192,7 @@ def _verify_and_get_cache_config(ov_core: ov.Core,
kv_cache_space = envs.VLLM_OPENVINO_KVCACHE_SPACE
if kv_cache_space >= 0:
- if kv_cache_space == 0 and is_openvino_cpu():
+ if kv_cache_space == 0 and current_platform.is_openvino_cpu():
config.openvino_kvcache_space_bytes = 4 * GiB_bytes # type: ignore
logger.warning(
"Environment variable VLLM_OPENVINO_KVCACHE_SPACE (GB) "
diff --git a/vllm/model_executor/model_loader/openvino.py b/vllm/model_executor/model_loader/openvino.py
index 88b7ac46e5541..8ada2210d0d51 100644
--- a/vllm/model_executor/model_loader/openvino.py
+++ b/vllm/model_executor/model_loader/openvino.py
@@ -12,12 +12,12 @@
import vllm.envs as envs
from vllm.attention.backends.openvino import OpenVINOAttentionMetadata
from vllm.config import DeviceConfig, ModelConfig
-from vllm.executor.openvino_executor import is_openvino_cpu
from vllm.logger import init_logger
from vllm.model_executor.layers.logits_processor import (LogitsProcessor,
_prune_hidden_states)
from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.platforms import current_platform
logger = init_logger(__name__)
@@ -136,7 +136,7 @@ def __init__(
ov_device = envs.VLLM_OPENVINO_DEVICE
paged_attention_transformation(pt_model.model)
_modify_cache_parameters(pt_model.model, kv_cache_dtype,
- is_openvino_cpu())
+ current_platform.is_openvino_cpu())
ov_compiled = ov_core.compile_model(pt_model.model, ov_device)
self.ov_request = ov_compiled.create_infer_request()
diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py
index 58912158139bd..7e9f8b1297b80 100644
--- a/vllm/platforms/__init__.py
+++ b/vllm/platforms/__init__.py
@@ -65,6 +65,13 @@
except ImportError:
pass
+is_openvino = False
+try:
+ from importlib.metadata import version
+ is_openvino = "openvino" in version("vllm")
+except Exception:
+ pass
+
if is_tpu:
# people might install pytorch built with cuda but run on tpu
# so we need to check tpu first
@@ -85,6 +92,9 @@
elif is_neuron:
from .neuron import NeuronPlatform
current_platform = NeuronPlatform()
+elif is_openvino:
+ from .openvino import OpenVinoPlatform
+ current_platform = OpenVinoPlatform()
else:
current_platform = UnspecifiedPlatform()
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index d36367f2bc9c1..7c933385d6ff6 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -11,6 +11,7 @@ class PlatformEnum(enum.Enum):
XPU = enum.auto()
CPU = enum.auto()
NEURON = enum.auto()
+ OPENVINO = enum.auto()
UNSPECIFIED = enum.auto()
@@ -52,6 +53,9 @@ def is_cpu(self) -> bool:
def is_neuron(self) -> bool:
return self._enum == PlatformEnum.NEURON
+ def is_openvino(self) -> bool:
+ return self._enum == PlatformEnum.OPENVINO
+
def is_cuda_alike(self) -> bool:
"""Stateless version of :func:`torch.cuda.is_available`."""
return self._enum in (PlatformEnum.CUDA, PlatformEnum.ROCM)
diff --git a/vllm/platforms/openvino.py b/vllm/platforms/openvino.py
new file mode 100644
index 0000000000000..35dbe22abf7ff
--- /dev/null
+++ b/vllm/platforms/openvino.py
@@ -0,0 +1,31 @@
+import torch
+
+import vllm.envs as envs
+from vllm.utils import print_warning_once
+
+from .interface import Platform, PlatformEnum
+
+
+class OpenVinoPlatform(Platform):
+ _enum = PlatformEnum.OPENVINO
+
+ @classmethod
+ def get_device_name(self, device_id: int = 0) -> str:
+ return "openvino"
+
+ @classmethod
+ def inference_mode(self):
+ return torch.inference_mode(mode=True)
+
+ @classmethod
+ def is_openvino_cpu(self) -> bool:
+ return "CPU" in envs.VLLM_OPENVINO_DEVICE
+
+ @classmethod
+ def is_openvino_gpu(self) -> bool:
+ return "GPU" in envs.VLLM_OPENVINO_DEVICE
+
+ @classmethod
+ def is_pin_memory_available(self) -> bool:
+ print_warning_once("Pin memory is not supported on OpenViNO.")
+ return False
diff --git a/vllm/utils.py b/vllm/utils.py
index 0e9b241b6f9f6..fba9804289b94 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -318,15 +318,6 @@ def is_hip() -> bool:
return torch.version.hip is not None
-@lru_cache(maxsize=None)
-def is_openvino() -> bool:
- from importlib.metadata import PackageNotFoundError, version
- try:
- return "openvino" in version("vllm")
- except PackageNotFoundError:
- return False
-
-
@lru_cache(maxsize=None)
def get_max_shared_memory_bytes(gpu: int = 0) -> int:
"""Returns the maximum shared memory per thread block in bytes."""
@@ -757,7 +748,7 @@ def is_pin_memory_available() -> bool:
elif current_platform.is_neuron():
print_warning_once("Pin memory is not supported on Neuron.")
return False
- elif current_platform.is_cpu() or is_openvino():
+ elif current_platform.is_cpu() or current_platform.is_openvino():
return False
return True
diff --git a/vllm/worker/openvino_worker.py b/vllm/worker/openvino_worker.py
index bc245d19663d6..a420d390c1ae4 100644
--- a/vllm/worker/openvino_worker.py
+++ b/vllm/worker/openvino_worker.py
@@ -13,12 +13,12 @@
from vllm.distributed import (broadcast_tensor_dict,
ensure_model_parallel_initialized,
init_distributed_environment)
-from vllm.executor.openvino_executor import is_openvino_cpu
from vllm.inputs import INPUT_REGISTRY
from vllm.logger import init_logger
from vllm.model_executor import set_random_seed
from vllm.model_executor.layers.sampler import SamplerOutput
from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.platforms import current_platform
from vllm.sampling_params import SamplingParams
from vllm.sequence import ExecuteModelRequest, SequenceGroupMetadata
from vllm.worker.openvino_model_runner import OpenVINOModelRunner
@@ -99,7 +99,7 @@ def _allocate_kv_cache(
num_blocks, self.block_size, self.num_kv_heads, self.head_size)[1:]
kv_cache: List[Tuple[ov.Tensor, ov.Tensor]] = []
- if is_openvino_cpu():
+ if current_platform.is_openvino_cpu():
for _ in range(self.num_layers):
key_blocks = ov.Tensor(self.cache_config.cache_dtype,
k_block_shape)
@@ -141,7 +141,7 @@ def _allocate_swap_cache(
if num_blocks == 0:
return swap_cache
- assert not is_openvino_cpu(), \
+ assert not current_platform.is_openvino_cpu(), \
"CPU device isn't supposed to have swap cache"
# Update key_cache shape:
@@ -285,7 +285,7 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
cache_block_size = self.get_cache_block_size_bytes()
kvcache_space_bytes = self.cache_config.openvino_kvcache_space_bytes
- if is_openvino_cpu():
+ if current_platform.is_openvino_cpu():
num_device_blocks = int(kvcache_space_bytes // cache_block_size)
num_swap_blocks = 0
else:
@@ -322,7 +322,7 @@ def initialize_cache(self, num_gpu_blocks: int,
num_device_blocks = num_gpu_blocks
num_swap_blocks = num_cpu_blocks
- if is_openvino_cpu():
+ if current_platform.is_openvino_cpu():
assert (num_swap_blocks == 0
), f"{type(self)} does not support swappable cache for CPU"
@@ -366,7 +366,7 @@ def _init_cache_engine(self) -> None:
assert self.kv_cache is not None
# Populate the cache to warmup the memory
- if is_openvino_cpu():
+ if current_platform.is_openvino_cpu():
for key_cache, value_cache in self.kv_cache:
key_cache.data[:] = 0
value_cache.data[:] = 0
@@ -414,7 +414,7 @@ def execute_model(
blocks_to_swap_in = data["blocks_to_swap_in"]
blocks_to_swap_out = data["blocks_to_swap_out"]
- if is_openvino_cpu():
+ if current_platform.is_openvino_cpu():
assert len(execute_model_req.blocks_to_swap_in) == 0
assert len(execute_model_req.blocks_to_swap_out) == 0
else:
@@ -466,7 +466,7 @@ def get_cache_block_size_bytes(self) -> int:
def profile_run(self) -> int:
ov_device = envs.VLLM_OPENVINO_DEVICE
- assert not is_openvino_cpu(), \
+ assert not current_platform.is_openvino_cpu(), \
"CPU device isn't supposed to use profile run."
import openvino.properties.device as device
From 55137e8ee32509b2fa3b83d5caaee018a929f82d Mon Sep 17 00:00:00 2001
From: ErkinSagiroglu <52523336+MErkinSag@users.noreply.github.com>
Date: Sat, 26 Oct 2024 13:12:57 +0100
Subject: [PATCH 006/113] Fix: MI100 Support By Bypassing Custom Paged
Attention (#9560)
---
vllm/attention/backends/rocm_flash_attn.py | 8 ++++++--
1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index c2aec4aaa74e7..30859dfa60634 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -21,7 +21,10 @@
logger = init_logger(__name__)
_PARTITION_SIZE_ROCM = 512
-_ON_NAVI = "gfx1" in torch.cuda.get_device_properties("cuda").gcnArchName
+_GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName
+_ON_NAVI = "gfx1" in _GPU_ARCH
+_ON_MI250_MI300 = any(arch in _GPU_ARCH
+ for arch in ["gfx90a", "gfx940", "gfx941", "gfx942"])
class ROCmFlashAttentionBackend(AttentionBackend):
@@ -662,7 +665,8 @@ def _use_rocm_custom_paged_attention(qtype: torch.dtype, head_size: int,
block_size: int, gqa_ratio: int,
max_seq_len: int) -> bool:
# rocm custom page attention not support on navi (gfx1*)
- return (not _ON_NAVI and (qtype == torch.half or qtype == torch.bfloat16)
+ return (_ON_MI250_MI300 and not _ON_NAVI
+ and (qtype == torch.half or qtype == torch.bfloat16)
and (head_size == 64 or head_size == 128)
and (block_size == 16 or block_size == 32)
and (gqa_ratio >= 1 and gqa_ratio <= 16) and max_seq_len <= 32768)
From 07e981fdf43bb7a7186c782a5ad6b99b36c2fc19 Mon Sep 17 00:00:00 2001
From: Vasiliy Alekseev
Date: Sat, 26 Oct 2024 19:29:38 +0300
Subject: [PATCH 007/113] [Frontend] Bad words sampling parameter (#9717)
Signed-off-by: Vasily Alexeev
---
tests/samplers/test_no_bad_words.py | 185 ++++++++++++++++++
vllm/engine/llm_engine.py | 13 +-
vllm/logits_process.py | 119 +++++++++++
.../guided_decoding/__init__.py | 3 +-
.../lm_format_enforcer_decoding.py | 3 +-
vllm/sampling_params.py | 32 +--
6 files changed, 339 insertions(+), 16 deletions(-)
create mode 100644 tests/samplers/test_no_bad_words.py
create mode 100644 vllm/logits_process.py
diff --git a/tests/samplers/test_no_bad_words.py b/tests/samplers/test_no_bad_words.py
new file mode 100644
index 0000000000000..4190cf7cd7664
--- /dev/null
+++ b/tests/samplers/test_no_bad_words.py
@@ -0,0 +1,185 @@
+"""Make sure bad_words works.
+
+Run `pytest tests/samplers/test_no_bad_words.py`.
+
+"""
+from typing import List, Optional
+
+from transformers import AutoTokenizer
+
+from vllm import LLM, SamplingParams
+
+
+def _generate(
+ model: LLM,
+ prompt: str,
+ num_prompt_tokens: int,
+ temperature: float = 0,
+ bad_words: Optional[List[str]] = None,
+) -> List[int]:
+ sampling_params = SamplingParams(
+ temperature=temperature,
+ bad_words=bad_words,
+ )
+
+ # [([output_token_ids, ], [output_text, ]), ]
+ output = model.generate([prompt], sampling_params=sampling_params)
+
+ output_token_ids = output[0][0][0][num_prompt_tokens:]
+ # [0] first (and only) request output
+ # [0] token_ids (not text)
+ # [0] first (and only) output completion
+
+ return output_token_ids
+
+
+class TestOneTokenBadWord:
+ MODEL = "TheBloke/Llama-2-7B-fp16"
+
+ PROMPT = "Hi! How are"
+ TARGET_TOKEN = "you"
+
+ def setup_method(self, method):
+ self.tokenizer = AutoTokenizer.from_pretrained(self.MODEL,
+ add_prefix_space=True)
+
+ self.num_prompt_tokens = len(self._encode(self.PROMPT))
+ self.target_token_id = self._encode(self.TARGET_TOKEN,
+ add_special_tokens=False)[0]
+
+ def test_one_token_bad_word(self, vllm_runner):
+ with vllm_runner(self.MODEL) as llm:
+ output_token_ids = self._generate(llm)
+ assert output_token_ids[0] == self.target_token_id
+
+ output_token_ids = self._generate(llm,
+ bad_words=[self.TARGET_TOKEN])
+ assert self.target_token_id not in output_token_ids
+
+ def _generate(self,
+ model: LLM,
+ bad_words: Optional[List[str]] = None) -> List[int]:
+ return _generate(
+ model=model,
+ prompt=self.PROMPT,
+ num_prompt_tokens=self.num_prompt_tokens,
+ bad_words=bad_words,
+ )
+
+ def _encode(self,
+ prompt: str,
+ add_special_tokens: bool = True) -> List[int]:
+ return self.tokenizer(prompt,
+ add_special_tokens=add_special_tokens).input_ids
+
+
+class TestTwoTokenBadWord:
+ # Another model (with a different tokenizer behaviour)
+ MODEL = "openai-community/gpt2"
+
+ PROMPT = "How old are you? I am 10"
+ TARGET_TOKEN1 = "years"
+ TARGET_TOKEN2 = "old"
+ NEIGHBOUR_TOKEN2 = "older"
+
+ def setup_method(self, method):
+ self.tokenizer = AutoTokenizer.from_pretrained(self.MODEL,
+ add_prefix_space=True)
+
+ self.num_prompt_tokens = len(self._encode(self.PROMPT))
+ self.target_token_id1 = self._encode(self.TARGET_TOKEN1,
+ add_special_tokens=False)[0]
+ self.target_token_id2 = self._encode(self.TARGET_TOKEN2,
+ add_special_tokens=False)[0]
+ self.neighbour_token_id2 = self._encode(self.NEIGHBOUR_TOKEN2,
+ add_special_tokens=False)[0]
+
+ def test_two_token_bad_word(self, vllm_runner):
+ with vllm_runner(self.MODEL) as llm:
+ output_token_ids = self._generate(llm)
+ assert output_token_ids[:2] == [
+ self.target_token_id1, self.target_token_id2
+ ]
+
+ output_token_ids = self._generate(llm,
+ bad_words=[self.TARGET_TOKEN1])
+ assert self.target_token_id1 not in output_token_ids
+
+ output_token_ids = self._generate(llm,
+ bad_words=[self.TARGET_TOKEN2])
+ assert output_token_ids[0] == self.target_token_id1
+ assert self.target_token_id2 not in output_token_ids
+
+ output_token_ids = self._generate(
+ llm, bad_words=[f'{self.TARGET_TOKEN1} {self.TARGET_TOKEN2}'])
+ assert output_token_ids[0] == self.target_token_id1
+ assert output_token_ids[:2] != [
+ self.target_token_id1, self.target_token_id2
+ ]
+ assert not self._contains(
+ output_token_ids,
+ [self.target_token_id1, self.target_token_id2])
+ # Model dependent behaviour
+ assert output_token_ids[:2] == [
+ self.target_token_id1, self.neighbour_token_id2
+ ]
+
+ output_token_ids = self._generate(
+ llm,
+ bad_words=[
+ f'{self.TARGET_TOKEN1} {self.TARGET_TOKEN2}',
+ f'{self.TARGET_TOKEN1} {self.NEIGHBOUR_TOKEN2}'
+ ])
+ assert output_token_ids[0] == self.target_token_id1
+ assert output_token_ids[:2] != [
+ self.target_token_id1, self.target_token_id2
+ ]
+ assert not self._contains(
+ output_token_ids,
+ [self.target_token_id1, self.target_token_id2])
+ assert output_token_ids[:2] != [
+ self.target_token_id1, self.neighbour_token_id2
+ ]
+ assert not self._contains(
+ output_token_ids,
+ [self.target_token_id1, self.neighbour_token_id2])
+ assert ((self.target_token_id2 in output_token_ids)
+ or (self.neighbour_token_id2 in output_token_ids))
+
+ def _generate(self,
+ model: LLM,
+ bad_words: Optional[List[str]] = None) -> List[int]:
+ return _generate(
+ model=model,
+ prompt=self.PROMPT,
+ num_prompt_tokens=self.num_prompt_tokens,
+ bad_words=bad_words,
+ )
+
+ @staticmethod
+ def _contains(sequence: List[int], subsequence: List[int]) -> bool:
+ searched = False
+
+ for start in range(len(sequence)):
+ end = start + len(subsequence)
+ current_subsequence = sequence[start:end]
+
+ if len(current_subsequence) < len(subsequence):
+ continue
+
+ searched = True
+
+ assert len(current_subsequence) == len(subsequence)
+
+ if current_subsequence == subsequence:
+ return True
+
+ assert searched, "All subsequences did not match in length..."
+
+ return False
+
+ def _encode(self,
+ prompt: str,
+ add_special_tokens: bool = True) -> List[int]:
+ return self.tokenizer(prompt,
+ add_special_tokens=add_special_tokens).input_ids
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 1dd0f097c74ff..ede77f04b1db9 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -26,7 +26,8 @@
SequenceGroupOutputProcessor)
from vllm.engine.output_processor.stop_checker import StopChecker
from vllm.engine.output_processor.util import create_output_by_sequence_group
-from vllm.entrypoints.openai.logits_processors import get_logits_processors
+from vllm.entrypoints.openai.logits_processors import (
+ get_logits_processors as get_openai_logits_processors)
from vllm.executor.executor_base import ExecutorBase
from vllm.executor.gpu_executor import GPUExecutor
from vllm.executor.ray_utils import initialize_ray_cluster
@@ -34,6 +35,7 @@
EncoderDecoderInputs, InputRegistry, PromptType)
from vllm.inputs.preprocess import InputPreprocessor
from vllm.logger import init_logger
+from vllm.logits_process import get_bad_words_logits_processors
from vllm.lora.request import LoRARequest
from vllm.model_executor.guided_decoding import (
get_local_guided_decoding_logits_processor)
@@ -1963,6 +1965,7 @@ def _build_logits_processors(
logits_processors field. Returns the modified sampling params."""
logits_processors = []
+
if (guided_decoding := sampling_params.guided_decoding) is not None:
logger.debug(
@@ -1984,7 +1987,7 @@ def _build_logits_processors(
if (sampling_params.logit_bias or sampling_params.allowed_token_ids):
tokenizer = self.get_tokenizer(lora_request=lora_request)
- processors = get_logits_processors(
+ processors = get_openai_logits_processors(
logit_bias=sampling_params.logit_bias,
allowed_token_ids=sampling_params.allowed_token_ids,
tokenizer=tokenizer)
@@ -1994,6 +1997,12 @@ def _build_logits_processors(
sampling_params.logit_bias = None
sampling_params.allowed_token_ids = None
+ if len(sampling_params.bad_words) > 0:
+ tokenizer = self.get_tokenizer(lora_request)
+ processors = get_bad_words_logits_processors(
+ bad_words=sampling_params.bad_words, tokenizer=tokenizer)
+ logits_processors.extend(processors)
+
if logits_processors:
if sampling_params.logits_processors is None:
sampling_params.logits_processors = logits_processors
diff --git a/vllm/logits_process.py b/vllm/logits_process.py
new file mode 100644
index 0000000000000..7716ccd27e253
--- /dev/null
+++ b/vllm/logits_process.py
@@ -0,0 +1,119 @@
+from typing import Callable, List, Tuple, Union
+
+import torch
+
+from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
+
+LogitsProcessor = Union[Callable[[List[int], torch.Tensor], torch.Tensor],
+ Callable[[List[int], List[int], torch.Tensor],
+ torch.Tensor]]
+"""LogitsProcessor is a function that takes a list
+of previously generated tokens, the logits tensor
+for the next token and, optionally, prompt tokens as a
+first argument, and returns a modified tensor of logits
+to sample from."""
+
+
+def get_bad_words_logits_processors(
+ bad_words: List[str],
+ tokenizer: AnyTokenizer) -> List[LogitsProcessor]:
+ bad_words_ids: List[List[int]] = list()
+
+ for bad_word in bad_words:
+ # To prohibit words both at the beginning
+ # and in the middle of text
+ # (related to add_prefix_space tokenizer parameter)
+ for add_prefix_space in [False, True]:
+ prefix = " " if add_prefix_space else ""
+ prompt = prefix + bad_word.lstrip()
+
+ if isinstance(tokenizer, MistralTokenizer):
+ # Mistral tokenizers should not add special tokens
+ prompt_token_ids = tokenizer.encode(prompt=prompt)
+ else:
+ prompt_token_ids = tokenizer.encode(text=prompt,
+ add_special_tokens=False)
+
+ # If no space at the beginning
+ # or if prefix space produces a new word token
+ if (not add_prefix_space) or (
+ add_prefix_space
+ and prompt_token_ids[0] != bad_words_ids[-1][0]
+ and len(prompt_token_ids) == len(bad_words_ids[-1])):
+ bad_words_ids.append(prompt_token_ids)
+
+ return [NoBadWordsLogitsProcessor(bad_words_ids=bad_words_ids)]
+
+
+class NoBadWordsLogitsProcessor:
+ _SMALLEST_LOGIT = float("-inf")
+ _NEUTRAL_LOGIT = 0.0
+
+ def __init__(self, bad_words_ids: List[List[int]]):
+ self.bad_words_ids = bad_words_ids
+ self.word_bias: torch.FloatTensor = None
+
+ def __call__(
+ self,
+ past_tokens_ids: Union[List[int], Tuple[int]],
+ logits: torch.FloatTensor,
+ ) -> torch.Tensor:
+ if self.word_bias is None:
+ self._init_word_bias(logits=logits)
+
+ last_token_bias = torch.zeros_like(logits)
+
+ for bad_word_ids in self.bad_words_ids:
+ if len(bad_word_ids) == 1: # 1-token words already processed
+ continue
+
+ if len(bad_word_ids) > len(past_tokens_ids) + 1:
+ continue
+
+ prefix_length = len(bad_word_ids) - 1
+ last_token_id = bad_word_ids[-1]
+ actual_prefix = past_tokens_ids[-prefix_length:]
+ expected_prefix = bad_word_ids[:prefix_length]
+
+ assert len(actual_prefix) == len(expected_prefix)
+
+ is_match = tuple(actual_prefix) == tuple(expected_prefix)
+ last_token_bias[last_token_id] += (self._SMALLEST_LOGIT if is_match
+ else self._NEUTRAL_LOGIT)
+
+ logits = logits + self.word_bias + last_token_bias
+
+ return logits
+
+ def _init_word_bias(self, logits: torch.FloatTensor) -> None:
+ # Code based on NoBadWordsLogitsProcessor and SequenceBiasLogitsProcessor # noqa: E501
+ # from https://github.com/huggingface/transformers/blob/main/src/transformers/generation/logits_process.py
+
+ vocab_size = logits.shape[-1]
+
+ self._check_token_ids_bounds(vocab_size=vocab_size)
+
+ self.word_bias = torch.zeros((vocab_size, ),
+ dtype=torch.float,
+ device=logits.device)
+
+ for bad_word_ids in self.bad_words_ids:
+ if len(bad_word_ids) == 1:
+ bad_word_id = bad_word_ids[-1]
+ self.word_bias[bad_word_id] = self._SMALLEST_LOGIT
+
+ def _check_token_ids_bounds(self, vocab_size: int) -> None:
+ invalid_token_ids = []
+
+ for bad_word_ids in self.bad_words_ids:
+ for token_id in bad_word_ids:
+ if token_id < 0 or token_id >= vocab_size:
+ invalid_token_ids.append(token_id)
+
+ if len(invalid_token_ids) > 0:
+ raise ValueError(
+ f"The model vocabulary size is {vocab_size},"
+ f" but the following tokens"
+ f" were specified as bad: {invalid_token_ids}."
+ f" All token id values should be integers satisfying:"
+ f" 0 <= token_id < {vocab_size}.")
diff --git a/vllm/model_executor/guided_decoding/__init__.py b/vllm/model_executor/guided_decoding/__init__.py
index 368436aa14613..d7b67425fcbc0 100644
--- a/vllm/model_executor/guided_decoding/__init__.py
+++ b/vllm/model_executor/guided_decoding/__init__.py
@@ -1,6 +1,7 @@
from typing import Optional
-from vllm.sampling_params import GuidedDecodingParams, LogitsProcessor
+from vllm.logits_process import LogitsProcessor
+from vllm.sampling_params import GuidedDecodingParams
async def get_guided_decoding_logits_processor(
diff --git a/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py b/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py
index cf2162ed7720d..a17e75a80300f 100644
--- a/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py
+++ b/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py
@@ -9,7 +9,8 @@
build_vllm_logits_processor, build_vllm_token_enforcer_tokenizer_data)
from transformers import PreTrainedTokenizerBase
-from vllm.sampling_params import GuidedDecodingParams, LogitsProcessor
+from vllm.logits_process import LogitsProcessor
+from vllm.sampling_params import GuidedDecodingParams
def get_local_lm_format_enforcer_guided_decoding_logits_processor(
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 9993cec13d649..bac32c991a0e3 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -3,14 +3,14 @@
from dataclasses import dataclass
from enum import Enum, IntEnum
from functools import cached_property
-from typing import Any, Callable, Dict, List, Optional, Set, Union
+from typing import Any, Dict, List, Optional, Set, Union
import msgspec
-import torch
from pydantic import BaseModel
from typing_extensions import Annotated
from vllm.logger import init_logger
+from vllm.logits_process import LogitsProcessor
logger = init_logger(__name__)
@@ -24,16 +24,6 @@ class SamplingType(IntEnum):
RANDOM_SEED = 2
-LogitsProcessor = Union[Callable[[List[int], torch.Tensor], torch.Tensor],
- Callable[[List[int], List[int], torch.Tensor],
- torch.Tensor]]
-"""LogitsProcessor is a function that takes a list
-of previously generated tokens, the logits tensor
-for the next token and, optionally, prompt tokens as a
-first argument, and returns a modified tensor of logits
-to sample from."""
-
-
# maybe make msgspec?
@dataclass
class GuidedDecodingParams:
@@ -139,6 +129,10 @@ class SamplingParams(
stop_token_ids: List of tokens that stop the generation when they are
generated. The returned output will contain the stop tokens unless
the stop tokens are special tokens.
+ bad_words: List of words that are not allowed to be generated.
+ More precisely, only the last token of a corresponding
+ token sequence is not allowed when the next generated token
+ can complete the sequence.
include_stop_str_in_output: Whether to include the stop strings in
output text. Defaults to False.
ignore_eos: Whether to ignore the EOS token and continue generating
@@ -186,6 +180,7 @@ class SamplingParams(
seed: Optional[int] = None
stop: Optional[Union[str, List[str]]] = None
stop_token_ids: Optional[List[int]] = None
+ bad_words: Optional[List[str]] = None
ignore_eos: bool = False
max_tokens: Optional[int] = 16
min_tokens: int = 0
@@ -228,6 +223,7 @@ def from_optional(
seed: Optional[int] = None,
stop: Optional[Union[str, List[str]]] = None,
stop_token_ids: Optional[List[int]] = None,
+ bad_words: Optional[List[str]] = None,
include_stop_str_in_output: bool = False,
ignore_eos: bool = False,
max_tokens: Optional[int] = 16,
@@ -267,6 +263,7 @@ def from_optional(
seed=seed,
stop=stop,
stop_token_ids=stop_token_ids,
+ bad_words=bad_words,
include_stop_str_in_output=include_stop_str_in_output,
ignore_eos=ignore_eos,
max_tokens=max_tokens,
@@ -298,26 +295,36 @@ def __post_init__(self) -> None:
f"got n={self.n} and best_of={self.best_of}.")
self._real_n = self.n
self.n = self.best_of
+
if 0 < self.temperature < _MAX_TEMP:
logger.warning(
"temperature %s is less than %s, which may cause numerical "
"errors nan or inf in tensors. We have maxed it out to %s.",
self.temperature, _MAX_TEMP, _MAX_TEMP)
self.temperature = max(self.temperature, _MAX_TEMP)
+
if self.seed == -1:
self.seed = None
else:
self.seed = self.seed
+
if self.stop is None:
self.stop = []
elif isinstance(self.stop, str):
self.stop = [self.stop]
else:
self.stop = list(self.stop)
+
if self.stop_token_ids is None:
self.stop_token_ids = []
else:
self.stop_token_ids = list(self.stop_token_ids)
+
+ if self.bad_words is None:
+ self.bad_words = []
+ else:
+ self.bad_words = list(self.bad_words)
+
self.logprobs = 1 if self.logprobs is True else self.logprobs
self.prompt_logprobs = (1 if self.prompt_logprobs is True else
self.prompt_logprobs)
@@ -468,6 +475,7 @@ def __repr__(self) -> str:
f"seed={self.seed}, "
f"stop={self.stop}, "
f"stop_token_ids={self.stop_token_ids}, "
+ f"bad_words={self.bad_words}, "
f"include_stop_str_in_output={self.include_stop_str_in_output}, "
f"ignore_eos={self.ignore_eos}, "
f"max_tokens={self.max_tokens}, "
From 6650e6a930dbdf1cd4def9b58e952376400ccfcf Mon Sep 17 00:00:00 2001
From: kakao-kevin-us
Date: Sun, 27 Oct 2024 02:53:35 +0900
Subject: [PATCH 008/113] [Model] Add classification Task with
Qwen2ForSequenceClassification (#9704)
Signed-off-by: Kevin-Yang
Co-authored-by: Kevin-Yang
---
docs/source/models/supported_models.rst | 22 ++++
tests/conftest.py | 19 ++++
.../embedding/language/test_cls_models.py | 53 +++++++++
vllm/model_executor/layers/pooler.py | 9 +-
vllm/model_executor/models/qwen2_cls.py | 107 ++++++++++++++++++
vllm/model_executor/models/registry.py | 2 +
6 files changed, 211 insertions(+), 1 deletion(-)
create mode 100644 tests/models/embedding/language/test_cls_models.py
create mode 100644 vllm/model_executor/models/qwen2_cls.py
diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index 98d804052b575..ff893b613f150 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -361,6 +361,28 @@ Reward Modeling
.. note::
As an interim measure, these models are supported via Embeddings API. See `this RFC `_ for upcoming changes.
+Classification
+---------------
+
+.. list-table::
+ :widths: 25 25 50 5 5
+ :header-rows: 1
+
+ * - Architecture
+ - Models
+ - Example HF Models
+ - :ref:`LoRA `
+ - :ref:`PP `
+ * - :code:`Qwen2ForSequenceClassification`
+ - Qwen2-based
+ - :code:`jason9693/Qwen2.5-1.5B-apeach`, etc.
+ -
+ - ✅︎
+
+.. note::
+ As an interim measure, these models are supported via Embeddings API. It will be supported via Classification API in the future (no reference APIs exist now).
+
+
Multimodal Language Models
^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/tests/conftest.py b/tests/conftest.py
index 6adff5e2328c4..2fce2d772c6ed 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -343,6 +343,17 @@ def get_inputs(
return all_inputs
+ def classify(self, prompts: List[str]) -> List[str]:
+ # output is final logits
+ all_inputs = self.get_inputs(prompts)
+ outputs = []
+ for inputs in all_inputs:
+ output = self.model(**self.wrap_device(inputs))
+ logits = output.logits.softmax(dim=-1)[0].tolist()
+ outputs.append(logits)
+
+ return outputs
+
def generate(
self,
prompts: List[str],
@@ -688,6 +699,14 @@ def get_inputs(
return inputs
+ def classify(self, prompts: List[str]) -> List[str]:
+ req_outputs = self.model.encode(prompts)
+ outputs = []
+ for req_output in req_outputs:
+ embedding = req_output.outputs.embedding
+ outputs.append(embedding)
+ return outputs
+
def generate(
self,
prompts: List[str],
diff --git a/tests/models/embedding/language/test_cls_models.py b/tests/models/embedding/language/test_cls_models.py
new file mode 100644
index 0000000000000..d8ca6d361f0e3
--- /dev/null
+++ b/tests/models/embedding/language/test_cls_models.py
@@ -0,0 +1,53 @@
+"""Compare the outputs of HF and vLLM when using greedy sampling.
+
+This test only tests small models. Big models such as 7B should be tested from
+test_big_models.py because it could use a larger instance to run tests.
+
+Run `pytest tests/models/test_cls_models.py`.
+"""
+import pytest
+import torch
+from transformers import AutoModelForSequenceClassification
+
+CLASSIFICATION_MODELS = ["jason9693/Qwen2.5-1.5B-apeach"]
+
+
+@pytest.mark.parametrize("model", CLASSIFICATION_MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+def test_classification_models(
+ hf_runner,
+ vllm_runner,
+ example_prompts,
+ model: str,
+ dtype: str,
+) -> None:
+ with hf_runner(model,
+ dtype=dtype,
+ auto_cls=AutoModelForSequenceClassification) as hf_model:
+ hf_outputs = hf_model.classify(example_prompts)
+
+ with vllm_runner(model, dtype=dtype) as vllm_model:
+ vllm_outputs = vllm_model.classify(example_prompts)
+
+ print(hf_outputs, vllm_outputs)
+
+ # check logits difference
+ for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
+ hf_output = torch.tensor(hf_output)
+ vllm_output = torch.tensor(vllm_output)
+
+ assert torch.allclose(hf_output, vllm_output, 1e-3)
+
+
+@pytest.mark.parametrize("model", CLASSIFICATION_MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+def test_classification_model_print(
+ vllm_runner,
+ model: str,
+ dtype: str,
+) -> None:
+ with vllm_runner(model, dtype=dtype) as vllm_model:
+ # This test is for verifying whether the model's extra_repr
+ # can be printed correctly.
+ print(vllm_model.model.llm_engine.model_executor.driver_worker.
+ model_runner.model)
diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py
index 3455a4ccf282f..0a1df9cb699ae 100644
--- a/vllm/model_executor/layers/pooler.py
+++ b/vllm/model_executor/layers/pooler.py
@@ -28,11 +28,15 @@ class Pooler(nn.Module):
normalize: Whether to normalize the pooled data.
"""
- def __init__(self, pooling_type: PoolingType, normalize: bool):
+ def __init__(self,
+ pooling_type: PoolingType,
+ normalize: bool,
+ softmax: bool = False):
super().__init__()
self.pooling_type = pooling_type
self.normalize = normalize
+ self.softmax = softmax
def forward(
self,
@@ -64,6 +68,9 @@ def forward(
if self.normalize:
pooled_data = nn.functional.normalize(pooled_data, p=2, dim=1)
+ if self.softmax:
+ pooled_data = nn.functional.softmax(pooled_data, dim=-1)
+
pooled_outputs = [
EmbeddingSequenceGroupOutput(data.tolist()) for data in pooled_data
]
diff --git a/vllm/model_executor/models/qwen2_cls.py b/vllm/model_executor/models/qwen2_cls.py
new file mode 100644
index 0000000000000..e10c6dbbb6472
--- /dev/null
+++ b/vllm/model_executor/models/qwen2_cls.py
@@ -0,0 +1,107 @@
+# coding=utf-8
+# Adapted from
+# https://huggingface.co/Qwen/Qwen2.5-Math-RM-72B/blob/main/modeling_qwen2_rm.py
+# Copyright 2024 Kakao Corp. (Kanana-X Team)
+# Copyright 2024 The Qwen team.
+# Copyright 2023 The vLLM team.
+"""Inference-only Qwen2-Classification model compatible with HF weights."""
+from typing import Iterable, List, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import Qwen2Config
+
+from vllm.attention import AttentionMetadata
+from vllm.config import CacheConfig, LoRAConfig
+from vllm.model_executor.layers.linear import RowParallelLinear
+from vllm.model_executor.layers.pooler import Pooler, PoolingType
+from vllm.model_executor.layers.quantization.base_config import (
+ QuantizationConfig)
+from vllm.model_executor.models.qwen2 import Qwen2Model
+from vllm.model_executor.pooling_metadata import PoolingMetadata
+from vllm.sequence import IntermediateTensors, PoolerOutput
+
+from .utils import AutoWeightsLoader
+
+
+class Qwen2ForSequenceClassification(nn.Module):
+ packed_modules_mapping = {
+ "qkv_proj": [
+ "q_proj",
+ "k_proj",
+ "v_proj",
+ ],
+ "gate_up_proj": [
+ "gate_proj",
+ "up_proj",
+ ],
+ }
+
+ # LoRA specific attributes
+ supported_lora_modules = [
+ "qkv_proj",
+ "o_proj",
+ "gate_up_proj",
+ "down_proj",
+ ]
+ embedding_modules = {}
+ embedding_padding_modules = []
+
+ def __init__(
+ self,
+ config: Qwen2Config,
+ cache_config: Optional[CacheConfig] = None,
+ quant_config: Optional[QuantizationConfig] = None,
+ lora_config: Optional[LoRAConfig] = None,
+ ) -> None:
+ # TODO (@robertgshaw2): see if this can be moved out
+ if (cache_config.sliding_window is not None
+ and hasattr(config, "max_window_layers")):
+ raise ValueError("Sliding window for some but all layers is not "
+ "supported. This model uses sliding window "
+ "but `max_window_layers` = %s is less than "
+ "`num_hidden_layers` = %s. Please open an issue "
+ "to discuss this feature." % (
+ config.max_window_layers,
+ config.num_hidden_layers,
+ ))
+
+ super().__init__()
+
+ self.config = config
+ self.lora_config = lora_config
+
+ self.quant_config = quant_config
+ self.model = Qwen2Model(config, cache_config, quant_config)
+
+ self.score = RowParallelLinear(config.hidden_size,
+ config.num_labels,
+ quant_config=quant_config)
+ self._pooler = Pooler(pooling_type=PoolingType.LAST,
+ normalize=False,
+ softmax=True)
+
+ def forward(
+ self,
+ input_ids: torch.Tensor,
+ positions: torch.Tensor,
+ kv_caches: List[torch.Tensor],
+ attn_metadata: AttentionMetadata,
+ intermediate_tensors: Optional[IntermediateTensors] = None,
+ ) -> torch.Tensor:
+ hidden_states = self.model(input_ids, positions, kv_caches,
+ attn_metadata, intermediate_tensors)
+ logits, _ = self.score(hidden_states)
+ return logits
+
+ def pooler(
+ self,
+ hidden_states: torch.Tensor,
+ pooling_metadata: PoolingMetadata,
+ ) -> Optional[PoolerOutput]:
+ return self._pooler(hidden_states, pooling_metadata)
+
+ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+ loader = AutoWeightsLoader(self,
+ ignore_unexpected_prefixes=["lm_head."])
+ loader.load_weights(weights)
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 717615988a907..f6713ab0898f0 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -96,6 +96,8 @@
"Gemma2Model": ("gemma2", "Gemma2EmbeddingModel"),
"MistralModel": ("llama", "LlamaEmbeddingModel"),
"Qwen2ForRewardModel": ("qwen2_rm", "Qwen2ForRewardModel"),
+ "Qwen2ForSequenceClassification": (
+ "qwen2_cls", "Qwen2ForSequenceClassification"),
# [Multimodal]
"LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"), # noqa: E501
"Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
From 67a6882da474a45dde0d35b3789e096e7bd0fd4e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=A7=91=E8=8B=B1?=
Date: Sun, 27 Oct 2024 12:18:03 +0800
Subject: [PATCH 009/113] [Misc] SpecDecodeWorker supports profiling (#9719)
Signed-off-by: Abatom
---
vllm/spec_decode/spec_decode_worker.py | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index 316db43502d3b..9f7ef2f8d851c 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -1038,6 +1038,14 @@ def get_cache_block_size_bytes(self):
"""
raise NotImplementedError
+ def start_profile(self):
+ if isinstance(self.scorer_worker, Worker):
+ self.scorer_worker.start_profile()
+
+ def stop_profile(self):
+ if isinstance(self.scorer_worker, Worker):
+ self.scorer_worker.stop_profile()
+
def split_num_cache_blocks_evenly(scorer_cache_block_size_bytes: int,
proposer_cache_block_size_bytes: int,
From 8549c82660cfa59a13cccd622f8afcc29cbd4281 Mon Sep 17 00:00:00 2001
From: youkaichao
Date: Sun, 27 Oct 2024 00:19:28 -0700
Subject: [PATCH 010/113] [core] cudagraph output with tensor weak reference
(#9724)
Signed-off-by: youkaichao
---
csrc/ops.h | 24 +++++++++++++++++++++
csrc/torch_bindings.cpp | 3 +++
vllm/utils.py | 9 ++++++++
vllm/worker/model_runner.py | 42 +++++++++++++------------------------
4 files changed, 50 insertions(+), 28 deletions(-)
diff --git a/csrc/ops.h b/csrc/ops.h
index f737f50c2ec96..c50eb39a3dacc 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -5,6 +5,30 @@
#include "core/scalar_type.hpp"
+#include
+
+torch::Tensor weak_ref_tensor(torch::Tensor& tensor) {
+ // Ensure tensor is on CUDA
+ if (!tensor.is_cuda()) {
+ throw std::runtime_error("Tensor must be on CUDA device");
+ }
+
+ // Get the raw data pointer
+ void* data_ptr = tensor.data_ptr();
+
+ // Get tensor sizes and strides
+ std::vector sizes = tensor.sizes().vec();
+ std::vector strides = tensor.strides().vec();
+
+ // Get tensor options (dtype, device)
+ auto options = tensor.options();
+
+ // Create a new tensor from the raw data pointer
+ auto new_tensor = torch::from_blob(data_ptr, sizes, strides, options);
+
+ return new_tensor;
+}
+
void paged_attention_v1(
torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index e704ff629fd6e..b8185c24d5628 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -18,6 +18,9 @@
TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
// vLLM custom ops
+ ops.def("weak_ref_tensor(Tensor input) -> Tensor");
+ ops.impl("weak_ref_tensor", torch::kCUDA, &weak_ref_tensor);
+
// Attention ops
// Compute the attention between an input query and the cached
// keys/values using PagedAttention.
diff --git a/vllm/utils.py b/vllm/utils.py
index fba9804289b94..1f75de89d0cc2 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -1479,3 +1479,12 @@ def __iter__(self):
def __len__(self):
return len(self._factory)
+
+
+def weak_ref_tensor(tensor: torch.Tensor) -> torch.Tensor:
+ """
+ Create a weak reference to a tensor.
+ The new tensor will share the same data as the original tensor,
+ but will not keep the original tensor alive.
+ """
+ return torch.ops._C.weak_ref_tensor(tensor)
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 8b74f06e77be0..4a287e3741d0f 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -50,7 +50,7 @@
from vllm.transformers_utils.config import uses_mrope
from vllm.utils import (DeviceMemoryProfiler, PyObjectCache, async_tensor_h2d,
flatten_2d_lists, is_hip, is_pin_memory_available,
- supports_dynamo)
+ supports_dynamo, weak_ref_tensor)
from vllm.worker.model_runner_base import (
ModelRunnerBase, ModelRunnerInputBase, ModelRunnerInputBuilderBase,
_add_attn_metadata_broadcastable_dict,
@@ -1426,12 +1426,6 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None:
dtype=self.model_config.dtype,
device=self.device)
- # Prepare buffer for outputs. These will be reused for all batch sizes.
- # It will be filled after the first graph capture.
- hidden_or_intermediate_states: List[Optional[torch.Tensor]] = [
- None
- ] * self.parallel_config.pipeline_parallel_size
-
graph_batch_size = self.max_batchsize_to_capture
batch_size_capture_list = [
bs for bs in _BATCH_SIZES_TO_CAPTURE if bs <= graph_batch_size
@@ -1474,12 +1468,6 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None:
input_tokens[:batch_size],
"positions":
input_positions[..., :batch_size],
- "hidden_or_intermediate_states":
- hidden_or_intermediate_states[
- virtual_engine] # type: ignore
- [:batch_size]
- if hidden_or_intermediate_states[virtual_engine]
- is not None else None,
"intermediate_inputs":
intermediate_inputs[:batch_size]
if intermediate_inputs is not None else None,
@@ -1762,15 +1750,13 @@ def capture(
self,
input_ids: torch.Tensor,
positions: torch.Tensor,
- hidden_or_intermediate_states: Optional[Union[IntermediateTensors,
- torch.Tensor]],
intermediate_inputs: Optional[IntermediateTensors],
kv_caches: List[torch.Tensor],
attn_metadata: AttentionMetadata,
memory_pool: Optional[Tuple[int, int]],
stream: torch.cuda.Stream,
**kwargs,
- ) -> Union[torch.Tensor, IntermediateTensors]:
+ ):
assert self._graph is None
# Run the model a few times without capturing the graph.
# This is to make sure that the captured graph does not include the
@@ -1799,20 +1785,21 @@ def capture(
intermediate_tensors=intermediate_inputs,
**kwargs,
)
- if hidden_or_intermediate_states is not None:
- if get_pp_group().is_last_rank:
- hidden_or_intermediate_states.copy_(
- output_hidden_or_intermediate_states)
- else:
- for key in hidden_or_intermediate_states.tensors:
- hidden_or_intermediate_states[key].copy_(
- output_hidden_or_intermediate_states[key])
- else:
- hidden_or_intermediate_states = (
+
+ if isinstance(output_hidden_or_intermediate_states, torch.Tensor):
+ hidden_or_intermediate_states = weak_ref_tensor(
output_hidden_or_intermediate_states)
+ elif isinstance(output_hidden_or_intermediate_states,
+ IntermediateTensors):
+ hidden_or_intermediate_states = IntermediateTensors(
+ tensors={
+ key: weak_ref_tensor(value)
+ for key, value in
+ output_hidden_or_intermediate_states.tensors.items()
+ })
del output_hidden_or_intermediate_states
- # make sure `output_hidden_states` is deleted
+ # make sure `output_hidden_or_intermediate_states` is deleted
# in the graph's memory pool
gc.collect()
torch.cuda.synchronize()
@@ -1837,7 +1824,6 @@ def capture(
}
else:
self.output_buffers = hidden_or_intermediate_states
- return hidden_or_intermediate_states
def forward(
self,
From 3cb07a36a20f9af11346650559470d685e9dc711 Mon Sep 17 00:00:00 2001
From: bnellnm <49004751+bnellnm@users.noreply.github.com>
Date: Sun, 27 Oct 2024 05:44:24 -0400
Subject: [PATCH 011/113] [Misc] Upgrade to pytorch 2.5 (#9588)
Signed-off-by: Bill Nell
Signed-off-by: youkaichao
Co-authored-by: youkaichao
---
CMakeLists.txt | 4 +-
cmake/utils.cmake | 6 +--
pyproject.toml | 2 +-
requirements-build.txt | 2 +-
requirements-cuda.txt | 6 +--
requirements-openvino.txt | 2 +-
.../decoder_only/language/test_big_models.py | 46 ++++++++++++++-----
vllm/platforms/cuda.py | 5 ++
8 files changed, 48 insertions(+), 25 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index fc4ac10b7669a..1a6a311e97633 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -49,7 +49,7 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx11
# requirements.txt files and should be kept consistent. The ROCm torch
# versions are derived from Dockerfile.rocm
#
-set(TORCH_SUPPORTED_VERSION_CUDA "2.4.0")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.5.0")
set(TORCH_SUPPORTED_VERSION_ROCM "2.5.0")
#
@@ -507,7 +507,7 @@ else()
FetchContent_Declare(
vllm-flash-attn
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
- GIT_TAG 013f0c4fc47e6574060879d9734c1df8c5c273bd
+ GIT_TAG 5259c586c403a4e4d8bf69973c159b40cc346fb9
GIT_PROGRESS TRUE
# Don't share the vllm-flash-attn build between build types
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index 24bb7299338ac..40430dae10c5b 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -424,11 +424,7 @@ function (define_gpu_extension_target GPU_MOD_NAME)
# Don't use `TORCH_LIBRARIES` for CUDA since it pulls in a bunch of
# dependencies that are not necessary and may not be installed.
if (GPU_LANGUAGE STREQUAL "CUDA")
- if ("${CUDA_CUDA_LIB}" STREQUAL "")
- set(CUDA_CUDA_LIB "${CUDA_CUDA_LIBRARY}")
- endif()
- target_link_libraries(${GPU_MOD_NAME} PRIVATE ${CUDA_CUDA_LIB}
- ${CUDA_LIBRARIES})
+ target_link_libraries(${GPU_MOD_NAME} PRIVATE CUDA::cudart CUDA::cuda_driver)
else()
target_link_libraries(${GPU_MOD_NAME} PRIVATE ${TORCH_LIBRARIES})
endif()
diff --git a/pyproject.toml b/pyproject.toml
index e0c56ab79cad0..e78f5652f486b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,7 +6,7 @@ requires = [
"packaging",
"setuptools>=61",
"setuptools-scm>=8.0",
- "torch == 2.4.0",
+ "torch == 2.5.0",
"wheel",
"jinja2",
]
diff --git a/requirements-build.txt b/requirements-build.txt
index 6144a56da8c47..ea2b688bb3108 100644
--- a/requirements-build.txt
+++ b/requirements-build.txt
@@ -4,6 +4,6 @@ ninja
packaging
setuptools>=61
setuptools-scm>=8
-torch==2.4.0
+torch==2.5.0
wheel
jinja2
diff --git a/requirements-cuda.txt b/requirements-cuda.txt
index 3b3c2f876919e..92fa303d687a2 100644
--- a/requirements-cuda.txt
+++ b/requirements-cuda.txt
@@ -4,7 +4,7 @@
# Dependencies for NVIDIA GPUs
ray >= 2.9
nvidia-ml-py # for pynvml package
-torch == 2.4.0
+torch == 2.5.0
# These must be updated alongside torch
-torchvision == 0.19 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
-xformers == 0.0.27.post2; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch 2.4.0
+torchvision == 0.20 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
+xformers == 0.0.28.post2; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch 2.5.0
diff --git a/requirements-openvino.txt b/requirements-openvino.txt
index ac54cf0c3288f..7ad0d1e7f704b 100644
--- a/requirements-openvino.txt
+++ b/requirements-openvino.txt
@@ -1,7 +1,7 @@
# Common dependencies
-r requirements-common.txt
-torch == 2.4.0 # should be aligned with "common" vLLM torch version
+torch == 2.5.0 # should be aligned with "common" vLLM torch version
openvino >= 2024.4.0 # since 2024.4.0 both CPU and GPU support Paged Attention
optimum @ git+https://github.com/huggingface/optimum.git@main # latest optimum is used to support latest transformers version
diff --git a/tests/models/decoder_only/language/test_big_models.py b/tests/models/decoder_only/language/test_big_models.py
index 75625b35209ce..fcfc159e4f5a0 100644
--- a/tests/models/decoder_only/language/test_big_models.py
+++ b/tests/models/decoder_only/language/test_big_models.py
@@ -8,7 +8,7 @@
from vllm.platforms import current_platform
-from ...utils import check_outputs_equal
+from ...utils import check_logprobs_close, check_outputs_equal
MODELS = [
"meta-llama/Llama-2-7b-hf",
@@ -43,18 +43,40 @@ def test_models(
dtype: str,
max_tokens: int,
) -> None:
- with hf_runner(model, dtype=dtype) as hf_model:
- hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
- with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model:
- vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-
- check_outputs_equal(
- outputs_0_lst=hf_outputs,
- outputs_1_lst=vllm_outputs,
- name_0="hf",
- name_1="vllm",
- )
+ if model == "openbmb/MiniCPM3-4B":
+ # the output becomes slightly different when upgrading to
+ # pytorch 2.5 . Changing to logprobs checks instead of exact
+ # output checks.
+ NUM_LOG_PROBS = 8
+ with hf_runner(model, dtype=dtype) as hf_model:
+ hf_outputs = hf_model.generate_greedy_logprobs_limit(
+ example_prompts, max_tokens, NUM_LOG_PROBS)
+
+ with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model:
+ vllm_outputs = vllm_model.generate_greedy_logprobs(
+ example_prompts, max_tokens, NUM_LOG_PROBS)
+
+ check_logprobs_close(
+ outputs_0_lst=hf_outputs,
+ outputs_1_lst=vllm_outputs,
+ name_0="hf",
+ name_1="vllm",
+ )
+ else:
+ with hf_runner(model, dtype=dtype) as hf_model:
+ hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+
+ with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model:
+ vllm_outputs = vllm_model.generate_greedy(example_prompts,
+ max_tokens)
+
+ check_outputs_equal(
+ outputs_0_lst=hf_outputs,
+ outputs_1_lst=vllm_outputs,
+ name_0="hf",
+ name_1="vllm",
+ )
@pytest.mark.parametrize("model", MODELS)
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 30bbf5107475d..9c5212ace1346 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -7,6 +7,7 @@
from typing import Callable, List, Tuple, TypeVar
import pynvml
+import torch
from typing_extensions import ParamSpec
from vllm.logger import init_logger
@@ -26,6 +27,10 @@
" and cause errors. See https://pypi.org/project/pynvml "
"for more information.")
+# pytorch 2.5 uses cudnn sdpa by default, which will cause crash on some models
+# see https://github.com/huggingface/diffusers/issues/9704 for details
+torch.backends.cuda.enable_cudnn_sdp(False)
+
# NVML utils
# Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`,
# all the related functions work on real physical device ids.
From e130c40e4eba63ee8f04d493d83bca8c59b5ada5 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Sun, 27 Oct 2024 17:30:03 +0000
Subject: [PATCH 012/113] Fix cache management in "Close inactive issues and
PRs" actions workflow (#9734)
---
.github/workflows/stale.yml | 1 +
1 file changed, 1 insertion(+)
diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
index 2418c61bdcf63..81e7c9b050760 100644
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@@ -10,6 +10,7 @@ jobs:
permissions:
issues: write
pull-requests: write
+ actions: write
runs-on: ubuntu-latest
steps:
- uses: actions/stale@28ca1036281a5e5922ead5184a1bbf96e5fc984e # v9.0.0
From 34a9941620d00879599a51609225452b705bae89 Mon Sep 17 00:00:00 2001
From: madt2709 <55849102+madt2709@users.noreply.github.com>
Date: Sun, 27 Oct 2024 10:46:41 -0700
Subject: [PATCH 013/113] [Bugfix] Fix load config when using bools (#9533)
---
tests/data/test_config.yaml | 2 ++
tests/test_utils.py | 6 +++++-
vllm/engine/arg_utils.py | 14 +-------------
vllm/utils.py | 35 +++++++++++++++++++++++++++--------
4 files changed, 35 insertions(+), 22 deletions(-)
diff --git a/tests/data/test_config.yaml b/tests/data/test_config.yaml
index 42f4f6f7bb992..5090e8f357bb8 100644
--- a/tests/data/test_config.yaml
+++ b/tests/data/test_config.yaml
@@ -1,3 +1,5 @@
port: 12312
served_model_name: mymodel
tensor_parallel_size: 2
+trust_remote_code: true
+multi_step_stream_outputs: false
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 0fed8e678fc76..a731b11eae81c 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -6,7 +6,7 @@
import pytest
-from vllm.utils import (FlexibleArgumentParser, deprecate_kwargs,
+from vllm.utils import (FlexibleArgumentParser, StoreBoolean, deprecate_kwargs,
get_open_port, merge_async_iterators, supports_kw)
from .utils import error_on_warning
@@ -141,6 +141,8 @@ def parser_with_config():
parser.add_argument('--config', type=str)
parser.add_argument('--port', type=int)
parser.add_argument('--tensor-parallel-size', type=int)
+ parser.add_argument('--trust-remote-code', action='store_true')
+ parser.add_argument('--multi-step-stream-outputs', action=StoreBoolean)
return parser
@@ -214,6 +216,8 @@ def test_config_args(parser_with_config):
args = parser_with_config.parse_args(
['serve', 'mymodel', '--config', './data/test_config.yaml'])
assert args.tensor_parallel_size == 2
+ assert args.trust_remote_code
+ assert not args.multi_step_stream_outputs
def test_config_file(parser_with_config):
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index c49f475b9ee61..38687809a31f6 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -19,7 +19,7 @@
from vllm.transformers_utils.config import (
maybe_register_config_serialize_by_value)
from vllm.transformers_utils.utils import check_gguf_file
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils import FlexibleArgumentParser, StoreBoolean
if TYPE_CHECKING:
from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
@@ -1144,18 +1144,6 @@ def add_cli_args(parser: FlexibleArgumentParser,
return parser
-class StoreBoolean(argparse.Action):
-
- def __call__(self, parser, namespace, values, option_string=None):
- if values.lower() == "true":
- setattr(namespace, self.dest, True)
- elif values.lower() == "false":
- setattr(namespace, self.dest, False)
- else:
- raise ValueError(f"Invalid boolean value: {values}. "
- "Expected 'true' or 'false'.")
-
-
# These functions are used by sphinx to build the documentation
def _engine_args_parser():
return EngineArgs.add_cli_args(FlexibleArgumentParser())
diff --git a/vllm/utils.py b/vllm/utils.py
index 1f75de89d0cc2..d4f2c936ca9cc 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -1155,6 +1155,18 @@ def wrapper(*args: P.args, **kwargs: P.kwargs) -> None:
return wrapper
+class StoreBoolean(argparse.Action):
+
+ def __call__(self, parser, namespace, values, option_string=None):
+ if values.lower() == "true":
+ setattr(namespace, self.dest, True)
+ elif values.lower() == "false":
+ setattr(namespace, self.dest, False)
+ else:
+ raise ValueError(f"Invalid boolean value: {values}. "
+ "Expected 'true' or 'false'.")
+
+
class FlexibleArgumentParser(argparse.ArgumentParser):
"""ArgumentParser that allows both underscore and dash in names."""
@@ -1163,7 +1175,7 @@ def parse_args(self, args=None, namespace=None):
args = sys.argv[1:]
if '--config' in args:
- args = FlexibleArgumentParser._pull_args_from_config(args)
+ args = self._pull_args_from_config(args)
# Convert underscores to dashes and vice versa in argument names
processed_args = []
@@ -1181,8 +1193,7 @@ def parse_args(self, args=None, namespace=None):
return super().parse_args(processed_args, namespace)
- @staticmethod
- def _pull_args_from_config(args: List[str]) -> List[str]:
+ def _pull_args_from_config(self, args: List[str]) -> List[str]:
"""Method to pull arguments specified in the config file
into the command-line args variable.
@@ -1226,7 +1237,7 @@ def _pull_args_from_config(args: List[str]) -> List[str]:
file_path = args[index + 1]
- config_args = FlexibleArgumentParser._load_config_file(file_path)
+ config_args = self._load_config_file(file_path)
# 0th index is for {serve,chat,complete}
# followed by model_tag (only for serve)
@@ -1247,8 +1258,7 @@ def _pull_args_from_config(args: List[str]) -> List[str]:
return args
- @staticmethod
- def _load_config_file(file_path: str) -> List[str]:
+ def _load_config_file(self, file_path: str) -> List[str]:
"""Loads a yaml file and returns the key value pairs as a
flattened list with argparse like pattern
```yaml
@@ -1282,9 +1292,18 @@ def _load_config_file(file_path: str) -> List[str]:
Make sure path is correct", file_path)
raise ex
+ store_boolean_arguments = [
+ action.dest for action in self._actions
+ if isinstance(action, StoreBoolean)
+ ]
+
for key, value in config.items():
- processed_args.append('--' + key)
- processed_args.append(str(value))
+ if isinstance(value, bool) and key not in store_boolean_arguments:
+ if value:
+ processed_args.append('--' + key)
+ else:
+ processed_args.append('--' + key)
+ processed_args.append(str(value))
return processed_args
From 4e2d95e372ad5fbef7b27c66d527c37477c0c8bb Mon Sep 17 00:00:00 2001
From: wangshuai09 <391746016@qq.com>
Date: Mon, 28 Oct 2024 12:07:00 +0800
Subject: [PATCH 014/113] [Hardware][ROCM] using current_platform.is_rocm
(#9642)
Signed-off-by: wangshuai09 <391746016@qq.com>
---
.../test_basic_correctness.py | 4 +-
tests/compile/utils.py | 4 +-
tests/kernels/quant_utils.py | 17 +++--
tests/kernels/test_attention.py | 23 +++---
tests/kernels/test_attention_selector.py | 3 +-
tests/kernels/test_blocksparse_attention.py | 7 +-
tests/kernels/test_encoder_decoder_attn.py | 76 ++++++++++---------
tests/kernels/test_moe.py | 7 +-
tests/lora/test_gemma.py | 5 +-
tests/lora/test_quant_model.py | 4 +-
.../vision_language/test_paligemma.py | 9 ++-
.../vision_language/test_phi3v.py | 3 +-
.../e2e/test_integration_dist_tp2.py | 4 +-
tests/utils.py | 4 +-
vllm/_custom_ops.py | 8 +-
.../ops/blocksparse_attention/interface.py | 6 +-
vllm/attention/selector.py | 4 +-
vllm/config.py | 49 ++++++------
vllm/executor/ray_utils.py | 4 +-
vllm/model_executor/custom_op.py | 4 +-
.../compressed_tensors_moe.py | 5 +-
.../schemes/compressed_tensors_w8a8_fp8.py | 6 +-
.../layers/quantization/fbgemm_fp8.py | 3 +-
.../model_executor/layers/quantization/fp8.py | 10 +--
.../layers/quantization/utils/w8a8_utils.py | 6 +-
vllm/model_executor/models/exaone.py | 4 +-
vllm/model_executor/models/granite.py | 4 +-
vllm/model_executor/models/llama.py | 4 +-
vllm/model_executor/models/registry.py | 4 +-
vllm/model_executor/models/solar.py | 4 +-
vllm/utils.py | 6 +-
vllm/worker/model_runner.py | 9 ++-
32 files changed, 162 insertions(+), 148 deletions(-)
diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
index 3c2ca1bddd906..79647589d5204 100644
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -11,7 +11,7 @@
import pytest
from vllm import LLM
-from vllm.utils import is_hip
+from vllm.platforms import current_platform
from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
from ..models.utils import check_outputs_equal
@@ -51,7 +51,7 @@ def test_models(
enforce_eager: bool,
) -> None:
- if backend == "FLASHINFER" and is_hip():
+ if backend == "FLASHINFER" and current_platform.is_rocm():
pytest.skip("Flashinfer does not support ROCm/HIP.")
os.environ["VLLM_ATTENTION_BACKEND"] = backend
diff --git a/tests/compile/utils.py b/tests/compile/utils.py
index c69343b51ae02..64fc08e80de3b 100644
--- a/tests/compile/utils.py
+++ b/tests/compile/utils.py
@@ -5,7 +5,7 @@
from tests.quantization.utils import is_quant_method_supported
from vllm import LLM, SamplingParams
from vllm.compilation.levels import CompilationLevel
-from vllm.utils import is_hip
+from vllm.platforms import current_platform
TEST_MODELS = [
("facebook/opt-125m", {}),
@@ -55,7 +55,7 @@
"quantization": "marlin"
}))
-if not is_hip() and is_quant_method_supported("awq"):
+if not current_platform.is_rocm() and is_quant_method_supported("awq"):
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {
"quantization": "AWQ"
}))
diff --git a/tests/kernels/quant_utils.py b/tests/kernels/quant_utils.py
index 8f6a54ff5979c..f2358940fc7b8 100644
--- a/tests/kernels/quant_utils.py
+++ b/tests/kernels/quant_utils.py
@@ -2,12 +2,13 @@
import torch
-from vllm.utils import is_hip
+from vllm.platforms import current_platform
# Using the default value (240.0) from pytorch will cause accuracy
# issue on dynamic quantization models. Here use 224.0 for rocm.
ROCM_FP8_MAX = 224.0
-FP8_DTYPE = torch.float8_e4m3fnuz if is_hip() else torch.float8_e4m3fn
+FP8_DTYPE = torch.float8_e4m3fnuz if current_platform.is_rocm() \
+ else torch.float8_e4m3fn
def as_float32_tensor(x: Union[float, torch.tensor]) -> torch.tensor:
@@ -24,8 +25,10 @@ def ref_dynamic_per_token_quant(x: torch.tensor,
qtype_traits = torch.iinfo(quant_dtype) if quant_dtype == torch.int8 \
else torch.finfo(quant_dtype)
- qtype_traits_max = ROCM_FP8_MAX if is_hip() else qtype_traits.max
- qtype_traits_min = -ROCM_FP8_MAX if is_hip() else qtype_traits.min
+ qtype_traits_max = ROCM_FP8_MAX if current_platform.is_rocm() \
+ else qtype_traits.max
+ qtype_traits_min = -ROCM_FP8_MAX if current_platform.is_rocm() \
+ else qtype_traits.min
qtype_max = as_float32_tensor(qtype_traits_max)
s_1 = as_float32_tensor(1.0)
s_512 = as_float32_tensor(512.0)
@@ -66,8 +69,10 @@ def ref_dynamic_per_tensor_fp8_quant(x: torch.tensor) \
-> Tuple[torch.tensor, torch.tensor]:
fp8_traits = torch.finfo(FP8_DTYPE)
- fp8_traits_max = ROCM_FP8_MAX if is_hip() else fp8_traits.max
- fp8_traits_min = -ROCM_FP8_MAX if is_hip() else fp8_traits.min
+ fp8_traits_max = ROCM_FP8_MAX if current_platform.is_rocm() \
+ else fp8_traits.max
+ fp8_traits_min = -ROCM_FP8_MAX if current_platform.is_rocm() \
+ else fp8_traits.min
fp8_max = as_float32_tensor(fp8_traits_max)
one = as_float32_tensor(1.0)
diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py
index 52f1ecd176963..1604aa4d2d6e5 100644
--- a/tests/kernels/test_attention.py
+++ b/tests/kernels/test_attention.py
@@ -6,11 +6,12 @@
from tests.kernels.utils import opcheck
from vllm import _custom_ops as ops
-from vllm.utils import get_max_shared_memory_bytes, is_hip, seed_everything
+from vllm.platforms import current_platform
+from vllm.utils import get_max_shared_memory_bytes, seed_everything
from .allclose_default import get_default_atol, get_default_rtol
-if not is_hip():
+if not current_platform.is_rocm():
from xformers import ops as xops
from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
@@ -23,8 +24,9 @@
NUM_BLOCKS = 4321 # Arbitrary values for testing
PARTITION_SIZE = 512
# flshattF and tritonflashattF supported: {torch.float16, torch.bfloat16}
-DTYPES = [torch.half, torch.bfloat16, torch.float
- ] if not is_hip() else [torch.half, torch.bfloat16]
+DTYPES = [
+ torch.half, torch.bfloat16, torch.float
+] if not current_platform.is_rocm() else [torch.half, torch.bfloat16]
NUM_GEN_SEQS = [7] # Arbitrary values for testing
NUM_PREFILL_SEQS = [3] # Arbitrary values for testing
NUM_HEADS = [(40, 40), (64, 8)] # Arbitrary values for testing
@@ -114,7 +116,8 @@ def ref_single_query_cached_kv_attention(
@pytest.mark.parametrize(
- "version", ["v1", "v2"] if not is_hip() else ["v1", "v2", "rocm"])
+ "version",
+ ["v1", "v2"] if not current_platform.is_rocm() else ["v1", "v2", "rocm"])
@pytest.mark.parametrize("num_seqs", NUM_GEN_SEQS)
@pytest.mark.parametrize("num_heads", NUM_HEADS)
@pytest.mark.parametrize("head_size", HEAD_SIZES)
@@ -317,8 +320,8 @@ def test_paged_attention(
# NOTE(woosuk): Due to the kernel-level differences in the two
# implementations, there is a small numerical difference in the two
# outputs. Thus, we use a relaxed tolerance for the test.
- atol = get_default_atol(output) if is_hip() else 1e-3
- rtol = get_default_rtol(output) if is_hip() else 1e-5
+ atol = get_default_atol(output) if current_platform.is_rocm() else 1e-3
+ rtol = get_default_rtol(output) if current_platform.is_rocm() else 1e-5
# NOTE(zhaoyang): FP8 KV Cache will introduce quantization error,
# so we use a relaxed tolerance for the test.
@@ -368,7 +371,7 @@ def ref_multi_query_kv_attention(
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("seed", SEEDS)
@pytest.mark.parametrize("device", CUDA_DEVICES)
-@pytest.mark.skipif(is_hip(),
+@pytest.mark.skipif(current_platform.is_rocm(),
reason="Xformers backend is not supported on ROCm.")
@torch.inference_mode()
def test_multi_query_kv_attention(
@@ -425,6 +428,6 @@ def test_multi_query_kv_attention(
scale,
dtype,
)
- atol = get_default_atol(output) if is_hip() else 1e-3
- rtol = get_default_rtol(output) if is_hip() else 1e-5
+ atol = get_default_atol(output) if current_platform.is_rocm() else 1e-3
+ rtol = get_default_rtol(output) if current_platform.is_rocm() else 1e-5
torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol)
diff --git a/tests/kernels/test_attention_selector.py b/tests/kernels/test_attention_selector.py
index df3e770e260e0..3fe9ca0b0450f 100644
--- a/tests/kernels/test_attention_selector.py
+++ b/tests/kernels/test_attention_selector.py
@@ -25,7 +25,8 @@ def test_env(name: str, device: str, monkeypatch):
False)
assert backend.name == "TORCH_SDPA"
elif device == "hip":
- with patch("vllm.attention.selector.is_hip", return_value=True):
+ with patch("vllm.attention.selector.current_platform.is_rocm",
+ return_value=True):
backend = which_attn_to_use(16, torch.float16, torch.float16, 16,
False)
assert backend.name == "ROCM_FLASH"
diff --git a/tests/kernels/test_blocksparse_attention.py b/tests/kernels/test_blocksparse_attention.py
index f3bd8f0524264..b65efb3abc230 100644
--- a/tests/kernels/test_blocksparse_attention.py
+++ b/tests/kernels/test_blocksparse_attention.py
@@ -7,7 +7,8 @@
from vllm import _custom_ops as ops
from vllm.attention.ops.blocksparse_attention.interface import (
LocalStridedBlockSparseAttn)
-from vllm.utils import get_max_shared_memory_bytes, is_hip, seed_everything
+from vllm.platforms import current_platform
+from vllm.utils import get_max_shared_memory_bytes, seed_everything
from .allclose_default import get_default_atol, get_default_rtol
@@ -316,8 +317,8 @@ def test_paged_attention(
# NOTE(woosuk): Due to the kernel-level differences in the two
# implementations, there is a small numerical difference in the two
# outputs. Thus, we use a relaxed tolerance for the test.
- atol = get_default_atol(output) if is_hip() else 1e-3
- rtol = get_default_rtol(output) if is_hip() else 1e-5
+ atol = get_default_atol(output) if current_platform.is_rocm() else 1e-3
+ rtol = get_default_rtol(output) if current_platform.is_rocm() else 1e-5
# NOTE(zhaoyang): FP8 KV Cache will introduce quantization error,
# so we use a relaxed tolerance for the test.
diff --git a/tests/kernels/test_encoder_decoder_attn.py b/tests/kernels/test_encoder_decoder_attn.py
index 6b979d0558c46..bc99c5559d388 100644
--- a/tests/kernels/test_encoder_decoder_attn.py
+++ b/tests/kernels/test_encoder_decoder_attn.py
@@ -18,7 +18,7 @@
from vllm.attention.backends.utils import STR_NOT_IMPL_ENC_DEC_ROCM_HIP
from vllm.attention.selector import (_Backend,
global_force_attn_backend_context_manager)
-from vllm.utils import is_hip
+from vllm.platforms import current_platform
# List of support backends for encoder/decoder models
LIST_ENC_DEC_SUPPORTED_BACKENDS = [_Backend.XFORMERS]
@@ -82,7 +82,7 @@ class TestResources(NamedTuple):
will leverage attn_backend for the purpose of
constructing backend-compatible attention
metadata instances
-
+
Attributes:
* scale: 1/sqrt(d) scale factor for attn
@@ -105,10 +105,10 @@ def _make_test_resources(test_pt: TestPoint, ) -> TestResources:
Build key components for performing encoder/decoder attention test.
Note that
- (1) The Attention instance constructed here, automatically selects
+ (1) The Attention instance constructed here, automatically selects
an attention backend class based on platform info & a set of canned
heuristics, so
- (2) The attention backend instance constructed here is thus *not
+ (2) The attention backend instance constructed here is thus *not
the same backend instance* used by attn, but rather it is
intended to be a *different instance* of the *same backend class*;
therefore,
@@ -156,7 +156,7 @@ def _encoder_attn_setup(
'''
Set up test vectors & data structures for encoder attention test.
- A triplet of synthetic query/key/value tensors are constructed.
+ A triplet of synthetic query/key/value tensors are constructed.
Given this is an encoder attention test, the key & value
sequences will have the same length as the corresponding queries.
@@ -169,14 +169,14 @@ def _encoder_attn_setup(
Arguments:
* test_pt: TestPoint data structure; this function relies on the
- following fields: batch_size, num_heads, head_size,
+ following fields: batch_size, num_heads, head_size,
block_size, max_q_seq_len
* test_rsrcs: TestResources data structure; this function relies on the
scale field
-
+
Returns:
-
+
* PhaseTestParameters data structure comprising (1) packed query/key/value
tensors, (2) the ideal output of attention computed using a naive
implementation, and (3) KVCache field set to None
@@ -265,7 +265,7 @@ def _decoder_attn_setup(
Arguments:
* test_pt: TestPoint data structure; this function relies on the
- following fields: batch_size, num_heads, head_size,
+ following fields: batch_size, num_heads, head_size,
block_size, max_q_seq_len
* test_rsrcs: TestResources data structure; this function relies on the
scale field
@@ -275,14 +275,14 @@ def _decoder_attn_setup(
* qkv: Unpacked (batch_size x padded_seq_len x num_heads x
head_size) query/key/value tensors
* Prefill-phase decoder self-attention PhaseTestParameters data structure,
- including (1) packed (number_of_tokens x num_heads x head_size)
+ including (1) packed (number_of_tokens x num_heads x head_size)
query/key/value tensors along with (2) ideal attention output
- computed using a naive implementation, and (3) memory-mapping data
+ computed using a naive implementation, and (3) memory-mapping data
structures appropriate for prefill phase.
- * Decode-phase decoder self-attention PhaseTestParameters data structure,
- including (1) packed (number_of_tokens x num_heads x head_size)
- query/key/value tensors along with (2) ideal attention output
- computed using a naive implementation, and (3) memory-mapping data
+ * Decode-phase decoder self-attention PhaseTestParameters data structure,
+ including (1) packed (number_of_tokens x num_heads x head_size)
+ query/key/value tensors along with (2) ideal attention output
+ computed using a naive implementation, and (3) memory-mapping data
structures appropriate for decode phase.
* max_block_idx: max physical address in decoder self-attention block-table
(intended to be used as the base address for the encoder/
@@ -436,12 +436,12 @@ def _enc_dec_cross_attn_setup_reuses_query(
This function also constructs the cross-attention KV cache memory mapping
(slot mapping and block table), ensuring that the block table starts at
- block_base_addr.
+ block_base_addr.
Arguments:
* decoder_qkv: pre-existing unpacked (batch_size x padded_seq_len x
- num_heads x head_size) decoder self-attention inputs;
+ num_heads x head_size) decoder self-attention inputs;
this function relies on the query and q_seq_lens
fields
* encoder_test_params: PhaseTestParameters data structure which was
@@ -452,7 +452,7 @@ def _enc_dec_cross_attn_setup_reuses_query(
self-attention; all fields
including KV cache required
* test_pt: TestPoint data structure; this function relies on the
- following fields: batch_size, num_heads, head_size,
+ following fields: batch_size, num_heads, head_size,
block_size, max_q_seq_len
* test_rsrcs: TestResources data structure; this function relies on the
scale field
@@ -460,16 +460,16 @@ def _enc_dec_cross_attn_setup_reuses_query(
Returns:
- * Prefill-phase encoder/decoder cross-attention PhaseTestParameters data
- structure, including (1) packed
+ * Prefill-phase encoder/decoder cross-attention PhaseTestParameters data
+ structure, including (1) packed
(number_of_tokens x num_heads x head_size) query/key/value tensors
- along with (2) ideal attention output computed using a
+ along with (2) ideal attention output computed using a
naive implementation, and (3) memory-mapping data structures appropriate
for prefill phase.
- * Decode-phase encoder/decoder cross-attention PhaseTestParameters data
+ * Decode-phase encoder/decoder cross-attention PhaseTestParameters data
structure, including (1) packed
(number_of_tokens x num_heads x head_size) query/key/value tensors
- along with (2) ideal attention output computed using a
+ along with (2) ideal attention output computed using a
naive implementation, and (3) memory-mapping data structures appropriate
for decode phase.
'''
@@ -596,7 +596,7 @@ def _run_encoder_attention_test(
'''
Run encoder attention.
- attn.forward() is passed attn_type=AttentionType.ENCODER in order
+ attn.forward() is passed attn_type=AttentionType.ENCODER in order
to configure the kernel invocation for encoder attention
Requires attn_metadata.num_decode_tokens == 0
@@ -607,7 +607,7 @@ def _run_encoder_attention_test(
* attn: Attention wrapper instance
* encoder_test_params: encoder PhaseTestParameters data structure;
this function relies on the packed
- (number_of_tokens x num_heads x head_size)
+ (number_of_tokens x num_heads x head_size)
query/key/value fields
* attn_metadata: attention metadata for encoder/decoder-self attention
@@ -646,7 +646,7 @@ def _run_decoder_self_attention_test(
and attn (Attention wrapper instance) fields
* decoder_test_params: decoder PhaseTestParameters data structure;
this function relies on the packed
- (number_of_tokens x num_heads x head_size)
+ (number_of_tokens x num_heads x head_size)
query/key/value fields
* attn_metadata: attention metadata for decoder-self attention
(contains KV cache memory-mapping)
@@ -694,11 +694,11 @@ def _run_encoder_decoder_cross_attention_test(
and attn (Attention wrapper instance) fields
* decoder_test_params: decoder PhaseTestParameters data structure;
this function relies on the packed
- (number_of_tokens x num_heads x head_size)
+ (number_of_tokens x num_heads x head_size)
query field
* cross_test_params: encoder/decoder PhaseTestParameters data structure;
this function relies on the packed
- (number_of_tokens x num_heads x head_size)
+ (number_of_tokens x num_heads x head_size)
key/value fields
* attn_metadata: attention metadata for encoder/decoder-self attention
@@ -726,7 +726,8 @@ def _run_encoder_decoder_cross_attention_test(
attn_type=attn_type)
-@pytest.mark.skipif(is_hip(), reason=STR_NOT_IMPL_ENC_DEC_ROCM_HIP)
+@pytest.mark.skipif(current_platform.is_rocm(),
+ reason=STR_NOT_IMPL_ENC_DEC_ROCM_HIP)
@pytest.mark.parametrize("num_heads", NUM_HEADS)
@pytest.mark.parametrize("head_size", HEAD_SIZES)
@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS)
@@ -755,7 +756,8 @@ def test_encoder_only(
No KV cache is required for encoder-only attention.
Note on ROCm/HIP: currently encoder/decoder models are not supported on
- AMD GPUs, therefore this test simply is skipped if is_hip().
+ AMD GPUs, therefore this test simply is skipped if
+ current_platform.is_rocm().
This test globally forces an override of the usual backend
auto-selection process, forcing the specific backend-under-test
@@ -811,7 +813,8 @@ def test_encoder_only(
assert_actual_matches_ideal(enc_test_params, enc_pckd_act_out)
-@pytest.mark.skipif(is_hip(), reason=STR_NOT_IMPL_ENC_DEC_ROCM_HIP)
+@pytest.mark.skipif(current_platform.is_rocm(),
+ reason=STR_NOT_IMPL_ENC_DEC_ROCM_HIP)
@pytest.mark.parametrize("num_heads", NUM_HEADS)
@pytest.mark.parametrize("head_size", HEAD_SIZES)
@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS)
@@ -837,14 +840,14 @@ def test_e2e_enc_dec_attn(
attributes for prefill-phase, and (2) an analogous attention metadata
structure but for decode-phase
* Test attention steps in the following order
-
+
* Encoder attention
* Prefill self-attention
* Prefill cross-attention
* Decode self-attention
* Decode cross-attention
- * Besides being reflective of realistic use-cases, this order would
- exacerbate any accidental overlap in the self-/cross-attention
+ * Besides being reflective of realistic use-cases, this order would
+ exacerbate any accidental overlap in the self-/cross-attention
block tables, which one hopes to avoid
@@ -864,10 +867,11 @@ def test_e2e_enc_dec_attn(
to be utilized.
Note on ROCm/HIP: currently encoder/decoder models are not supported on
- AMD GPUs, therefore this test simply is skipped if is_hip().
+ AMD GPUs, therefore this test simply is skipped if
+ current_platform.is_rocm().
Note on metadata: there is a single attention metadata structure shared by
- all prefill-phase attention operations (encoder, decoder, enc/dec cross),
+ all prefill-phase attention operations (encoder, decoder, enc/dec cross),
and a single one shared by all decode-phase attention operations
(decoder & enc/dec cross.) This is intended to reflect the behavior
of EncoderDecoderModelRunner, which constructs a single attention metadata
diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py
index c0053071258ea..4bfc089c82179 100644
--- a/tests/kernels/test_moe.py
+++ b/tests/kernels/test_moe.py
@@ -18,8 +18,9 @@
from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
marlin_quantize)
from vllm.model_executor.models.mixtral import MixtralMoE
+from vllm.platforms import current_platform
from vllm.scalar_type import scalar_types
-from vllm.utils import is_hip, seed_everything
+from vllm.utils import seed_everything
@pytest.mark.parametrize("m", [1024 * 128, 512, 222, 33, 1])
@@ -103,7 +104,7 @@ def test_mixtral_moe(dtype: torch.dtype):
@pytest.mark.parametrize("act_order", [True, False])
@pytest.mark.parametrize("num_bits", [4, 8])
@pytest.mark.parametrize("is_k_full", [True, False])
-@pytest.mark.skipif(is_hip(), reason="Skip for rocm")
+@pytest.mark.skipif(current_platform.is_rocm(), reason="Skip for rocm")
def test_fused_marlin_moe(
m: int,
n: int,
@@ -256,7 +257,7 @@ def test_fused_marlin_moe(
@pytest.mark.parametrize("act_order", [True, False])
@pytest.mark.parametrize("num_bits", [4, 8])
@pytest.mark.parametrize("is_k_full", [True, False])
-@pytest.mark.skipif(is_hip(), reason="Skip for rocm")
+@pytest.mark.skipif(current_platform.is_rocm(), reason="Skip for rocm")
def test_single_marlin_moe_multiply(
m: int,
n: int,
diff --git a/tests/lora/test_gemma.py b/tests/lora/test_gemma.py
index f7c1d4f041c12..15ec66b0f5502 100644
--- a/tests/lora/test_gemma.py
+++ b/tests/lora/test_gemma.py
@@ -4,7 +4,7 @@
import vllm
from vllm.lora.request import LoRARequest
-from vllm.utils import is_hip
+from vllm.platforms import current_platform
MODEL_PATH = "google/gemma-7b"
@@ -31,7 +31,8 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
return generated_texts
-@pytest.mark.xfail(is_hip(), reason="There can be output mismatch on ROCm")
+@pytest.mark.xfail(current_platform.is_rocm(),
+ reason="There can be output mismatch on ROCm")
def test_gemma_lora(gemma_lora_files):
llm = vllm.LLM(MODEL_PATH,
max_model_len=1024,
diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py
index d004c65929418..5432fa4ad0d3a 100644
--- a/tests/lora/test_quant_model.py
+++ b/tests/lora/test_quant_model.py
@@ -8,7 +8,7 @@
import vllm
from vllm.distributed import cleanup_dist_env_and_memory
from vllm.lora.request import LoRARequest
-from vllm.utils import is_hip
+from vllm.platforms import current_platform
@dataclass
@@ -19,7 +19,7 @@ class ModelWithQuantization:
MODELS: List[ModelWithQuantization]
#AWQ quantization is currently not supported in ROCm.
-if is_hip():
+if current_platform.is_rocm():
MODELS = [
ModelWithQuantization(
model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
diff --git a/tests/models/decoder_only/vision_language/test_paligemma.py b/tests/models/decoder_only/vision_language/test_paligemma.py
index a3ca0845e5ff8..69189ba2f25cb 100644
--- a/tests/models/decoder_only/vision_language/test_paligemma.py
+++ b/tests/models/decoder_only/vision_language/test_paligemma.py
@@ -6,8 +6,9 @@
BatchEncoding)
from vllm.multimodal.utils import rescale_image_size
+from vllm.platforms import current_platform
from vllm.sequence import SampleLogprobs
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, is_hip
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
from ...utils import check_logprobs_close
@@ -24,7 +25,7 @@
# ROCm Triton FA can run into compilation issues with these models due to,
# excessive use of shared memory. Use other backends in the meantime.
# FIXME (mattwong, gshtrasb, hongxiayan)
-if is_hip():
+if current_platform.is_rocm():
os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
@@ -70,7 +71,7 @@ def run_test(
All the image fixtures for the test are from IMAGE_ASSETS.
For huggingface runner, we provide the PIL images as input.
- For vllm runner, we provide MultiModalDataDict objects
+ For vllm runner, we provide MultiModalDataDict objects
and corresponding MultiModalConfig as input.
Note, the text input is also adjusted to abide by vllm contract.
The text output is sanitized to be able to compare with hf.
@@ -151,7 +152,7 @@ def process(hf_inputs: BatchEncoding):
pytest.param(
"float",
marks=pytest.mark.skipif(
- is_hip(),
+ current_platform.is_rocm(),
reason=
"ROCm FA does not yet fully support 32-bit precision on PaliGemma")
), "half"
diff --git a/tests/models/decoder_only/vision_language/test_phi3v.py b/tests/models/decoder_only/vision_language/test_phi3v.py
index dfe10629f1c66..1840b4bb8574c 100644
--- a/tests/models/decoder_only/vision_language/test_phi3v.py
+++ b/tests/models/decoder_only/vision_language/test_phi3v.py
@@ -12,7 +12,6 @@
from vllm.multimodal.utils import rescale_image_size
from vllm.platforms import current_platform
from vllm.sequence import SampleLogprobs
-from vllm.utils import is_hip
from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
_ImageAssets)
@@ -56,7 +55,7 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
# ROCm Triton FA can run into shared memory issues with these models,
# use other backends in the meantime
# FIXME (mattwong, gshtrasb, hongxiayan)
-if is_hip():
+if current_platform.is_rocm():
os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
diff --git a/tests/spec_decode/e2e/test_integration_dist_tp2.py b/tests/spec_decode/e2e/test_integration_dist_tp2.py
index b829d1a5be784..25562ca85adf4 100644
--- a/tests/spec_decode/e2e/test_integration_dist_tp2.py
+++ b/tests/spec_decode/e2e/test_integration_dist_tp2.py
@@ -5,7 +5,7 @@
import pytest
import torch
-from vllm.utils import is_hip
+from vllm.platforms import current_platform
from .conftest import run_equality_correctness_test_tp
@@ -51,7 +51,7 @@ def test_target_model_tp_gt_1(common_llm_kwargs, per_test_common_llm_kwargs,
batch_size: int, output_len: int, seed: int):
"""Verify greedy equality when tensor parallelism is used.
"""
- if is_hip():
+ if current_platform.is_rocm():
pytest.skip("hip is not well-supported yet")
run_equality_correctness_test_tp("JackFram/llama-68m",
common_llm_kwargs,
diff --git a/tests/utils.py b/tests/utils.py
index e983104e3cb0c..0c61891cfefec 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -26,7 +26,7 @@
from vllm.platforms import current_platform
from vllm.transformers_utils.tokenizer import get_tokenizer
from vllm.utils import (FlexibleArgumentParser, GB_bytes,
- cuda_device_count_stateless, get_open_port, is_hip)
+ cuda_device_count_stateless, get_open_port)
if current_platform.is_rocm():
from amdsmi import (amdsmi_get_gpu_vram_usage,
@@ -487,7 +487,7 @@ def wait_for_gpu_memory_to_clear(devices: List[int],
output: Dict[int, str] = {}
output_raw: Dict[int, float] = {}
for device in devices:
- if is_hip():
+ if current_platform.is_rocm():
dev_handle = amdsmi_get_processor_handles()[device]
mem_info = amdsmi_get_gpu_vram_usage(dev_handle)
gb_used = mem_info["vram_used"] / 2**10
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index f57414bd5197e..46a2fb8bc80a2 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -659,11 +659,11 @@ def scaled_fp8_quant(
Args:
input: The input tensor to be quantized to FP8
scale: Optional scaling factor for the FP8 quantization
- scale_ub: Optional upper bound for scaling factor in dynamic
+ scale_ub: Optional upper bound for scaling factor in dynamic
per token case
num_token_padding: If specified, pad the first dimension
of the output to at least this value.
- use_per_token_if_dynamic: Whether to do per_tensor or per_token
+ use_per_token_if_dynamic: Whether to do per_tensor or per_token
in the dynamic quantization case.
Returns:
@@ -674,8 +674,8 @@ def scaled_fp8_quant(
assert (input.ndim == 2)
shape: Union[Tuple[int, int], torch.Size] = input.shape
# For rocm, the output fp8 dtype is torch.float_e3m3fnuz
- out_dtype: torch.dtype = torch.float8_e4m3fnuz if vllm.utils.is_hip() \
- else torch.float8_e4m3fn
+ out_dtype: torch.dtype = torch.float8_e4m3fnuz \
+ if current_platform.is_rocm() else torch.float8_e4m3fn
if num_token_padding:
shape = (max(num_token_padding, input.shape[0]), shape[1])
output = torch.empty(shape, device=input.device, dtype=out_dtype)
diff --git a/vllm/attention/ops/blocksparse_attention/interface.py b/vllm/attention/ops/blocksparse_attention/interface.py
index e4dc576d27932..a98eb431ac7fc 100644
--- a/vllm/attention/ops/blocksparse_attention/interface.py
+++ b/vllm/attention/ops/blocksparse_attention/interface.py
@@ -3,7 +3,6 @@
import torch
from vllm.platforms import current_platform
-from vllm.utils import is_hip
from .utils import (dense_to_crow_col, get_head_sliding_step,
get_sparse_attn_mask)
@@ -32,8 +31,9 @@ def __init__(
):
super().__init__()
if use_spda is None:
- use_spda = is_hip() or current_platform.is_cpu() or not \
- IS_COMPUTE_8_OR_ABOVE
+ use_spda = current_platform.is_rocm() or \
+ current_platform.is_cpu() or not \
+ IS_COMPUTE_8_OR_ABOVE
device = device or (torch.cuda.current_device()
if current_platform.is_cuda_alike() else "cpu")
device = torch.device(device)
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index 10d4509b38279..376b3136f0fb8 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -10,7 +10,7 @@
from vllm.attention.backends.abstract import AttentionBackend
from vllm.logger import init_logger
from vllm.platforms import current_platform
-from vllm.utils import STR_BACKEND_ENV_VAR, is_hip
+from vllm.utils import STR_BACKEND_ENV_VAR
logger = init_logger(__name__)
@@ -208,7 +208,7 @@ def which_attn_to_use(
logger.info("Cannot use %s backend on TPU.", selected_backend)
return _Backend.PALLAS
- if is_hip():
+ if current_platform.is_rocm():
# AMD GPUs.
selected_backend = (_Backend.ROCM_FLASH if selected_backend
== _Backend.FLASH_ATTN else selected_backend)
diff --git a/vllm/config.py b/vllm/config.py
index a1fba98233b80..99a82c8f1b40b 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -17,7 +17,7 @@
get_hf_image_processor_config,
get_hf_text_config)
from vllm.utils import (GiB_bytes, cuda_device_count_stateless, get_cpu_memory,
- is_hip, print_warning_once)
+ print_warning_once)
if TYPE_CHECKING:
from ray.util.placement_group import PlacementGroup
@@ -43,7 +43,7 @@ class ModelConfig:
Args:
model: Name or path of the huggingface model to use.
- It is also used as the content for `model_name` tag in metrics
+ It is also used as the content for `model_name` tag in metrics
output when `served_model_name` is not specified.
task: The task to use the model for. Each vLLM instance only supports
one task, even if the same model can be used for multiple tasks.
@@ -99,15 +99,15 @@ class ModelConfig:
skip_tokenizer_init: If true, skip initialization of tokenizer and
detokenizer.
served_model_name: The model name used in metrics tag `model_name`,
- matches the model name exposed via the APIs. If multiple model
- names provided, the first name will be used. If not specified,
+ matches the model name exposed via the APIs. If multiple model
+ names provided, the first name will be used. If not specified,
the model name will be the same as `model`.
- limit_mm_per_prompt: Maximum number of data instances per modality
+ limit_mm_per_prompt: Maximum number of data instances per modality
per prompt. Only applicable for multimodal models.
- override_neuron_config: Initialize non default neuron config or
- override default neuron config that are specific to Neuron devices,
- this argument will be used to configure the neuron config that
- can not be gathered from the vllm arguments.
+ override_neuron_config: Initialize non default neuron config or
+ override default neuron config that are specific to Neuron devices,
+ this argument will be used to configure the neuron config that
+ can not be gathered from the vllm arguments.
config_format: The config format which shall be loaded.
Defaults to 'auto' which defaults to 'hf'.
mm_processor_kwargs: Arguments to be forwarded to the model's processor
@@ -350,7 +350,7 @@ def _verify_quantization(self) -> None:
raise ValueError(
f"Unknown quantization method: {self.quantization}. Must "
f"be one of {supported_quantization}.")
- if is_hip(
+ if current_platform.is_rocm(
) and self.quantization not in rocm_supported_quantization:
raise ValueError(
f"{self.quantization} quantization is currently not "
@@ -365,7 +365,7 @@ def _verify_quantization(self) -> None:
"%s quantization is not fully "
"optimized yet. The speed can be slower than "
"non-quantized models.", self.quantization)
- if (self.quantization == "awq" and is_hip()
+ if (self.quantization == "awq" and current_platform.is_rocm()
and not envs.VLLM_USE_TRITON_AWQ):
logger.warning(
"Using AWQ quantization with ROCm, but VLLM_USE_TRITON_AWQ"
@@ -385,7 +385,7 @@ def _verify_cuda_graph(self) -> None:
def _verify_bnb_config(self) -> None:
"""
- The current version of bitsandbytes (0.44.0) with 8-bit models does not
+ The current version of bitsandbytes (0.44.0) with 8-bit models does not
yet support CUDA graph.
"""
is_bitsandbytes = self.quantization == "bitsandbytes"
@@ -810,7 +810,7 @@ class LoadConfig:
fast weight loading.
"bitsandbytes" will load nf4 type weights.
ignore_patterns: The list of patterns to ignore when loading the model.
- Default to "original/**/*" to avoid repeated loading of llama's
+ Default to "original/**/*" to avoid repeated loading of llama's
checkpoints.
"""
@@ -843,7 +843,8 @@ def _verify_load_format(self) -> None:
self.load_format = LoadFormat(load_format)
rocm_not_supported_load_format: List[str] = []
- if is_hip() and load_format in rocm_not_supported_load_format:
+ if current_platform.is_rocm(
+ ) and load_format in rocm_not_supported_load_format:
rocm_supported_load_format = [
f for f in LoadFormat.__members__
if (f not in rocm_not_supported_load_format)
@@ -967,7 +968,7 @@ def _verify_args(self) -> None:
if self.use_ray:
from vllm.executor import ray_utils
ray_utils.assert_ray_available()
- if is_hip():
+ if current_platform.is_rocm():
self.disable_custom_all_reduce = True
logger.info(
"Disabled the custom all-reduce kernel because it is not "
@@ -996,7 +997,7 @@ class SchedulerConfig:
prompt latency) before scheduling next prompt.
enable_chunked_prefill: If True, prefill requests can be chunked based
on the remaining max_num_batched_tokens.
- preemption_mode: Whether to perform preemption by swapping or
+ preemption_mode: Whether to perform preemption by swapping or
recomputation. If not specified, we determine the mode as follows:
We use recomputation by default since it incurs lower overhead than
swapping. However, when the sequence group has multiple sequences
@@ -1215,7 +1216,7 @@ def maybe_create_spec_config(
typical_acceptance_sampler_posterior_threshold (Optional[float]):
A threshold value that sets a lower bound on the posterior
probability of a token in the target model for it to be
- accepted. This threshold is used only when we use the
+ accepted. This threshold is used only when we use the
TypicalAcceptanceSampler for token acceptance.
typical_acceptance_sampler_posterior_alpha (Optional[float]):
A scaling factor for the entropy-based threshold in the
@@ -1225,7 +1226,7 @@ def maybe_create_spec_config(
If set to False, token log probabilities are returned
according to the log probability settings in SamplingParams.
If not specified, it defaults to True.
-
+
Returns:
Optional["SpeculativeConfig"]: An instance of SpeculativeConfig if
the necessary conditions are met, else None.
@@ -1470,13 +1471,13 @@ def __init__(
typical_acceptance_sampler_posterior_threshold (Optional[float]):
A threshold value that sets a lower bound on the posterior
probability of a token in the target model for it to be
- accepted. This threshold is used only when we use the
+ accepted. This threshold is used only when we use the
TypicalAcceptanceSampler for token acceptance.
typical_acceptance_sampler_posterior_alpha (Optional[float]):
A scaling factor for the entropy-based threshold in the
TypicalAcceptanceSampler.
disable_logprobs: If set to True, token log probabilities will not
- be returned even if requested by sampling parameters. This
+ be returned even if requested by sampling parameters. This
reduces latency by skipping logprob calculation in proposal
sampling, target sampling, and after accepted tokens are
determined. If set to False, log probabilities will be
@@ -1843,10 +1844,10 @@ def get_min_sliding_window(
def get_served_model_name(model: str,
served_model_name: Optional[Union[str, List[str]]]):
"""
- If the input is a non-empty list, the first model_name in
- `served_model_name` is taken.
- If the input is a non-empty string, it is used directly.
- For cases where the input is either an empty string or an
+ If the input is a non-empty list, the first model_name in
+ `served_model_name` is taken.
+ If the input is a non-empty string, it is used directly.
+ For cases where the input is either an empty string or an
empty list, the fallback is to use `self.model`.
"""
if not served_model_name:
diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py
index 0af7b3386d895..aa546ebada473 100644
--- a/vllm/executor/ray_utils.py
+++ b/vllm/executor/ray_utils.py
@@ -10,7 +10,7 @@
from vllm.logger import init_logger
from vllm.platforms import current_platform
from vllm.sequence import ExecuteModelRequest, IntermediateTensors
-from vllm.utils import get_ip, is_hip
+from vllm.utils import get_ip
from vllm.worker.worker_base import WorkerWrapperBase
logger = init_logger(__name__)
@@ -231,7 +231,7 @@ def initialize_ray_cluster(
assert_ray_available()
# Connect to a ray cluster.
- if is_hip() or current_platform.is_xpu():
+ if current_platform.is_rocm() or current_platform.is_xpu():
ray.init(address=ray_address,
ignore_reinit_error=True,
num_gpus=parallel_config.world_size)
diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py
index 71eed6eb68d78..83910339f3c9f 100644
--- a/vllm/model_executor/custom_op.py
+++ b/vllm/model_executor/custom_op.py
@@ -7,7 +7,7 @@
from vllm.compilation.levels import CompilationLevel
from vllm.logger import init_logger
from vllm.platforms import current_platform
-from vllm.utils import is_hip, print_warning_once
+from vllm.utils import print_warning_once
logger = init_logger(__name__)
@@ -72,7 +72,7 @@ def dispatch_forward(self):
if not enabled:
return self.forward_native
- if is_hip():
+ if current_platform.is_rocm():
return self.forward_hip
elif current_platform.is_cpu():
return self.forward_cpu
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index c21aaa40ff2cc..be3d3985a74ad 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -14,7 +14,8 @@
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
all_close_1d, normalize_e4m3fn_to_e4m3fnuz, per_tensor_dequantize)
from vllm.model_executor.utils import set_weight_attrs
-from vllm.utils import is_hip, print_warning_once
+from vllm.platforms import current_platform
+from vllm.utils import print_warning_once
class GPTQMarlinState(Enum):
@@ -150,7 +151,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
layer.w2_input_scale.max(), requires_grad=False)
# If rocm, normalize the weights and scales to e4m3fnuz
- if is_hip():
+ if current_platform.is_rocm():
# Normalize the weights and scales
w13_weight, w13_weight_scale, w13_input_scale = \
normalize_e4m3fn_to_e4m3fnuz(
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
index 7270b302ef965..73cc8ce0d2a4b 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
@@ -12,7 +12,7 @@
from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
ModelWeightParameter,
PerTensorScaleParameter)
-from vllm.utils import is_hip
+from vllm.platforms import current_platform
__all__ = ["CompressedTensorsW8A8Fp8"]
@@ -40,7 +40,7 @@ def process_weights_after_loading(self, layer) -> None:
logical_widths=layer.logical_widths,
)
- if is_hip():
+ if current_platform.is_rocm():
weight, max_w_scale, input_scale = normalize_e4m3fn_to_e4m3fnuz(
weight=weight,
weight_scale=max_w_scale,
@@ -56,7 +56,7 @@ def process_weights_after_loading(self, layer) -> None:
elif self.strategy == QuantizationStrategy.CHANNEL:
weight = layer.weight
- if is_hip():
+ if current_platform.is_rocm():
weight, weight_scale, input_scale = \
normalize_e4m3fn_to_e4m3fnuz(
weight=weight,
diff --git a/vllm/model_executor/layers/quantization/fbgemm_fp8.py b/vllm/model_executor/layers/quantization/fbgemm_fp8.py
index f26907176ad1a..825d01d1b3551 100644
--- a/vllm/model_executor/layers/quantization/fbgemm_fp8.py
+++ b/vllm/model_executor/layers/quantization/fbgemm_fp8.py
@@ -19,7 +19,6 @@
from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
ModelWeightParameter)
from vllm.platforms import current_platform
-from vllm.utils import is_hip
logger = init_logger(__name__)
@@ -127,7 +126,7 @@ def process_weights_after_loading(self, layer: Module) -> None:
weight = layer.weight
- if is_hip():
+ if current_platform.is_rocm():
weight, weight_scale, input_scale = \
normalize_e4m3fn_to_e4m3fnuz(
weight=weight,
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index b5feb55db0e74..d34579b7099bb 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -26,7 +26,7 @@
PerTensorScaleParameter)
from vllm.model_executor.utils import set_weight_attrs
from vllm.platforms import current_platform
-from vllm.utils import is_hip, print_warning_once
+from vllm.utils import print_warning_once
ACTIVATION_SCHEMES = ["static", "dynamic"]
@@ -123,7 +123,7 @@ def __init__(self, quant_config: Fp8Config):
self.use_marlin = (not current_platform.has_device_capability(89)
or envs.VLLM_TEST_FORCE_FP8_MARLIN)
# Disable marlin for rocm
- if is_hip():
+ if current_platform.is_rocm():
self.use_marlin = False
def create_weights(
@@ -226,7 +226,7 @@ def process_weights_after_loading(self, layer: Module) -> None:
weight_scale = layer.weight_scale
# If rocm, use float8_e4m3fnuz.
- if is_hip():
+ if current_platform.is_rocm():
weight, weight_scale, input_scale = \
normalize_e4m3fn_to_e4m3fnuz(
weight=weight,
@@ -372,7 +372,7 @@ def process_weights_after_loading(self, layer: Module) -> None:
if not self.quant_config.is_checkpoint_fp8_serialized:
# If rocm, use float8_e4m3fnuz as dtype
fp8_dtype = torch.float8_e4m3fnuz \
- if is_hip() else torch.float8_e4m3fn
+ if current_platform.is_rocm() else torch.float8_e4m3fn
w13_weight = torch.empty_like(layer.w13_weight.data,
dtype=fp8_dtype)
w2_weight = torch.empty_like(layer.w2_weight.data, dtype=fp8_dtype)
@@ -420,7 +420,7 @@ def process_weights_after_loading(self, layer: Module) -> None:
layer.w2_input_scale = torch.nn.Parameter(
layer.w2_input_scale.max(), requires_grad=False)
# If rocm, normalize the weights and scales to e4m3fnuz
- if is_hip():
+ if current_platform.is_rocm():
# Normalize the weights and scales
w13_weight, w13_weight_scale, w13_input_scale = \
normalize_e4m3fn_to_e4m3fnuz(
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index 411af922149fd..1879d2855d93d 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -4,16 +4,16 @@
from vllm import _custom_ops as ops
from vllm.platforms import current_platform
-from vllm.utils import is_hip
# Input scaling factors are no longer optional in _scaled_mm starting
# from pytorch 2.5. Allocating a dummy tensor to pass as input_scale
-TORCH_DEVICE_IDENTITY = torch.ones(1).cuda() if is_hip() else None
+TORCH_DEVICE_IDENTITY = torch.ones(1).cuda() \
+ if current_platform.is_rocm() else None
def cutlass_fp8_supported() -> bool:
# cutlass is not supported on Rocm
- if is_hip():
+ if current_platform.is_rocm():
return False
capability_tuple = current_platform.get_device_capability()
diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py
index 4126ceb7117d4..22f194c776b69 100644
--- a/vllm/model_executor/models/exaone.py
+++ b/vllm/model_executor/models/exaone.py
@@ -49,9 +49,9 @@
from vllm.model_executor.model_loader.weight_utils import (
default_weight_loader, kv_cache_scales_loader, maybe_remap_kv_scale_name)
from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.platforms import current_platform
from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.configs.exaone import ExaoneConfig
-from vllm.utils import is_hip
from .interfaces import SupportsLoRA, SupportsPP
from .utils import (PPMissingLayer, is_pp_missing_parameter,
@@ -595,7 +595,7 @@ def load_kv_cache_scales(self, quantization_param_path: str) -> None:
if not isinstance(self.transformer.h[layer_idx], nn.Identity):
layer_self_attn = self.transformer.h[layer_idx].attn
- if is_hip():
+ if current_platform.is_rocm():
# The scaling factor convention we are assuming is
# quantized_value * scaling_factor ~= true_value
# which is consistent with the practice of setting
diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py
index 5a397ed8ff6a0..c968817747754 100644
--- a/vllm/model_executor/models/granite.py
+++ b/vllm/model_executor/models/granite.py
@@ -49,8 +49,8 @@
from vllm.model_executor.model_loader.weight_utils import (
default_weight_loader, kv_cache_scales_loader, maybe_remap_kv_scale_name)
from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.platforms import current_platform
from vllm.sequence import IntermediateTensors
-from vllm.utils import is_hip
from .interfaces import SupportsLoRA, SupportsPP
from .utils import PPMissingLayer, is_pp_missing_parameter, make_layers
@@ -534,7 +534,7 @@ def load_kv_cache_scales(self, quantization_param_path: str) -> None:
if not isinstance(self.model.layers[layer_idx], nn.Identity):
layer_self_attn = self.model.layers[layer_idx].self_attn
- if is_hip():
+ if current_platform.is_rocm():
# The scaling factor convention we are assuming is
# quantized_value * scaling_factor ~= true_value
# which is consistent with the practice of setting
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index c346e3e808e3f..b0ca1fe006239 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -50,8 +50,8 @@
default_weight_loader, kv_cache_scales_loader, maybe_remap_kv_scale_name)
from vllm.model_executor.pooling_metadata import PoolingMetadata
from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.platforms import current_platform
from vllm.sequence import IntermediateTensors, PoolerOutput
-from vllm.utils import is_hip
from .interfaces import SupportsLoRA, SupportsPP
from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter,
@@ -423,7 +423,7 @@ def load_kv_cache_scales(self, quantization_param_path: str) -> None:
if not isinstance(self.layers[layer_idx], nn.Identity):
layer_self_attn = self.layers[layer_idx].self_attn
- if is_hip():
+ if current_platform.is_rocm():
# The scaling factor convention we are assuming is
# quantized_value * scaling_factor ~= true_value
# which is consistent with the practice of setting
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index f6713ab0898f0..595a9256f958e 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -12,7 +12,7 @@
import torch.nn as nn
from vllm.logger import init_logger
-from vllm.utils import is_hip
+from vllm.platforms import current_platform
from .interfaces import (has_inner_state, is_attention_free,
supports_multimodal, supports_pp)
@@ -247,7 +247,7 @@ def _try_load_model_cls(
model_arch: str,
model: _BaseRegisteredModel,
) -> Optional[Type[nn.Module]]:
- if is_hip():
+ if current_platform.is_rocm():
if model_arch in _ROCM_UNSUPPORTED_MODELS:
raise ValueError(f"Model architecture '{model_arch}' is not "
"supported by ROCm for now.")
diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py
index 5a3dd3c02b85b..e3e7ccb5cf179 100644
--- a/vllm/model_executor/models/solar.py
+++ b/vllm/model_executor/models/solar.py
@@ -49,8 +49,8 @@
from vllm.model_executor.model_loader.weight_utils import (
default_weight_loader, kv_cache_scales_loader, maybe_remap_kv_scale_name)
from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.platforms import current_platform
from vllm.sequence import IntermediateTensors
-from vllm.utils import is_hip
from .interfaces import SupportsLoRA, SupportsPP
from .utils import (PPMissingLayer, is_pp_missing_parameter,
@@ -558,7 +558,7 @@ def load_kv_cache_scales(self, quantization_param_path: str) -> None:
if not isinstance(self.model.layers[layer_idx], nn.Identity):
layer_self_attn = self.model.layers[layer_idx].self_attn
- if is_hip():
+ if current_platform.is_rocm():
# The scaling factor convention we are assuming is
# quantized_value * scaling_factor ~= true_value
# which is consistent with the practice of setting
diff --git a/vllm/utils.py b/vllm/utils.py
index d4f2c936ca9cc..c3f9a6bdd8b80 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -314,10 +314,6 @@ def reset(self):
self._index = 0
-def is_hip() -> bool:
- return torch.version.hip is not None
-
-
@lru_cache(maxsize=None)
def get_max_shared_memory_bytes(gpu: int = 0) -> int:
"""Returns the maximum shared memory per thread block in bytes."""
@@ -1098,7 +1094,7 @@ def _cuda_device_count_stateless(
if not torch.cuda._is_compiled():
return 0
- if is_hip():
+ if current_platform.is_rocm():
# ROCm uses amdsmi instead of nvml for stateless device count
# This requires a sufficiently modern version of Torch 2.4.0
raw_count = torch.cuda._device_count_amdsmi() if (hasattr(
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 4a287e3741d0f..233a9e664d845 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -41,6 +41,7 @@
from vllm.model_executor.models.utils import set_cpu_offload_max_bytes
from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
MultiModalInputs, MultiModalRegistry)
+from vllm.platforms import current_platform
from vllm.prompt_adapter.layers import PromptAdapterMapping
from vllm.prompt_adapter.request import PromptAdapterRequest
from vllm.prompt_adapter.worker_manager import (
@@ -49,7 +50,7 @@
from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
from vllm.transformers_utils.config import uses_mrope
from vllm.utils import (DeviceMemoryProfiler, PyObjectCache, async_tensor_h2d,
- flatten_2d_lists, is_hip, is_pin_memory_available,
+ flatten_2d_lists, is_pin_memory_available,
supports_dynamo, weak_ref_tensor)
from vllm.worker.model_runner_base import (
ModelRunnerBase, ModelRunnerInputBase, ModelRunnerInputBuilderBase,
@@ -737,13 +738,13 @@ def _get_cuda_graph_pad_size(self,
family of functions.
Args:
- num_seqs (int): Number of sequences scheduled to run.
+ num_seqs (int): Number of sequences scheduled to run.
max_decode_seq_len (int): Greatest of all the decode sequence
lengths. Used only in checking the viablility of using
CUDA graphs.
max_encoder_seq_len (int, optional): Greatest of all the encode
sequence lengths. Defaults to 0. Used only in checking the
- viability of using CUDA graphs.
+ viability of using CUDA graphs.
Returns:
int: Returns the determined number of padding sequences. If
CUDA graphs is not viable, returns -1.
@@ -1103,7 +1104,7 @@ def load_model(self) -> None:
self.prompt_adapter_manager.create_prompt_adapter_manager(
self.model))
- if self.kv_cache_dtype == "fp8" and is_hip():
+ if self.kv_cache_dtype == "fp8" and current_platform.is_rocm():
# Currently only ROCm accepts kv-cache scaling factors
# via quantization_param_path and this will be deprecated
# in the future.
From 32176fee733b76b295346870d717d44cb7102944 Mon Sep 17 00:00:00 2001
From: youkaichao
Date: Sun, 27 Oct 2024 21:58:04 -0700
Subject: [PATCH 015/113] [torch.compile] support moe models (#9632)
Signed-off-by: youkaichao
---
benchmarks/kernels/benchmark_moe.py | 33 +++---
tests/compile/test_basic_correctness.py | 4 +-
tests/kernels/test_awq_marlin.py | 21 ++--
tests/kernels/test_moe.py | 7 +-
.../layers/fused_moe/__init__.py | 28 ++++-
.../layers/fused_moe/fused_marlin_moe.py | 51 +++++++--
.../layers/fused_moe/fused_moe.py | 100 ++++++++++++++++--
vllm/model_executor/layers/fused_moe/layer.py | 29 +++--
.../layers/quantization/awq_marlin.py | 7 +-
.../compressed_tensors_moe.py | 7 +-
.../layers/quantization/gptq_marlin.py | 6 +-
vllm/model_executor/models/granitemoe.py | 2 +
12 files changed, 217 insertions(+), 78 deletions(-)
diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
index c2ad98b7e2656..4f88e8e6eb1a6 100644
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -88,22 +88,23 @@ def prepare(i: int):
input_gating.copy_(gating_output[i])
def run():
- fused_moe(
- x,
- w1,
- w2,
- input_gating,
- topk,
- renormalize=True,
- inplace=True,
- override_config=config,
- use_fp8_w8a8=use_fp8_w8a8,
- use_int8_w8a16=use_int8_w8a16,
- w1_scale=w1_scale,
- w2_scale=w2_scale,
- a1_scale=a1_scale,
- a2_scale=a2_scale,
- )
+ from vllm.model_executor.layers.fused_moe import override_config
+ with override_config(config):
+ fused_moe(
+ x,
+ w1,
+ w2,
+ input_gating,
+ topk,
+ renormalize=True,
+ inplace=True,
+ use_fp8_w8a8=use_fp8_w8a8,
+ use_int8_w8a16=use_int8_w8a16,
+ w1_scale=w1_scale,
+ w2_scale=w2_scale,
+ a1_scale=a1_scale,
+ a2_scale=a2_scale,
+ )
# JIT compilation & warmup
run()
diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py
index 77c56d91d0a8b..6aa27b24b4a6e 100644
--- a/tests/compile/test_basic_correctness.py
+++ b/tests/compile/test_basic_correctness.py
@@ -13,11 +13,11 @@
@pytest.mark.parametrize(
"model, model_args, pp_size, tp_size, attn_backend, method, fullgraph",
[
- ("meta-llama/Llama-3.2-1B", [], 2, 2, "FLASH_ATTN", "generate", True),
+ ("meta-llama/Llama-3.2-1B", [], 2, 2, "FLASHINFER", "generate", True),
("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples",
["--quantization", "compressed-tensors"
], 1, 1, "FLASH_ATTN", "generate", True),
- ("google/gemma-2-2b-it", [], 1, 2, "FLASHINFER", "generate", True),
+ ("ibm/PowerMoE-3b", [], 1, 2, "FLASH_ATTN", "generate", True),
# TODO: add multi-modality test for llava
("llava-hf/llava-1.5-7b-hf", [], 2, 1, "FLASHINFER", "generate", False)
])
diff --git a/tests/kernels/test_awq_marlin.py b/tests/kernels/test_awq_marlin.py
index 0f0a2b24563fd..59917dd2c58ad 100644
--- a/tests/kernels/test_awq_marlin.py
+++ b/tests/kernels/test_awq_marlin.py
@@ -5,11 +5,10 @@
import pytest
import torch
+import vllm.model_executor.layers.fused_moe # noqa
from tests.kernels.utils import (compute_max_diff, stack_and_dev, torch_moe,
torch_moe_single)
from vllm import _custom_ops as ops
-from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
- fused_marlin_moe, single_marlin_moe)
from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
awq_marlin_quantize)
@@ -81,7 +80,7 @@ def test_fused_marlin_moe_awq(
score = torch.randn((m, e), device="cuda", dtype=dtype)
topk_weights, topk_ids = fused_topk(a, score, topk, False)
- marlin_output = fused_marlin_moe(
+ marlin_output = torch.ops.vllm.fused_marlin_moe(
a,
qweight1,
qweight2,
@@ -150,14 +149,14 @@ def test_single_marlin_moe_multiply_awq(
score = torch.randn((m, e), device="cuda", dtype=dtype)
- marlin_output = single_marlin_moe(a,
- qweight,
- scales,
- score,
- topk,
- renormalize=False,
- w_zeros=zp,
- num_bits=num_bits)
+ marlin_output = torch.ops.vllm.single_marlin_moe(a,
+ qweight,
+ scales,
+ score,
+ topk,
+ renormalize=False,
+ w_zeros=zp,
+ num_bits=num_bits)
torch_output = torch_moe_single(a, w_ref.transpose(1, 2), score, topk)
diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py
index 4bfc089c82179..70906ab2187bc 100644
--- a/tests/kernels/test_moe.py
+++ b/tests/kernels/test_moe.py
@@ -7,12 +7,11 @@
from transformers import MixtralConfig
from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock
+import vllm.model_executor.layers.fused_moe # noqa
from tests.kernels.utils import (compute_max_diff, opcheck, stack_and_dev,
torch_moe, torch_moe_single)
from vllm import _custom_ops as ops
from vllm.model_executor.layers.fused_moe import fused_moe
-from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
- fused_marlin_moe, single_marlin_moe)
from vllm.model_executor.layers.fused_moe.fused_moe import (
fused_topk, moe_align_block_size)
from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
@@ -193,7 +192,7 @@ def test_fused_marlin_moe(
topk,
renormalize=False,
)
- marlin_output = fused_marlin_moe(
+ marlin_output = torch.ops.vllm.fused_marlin_moe(
a,
qweight1,
qweight2,
@@ -309,7 +308,7 @@ def test_single_marlin_moe_multiply(
sort_indices = stack_and_dev(sort_indices_l)
score = torch.randn((m, e), device="cuda", dtype=dtype)
- marlin_output = single_marlin_moe(
+ marlin_output = torch.ops.vllm.single_marlin_moe(
a,
qweight,
scales,
diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
index e9b5703ca28be..c4223d12600ac 100644
--- a/vllm/model_executor/layers/fused_moe/__init__.py
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -1,23 +1,43 @@
+from contextlib import contextmanager
+from typing import Any, Dict, Optional
+
from vllm.model_executor.layers.fused_moe.layer import (
FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
from vllm.triton_utils import HAS_TRITON
+_config: Optional[Dict[str, Any]] = None
+
+
+@contextmanager
+def override_config(config):
+ global _config
+ old_config = _config
+ _config = config
+ yield
+ _config = old_config
+
+
+def get_config() -> Optional[Dict[str, Any]]:
+ return _config
+
+
__all__ = [
"FusedMoE",
"FusedMoEMethodBase",
"FusedMoeWeightScaleSupported",
+ "override_config",
+ "get_config",
]
if HAS_TRITON:
- from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
- fused_marlin_moe, single_marlin_moe)
+ # import to register the custom ops
+ import vllm.model_executor.layers.fused_moe.fused_marlin_moe # noqa
+ import vllm.model_executor.layers.fused_moe.fused_moe # noqa
from vllm.model_executor.layers.fused_moe.fused_moe import (
fused_experts, fused_moe, fused_topk, get_config_file_name,
grouped_topk)
__all__ += [
- "fused_marlin_moe",
- "single_marlin_moe",
"fused_moe",
"fused_topk",
"fused_experts",
diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
index 5ae40a2af5a2b..93019d0d0abb6 100644
--- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
@@ -1,6 +1,6 @@
"""Fused MoE utilities for GPTQ."""
import functools
-from typing import Any, Dict, Optional
+from typing import Optional
import torch
@@ -18,6 +18,7 @@ def get_scalar_type(num_bits: int, has_zp: bool):
return scalar_types.uint4b8 if num_bits == 4 else scalar_types.uint8b128
+@torch.library.custom_op("vllm::single_marlin_moe", mutates_args=[])
def single_marlin_moe(
hidden_states: torch.Tensor,
w: torch.Tensor,
@@ -28,7 +29,6 @@ def single_marlin_moe(
g_idx: Optional[torch.Tensor] = None,
sort_indices: Optional[torch.Tensor] = None,
w_zeros: Optional[torch.Tensor] = None,
- override_config: Optional[Dict[str, Any]] = None,
num_bits: int = 8,
is_k_full: bool = True,
) -> torch.Tensor:
@@ -49,8 +49,6 @@ def single_marlin_moe(
- topk (int): The number of top-k experts to select.
- renormalize (bool): If True, renormalize the top-k weights to sum to 1.
- w_zeros (Optional[torch.Tensor]): Optional zero points to be used for w.
- - override_config (Optional[Dict[str, Any]]): Optional override
- for the kernel configuration.
- num_bits (bool): The number of bits in expert weights quantization.
Returns:
@@ -79,7 +77,6 @@ def single_marlin_moe(
w.shape,
topk_ids.shape[1],
None,
- override_config=override_config,
is_marlin=True)
config = get_config_func(M)
@@ -122,6 +119,24 @@ def single_marlin_moe(
return torch.sum(intermediate_cache.view(*intermediate_cache.shape), dim=1)
+@single_marlin_moe.register_fake
+def _(
+ hidden_states: torch.Tensor,
+ w: torch.Tensor,
+ scales: torch.Tensor,
+ gating_output: torch.Tensor,
+ topk: int,
+ renormalize: bool,
+ g_idx: Optional[torch.Tensor] = None,
+ sort_indices: Optional[torch.Tensor] = None,
+ w_zeros: Optional[torch.Tensor] = None,
+ num_bits: int = 8,
+ is_k_full: bool = True,
+) -> torch.Tensor:
+ return torch.empty_like(hidden_states)
+
+
+@torch.library.custom_op("vllm::fused_marlin_moe", mutates_args=[])
def fused_marlin_moe(
hidden_states: torch.Tensor,
w1: torch.Tensor,
@@ -137,7 +152,6 @@ def fused_marlin_moe(
sort_indices2: Optional[torch.Tensor] = None,
w1_zeros: Optional[torch.Tensor] = None,
w2_zeros: Optional[torch.Tensor] = None,
- override_config: Optional[Dict[str, Any]] = None,
num_bits: int = 8,
is_k_full: bool = True,
) -> torch.Tensor:
@@ -161,8 +175,6 @@ def fused_marlin_moe(
permutation.
- topk_weights (torch.Tensor): Top-k weights.
- topk_ids (torch.Tensor): Indices of topk-k elements.
- - override_config (Optional[Dict[str, Any]]): Optional override
- for the kernel configuration.
- w1_zeros (Optional[torch.Tensor]): Optional zero points to be used for w1.
- w2_zeros (Optional[torch.Tensor]): Optional zero points to be used for w2.
- num_bits (bool): The number of bits in expert weights quantization.
@@ -209,7 +221,6 @@ def fused_marlin_moe(
w2.shape,
topk_ids.shape[1],
None,
- override_config=override_config,
is_marlin=True,
)
config = get_config_func(M)
@@ -311,3 +322,25 @@ def fused_marlin_moe(
return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape),
dim=1)
+
+
+@fused_marlin_moe.register_fake
+def _(
+ hidden_states: torch.Tensor,
+ w1: torch.Tensor,
+ w2: torch.Tensor,
+ w1_scale: torch.Tensor,
+ w2_scale: torch.Tensor,
+ gating_output: torch.Tensor,
+ topk_weights: torch.Tensor,
+ topk_ids: torch.Tensor,
+ g_idx1: Optional[torch.Tensor] = None,
+ g_idx2: Optional[torch.Tensor] = None,
+ sort_indices1: Optional[torch.Tensor] = None,
+ sort_indices2: Optional[torch.Tensor] = None,
+ w1_zeros: Optional[torch.Tensor] = None,
+ w2_zeros: Optional[torch.Tensor] = None,
+ num_bits: int = 8,
+ is_k_full: bool = True,
+) -> torch.Tensor:
+ return torch.empty_like(hidden_states)
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 90a4209b5bce5..1cf5c2253ca0b 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -358,9 +358,10 @@ def try_get_optimal_moe_config(
top_k: int,
dtype: Optional[str],
M: int,
- override_config: Optional[Dict[str, Any]] = None,
is_marlin: bool = False,
):
+ from vllm.model_executor.layers.fused_moe import get_config
+ override_config = get_config()
if override_config:
config = override_config
else:
@@ -465,19 +466,109 @@ def get_config_dtype_str(dtype: torch.dtype,
return None
+@torch.library.custom_op("vllm::inplace_fused_experts",
+ mutates_args=["hidden_states"])
+def inplace_fused_experts(hidden_states: torch.Tensor,
+ w1: torch.Tensor,
+ w2: torch.Tensor,
+ topk_weights: torch.Tensor,
+ topk_ids: torch.Tensor,
+ use_fp8_w8a8: bool = False,
+ use_int8_w8a16: bool = False,
+ w1_scale: Optional[torch.Tensor] = None,
+ w2_scale: Optional[torch.Tensor] = None,
+ a1_scale: Optional[torch.Tensor] = None,
+ a2_scale: Optional[torch.Tensor] = None) -> None:
+ fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids, True,
+ use_fp8_w8a8, use_int8_w8a16, w1_scale, w2_scale,
+ a1_scale, a2_scale)
+
+
+@inplace_fused_experts.register_fake
+def _(hidden_states: torch.Tensor,
+ w1: torch.Tensor,
+ w2: torch.Tensor,
+ topk_weights: torch.Tensor,
+ topk_ids: torch.Tensor,
+ use_fp8_w8a8: bool = False,
+ use_int8_w8a16: bool = False,
+ w1_scale: Optional[torch.Tensor] = None,
+ w2_scale: Optional[torch.Tensor] = None,
+ a1_scale: Optional[torch.Tensor] = None,
+ a2_scale: Optional[torch.Tensor] = None) -> None:
+ pass
+
+
+@torch.library.custom_op("vllm::outplace_fused_experts", mutates_args=[])
+def outplace_fused_experts(
+ hidden_states: torch.Tensor,
+ w1: torch.Tensor,
+ w2: torch.Tensor,
+ topk_weights: torch.Tensor,
+ topk_ids: torch.Tensor,
+ use_fp8_w8a8: bool = False,
+ use_int8_w8a16: bool = False,
+ w1_scale: Optional[torch.Tensor] = None,
+ w2_scale: Optional[torch.Tensor] = None,
+ a1_scale: Optional[torch.Tensor] = None,
+ a2_scale: Optional[torch.Tensor] = None) -> torch.Tensor:
+ return fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids,
+ False, use_fp8_w8a8, use_int8_w8a16, w1_scale,
+ w2_scale, a1_scale, a2_scale)
+
+
+@outplace_fused_experts.register_fake
+def _(hidden_states: torch.Tensor,
+ w1: torch.Tensor,
+ w2: torch.Tensor,
+ topk_weights: torch.Tensor,
+ topk_ids: torch.Tensor,
+ use_fp8_w8a8: bool = False,
+ use_int8_w8a16: bool = False,
+ w1_scale: Optional[torch.Tensor] = None,
+ w2_scale: Optional[torch.Tensor] = None,
+ a1_scale: Optional[torch.Tensor] = None,
+ a2_scale: Optional[torch.Tensor] = None) -> torch.Tensor:
+ return torch.empty_like(hidden_states)
+
+
def fused_experts(hidden_states: torch.Tensor,
w1: torch.Tensor,
w2: torch.Tensor,
topk_weights: torch.Tensor,
topk_ids: torch.Tensor,
inplace: bool = False,
- override_config: Optional[Dict[str, Any]] = None,
use_fp8_w8a8: bool = False,
use_int8_w8a16: bool = False,
w1_scale: Optional[torch.Tensor] = None,
w2_scale: Optional[torch.Tensor] = None,
a1_scale: Optional[torch.Tensor] = None,
a2_scale: Optional[torch.Tensor] = None):
+ if inplace:
+ torch.ops.vllm.inplace_fused_experts(hidden_states, w1, w2,
+ topk_weights, topk_ids,
+ use_fp8_w8a8, use_int8_w8a16,
+ w1_scale, w2_scale, a1_scale,
+ a2_scale)
+ return hidden_states
+ else:
+ return torch.ops.vllm.outplace_fused_experts(
+ hidden_states, w1, w2, topk_weights, topk_ids, use_fp8_w8a8,
+ use_int8_w8a16, w1_scale, w2_scale, a1_scale, a2_scale)
+
+
+def fused_experts_impl(hidden_states: torch.Tensor,
+ w1: torch.Tensor,
+ w2: torch.Tensor,
+ topk_weights: torch.Tensor,
+ topk_ids: torch.Tensor,
+ inplace: bool = False,
+ use_fp8_w8a8: bool = False,
+ use_int8_w8a16: bool = False,
+ w1_scale: Optional[torch.Tensor] = None,
+ w2_scale: Optional[torch.Tensor] = None,
+ a1_scale: Optional[torch.Tensor] = None,
+ a2_scale: Optional[torch.Tensor] = None):
# Check constraints.
assert hidden_states.shape[1] == w1.shape[2], "Hidden size mismatch"
assert topk_weights.shape == topk_ids.shape, "topk shape mismatch"
@@ -504,7 +595,6 @@ def fused_experts(hidden_states: torch.Tensor,
w2.shape,
topk_ids.shape[1],
config_dtype,
- override_config=override_config,
)
config = get_config_func(M)
@@ -602,7 +692,6 @@ def fused_moe(
topk: int,
renormalize: bool,
inplace: bool = False,
- override_config: Optional[Dict[str, Any]] = None,
use_grouped_topk: bool = False,
num_expert_group: Optional[int] = None,
topk_group: Optional[int] = None,
@@ -628,8 +717,6 @@ def fused_moe(
- renormalize (bool): If True, renormalize the top-k weights to sum to 1.
- inplace (bool): If True, perform the operation in-place.
Defaults to False.
- - override_config (Optional[Dict[str, Any]]): Optional override
- for the kernel configuration.
- num_expert_group: Optional[int]: additional parameter for grouped_topk
- topk_group: Optional[int]: additional parameter for grouped_topk
- use_grouped_topk: If True, use grouped_topk instead of fused_topk
@@ -667,7 +754,6 @@ def fused_moe(
topk_weights,
topk_ids,
inplace=inplace,
- override_config=override_config,
use_fp8_w8a8=use_fp8_w8a8,
use_int8_w8a16=use_int8_w8a16,
w1_scale=w1_scale,
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 8dd36620e3fa0..5570771ac917b 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -12,7 +12,16 @@
from vllm.model_executor.layers.quantization.base_config import (
QuantizationConfig, QuantizeMethodBase)
from vllm.model_executor.utils import set_weight_attrs
-
+from vllm.platforms import current_platform
+
+if current_platform.is_cuda_alike():
+ from .fused_moe import fused_experts
+else:
+ fused_experts = None # type: ignore
+if current_platform.is_tpu():
+ from .moe_pallas import fused_moe as fused_moe_pallas
+else:
+ fused_moe_pallas = None # type: ignore
logger = init_logger(__name__)
@@ -96,9 +105,6 @@ def forward_cuda(
num_expert_group: Optional[int] = None,
custom_routing_function: Optional[Callable] = None
) -> torch.Tensor:
- from vllm.model_executor.layers.fused_moe.fused_moe import (
- fused_experts)
-
topk_weights, topk_ids = FusedMoE.select_experts(
hidden_states=x,
router_logits=router_logits,
@@ -132,17 +138,18 @@ def forward_tpu(
num_expert_group: Optional[int] = None,
custom_routing_function: Optional[Callable] = None
) -> torch.Tensor:
- from vllm.model_executor.layers.fused_moe.moe_pallas import fused_moe
assert not use_grouped_topk
assert num_expert_group is None
assert topk_group is None
assert custom_routing_function is None
- return fused_moe(hidden_states=x,
- w1=layer.w13_weight,
- w2=layer.w2_weight,
- topk=top_k,
- gating_output=router_logits,
- renormalize=renormalize)
+ return fused_moe_pallas(hidden_states=x,
+ w1=layer.w13_weight,
+ w2=layer.w2_weight,
+ topk=top_k,
+ gating_output=router_logits,
+ renormalize=renormalize)
+
+ forward_native = forward_cuda
class FusedMoE(torch.nn.Module):
diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py
index b3d93b285769c..95ec12daeeeb5 100644
--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -3,6 +3,7 @@
import torch
from torch.nn import Parameter
+import vllm.model_executor.layers.fused_moe # noqa
from vllm import _custom_ops as ops
from vllm.logger import init_logger
from vllm.model_executor.layers.fused_moe.layer import (
@@ -435,10 +436,6 @@ def apply(
topk_group: Optional[int] = None,
custom_routing_function: Optional[Callable] = None,
) -> torch.Tensor:
-
- from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
- fused_marlin_moe)
-
topk_weights, topk_ids = FusedMoE.select_experts(
hidden_states=x,
router_logits=router_logits,
@@ -449,7 +446,7 @@ def apply(
num_expert_group=num_expert_group,
custom_routing_function=custom_routing_function)
- return fused_marlin_moe(
+ return torch.ops.vllm.fused_marlin_moe(
x,
layer.w13_qweight,
layer.w2_qweight,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index be3d3985a74ad..dad04017d3212 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -6,6 +6,7 @@
from compressed_tensors import CompressionFormat
from compressed_tensors.quantization import QuantizationStrategy
+import vllm.model_executor.layers.fused_moe # noqa
from vllm import _custom_ops as ops
from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEMethodBase,
FusedMoeWeightScaleSupported)
@@ -481,10 +482,6 @@ def apply(
topk_group: Optional[int] = None,
custom_routing_function: Optional[Callable] = None,
) -> torch.Tensor:
-
- from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
- fused_marlin_moe)
-
topk_weights, topk_ids = FusedMoE.select_experts(
hidden_states=x,
router_logits=router_logits,
@@ -495,7 +492,7 @@ def apply(
num_expert_group=num_expert_group,
custom_routing_function=custom_routing_function)
- return fused_marlin_moe(
+ return torch.ops.vllm.fused_marlin_moe(
x,
layer.w13_weight_packed,
layer.w2_weight_packed,
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index e77191796bd7e..b97dd108d6785 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -2,6 +2,7 @@
import torch
+import vllm.model_executor.layers.fused_moe # noqa
from vllm import _custom_ops as ops
from vllm.logger import init_logger
from vllm.model_executor.layers.fused_moe.layer import (
@@ -536,9 +537,6 @@ def apply(
topk_group: Optional[int] = None,
custom_routing_function: Optional[Callable] = None,
) -> torch.Tensor:
- from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
- fused_marlin_moe)
-
# The input must currently be float16
orig_dtype = x.dtype
x = x.half()
@@ -553,7 +551,7 @@ def apply(
num_expert_group=num_expert_group,
custom_routing_function=None)
- return fused_marlin_moe(
+ return torch.ops.vllm.fused_marlin_moe(
x,
layer.w13_qweight,
layer.w2_qweight,
diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py
index fd0d4c89a28fe..5307bb21adb96 100644
--- a/vllm/model_executor/models/granitemoe.py
+++ b/vllm/model_executor/models/granitemoe.py
@@ -28,6 +28,7 @@
from transformers.models.granitemoe import GraniteMoeConfig
from vllm.attention import Attention, AttentionMetadata
+from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, LoRAConfig
from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
from vllm.model_executor.layers.fused_moe import FusedMoE
@@ -244,6 +245,7 @@ def forward(
return hidden_states
+@support_torch_compile
class GraniteMoeModel(nn.Module):
def __init__(
From feb92fbe4ab6803527df48658a87ebd00b99969f Mon Sep 17 00:00:00 2001
From: Robert Shaw
<114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Mon, 28 Oct 2024 02:59:37 -0400
Subject: [PATCH 016/113] Fix beam search eos (#9627)
---
vllm/engine/protocol.py | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index 5c504e0f0217d..b00dd136d4a47 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -140,7 +140,12 @@ async def beam_search(
best_beams = sorted_completed[:beam_width]
for beam in best_beams:
- beam.text = tokenizer.decode(beam.tokens[tokenized_length:])
+ if (beam.tokens[-1] == tokenizer.eos_token_id and not ignore_eos):
+ # Skip the eos token in the text.
+ tokens = beam.tokens[tokenized_length:-1]
+ else:
+ tokens = beam.tokens[tokenized_length:]
+ beam.text = tokenizer.decode(tokens)
beam_search_output = RequestOutput(
request_id=request_id,
From 2adb4409e0359039135b5aa6501994da12aa5a26 Mon Sep 17 00:00:00 2001
From: Yan Ma
Date: Mon, 28 Oct 2024 15:13:03 +0800
Subject: [PATCH 017/113] [Bugfix] Fix ray instance detect issue (#9439)
---
vllm/executor/ray_utils.py | 13 ++++++++++---
1 file changed, 10 insertions(+), 3 deletions(-)
diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py
index aa546ebada473..993d279890820 100644
--- a/vllm/executor/ray_utils.py
+++ b/vllm/executor/ray_utils.py
@@ -232,9 +232,16 @@ def initialize_ray_cluster(
# Connect to a ray cluster.
if current_platform.is_rocm() or current_platform.is_xpu():
- ray.init(address=ray_address,
- ignore_reinit_error=True,
- num_gpus=parallel_config.world_size)
+ # Try to connect existing ray instance and create a new one if not found
+ try:
+ ray.init("auto")
+ except ConnectionError:
+ logger.warning(
+ "No existing RAY instance detected. "
+ "A new instance will be launched with current node resources.")
+ ray.init(address=ray_address,
+ ignore_reinit_error=True,
+ num_gpus=parallel_config.world_size)
else:
ray.init(address=ray_address, ignore_reinit_error=True)
From 8b0e4f2ad7b5a3ddd6d61acbe8ceb50b4ea3c309 Mon Sep 17 00:00:00 2001
From: Russell Bryant
Date: Mon, 28 Oct 2024 12:38:09 -0400
Subject: [PATCH 018/113] [CI/Build] Adopt Mergify for auto-labeling PRs
(#9259)
Signed-off-by: Russell Bryant
---
.github/mergify.yml | 57 +++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 57 insertions(+)
create mode 100644 .github/mergify.yml
diff --git a/.github/mergify.yml b/.github/mergify.yml
new file mode 100644
index 0000000000000..2a3dee7c662d1
--- /dev/null
+++ b/.github/mergify.yml
@@ -0,0 +1,57 @@
+pull_request_rules:
+- name: label-documentation
+ description: Automatically apply documentation label
+ conditions:
+ - or:
+ - files~=^[^/]+\.md$
+ - files~=^docs/
+ actions:
+ label:
+ add:
+ - documentation
+
+- name: label-ci-build
+ description: Automatically apply ci/build label
+ conditions:
+ - files~=^\.github/
+ - files~=\.buildkite/
+ - files~=^cmake/
+ - files=CMakeLists.txt
+ - files~=^Dockerfile
+ - files~=^requirements.*\.txt
+ - files=setup.py
+ actions:
+ label:
+ add:
+ - ci/build
+
+- name: label-frontend
+ description: Automatically apply frontend label
+ conditions:
+ - files~=^vllm/entrypoints/
+ actions:
+ label:
+ add:
+ - frontend
+
+- name: ping author on conflicts and add 'needs-rebase' label
+ conditions:
+ - conflict
+ - -closed
+ actions:
+ label:
+ add:
+ - needs-rebase
+ comment:
+ message: |
+ This pull request has merge conflicts that must be resolved before it can be
+ merged. @{{author}} please rebase it. https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork
+
+- name: remove 'needs-rebase' label when conflict is resolved
+ conditions:
+ - -conflict
+ - -closed
+ actions:
+ label:
+ remove:
+ - needs-rebase
From 5f8d8075f957d5376b2f1cc451e35a2a757e95a5 Mon Sep 17 00:00:00 2001
From: litianjian <45817262+litianjian@users.noreply.github.com>
Date: Tue, 29 Oct 2024 02:04:10 +0800
Subject: [PATCH 019/113] [Model][VLM] Add multi-video support for
LLaVA-Onevision (#8905)
Co-authored-by: litianjian
Co-authored-by: DarkLight1337
---
.../vision_language/test_llava_onevision.py | 173 +++++-------------
vllm/model_executor/models/clip.py | 4 +-
vllm/model_executor/models/llava_onevision.py | 94 +++++++---
vllm/model_executor/models/siglip.py | 4 +-
vllm/multimodal/video.py | 10 +-
5 files changed, 123 insertions(+), 162 deletions(-)
diff --git a/tests/models/decoder_only/vision_language/test_llava_onevision.py b/tests/models/decoder_only/vision_language/test_llava_onevision.py
index 367f25f446279..1616fd299b9aa 100644
--- a/tests/models/decoder_only/vision_language/test_llava_onevision.py
+++ b/tests/models/decoder_only/vision_language/test_llava_onevision.py
@@ -1,4 +1,4 @@
-from typing import List, Optional, Tuple, Type, overload
+from typing import List, Optional, Tuple, Type
import pytest
from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer,
@@ -9,9 +9,8 @@
from vllm.sequence import SampleLogprobs
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
-from ....conftest import (VIDEO_ASSETS, HfRunner, PromptImageInput, VllmRunner,
- _VideoAssets)
-from ....utils import large_gpu_test
+from ....conftest import (VIDEO_ASSETS, HfRunner, PromptImageInput,
+ PromptVideoInput, VllmRunner)
from ...utils import check_logprobs_close
# Video test
@@ -20,7 +19,7 @@
"<|im_start|>user\n
+---
+
+**vLLM x Snowfkale Meetup (Wednesday, November 13th, 5:30-8PM PT) at Snowfkale HQ, San Mateo**
+
+We are excited to announce the last in-person vLLM meetup of the year!
+Join the vLLM developers and engineers from Snowflake AI Research to chat about the latest LLM inference optimizations and your 2025 vLLM wishlist!
+Register [here](https://lu.ma/h0qvrajz) and be a part of the event!
+
+---
+
*Latest News* 🔥
-- [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there!
+- [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there!
- [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://raysummit.anyscale.com/flow/anyscale/raysummit2024/landing/page/sessioncatalog?tab.day=20241001&search.sessiontracks=1719251906298001uzJ2) from other vLLM contributors and users!
- [2024/09] We hosted [the sixth vLLM meetup](https://lu.ma/87q3nvnh) with NVIDIA! Please find the meetup slides [here](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing).
- [2024/07] We hosted [the fifth vLLM meetup](https://lu.ma/lp0gyjqr) with AWS! Please find the meetup slides [here](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing).
@@ -42,7 +52,7 @@ vLLM is fast with:
- Speculative decoding
- Chunked prefill
-**Performance benchmark**: We include a performance benchmark at the end of [our blog post](https://blog.vllm.ai/2024/09/05/perf-update.html). It compares the performance of vLLM against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [SGLang](https://github.com/sgl-project/sglang) and [LMDeploy](https://github.com/InternLM/lmdeploy)). The implementation is under [nightly-benchmarks folder](.buildkite/nightly-benchmarks/) and you can [reproduce](https://github.com/vllm-project/vllm/issues/8176) this benchmark using our one-click runnable script.
+**Performance benchmark**: We include a performance benchmark at the end of [our blog post](https://blog.vllm.ai/2024/09/05/perf-update.html). It compares the performance of vLLM against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [SGLang](https://github.com/sgl-project/sglang) and [LMDeploy](https://github.com/InternLM/lmdeploy)). The implementation is under [nightly-benchmarks folder](.buildkite/nightly-benchmarks/) and you can [reproduce](https://github.com/vllm-project/vllm/issues/8176) this benchmark using our one-click runnable script.
vLLM is flexible and easy to use with:
From bc73e9821cb4f90a88c04e7d550f132d8911266b Mon Sep 17 00:00:00 2001
From: Michael Goin
Date: Tue, 29 Oct 2024 19:02:59 -0400
Subject: [PATCH 041/113] [Bugfix] Fix prefix strings for quantized VLMs
(#9772)
---
vllm/model_executor/model_loader/loader.py | 11 +++-
vllm/model_executor/models/blip2.py | 5 +-
vllm/model_executor/models/gemma.py | 58 +++++++++++++------
vllm/model_executor/models/internlm2.py | 56 ++++++++++++------
vllm/model_executor/models/internlm2_ve.py | 16 +++--
vllm/model_executor/models/internvl.py | 5 +-
vllm/model_executor/models/llama.py | 7 ++-
vllm/model_executor/models/llava.py | 20 +++++--
vllm/model_executor/models/llava_next.py | 10 +++-
.../model_executor/models/llava_next_video.py | 10 +++-
vllm/model_executor/models/llava_onevision.py | 10 +++-
vllm/model_executor/models/minicpmv.py | 34 ++++++++---
vllm/model_executor/models/opt.py | 34 ++++++++---
vllm/model_executor/models/paligemma.py | 7 ++-
vllm/model_executor/models/phi3v.py | 19 ++++--
vllm/model_executor/models/pixtral.py | 5 +-
vllm/model_executor/models/qwen2.py | 50 +++++++++++-----
vllm/model_executor/models/qwen2_vl.py | 8 ++-
vllm/model_executor/models/ultravox.py | 5 +-
vllm/model_executor/models/utils.py | 15 +++++
20 files changed, 288 insertions(+), 97 deletions(-)
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 3cfee13b9fa6e..3ae8a51859f70 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -147,15 +147,20 @@ def _get_model_initialization_kwargs(
return extra_kwargs
-def build_model(model_class: Type[nn.Module], hf_config: PretrainedConfig,
+def build_model(model_class: Type[nn.Module],
+ hf_config: PretrainedConfig,
cache_config: Optional[CacheConfig],
- quant_config: Optional[QuantizationConfig], *,
+ quant_config: Optional[QuantizationConfig],
+ *,
lora_config: Optional[LoRAConfig],
multimodal_config: Optional[MultiModalConfig],
- scheduler_config: Optional[SchedulerConfig]) -> nn.Module:
+ scheduler_config: Optional[SchedulerConfig],
+ prefix: Optional[str] = None) -> nn.Module:
extra_kwargs = _get_model_initialization_kwargs(model_class, lora_config,
multimodal_config,
scheduler_config)
+ if prefix:
+ extra_kwargs["prefix"] = prefix
return model_class(config=hf_config,
cache_config=cache_config,
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index cd2013e91514d..c3b3cc8a4ddb6 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -507,7 +507,10 @@ def __init__(self,
)
self.language_model = init_vllm_registered_model(
- config.text_config, cache_config, quant_config)
+ config.text_config,
+ cache_config,
+ quant_config,
+ prefix="language_model")
self.make_empty_intermediate_tensors = (
self.language_model.make_empty_intermediate_tensors)
diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
index 436bd45d53f35..57b2b43c82f89 100644
--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -43,7 +43,8 @@
from .interfaces import SupportsLoRA, SupportsPP
from .utils import (is_pp_missing_parameter,
- make_empty_intermediate_tensors_factory, make_layers)
+ make_empty_intermediate_tensors_factory, make_layers,
+ maybe_prefix)
logger = init_logger(__name__)
@@ -83,16 +84,23 @@ def __init__(
hidden_act: Optional[str] = None,
hidden_activation: Optional[str] = None,
quant_config: Optional[QuantizationConfig] = None,
+ prefix: str = "",
) -> None:
super().__init__()
self.gate_up_proj = MergedColumnParallelLinear(
- hidden_size, [intermediate_size] * 2,
+ hidden_size,
+ [intermediate_size] * 2,
bias=False,
- quant_config=quant_config)
- self.down_proj = RowParallelLinear(intermediate_size,
- hidden_size,
- bias=False,
- quant_config=quant_config)
+ quant_config=quant_config,
+ prefix=f"{prefix}.gate_up_proj",
+ )
+ self.down_proj = RowParallelLinear(
+ intermediate_size,
+ hidden_size,
+ bias=False,
+ quant_config=quant_config,
+ prefix=f"{prefix}.down_proj",
+ )
self.act_fn = _get_gemma_act_fn(hidden_act, hidden_activation)
def forward(self, x):
@@ -104,15 +112,18 @@ def forward(self, x):
class GemmaAttention(nn.Module):
- def __init__(self,
- hidden_size: int,
- num_heads: int,
- num_kv_heads: int,
- head_dim: int,
- max_position_embeddings: int = 8192,
- rope_theta: float = 10000,
- cache_config: Optional[CacheConfig] = None,
- quant_config: Optional[QuantizationConfig] = None) -> None:
+ def __init__(
+ self,
+ hidden_size: int,
+ num_heads: int,
+ num_kv_heads: int,
+ head_dim: int,
+ max_position_embeddings: int = 8192,
+ rope_theta: float = 10000,
+ cache_config: Optional[CacheConfig] = None,
+ quant_config: Optional[QuantizationConfig] = None,
+ prefix: str = "",
+ ) -> None:
super().__init__()
self.hidden_size = hidden_size
tp_size = get_tensor_model_parallel_world_size()
@@ -142,12 +153,14 @@ def __init__(self,
self.total_num_kv_heads,
bias=False,
quant_config=quant_config,
+ prefix=f"{prefix}.qkv_proj",
)
self.o_proj = RowParallelLinear(
self.total_num_heads * self.head_dim,
hidden_size,
bias=False,
quant_config=quant_config,
+ prefix=f"{prefix}.o_proj",
)
self.rotary_emb = get_rope(
@@ -186,6 +199,7 @@ def __init__(
config: GemmaConfig,
cache_config: Optional[CacheConfig] = None,
quant_config: Optional[QuantizationConfig] = None,
+ prefix: str = "",
) -> None:
super().__init__()
self.hidden_size = config.hidden_size
@@ -198,6 +212,7 @@ def __init__(
rope_theta=config.rope_theta,
cache_config=cache_config,
quant_config=quant_config,
+ prefix=f"{prefix}.self_attn",
)
self.mlp = GemmaMLP(
hidden_size=self.hidden_size,
@@ -205,6 +220,7 @@ def __init__(
hidden_act=config.hidden_act,
hidden_activation=getattr(config, "hidden_activation", None),
quant_config=quant_config,
+ prefix=f"{prefix}.mlp",
)
self.input_layernorm = GemmaRMSNorm(config.hidden_size,
eps=config.rms_norm_eps)
@@ -259,8 +275,8 @@ def __init__(
)
self.start_layer, self.end_layer, self.layers = make_layers(
config.num_hidden_layers,
- lambda prefix: GemmaDecoderLayer(config, cache_config, quant_config
- ),
+ lambda prefix: GemmaDecoderLayer(
+ config, cache_config, quant_config, prefix=prefix),
prefix=f"{prefix}.layers")
self.norm = GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
@@ -366,6 +382,7 @@ def __init__(
cache_config: Optional[CacheConfig] = None,
quant_config: Optional[QuantizationConfig] = None,
lora_config: Optional[LoRAConfig] = None,
+ prefix: str = "",
) -> None:
super().__init__()
@@ -375,7 +392,10 @@ def __init__(
self.lora_config = lora_config
self.quant_config = quant_config
- self.model = GemmaModel(config, cache_config, quant_config)
+ self.model = GemmaModel(config,
+ cache_config,
+ quant_config,
+ prefix=maybe_prefix(prefix, "model"))
self.logits_processor = LogitsProcessor(config.vocab_size)
self.sampler = Sampler()
self.make_empty_intermediate_tensors = (
diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py
index 9a77e48626ca5..313d98b649b48 100644
--- a/vllm/model_executor/models/internlm2.py
+++ b/vllm/model_executor/models/internlm2.py
@@ -30,7 +30,8 @@
from .interfaces import SupportsPP
from .utils import (is_pp_missing_parameter,
- make_empty_intermediate_tensors_factory, make_layers)
+ make_empty_intermediate_tensors_factory, make_layers,
+ maybe_prefix)
class InternLM2MLP(nn.Module):
@@ -41,16 +42,23 @@ def __init__(
intermediate_size: int,
hidden_act: str,
quant_config: Optional[QuantizationConfig] = None,
+ prefix: str = "",
) -> None:
super().__init__()
self.gate_up_proj = MergedColumnParallelLinear(
- hidden_size, [intermediate_size] * 2,
+ hidden_size,
+ [intermediate_size] * 2,
+ bias=False,
+ quant_config=quant_config,
+ prefix=f"{prefix}.gate_up_proj",
+ )
+ self.w2 = RowParallelLinear(
+ intermediate_size,
+ hidden_size,
bias=False,
- quant_config=quant_config)
- self.w2 = RowParallelLinear(intermediate_size,
- hidden_size,
- bias=False,
- quant_config=quant_config)
+ quant_config=quant_config,
+ prefix=f"{prefix}.w2",
+ )
if hidden_act != "silu":
raise ValueError(f"Unsupported activation: {hidden_act}. "
"Only silu is supported for now.")
@@ -75,6 +83,7 @@ def __init__(
max_position_embeddings: int = 8192,
cache_config: Optional[CacheConfig] = None,
quant_config: Optional[QuantizationConfig] = None,
+ prefix: str = "",
) -> None:
super().__init__()
self.hidden_size = hidden_size
@@ -108,12 +117,14 @@ def __init__(
self.total_num_kv_heads,
bias=False,
quant_config=quant_config,
+ prefix=f"{prefix}.wqkv",
)
self.wo = RowParallelLinear(
self.total_num_heads * self.head_dim,
hidden_size,
bias=False,
quant_config=quant_config,
+ prefix=f"{prefix}.wo",
)
self.rotary_emb = get_rope(
@@ -123,12 +134,15 @@ def __init__(
base=rope_theta,
rope_scaling=rope_scaling,
)
- self.attn = Attention(self.num_heads,
- self.head_dim,
- self.scaling,
- num_kv_heads=self.num_kv_heads,
- cache_config=cache_config,
- quant_config=quant_config)
+ self.attn = Attention(
+ self.num_heads,
+ self.head_dim,
+ self.scaling,
+ num_kv_heads=self.num_kv_heads,
+ cache_config=cache_config,
+ quant_config=quant_config,
+ prefix=f"{prefix}.attn",
+ )
def split_qkv(self, qkv: torch.Tensor):
seq_len = qkv.shape[0]
@@ -176,6 +190,7 @@ def __init__(
config: PretrainedConfig,
cache_config: Optional[CacheConfig] = None,
quant_config: Optional[QuantizationConfig] = None,
+ prefix: str = "",
) -> None:
super().__init__()
self.hidden_size = config.hidden_size
@@ -192,12 +207,14 @@ def __init__(
max_position_embeddings=max_position_embeddings,
cache_config=cache_config,
quant_config=quant_config,
+ prefix=f"{prefix}.attention",
)
self.feed_forward = InternLM2MLP(
hidden_size=self.hidden_size,
intermediate_size=config.intermediate_size,
hidden_act=config.hidden_act,
quant_config=quant_config,
+ prefix=f"{prefix}.feed_forward",
)
self.attention_norm = RMSNorm(config.hidden_size,
eps=config.rms_norm_eps)
@@ -251,8 +268,8 @@ def __init__(
)
self.start_layer, self.end_layer, self.layers = make_layers(
config.num_hidden_layers,
- lambda prefix: InternLMDecoderLayer(config, cache_config,
- quant_config),
+ lambda prefix: InternLMDecoderLayer(
+ config, cache_config, quant_config, prefix=prefix),
prefix=f"{prefix}.layers")
self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
self.make_empty_intermediate_tensors = (
@@ -306,14 +323,19 @@ def __init__(
config: PretrainedConfig,
cache_config: Optional[CacheConfig] = None,
quant_config: Optional[QuantizationConfig] = None,
+ prefix: str = "",
) -> None:
super().__init__()
self.config = config
self.quant_config = quant_config
- self.model = InternLM2Model(config, cache_config, quant_config)
+ self.model = InternLM2Model(config,
+ cache_config,
+ quant_config,
+ prefix=maybe_prefix(prefix, "model"))
self.output = ParallelLMHead(config.vocab_size,
config.hidden_size,
- quant_config=quant_config)
+ quant_config=quant_config,
+ prefix=maybe_prefix(prefix, "output"))
if self.config.tie_word_embeddings:
self.output.weight = self.model.tok_embeddings.weight
self.logits_processor = LogitsProcessor(config.vocab_size)
diff --git a/vllm/model_executor/models/internlm2_ve.py b/vllm/model_executor/models/internlm2_ve.py
index 6effd70b75da3..edd867e4b6457 100644
--- a/vllm/model_executor/models/internlm2_ve.py
+++ b/vllm/model_executor/models/internlm2_ve.py
@@ -15,7 +15,7 @@
InternLM2MLP, InternLM2Model)
from vllm.sequence import IntermediateTensors
-from .utils import make_layers
+from .utils import make_layers, maybe_prefix
class InternLM2VEDecoderLayer(nn.Module):
@@ -25,6 +25,7 @@ def __init__(
config: PretrainedConfig,
cache_config: Optional[CacheConfig] = None,
quant_config: Optional[QuantizationConfig] = None,
+ prefix: str = "",
) -> None:
super().__init__()
self.hidden_size = config.hidden_size
@@ -41,18 +42,21 @@ def __init__(
max_position_embeddings=max_position_embeddings,
cache_config=cache_config,
quant_config=quant_config,
+ prefix=f"{prefix}.attention",
)
self.feed_forward = InternLM2MLP(
hidden_size=self.hidden_size,
intermediate_size=config.intermediate_size,
hidden_act=config.hidden_act,
quant_config=quant_config,
+ prefix=f"{prefix}.feed_forward",
)
self.feed_forward_ve = InternLM2MLP(
hidden_size=self.hidden_size,
intermediate_size=config.intermediate_size,
hidden_act=config.hidden_act,
quant_config=quant_config,
+ prefix=f"{prefix}.feed_forward_ve",
)
self.attention_norm = RMSNorm(config.hidden_size,
eps=config.rms_norm_eps)
@@ -111,8 +115,8 @@ def __init__(
super().__init__(config, cache_config, quant_config)
self.start_layer, self.end_layer, self.layers = make_layers(
config.num_hidden_layers,
- lambda prefix: InternLM2VEDecoderLayer(config, cache_config,
- quant_config),
+ lambda prefix: InternLM2VEDecoderLayer(
+ config, cache_config, quant_config, prefix=prefix),
prefix=f"{prefix}.layers")
def forward(
@@ -161,6 +165,10 @@ def __init__(
config: PretrainedConfig,
cache_config: Optional[CacheConfig] = None,
quant_config: Optional[QuantizationConfig] = None,
+ prefix: str = "",
) -> None:
super().__init__(config, cache_config, quant_config)
- self.model = InternLM2VEModel(config, cache_config, quant_config)
+ self.model = InternLM2VEModel(config,
+ cache_config,
+ quant_config,
+ prefix=maybe_prefix(prefix, "model"))
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index 3ae37d9fe5d85..1c1fde5b30983 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -439,7 +439,10 @@ def __init__(self,
)
self.language_model = init_vllm_registered_model(
- config.text_config, cache_config, quant_config)
+ config.text_config,
+ cache_config,
+ quant_config,
+ prefix="language_model")
self.mlp1 = self._init_mlp1(config)
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index b0ca1fe006239..98c53bdaae811 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -55,7 +55,8 @@
from .interfaces import SupportsLoRA, SupportsPP
from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter,
- make_empty_intermediate_tensors_factory, make_layers)
+ make_empty_intermediate_tensors_factory, make_layers,
+ maybe_prefix)
class LlamaMLP(nn.Module):
@@ -500,6 +501,7 @@ def __init__(
cache_config: Optional[CacheConfig] = None,
quant_config: Optional[QuantizationConfig] = None,
lora_config: Optional[LoRAConfig] = None,
+ prefix: str = "",
) -> None:
super().__init__()
@@ -510,7 +512,7 @@ def __init__(
cache_config,
quant_config,
lora_config=lora_config,
- prefix="model")
+ prefix=maybe_prefix(prefix, "model"))
if get_pp_group().is_last_rank:
self.unpadded_vocab_size = config.vocab_size
if lora_config:
@@ -526,6 +528,7 @@ def __init__(
if not lora_config else
lora_config.lora_vocab_padding_size),
quant_config=quant_config,
+ prefix=maybe_prefix(prefix, "lm_head"),
)
if config.tie_word_embeddings:
self.lm_head = self.lm_head.tie_weights(
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index b005d83c17f90..eda99c029881f 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -210,6 +210,7 @@ def init_vision_tower_for_llava(
quant_config: Optional[QuantizationConfig],
*,
require_post_norm: Optional[bool] = None,
+ prefix: str = "",
):
vision_config = hf_config.vision_config
@@ -224,23 +225,26 @@ def init_vision_tower_for_llava(
if isinstance(vision_config, CLIPVisionConfig):
return CLIPVisionModel(
vision_config,
- quant_config,
+ quant_config=quant_config,
num_hidden_layers_override=num_hidden_layers,
require_post_norm=require_post_norm,
+ prefix=prefix,
)
elif isinstance(vision_config, SiglipVisionConfig):
return SiglipVisionModel(
vision_config,
- quant_config,
+ quant_config=quant_config,
num_hidden_layers_override=num_hidden_layers,
require_post_norm=require_post_norm,
+ prefix=prefix,
)
elif isinstance(vision_config, PixtralVisionConfig):
return PixtralHFVisionModel(
vision_config,
- quant_config,
+ quant_config=quant_config,
num_hidden_layers_override=num_hidden_layers,
require_post_norm=require_post_norm,
+ prefix=prefix,
)
msg = f"Unsupported vision config: {type(vision_config)}"
@@ -274,14 +278,20 @@ def __init__(self,
# TODO: Optionally initializes this for supporting embeddings.
self.vision_tower = init_vision_tower_for_llava(
- config, quant_config, require_post_norm=False)
+ config,
+ quant_config,
+ require_post_norm=False,
+ prefix="vision_tower")
self.multi_modal_projector = LlavaMultiModalProjector(
vision_hidden_size=config.vision_config.hidden_size,
text_hidden_size=config.text_config.hidden_size,
projector_hidden_act=config.projector_hidden_act)
self.language_model = init_vllm_registered_model(
- config.text_config, cache_config, quant_config)
+ config.text_config,
+ cache_config,
+ quant_config,
+ prefix="language_model")
self.make_empty_intermediate_tensors = (
self.language_model.make_empty_intermediate_tensors)
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 2a582deeaa2c9..f85129b206919 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -293,7 +293,10 @@ def __init__(self,
# TODO: Optionally initializes this for supporting embeddings.
self.vision_tower = init_vision_tower_for_llava(
- config, quant_config, require_post_norm=False)
+ config,
+ quant_config,
+ require_post_norm=False,
+ prefix="vision_tower")
self.image_newline = nn.Parameter(
torch.empty(config.text_config.hidden_size))
self.multi_modal_projector = LlavaMultiModalProjector(
@@ -302,7 +305,10 @@ def __init__(self,
projector_hidden_act=config.projector_hidden_act)
self.language_model = init_vllm_registered_model(
- config.text_config, cache_config, quant_config)
+ config.text_config,
+ cache_config,
+ quant_config,
+ prefix="language_model")
# The same model class supports both language generation and embedding
# because the architecture name is the same
diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
index 43eec43d56643..b8051d5fc6ae2 100644
--- a/vllm/model_executor/models/llava_next_video.py
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -257,14 +257,20 @@ def __init__(self,
# Initialize the vision tower only up to the required feature layer
self.vision_tower = init_vision_tower_for_llava(
- config, quant_config, require_post_norm=False)
+ config,
+ quant_config,
+ require_post_norm=False,
+ prefix="vision_tower")
self.vision_resampler = LlavaNextVideoPooler(config)
self.multi_modal_projector = LlavaNextMultiModalProjector(
vision_hidden_size=config.vision_config.hidden_size,
text_hidden_size=config.text_config.hidden_size,
projector_hidden_act=config.projector_hidden_act)
self.language_model = init_vllm_registered_model(
- config.text_config, cache_config, quant_config)
+ config.text_config,
+ cache_config,
+ quant_config,
+ prefix="language_model")
self.make_empty_intermediate_tensors = (
self.language_model.model.make_empty_intermediate_tensors)
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index 9606b126141df..a0cf208a65f36 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -415,10 +415,16 @@ def __init__(self,
# Initialize the vision tower only up to the required feature layer
self.vision_tower = init_vision_tower_for_llava(
- config, quant_config, require_post_norm=False)
+ config,
+ quant_config,
+ require_post_norm=False,
+ prefix="vision_tower")
self.multi_modal_projector = LlavaOnevisionMultiModalProjector(config)
self.language_model = init_vllm_registered_model(
- config.text_config, cache_config, quant_config)
+ config.text_config,
+ cache_config,
+ quant_config,
+ prefix="language_model")
self.image_newline = nn.Parameter(
torch.empty(config.text_config.hidden_size))
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 2ec51dc4647f5..a270282d87bc8 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -394,8 +394,11 @@ def __init__(
self.multimodal_config = multimodal_config
self.version = get_version_by_config(self.config)
- self.llm = self.init_llm(config, cache_config, quant_config)
- self.vpm = self.init_vision_module(config, quant_config)
+ self.llm = self.init_llm(config,
+ cache_config,
+ quant_config,
+ prefix="llm")
+ self.vpm = self.init_vision_module(config, quant_config, prefix="vpm")
param_dtype = torch.get_default_dtype()
self.vpm.to(dtype=param_dtype)
self.vision_dim = (self.vpm.embed_dim if self.version == (2, 0) else
@@ -403,9 +406,11 @@ def __init__(
self.embed_dim = self.config.hidden_size
self.resampler = self.init_resampler(self.embed_dim, self.vision_dim)
self.resampler.to(device="cuda", dtype=param_dtype)
+ # TODO: why is there _KEYS_TO_MODIFY_MAPPING? lm_head should be in llm
self.lm_head = ParallelLMHead(config.vocab_size,
config.hidden_size,
- quant_config=quant_config)
+ quant_config=quant_config,
+ prefix="llm.lm_head")
self.logits_processor = LogitsProcessor(config.vocab_size)
self.sampler = Sampler()
@@ -644,6 +649,7 @@ def init_llm(
config: PretrainedConfig,
cache_config: Optional[CacheConfig] = None,
quant_config: Optional[QuantizationConfig] = None,
+ prefix: str = "",
) -> nn.Module:
raise NotImplementedError
@@ -651,6 +657,7 @@ def init_vision_module(
self,
config: PretrainedConfig,
quant_config: Optional[QuantizationConfig],
+ prefix: str = "",
) -> nn.Module:
raise NotImplementedError
@@ -690,17 +697,20 @@ def init_llm(
config: PretrainedConfig,
cache_config: Optional[CacheConfig] = None,
quant_config: Optional[QuantizationConfig] = None,
+ prefix: str = "",
) -> nn.Module:
return LLMWrapper(MiniCPMModel(config,
cache_config=cache_config,
- quant_config=quant_config),
+ quant_config=quant_config,
+ prefix=prefix),
name="model")
def init_vision_module(
self,
config: PretrainedConfig,
quant_config: Optional[QuantizationConfig],
+ prefix: str = "",
) -> nn.Module:
# TODO :refactor this vision model
try:
@@ -819,19 +829,23 @@ def init_llm(
config: PretrainedConfig,
cache_config: Optional[CacheConfig] = None,
quant_config: Optional[QuantizationConfig] = None,
+ prefix: str = "",
) -> nn.Module:
return LLMWrapper(LlamaModel(config,
cache_config=cache_config,
- quant_config=quant_config),
+ quant_config=quant_config,
+ prefix=prefix),
name="model")
def init_vision_module(
self,
config: PretrainedConfig,
quant_config: Optional[QuantizationConfig],
+ prefix: str = "",
) -> nn.Module:
model = Idefics2VisionTransformer(config.vision_config,
- quant_config=quant_config)
+ quant_config=quant_config,
+ prefix=prefix)
if self.config.drop_vision_last_layer:
model.encoder.layers = model.encoder.layers[:-1]
return model
@@ -935,20 +949,24 @@ def init_llm(
config: PretrainedConfig,
cache_config: Optional[CacheConfig] = None,
quant_config: Optional[QuantizationConfig] = None,
+ prefix: str = "",
) -> nn.Module:
return LLMWrapper(Qwen2Model(config,
cache_config=cache_config,
- quant_config=quant_config),
+ quant_config=quant_config,
+ prefix=prefix),
name="model")
def init_vision_module(
self,
config: PretrainedConfig,
quant_config: Optional[QuantizationConfig],
+ prefix: str = "",
) -> nn.Module:
model = Idefics2VisionTransformer(config.vision_config,
- quant_config=quant_config)
+ quant_config=quant_config,
+ prefix=prefix)
if self.config.drop_vision_last_layer:
model.encoder.layers = model.encoder.layers[:-1]
return model
diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py
index 37c3fa919124e..10cca8b56268a 100644
--- a/vllm/model_executor/models/opt.py
+++ b/vllm/model_executor/models/opt.py
@@ -43,7 +43,8 @@
from .interfaces import SupportsPP
from .utils import (is_pp_missing_parameter,
- make_empty_intermediate_tensors_factory, make_layers)
+ make_empty_intermediate_tensors_factory, make_layers,
+ maybe_prefix)
class OPTLearnedPositionalEmbedding(nn.Embedding):
@@ -68,6 +69,7 @@ def __init__(
bias: bool = True,
cache_config: Optional[CacheConfig] = None,
quant_config: Optional[QuantizationConfig] = None,
+ prefix: str = "",
) -> None:
super().__init__()
self.embed_dim = embed_dim
@@ -85,18 +87,21 @@ def __init__(
total_num_heads,
bias=bias,
quant_config=quant_config,
+ prefix=f"{prefix}.qkv_proj",
)
self.out_proj = RowParallelLinear(
embed_dim,
embed_dim,
bias=bias,
quant_config=quant_config,
+ prefix=f"{prefix}.out_proj",
)
self.attn = Attention(self.num_heads,
self.head_dim,
scale=self.scaling,
cache_config=cache_config,
- quant_config=quant_config)
+ quant_config=quant_config,
+ prefix=f"{prefix}.attn")
def forward(
self,
@@ -118,6 +123,7 @@ def __init__(
config: OPTConfig,
cache_config: Optional[CacheConfig] = None,
quant_config: Optional[QuantizationConfig] = None,
+ prefix: str = "",
):
super().__init__()
self.config = config
@@ -128,6 +134,7 @@ def __init__(
bias=config.enable_bias,
cache_config=cache_config,
quant_config=quant_config,
+ prefix=f"{prefix}.self_attn",
)
self.do_layer_norm_before = config.do_layer_norm_before
@@ -139,6 +146,7 @@ def __init__(
config.ffn_dim,
bias=config.enable_bias,
quant_config=quant_config,
+ prefix=f"{prefix}.fc1",
)
self.activation_fn = get_act_fn(config.activation_function,
quant_config, config.ffn_dim)
@@ -147,6 +155,7 @@ def __init__(
self.embed_dim,
bias=config.enable_bias,
quant_config=quant_config,
+ prefix=f"{prefix}.fc2",
)
self.final_layer_norm = nn.LayerNorm(
self.embed_dim,
@@ -214,7 +223,8 @@ def __init__(
self.project_out = ReplicatedLinear(config.hidden_size,
config.word_embed_proj_dim,
bias=False,
- quant_config=quant_config)
+ quant_config=quant_config,
+ prefix=f"{prefix}.project_out")
else:
self.project_out = None
@@ -222,7 +232,8 @@ def __init__(
self.project_in = ReplicatedLinear(config.word_embed_proj_dim,
config.hidden_size,
bias=False,
- quant_config=quant_config)
+ quant_config=quant_config,
+ prefix=f"{prefix}.project_in")
else:
self.project_in = None
@@ -239,7 +250,8 @@ def __init__(
self.start_layer, self.end_layer, self.layers = make_layers(
config.num_hidden_layers,
- lambda prefix: OPTDecoderLayer(config, cache_config, quant_config),
+ lambda prefix: OPTDecoderLayer(
+ config, cache_config, quant_config, prefix=prefix),
prefix=f"{prefix}.layers")
def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
@@ -288,9 +300,13 @@ def __init__(
config: OPTConfig,
cache_config: Optional[CacheConfig] = None,
quant_config: Optional[QuantizationConfig] = None,
+ prefix: str = "",
):
super().__init__()
- self.decoder = OPTDecoder(config, cache_config, quant_config)
+ self.decoder = OPTDecoder(config,
+ cache_config,
+ quant_config,
+ prefix=f"{prefix}.decoder")
self.make_empty_intermediate_tensors = (
make_empty_intermediate_tensors_factory(["hidden_states"],
config.hidden_size))
@@ -335,11 +351,15 @@ def __init__(
config: OPTConfig,
cache_config: Optional[CacheConfig] = None,
quant_config: Optional[QuantizationConfig] = None,
+ prefix: str = "",
):
super().__init__()
self.config = config
self.quant_config = quant_config
- self.model = OPTModel(config, cache_config, quant_config)
+ self.model = OPTModel(config,
+ cache_config,
+ quant_config,
+ prefix=maybe_prefix(prefix, "model"))
if self.config.tie_word_embeddings:
self.lm_head = self.model.decoder.embed_tokens
else:
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index 7a62a098a4525..8e29c6079b994 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -143,14 +143,17 @@ def __init__(self,
self.multimodal_config = multimodal_config
self.vision_tower = SiglipVisionModel(config.vision_config,
- quant_config)
+ quant_config,
+ prefix="vision_tower")
self.multi_modal_projector = PaliGemmaMultiModalProjector(
vision_hidden_size=config.vision_config.hidden_size,
projection_dim=config.vision_config.projection_dim)
self.quant_config = quant_config
self.language_model = GemmaForCausalLM(config.text_config,
- cache_config, quant_config)
+ cache_config,
+ quant_config,
+ prefix="language_model")
logit_scale = getattr(config, "logit_scale", 1.0)
self.language_model.logits_processor.scale *= logit_scale
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 855a9b17585a4..0962d3d3847c9 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -71,7 +71,8 @@
def _init_img_processor(hf_config: PretrainedConfig,
- quant_config: Optional[QuantizationConfig]):
+ quant_config: Optional[QuantizationConfig],
+ prefix: str = "") -> CLIPVisionModel:
clip_config = CLIP_VIT_LARGE_PATCH14_336_CONFIG
layer_idx = hf_config.img_processor.get('layer_idx', -2)
@@ -86,6 +87,7 @@ def _init_img_processor(hf_config: PretrainedConfig,
clip_config,
quant_config,
num_hidden_layers_override=num_hidden_layers,
+ prefix=prefix,
)
return img_processor
@@ -152,15 +154,18 @@ def get_img_features(self,
class Phi3HDImageEmbedding(Phi3ImageEmbeddingBase):
"""Phi3 Image embedding with HD transform."""
- def __init__(self, config: PretrainedConfig,
- quant_config: Optional[QuantizationConfig]) -> None:
+ def __init__(self,
+ config: PretrainedConfig,
+ quant_config: Optional[QuantizationConfig],
+ prefix: str = "") -> None:
super().__init__()
# n_embed or hidden_size
hidden_size = config.n_embd if hasattr(
config, 'n_embd') else config.hidden_size
- self.img_processor = _init_img_processor(config, quant_config)
+ self.img_processor = _init_img_processor(
+ config, quant_config, prefix=f"{prefix}.img_processor")
image_dim_out = config.img_processor['image_dim_out']
self.num_img_tokens = config.img_processor['num_img_tokens']
@@ -537,11 +542,15 @@ def __init__(self,
config.hidden_size,
org_num_embeddings=config.vocab_size,
quant_config=quant_config,
+ prefix="model.embed_tokens",
)
# TODO: Optionally initializes this for supporting input embeddings.
- self.vision_embed_tokens = Phi3HDImageEmbedding(config, quant_config)
+ self.vision_embed_tokens = Phi3HDImageEmbedding(
+ config, quant_config, prefix="model.vision_embed_tokens")
+ # The prefix is empty intentionally because default prefix of
+ # LlamaForCausalLM is "model"
self.language_model = LlamaForCausalLM(config, cache_config,
quant_config)
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index a9dbb3823743a..6b53bf5660096 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -164,7 +164,10 @@ def __init__(self,
# init MistralForCausalLM
self.language_model = init_vllm_registered_model(
- config.text_config, cache_config, quant_config)
+ config.text_config,
+ cache_config,
+ quant_config,
+ prefix="language_model")
self.vision_encoder = VisionTransformer(self.vision_args)
self.vision_language_adapter = VisionLanguageAdapter(
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 23eb1482ffef1..db1029345a8ac 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -49,7 +49,8 @@
from .interfaces import SupportsLoRA, SupportsPP
from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter,
- make_empty_intermediate_tensors_factory, make_layers)
+ make_empty_intermediate_tensors_factory, make_layers,
+ maybe_prefix)
class Qwen2MLP(nn.Module):
@@ -60,16 +61,23 @@ def __init__(
intermediate_size: int,
hidden_act: str,
quant_config: Optional[QuantizationConfig] = None,
+ prefix: str = "",
) -> None:
super().__init__()
self.gate_up_proj = MergedColumnParallelLinear(
- hidden_size, [intermediate_size] * 2,
+ hidden_size,
+ [intermediate_size] * 2,
+ bias=False,
+ quant_config=quant_config,
+ prefix=f"{prefix}.gate_up_proj",
+ )
+ self.down_proj = RowParallelLinear(
+ intermediate_size,
+ hidden_size,
bias=False,
- quant_config=quant_config)
- self.down_proj = RowParallelLinear(intermediate_size,
- hidden_size,
- bias=False,
- quant_config=quant_config)
+ quant_config=quant_config,
+ prefix=f"{prefix}.down_proj",
+ )
if hidden_act != "silu":
raise ValueError(f"Unsupported activation: {hidden_act}. "
"Only silu is supported for now.")
@@ -92,7 +100,8 @@ def __init__(self,
rope_theta: float = 10000,
cache_config: Optional[CacheConfig] = None,
quant_config: Optional[QuantizationConfig] = None,
- rope_scaling: Optional[Tuple] = None) -> None:
+ rope_scaling: Optional[Tuple] = None,
+ prefix: str = "") -> None:
super().__init__()
self.hidden_size = hidden_size
tp_size = get_tensor_model_parallel_world_size()
@@ -122,12 +131,14 @@ def __init__(self,
self.total_num_kv_heads,
bias=True,
quant_config=quant_config,
+ prefix=f"{prefix}.qkv_proj",
)
self.o_proj = RowParallelLinear(
self.total_num_heads * self.head_dim,
hidden_size,
bias=False,
quant_config=quant_config,
+ prefix=f"{prefix}.o_proj",
)
self.rotary_emb = get_rope(
@@ -142,7 +153,8 @@ def __init__(self,
self.scaling,
num_kv_heads=self.num_kv_heads,
cache_config=cache_config,
- quant_config=quant_config)
+ quant_config=quant_config,
+ prefix=f"{prefix}.attn")
def forward(
self,
@@ -166,6 +178,7 @@ def __init__(
config: Qwen2Config,
cache_config: Optional[CacheConfig] = None,
quant_config: Optional[QuantizationConfig] = None,
+ prefix: str = "",
) -> None:
super().__init__()
self.hidden_size = config.hidden_size
@@ -180,12 +193,15 @@ def __init__(
rope_theta=rope_theta,
cache_config=cache_config,
quant_config=quant_config,
- rope_scaling=rope_scaling)
+ rope_scaling=rope_scaling,
+ prefix=f"{prefix}.self_attn",
+ )
self.mlp = Qwen2MLP(
hidden_size=self.hidden_size,
intermediate_size=config.intermediate_size,
hidden_act=config.hidden_act,
quant_config=quant_config,
+ prefix=f"{prefix}.mlp",
)
self.input_layernorm = RMSNorm(config.hidden_size,
eps=config.rms_norm_eps)
@@ -241,6 +257,7 @@ def __init__(
config.vocab_size,
config.hidden_size,
quant_config=quant_config,
+ prefix=f"{prefix}.embed_tokens",
)
else:
self.embed_tokens = PPMissingLayer()
@@ -249,7 +266,8 @@ def __init__(
config.num_hidden_layers,
lambda prefix: Qwen2DecoderLayer(config=config,
cache_config=cache_config,
- quant_config=quant_config),
+ quant_config=quant_config,
+ prefix=f"{prefix}.layers"),
prefix=f"{prefix}.layers",
)
@@ -393,6 +411,7 @@ def __init__(
cache_config: Optional[CacheConfig] = None,
quant_config: Optional[QuantizationConfig] = None,
lora_config: Optional[LoRAConfig] = None,
+ prefix: str = "",
) -> None:
# TODO (@robertgshaw2): see if this can be moved out
if (cache_config.sliding_window is not None
@@ -412,14 +431,19 @@ def __init__(
self.lora_config = lora_config
self.quant_config = quant_config
- self.model = Qwen2Model(config, cache_config, quant_config)
+ self.model = Qwen2Model(config,
+ cache_config,
+ quant_config,
+ prefix=maybe_prefix(prefix, "model"))
if config.tie_word_embeddings:
self.lm_head = self.model.embed_tokens
else:
self.lm_head = ParallelLMHead(config.vocab_size,
config.hidden_size,
- quant_config=quant_config)
+ quant_config=quant_config,
+ prefix=maybe_prefix(
+ prefix, "lm_head"))
self.logits_processor = LogitsProcessor(config.vocab_size)
self.sampler = Sampler()
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 4e60fe70b25f1..633d66b4af31a 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -938,7 +938,10 @@ def __init__(self,
quant_config=None,
)
- self.model = Qwen2Model(config, cache_config, quant_config)
+ self.model = Qwen2Model(config,
+ cache_config,
+ quant_config,
+ prefix="model")
if get_pp_group().is_last_rank:
if config.tie_word_embeddings:
@@ -946,7 +949,8 @@ def __init__(self,
else:
self.lm_head = ParallelLMHead(config.vocab_size,
config.hidden_size,
- quant_config=quant_config)
+ quant_config=quant_config,
+ prefix="lm_head")
else:
self.lm_head = PPMissingLayer()
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index 5f33b872beecb..f08e4aa355086 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -357,7 +357,10 @@ def __init__(self,
))
self.multi_modal_projector = UltravoxProjector(config)
self.language_model = init_vllm_registered_model(
- config.text_config, cache_config, quant_config)
+ config.text_config,
+ cache_config,
+ quant_config,
+ prefix="language_model")
if config.text_model_id is not None:
self.secondary_weights.append(
DefaultModelLoader.Source(model_or_path=config.text_model_id,
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index 6995f5805c5e1..0aecb5d151a45 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -242,6 +242,7 @@ def init_vllm_registered_model(
lora_config: Optional[LoRAConfig] = None,
multimodal_config: Optional[MultiModalConfig] = None,
scheduler_config: Optional[SchedulerConfig] = None,
+ prefix: str = "",
) -> nn.Module:
"""
Helper function to initialize an inner model registered to vLLM,
@@ -257,6 +258,7 @@ def init_vllm_registered_model(
lora_config=lora_config,
multimodal_config=multimodal_config,
scheduler_config=scheduler_config,
+ prefix=prefix,
)
@@ -610,3 +612,16 @@ def get_vit_attn_backend() -> _Backend:
else:
selected_backend = _Backend.XFORMERS
return selected_backend
+
+
+def maybe_prefix(prefix: str, name: str) -> str:
+ """Add a prefix to a name if the prefix is non-empty.
+
+ Args:
+ prefix: The prefix to add. If empty, no prefix will be added.
+ name: The name to potentially prefix.
+
+ Returns:
+ The string "prefix.name" if prefix was non-empty, otherwise just "name".
+ """
+ return name if not prefix else f"{prefix}.{name}"
From 1ab6f6b4ad5c4aac6ee72e51b7f6712098f9ccff Mon Sep 17 00:00:00 2001
From: youkaichao
Date: Tue, 29 Oct 2024 17:06:24 -0700
Subject: [PATCH 042/113] [core][distributed] fix custom allreduce in pytorch
2.5 (#9815)
Signed-off-by: youkaichao
---
.../device_communicators/custom_all_reduce.py | 14 +++++++++++++-
1 file changed, 13 insertions(+), 1 deletion(-)
diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py
index 7de5b05a0b053..c3632aee6d11a 100644
--- a/vllm/distributed/device_communicators/custom_all_reduce.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce.py
@@ -191,8 +191,20 @@ def capture(self):
def _get_ipc_meta(self, inp: torch.Tensor):
data = inp.untyped_storage()._share_cuda_()
+ handle = data[1]
+ # https://github.com/pytorch/pytorch/pull/130890 changes
+ # the binary format of the ipc handle
+ # it starts from pytorch 2.5
+ if len(handle) > 64:
+ assert len(handle) == 66
+ # only support SHAREABLE_HANDLE_VERSION = 1
+ assert int(handle[0]) == 1
+ # only support SHAREABLE_CUDA_MALLOC = 'c'
+ assert handle[1] == ord("c")
+ handle = handle[2:]
+ # TODO: support expandable segment
shard_data = (
- data[1], # ipc handle to base ptr
+ handle, # ipc handle to base ptr
data[3], # offset of base ptr
)
return self._gather_ipc_meta(shard_data)
From 64cb1cdc3f3a6c0ca976d68b19d454122c720e6d Mon Sep 17 00:00:00 2001
From: Lily Liu
Date: Tue, 29 Oct 2024 17:28:43 -0700
Subject: [PATCH 043/113] Update README.md (#9819)
---
README.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/README.md b/README.md
index 8c8d6eb291cea..b75bfc5c699a7 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,7 @@ Easy, fast, and cheap LLM serving for everyone
---
-**vLLM x Snowfkale Meetup (Wednesday, November 13th, 5:30-8PM PT) at Snowfkale HQ, San Mateo**
+**vLLM x Snowflake Meetup (Wednesday, November 13th, 5:30-8PM PT) at Snowflake HQ, San Mateo**
We are excited to announce the last in-person vLLM meetup of the year!
Join the vLLM developers and engineers from Snowflake AI Research to chat about the latest LLM inference optimizations and your 2025 vLLM wishlist!
From 226688bd6114749633132b9ed074c59d50904830 Mon Sep 17 00:00:00 2001
From: Michael Goin
Date: Tue, 29 Oct 2024 22:49:44 -0400
Subject: [PATCH 044/113] [Bugfix][VLM] Make apply_fp8_linear work with >2D
input (#9812)
---
.../layers/quantization/utils/w8a8_utils.py | 33 +++++++++++--------
1 file changed, 20 insertions(+), 13 deletions(-)
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index 1879d2855d93d..445117ac99a34 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -96,21 +96,26 @@ def apply_fp8_linear(
# If dynamic, layer.input_scale is None and x_scale computed from x.
# If static, layer.input_scale is scalar and x_scale is input_scale.
+ # View input as 2D matrix for fp8 methods
+ input_2d = input.view(-1, input.shape[-1])
+ output_shape = [*input.shape[:-1], weight.shape[1]]
+
# cutlass_scaled_mm supports per tensor/channel W and per tensor/token A
if cutlass_fp8_supported:
qinput, x_scale = ops.scaled_fp8_quant(
- input,
+ input_2d,
input_scale,
scale_ub=input_scale_ub,
use_per_token_if_dynamic=use_per_token_if_dynamic)
# Fused GEMM_DQ
- return ops.cutlass_scaled_mm(qinput,
- weight,
- out_dtype=input.dtype,
- scale_a=x_scale,
- scale_b=weight_scale,
- bias=bias)
+ output = ops.cutlass_scaled_mm(qinput,
+ weight,
+ out_dtype=input.dtype,
+ scale_a=x_scale,
+ scale_b=weight_scale,
+ bias=bias)
+ return output.view(*output_shape)
# torch.scaled_mm supports per tensor weights + activations only
# so fallback to naive if per channel or per token
@@ -119,7 +124,7 @@ def apply_fp8_linear(
# for matrices with batch dimension > 16.
# This could change in the future.
qinput, x_scale = ops.scaled_fp8_quant(
- input,
+ input_2d,
input_scale,
num_token_padding=17,
use_per_token_if_dynamic=use_per_token_if_dynamic)
@@ -138,8 +143,10 @@ def apply_fp8_linear(
# A fix for discrepancy in scaled_mm which returns tuple
# for torch < 2.5 and a single value in torch >= 2.5
if type(output) is tuple and len(output) == 2:
- return torch.narrow(output[0], 0, 0, input.shape[0])
- return torch.narrow(output, 0, 0, input.shape[0])
+ output = output[0]
+
+ return torch.narrow(output, 0, 0,
+ input_2d.shape[0]).view(*output_shape)
else:
# Fallback for channelwise case, where we use unfused DQ
@@ -176,15 +183,15 @@ def apply_fp8_linear(
if type(output) is tuple and len(output) == 2:
output = output[0]
# Unpad (undo num_token_padding)
- output = torch.narrow(output, 0, 0, input.shape[0])
- x_scale = torch.narrow(x_scale, 0, 0, input.shape[0])
+ output = torch.narrow(output, 0, 0, input_2d.shape[0])
+ x_scale = torch.narrow(x_scale, 0, 0, input_2d.shape[0])
# DQ
# C = sw * sx * (X * W) + bias
output = output * x_scale * weight_scale.t()
if bias is not None:
output = output + bias
- return output.to(dtype=input.dtype)
+ return output.to(dtype=input.dtype).view(*output_shape)
def apply_int8_linear(
From 62fac4b9aab3c05124d83fcd71db5732774b17d8 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu"
Date: Tue, 29 Oct 2024 17:34:55 -1000
Subject: [PATCH 045/113] [ci/build] Pin CI dependencies version with
pip-compile (#9810)
Signed-off-by: kevin
---
Dockerfile.rocm | 2 +
requirements-build.txt | 18 +-
requirements-test.in | 37 +++
requirements-test.txt | 593 ++++++++++++++++++++++++++++++++++++++---
4 files changed, 608 insertions(+), 42 deletions(-)
create mode 100644 requirements-test.in
diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index d35889f053e27..562117a313020 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -121,6 +121,8 @@ ARG GIT_REPO_CHECK=0
RUN --mount=type=bind,source=.git,target=.git \
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
+RUN python3 -m pip install --upgrade pip
+
# Package upgrades for useful functionality or to avoid dependency issues
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install --upgrade numba scipy huggingface-hub[cli] pytest-shard
diff --git a/requirements-build.txt b/requirements-build.txt
index ea2b688bb3108..7b16d9778c1a6 100644
--- a/requirements-build.txt
+++ b/requirements-build.txt
@@ -1,9 +1,9 @@
-# Should be mirrored in pyproject.toml
-cmake>=3.26
-ninja
-packaging
-setuptools>=61
-setuptools-scm>=8
-torch==2.5.0
-wheel
-jinja2
+# Should be mirrored in pyproject.toml
+cmake>=3.26
+ninja
+packaging
+setuptools>=61
+setuptools-scm>=8
+torch==2.5.0
+wheel
+jinja2
diff --git a/requirements-test.in b/requirements-test.in
new file mode 100644
index 0000000000000..3881f2566b556
--- /dev/null
+++ b/requirements-test.in
@@ -0,0 +1,37 @@
+# testing
+pytest
+tensorizer>=2.9.0
+pytest-forked
+pytest-asyncio
+pytest-rerunfailures
+pytest-shard
+
+# testing utils
+awscli
+einops # required for MPT, qwen-vl and Mamba
+httpx
+librosa # required for audio tests
+opencv-python # required for video tests
+peft
+requests
+ray[adag]==2.35
+sentence-transformers # required for embedding
+soundfile # required for audio test
+timm # required for internvl test
+torch==2.5.0
+transformers_stream_generator # required for qwen-vl test
+matplotlib # required for qwen-vl test
+datamodel_code_generator # required for minicpm3 test
+lm-eval[api]==0.4.4 # required for model evaluation test
+
+# TODO: Add this after fully implementing llava(mantis)
+# git+https://github.com/TIGER-AI-Lab/Mantis.git # required for llava(mantis) test
+
+# Benchmarking
+aiohttp
+
+# quantization
+bitsandbytes>=0.44.0
+buildkite-test-collector==0.1.8
+
+numpy < 2.0.0
diff --git a/requirements-test.txt b/requirements-test.txt
index 9787fa2a4a486..c474c2ec34b22 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -1,34 +1,561 @@
-# testing
-pytest
-tensorizer>=2.9.0
-pytest-forked
-pytest-asyncio
-pytest-rerunfailures
-pytest-shard
-
-# testing utils
-awscli
-einops # required for MPT, qwen-vl and Mamba
-httpx
-librosa # required for audio tests
-opencv-python # required for video tests
-peft
-requests
-ray[adag]==2.35
-sentence-transformers # required for embedding
-soundfile # required for audio test
-timm # required for internvl test
-transformers_stream_generator # required for qwen-vl test
-matplotlib # required for qwen-vl test
-datamodel_code_generator # required for minicpm3 test
-lm-eval[api]==0.4.4 # required for model evaluation test
-
-# TODO: Add this after fully implementing llava(mantis)
-# git+https://github.com/TIGER-AI-Lab/Mantis.git # required for llava(mantis) test
-
-# Benchmarking
-aiohttp
-
-# quantization
-bitsandbytes>=0.44.0
+#
+# This file is autogenerated by pip-compile with Python 3.12
+# by the following command:
+#
+# pip-compile --output-file=requirements-test.txt requirements-test.in
+#
+absl-py==2.1.0
+ # via rouge-score
+accelerate==1.0.1
+ # via
+ # lm-eval
+ # peft
+aiohappyeyeballs==2.4.3
+ # via aiohttp
+aiohttp==3.10.10
+ # via
+ # -r requirements-test.in
+ # datasets
+ # fsspec
+ # lm-eval
+aiosignal==1.3.1
+ # via
+ # aiohttp
+ # ray
+annotated-types==0.7.0
+ # via pydantic
+anyio==4.6.2.post1
+ # via httpx
+argcomplete==3.5.1
+ # via datamodel-code-generator
+attrs==24.2.0
+ # via
+ # aiohttp
+ # jsonlines
+ # jsonschema
+ # referencing
+audioread==3.0.1
+ # via librosa
+awscli==1.35.16
+ # via -r requirements-test.in
+bitsandbytes==0.44.1
+ # via -r requirements-test.in
+black==24.10.0
+ # via datamodel-code-generator
+boto3==1.35.50
+ # via tensorizer
+botocore==1.35.50
+ # via
+ # awscli
+ # boto3
+ # s3transfer
buildkite-test-collector==0.1.8
+ # via -r requirements-test.in
+certifi==2024.8.30
+ # via
+ # httpcore
+ # httpx
+ # requests
+cffi==1.17.1
+ # via soundfile
+chardet==5.2.0
+ # via mbstrdecoder
+charset-normalizer==3.4.0
+ # via requests
+click==8.1.7
+ # via
+ # black
+ # nltk
+ # ray
+colorama==0.4.6
+ # via
+ # awscli
+ # sacrebleu
+ # tqdm-multiprocess
+contourpy==1.3.0
+ # via matplotlib
+cupy-cuda12x==13.3.0
+ # via ray
+cycler==0.12.1
+ # via matplotlib
+datamodel-code-generator==0.26.2
+ # via -r requirements-test.in
+dataproperty==1.0.1
+ # via
+ # pytablewriter
+ # tabledata
+datasets==3.0.2
+ # via
+ # evaluate
+ # lm-eval
+decorator==5.1.1
+ # via librosa
+dill==0.3.8
+ # via
+ # datasets
+ # evaluate
+ # lm-eval
+ # multiprocess
+dnspython==2.7.0
+ # via email-validator
+docutils==0.16
+ # via awscli
+einops==0.8.0
+ # via -r requirements-test.in
+email-validator==2.2.0
+ # via pydantic
+evaluate==0.4.3
+ # via lm-eval
+fastrlock==0.8.2
+ # via cupy-cuda12x
+filelock==3.16.1
+ # via
+ # datasets
+ # huggingface-hub
+ # ray
+ # torch
+ # transformers
+ # triton
+fonttools==4.54.1
+ # via matplotlib
+frozenlist==1.5.0
+ # via
+ # aiohttp
+ # aiosignal
+ # ray
+fsspec[http]==2024.9.0
+ # via
+ # datasets
+ # evaluate
+ # huggingface-hub
+ # torch
+genson==1.3.0
+ # via datamodel-code-generator
+h11==0.14.0
+ # via httpcore
+hiredis==3.0.0
+ # via tensorizer
+httpcore==1.0.6
+ # via httpx
+httpx==0.27.2
+ # via -r requirements-test.in
+huggingface-hub==0.26.2
+ # via
+ # accelerate
+ # datasets
+ # evaluate
+ # peft
+ # sentence-transformers
+ # timm
+ # tokenizers
+ # transformers
+idna==3.10
+ # via
+ # anyio
+ # email-validator
+ # httpx
+ # requests
+ # yarl
+inflect==5.6.2
+ # via datamodel-code-generator
+iniconfig==2.0.0
+ # via pytest
+isort==5.13.2
+ # via datamodel-code-generator
+jinja2==3.1.4
+ # via
+ # datamodel-code-generator
+ # torch
+jmespath==1.0.1
+ # via
+ # boto3
+ # botocore
+joblib==1.4.2
+ # via
+ # librosa
+ # nltk
+ # scikit-learn
+jsonlines==4.0.0
+ # via lm-eval
+jsonschema==4.23.0
+ # via ray
+jsonschema-specifications==2024.10.1
+ # via jsonschema
+kiwisolver==1.4.7
+ # via matplotlib
+lazy-loader==0.4
+ # via librosa
+libnacl==2.1.0
+ # via tensorizer
+librosa==0.10.2.post1
+ # via -r requirements-test.in
+llvmlite==0.43.0
+ # via numba
+lm-eval[api]==0.4.4
+ # via -r requirements-test.in
+lxml==5.3.0
+ # via sacrebleu
+markupsafe==3.0.2
+ # via jinja2
+matplotlib==3.9.2
+ # via -r requirements-test.in
+mbstrdecoder==1.1.3
+ # via
+ # dataproperty
+ # pytablewriter
+ # typepy
+more-itertools==10.5.0
+ # via lm-eval
+mpmath==1.3.0
+ # via sympy
+msgpack==1.1.0
+ # via
+ # librosa
+ # ray
+multidict==6.1.0
+ # via
+ # aiohttp
+ # yarl
+multiprocess==0.70.16
+ # via
+ # datasets
+ # evaluate
+mypy-extensions==1.0.0
+ # via black
+networkx==3.2.1
+ # via torch
+nltk==3.9.1
+ # via rouge-score
+numba==0.60.0
+ # via librosa
+numexpr==2.10.1
+ # via lm-eval
+numpy==1.26.4
+ # via
+ # -r requirements-test.in
+ # accelerate
+ # bitsandbytes
+ # contourpy
+ # cupy-cuda12x
+ # datasets
+ # evaluate
+ # librosa
+ # matplotlib
+ # numba
+ # numexpr
+ # opencv-python
+ # pandas
+ # peft
+ # rouge-score
+ # sacrebleu
+ # scikit-learn
+ # scipy
+ # soxr
+ # tensorizer
+ # torchvision
+ # transformers
+nvidia-cublas-cu12==12.4.5.8
+ # via
+ # nvidia-cudnn-cu12
+ # nvidia-cusolver-cu12
+ # torch
+nvidia-cuda-cupti-cu12==12.4.127
+ # via torch
+nvidia-cuda-nvrtc-cu12==12.4.127
+ # via torch
+nvidia-cuda-runtime-cu12==12.4.127
+ # via torch
+nvidia-cudnn-cu12==9.1.0.70
+ # via torch
+nvidia-cufft-cu12==11.2.1.3
+ # via torch
+nvidia-curand-cu12==10.3.5.147
+ # via torch
+nvidia-cusolver-cu12==11.6.1.9
+ # via torch
+nvidia-cusparse-cu12==12.3.1.170
+ # via
+ # nvidia-cusolver-cu12
+ # torch
+nvidia-nccl-cu12==2.21.5
+ # via torch
+nvidia-nvjitlink-cu12==12.4.127
+ # via
+ # nvidia-cusolver-cu12
+ # nvidia-cusparse-cu12
+ # torch
+nvidia-nvtx-cu12==12.4.127
+ # via torch
+opencv-python==4.10.0.84
+ # via -r requirements-test.in
+packaging==24.1
+ # via
+ # accelerate
+ # black
+ # datamodel-code-generator
+ # datasets
+ # evaluate
+ # huggingface-hub
+ # lazy-loader
+ # matplotlib
+ # peft
+ # pooch
+ # pytest
+ # pytest-rerunfailures
+ # ray
+ # transformers
+ # typepy
+pandas==2.2.3
+ # via
+ # datasets
+ # evaluate
+pathspec==0.12.1
+ # via black
+pathvalidate==3.2.1
+ # via pytablewriter
+peft==0.13.2
+ # via
+ # -r requirements-test.in
+ # lm-eval
+pillow==11.0.0
+ # via
+ # matplotlib
+ # sentence-transformers
+ # torchvision
+platformdirs==4.3.6
+ # via
+ # black
+ # pooch
+pluggy==1.5.0
+ # via pytest
+pooch==1.8.2
+ # via librosa
+portalocker==2.10.1
+ # via sacrebleu
+propcache==0.2.0
+ # via yarl
+protobuf==5.28.3
+ # via
+ # ray
+ # tensorizer
+psutil==6.1.0
+ # via
+ # accelerate
+ # peft
+ # tensorizer
+py==1.11.0
+ # via pytest-forked
+pyarrow==18.0.0
+ # via datasets
+pyasn1==0.6.1
+ # via rsa
+pybind11==2.13.6
+ # via lm-eval
+pycparser==2.22
+ # via cffi
+pydantic[email]==2.9.2
+ # via datamodel-code-generator
+pydantic-core==2.23.4
+ # via pydantic
+pyparsing==3.2.0
+ # via matplotlib
+pytablewriter==1.2.0
+ # via lm-eval
+pytest==8.3.3
+ # via
+ # -r requirements-test.in
+ # buildkite-test-collector
+ # pytest-asyncio
+ # pytest-forked
+ # pytest-rerunfailures
+ # pytest-shard
+pytest-asyncio==0.24.0
+ # via -r requirements-test.in
+pytest-forked==1.6.0
+ # via -r requirements-test.in
+pytest-rerunfailures==14.0
+ # via -r requirements-test.in
+pytest-shard==0.1.2
+ # via -r requirements-test.in
+python-dateutil==2.9.0.post0
+ # via
+ # botocore
+ # matplotlib
+ # pandas
+ # typepy
+pytz==2024.2
+ # via
+ # pandas
+ # typepy
+pyyaml==6.0.2
+ # via
+ # accelerate
+ # awscli
+ # datamodel-code-generator
+ # datasets
+ # huggingface-hub
+ # peft
+ # ray
+ # timm
+ # transformers
+ray[adag]==2.35.0
+ # via -r requirements-test.in
+redis==5.2.0
+ # via tensorizer
+referencing==0.35.1
+ # via
+ # jsonschema
+ # jsonschema-specifications
+regex==2024.9.11
+ # via
+ # nltk
+ # sacrebleu
+ # tiktoken
+ # transformers
+requests==2.32.3
+ # via
+ # -r requirements-test.in
+ # buildkite-test-collector
+ # datasets
+ # evaluate
+ # huggingface-hub
+ # lm-eval
+ # pooch
+ # ray
+ # tiktoken
+ # transformers
+rouge-score==0.1.2
+ # via lm-eval
+rpds-py==0.20.0
+ # via
+ # jsonschema
+ # referencing
+rsa==4.7.2
+ # via awscli
+s3transfer==0.10.3
+ # via
+ # awscli
+ # boto3
+sacrebleu==2.4.3
+ # via lm-eval
+safetensors==0.4.5
+ # via
+ # accelerate
+ # peft
+ # timm
+ # transformers
+scikit-learn==1.5.2
+ # via
+ # librosa
+ # lm-eval
+ # sentence-transformers
+scipy==1.13.1
+ # via
+ # librosa
+ # scikit-learn
+ # sentence-transformers
+sentence-transformers==3.2.1
+ # via -r requirements-test.in
+six==1.16.0
+ # via
+ # python-dateutil
+ # rouge-score
+sniffio==1.3.1
+ # via
+ # anyio
+ # httpx
+soundfile==0.12.1
+ # via
+ # -r requirements-test.in
+ # librosa
+soxr==0.5.0.post1
+ # via librosa
+sqlitedict==2.1.0
+ # via lm-eval
+sympy==1.13.1
+ # via torch
+tabledata==1.3.3
+ # via pytablewriter
+tabulate==0.9.0
+ # via sacrebleu
+tcolorpy==0.1.6
+ # via pytablewriter
+tenacity==9.0.0
+ # via lm-eval
+tensorizer==2.9.0
+ # via -r requirements-test.in
+threadpoolctl==3.5.0
+ # via scikit-learn
+tiktoken==0.8.0
+ # via lm-eval
+timm==1.0.11
+ # via -r requirements-test.in
+tokenizers==0.20.1
+ # via transformers
+torch==2.5.0
+ # via
+ # -r requirements-test.in
+ # accelerate
+ # bitsandbytes
+ # lm-eval
+ # peft
+ # sentence-transformers
+ # tensorizer
+ # timm
+ # torchvision
+torchvision==0.20.0
+ # via timm
+tqdm==4.66.6
+ # via
+ # datasets
+ # evaluate
+ # huggingface-hub
+ # lm-eval
+ # nltk
+ # peft
+ # sentence-transformers
+ # tqdm-multiprocess
+ # transformers
+tqdm-multiprocess==0.0.11
+ # via lm-eval
+transformers==4.45.2
+ # via
+ # lm-eval
+ # peft
+ # sentence-transformers
+ # transformers-stream-generator
+transformers-stream-generator==0.0.5
+ # via -r requirements-test.in
+triton==3.1.0
+ # via torch
+typepy[datetime]==1.3.2
+ # via
+ # dataproperty
+ # pytablewriter
+ # tabledata
+typing-extensions==4.12.2
+ # via
+ # huggingface-hub
+ # librosa
+ # pydantic
+ # pydantic-core
+ # torch
+tzdata==2024.2
+ # via pandas
+urllib3==1.26.20
+ # via
+ # botocore
+ # requests
+word2number==1.1
+ # via lm-eval
+xxhash==3.5.0
+ # via
+ # datasets
+ # evaluate
+yarl==1.17.0
+ # via aiohttp
+zstandard==0.23.0
+ # via lm-eval
+
+# The following packages are considered to be unsafe in a requirements file:
+# setuptools
From 04a3ae0acae3d522299ec90b5730f876daa845e6 Mon Sep 17 00:00:00 2001
From: Yan Ma
Date: Wed, 30 Oct 2024 12:34:45 +0800
Subject: [PATCH 046/113] [Bugfix] Fix multi nodes TP+PP for XPU (#8884)
Signed-off-by: YiSheng5
Signed-off-by: yan ma
Co-authored-by: YiSheng5
---
.../getting_started/xpu-installation.rst | 18 +++++++++++++++
requirements-xpu.txt | 2 +-
vllm/distributed/parallel_state.py | 22 +++++++++++++++++++
vllm/executor/xpu_executor.py | 12 +++++++++-
vllm/platforms/__init__.py | 3 +++
vllm/platforms/xpu.py | 4 ++++
vllm/worker/xpu_worker.py | 13 ++++-------
7 files changed, 63 insertions(+), 11 deletions(-)
diff --git a/docs/source/getting_started/xpu-installation.rst b/docs/source/getting_started/xpu-installation.rst
index 151ebb5f1811f..b1868acbc84b0 100644
--- a/docs/source/getting_started/xpu-installation.rst
+++ b/docs/source/getting_started/xpu-installation.rst
@@ -60,3 +60,21 @@ Build from source
- FP16 is the default data type in the current XPU backend. The BF16 data
type will be supported in the future.
+
+Distributed inference and serving
+---------------------------------
+
+XPU platform supports tensor-parallel inference/serving and also supports pipeline parallel as a beta feature for online serving. We requires Ray as the distributed runtime backend. For example, a reference execution likes following:
+
+.. code-block:: console
+
+ $ python -m vllm.entrypoints.openai.api_server \
+ $ --model=facebook/opt-13b \
+ $ --dtype=bfloat16 \
+ $ --device=xpu \
+ $ --max_model_len=1024 \
+ $ --distributed-executor-backend=ray \
+ $ --pipeline-parallel-size=2 \
+ $ -tp=8
+
+By default, a ray instance will be launched automatically if no existing one is detected in system, with ``num-gpus`` equals to ``parallel_config.world_size``. We recommend properly starting a ray cluster before execution, referring helper `script `_.
diff --git a/requirements-xpu.txt b/requirements-xpu.txt
index ce83a178c618f..eb76a33dab5c2 100644
--- a/requirements-xpu.txt
+++ b/requirements-xpu.txt
@@ -13,4 +13,4 @@ torch == 2.3.1+cxx11.abi
intel-extension-for-pytorch == 2.3.110+xpu
oneccl_bind_pt == 2.3.100+xpu
-triton-xpu == 3.0.0b2
+triton-xpu == 3.0.0b1
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index ec39856b6f67c..b04bbc478534c 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -431,6 +431,28 @@ def gather(self,
if dim < 0:
# Convert negative dim to positive.
dim += input_.dim()
+ # For xpu path, gather doesn't work properly together with ray
+ # cluster so we use all_gather instead for now.
+ if current_platform.is_xpu():
+ input_size = input_.size()
+ # Allocate output tensor.
+ output_tensor = torch.empty((world_size, ) + input_size,
+ dtype=input_.dtype,
+ device=input_.device)
+ # All-gather.
+ torch.distributed.all_gather_into_tensor(output_tensor,
+ input_,
+ group=self.device_group)
+ if self.rank_in_group == dst:
+ # Reshape
+ output_tensor = output_tensor.movedim(0, dim)
+ output_tensor = output_tensor.reshape(input_size[:dim] +
+ (world_size *
+ input_size[dim], ) +
+ input_size[dim + 1:])
+ else:
+ output_tensor = None
+ return output_tensor
# Allocate output tensor.
if self.rank_in_group == dst:
gather_list = [torch.empty_like(input_) for _ in range(world_size)]
diff --git a/vllm/executor/xpu_executor.py b/vllm/executor/xpu_executor.py
index bada56068507a..5f78993ddc4b4 100644
--- a/vllm/executor/xpu_executor.py
+++ b/vllm/executor/xpu_executor.py
@@ -44,7 +44,7 @@ def __init__(
self.cache_config = cache_config
self.load_config = load_config
self.lora_config = lora_config
- self.parallel_config = parallel_config
+ self.parallel_config = _verify_and_get_parallel_config(parallel_config)
self.scheduler_config = scheduler_config
self.device_config = device_config
self.prompt_adapter_config = prompt_adapter_config
@@ -94,3 +94,13 @@ def _verify_and_get_model_config(config: ModelConfig) -> ModelConfig:
"mode.")
config.enforce_eager = True
return config
+
+
+def _verify_and_get_parallel_config(config: ParallelConfig) -> ParallelConfig:
+ if (config.distributed_executor_backend is not None
+ and config.distributed_executor_backend != "ray"):
+ logger.warning(
+ "%s is not supported on XPU, fallback to ray distributed executor "
+ "backend.", config.distributed_executor_backend)
+ config.distributed_executor_backend = "ray"
+ return config
diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py
index 7e9f8b1297b80..524150920b854 100644
--- a/vllm/platforms/__init__.py
+++ b/vllm/platforms/__init__.py
@@ -45,6 +45,9 @@
is_xpu = False
try:
+ # installed IPEX if the machine has XPUs.
+ import intel_extension_for_pytorch # noqa: F401
+ import oneccl_bindings_for_pytorch # noqa: F401
import torch
if hasattr(torch, 'xpu') and torch.xpu.is_available():
is_xpu = True
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index d00e0dca84fff..106e8eddf458f 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -20,3 +20,7 @@ def get_device_name(device_id: int = 0) -> str:
def get_device_total_memory(cls, device_id: int = 0) -> int:
device_props = torch.xpu.get_device_properties(device_id)
return device_props.total_memory
+
+ @staticmethod
+ def inference_mode():
+ return torch.no_grad()
diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py
index 917866f2d985b..c1d836bb0d318 100644
--- a/vllm/worker/xpu_worker.py
+++ b/vllm/worker/xpu_worker.py
@@ -14,7 +14,6 @@
SpeculativeConfig)
from vllm.distributed import (ensure_model_parallel_initialized,
init_distributed_environment)
-from vllm.distributed.parallel_state import get_pp_group
from vllm.logger import init_logger
from vllm.model_executor import set_random_seed
from vllm.platforms import current_platform
@@ -183,11 +182,10 @@ def init_worker_distributed_environment(self) -> None:
# use sockets as default Level zero IPC exchange backend. By
# default oneccl will use `drmfd` as mechanism which need extra
# dependency (libdrm and drm headers) on your system.
- ENV_CCL_ZE_IPC_EXCHANGE = os.getenv("CCL_ZE_IPC_EXCHANGE",
- "sockets")
+ ENV_CCL_ATL_TRANSPORT = os.getenv("CCL_ATL_TRANSPORT", "ofi")
ENV_LOCAL_WORLD_SIZE = os.getenv("LOCAL_WORLD_SIZE",
str(parallel_config.world_size))
- os.environ['CCL_ZE_IPC_EXCHANGE'] = ENV_CCL_ZE_IPC_EXCHANGE
+ os.environ["CCL_ATL_TRANSPORT"] = ENV_CCL_ATL_TRANSPORT
os.environ["LOCAL_WORLD_SIZE"] = ENV_LOCAL_WORLD_SIZE
os.environ["LOCAL_RANK"] = str(self.local_rank)
init_distributed_environment(
@@ -200,8 +198,5 @@ def init_worker_distributed_environment(self) -> None:
ensure_model_parallel_initialized(
parallel_config.tensor_parallel_size,
parallel_config.pipeline_parallel_size)
-
- if parallel_config.pipeline_parallel_size > 1:
- # torch-ccl xpu need a collective API warm up
- # before calling send/recv API
- get_pp_group().all_reduce(torch.zeros(1).xpu())
+ # global all_reduce needed for overall oneccl warm up
+ torch.distributed.all_reduce(torch.zeros(1).xpu())
From 7b0365efef35bb03aa94e0085199d20750409363 Mon Sep 17 00:00:00 2001
From: Russell Bryant
Date: Wed, 30 Oct 2024 01:22:23 -0400
Subject: [PATCH 047/113] [Doc] Add the DCO to CONTRIBUTING.md (#9803)
Signed-off-by: Russell Bryant
Co-authored-by: Michael Goin
Co-authored-by: Cyrus Leung
---
CONTRIBUTING.md | 12 +++++++++++-
DCO | 34 ++++++++++++++++++++++++++++++++++
2 files changed, 45 insertions(+), 1 deletion(-)
create mode 100644 DCO
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 5f79356bd32f7..b39fd75b5fb70 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -11,12 +11,14 @@ We also believe in the power of community support; thus, answering queries, offe
Finally, one of the most impactful ways to support us is by raising awareness about vLLM. Talk about it in your blog posts and highlight how it's driving your incredible projects. Express your support on social media if you're using vLLM, or simply offer your appreciation by starring our repository!
+## License
+
+See [LICENSE](LICENSE).
## Developing
Depending on the kind of development you'd like to do (e.g. Python, CUDA), you can choose to build vLLM with or without compilation. Check out the [building from source](https://docs.vllm.ai/en/latest/getting_started/installation.html#build-from-source) documentation for details.
-
## Testing
```bash
@@ -33,6 +35,14 @@ pytest tests/
## Contribution Guidelines
+### DCO and Signed-off-by
+
+When contributing changes to this project, you must agree to the [DCO](DCO).
+Commits must include a `Signed-off-by:` header which certifies agreement with
+the terms of the [DCO](DCO).
+
+Using `-s` with `git commit` will automatically add this header.
+
### Issues
If you encounter a bug or have a feature request, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible.
diff --git a/DCO b/DCO
new file mode 100644
index 0000000000000..49b8cb0549267
--- /dev/null
+++ b/DCO
@@ -0,0 +1,34 @@
+Developer Certificate of Origin
+Version 1.1
+
+Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
+
+Everyone is permitted to copy and distribute verbatim copies of this
+license document, but changing it is not allowed.
+
+
+Developer's Certificate of Origin 1.1
+
+By making a contribution to this project, I certify that:
+
+(a) The contribution was created in whole or in part by me and I
+ have the right to submit it under the open source license
+ indicated in the file; or
+
+(b) The contribution is based upon previous work that, to the best
+ of my knowledge, is covered under an appropriate open source
+ license and I have the right under that license to submit that
+ work with modifications, whether created in whole or in part
+ by me, under the same open source license (unless I am
+ permitted to submit under a different license), as indicated
+ in the file; or
+
+(c) The contribution was provided directly to me by some other
+ person who certified (a), (b) or (c) and I have not modified
+ it.
+
+(d) I understand and agree that this project and the contribution
+ are public and that a record of the contribution (including all
+ personal information I submit with it, including my sign-off) is
+ maintained indefinitely and may be redistributed consistent with
+ this project or the open source license(s) involved.
From ff5ed6e1bcbd112a26f8eb43b6bfdbc5ec73726e Mon Sep 17 00:00:00 2001
From: youkaichao
Date: Tue, 29 Oct 2024 23:03:49 -0700
Subject: [PATCH 048/113] [torch.compile] rework compile control with piecewise
cudagraph (#9715)
Signed-off-by: youkaichao
---
.buildkite/test-pipeline.yaml | 3 +
tests/compile/piecewise/__init__.py | 0
.../piecewise_compilation_config.json | 4 +
tests/compile/piecewise/test_simple.py | 96 +++++
tests/compile/piecewise/test_toy_llama.py | 334 +++++++++++++++
tests/compile/test_full_graph.py | 2 +-
tests/compile/utils.py | 18 +-
vllm/compilation/backends.py | 384 ++++++++++++++----
vllm/compilation/config.py | 154 +++++++
vllm/compilation/counter.py | 30 ++
vllm/compilation/decorators.py | 10 +-
vllm/compilation/levels.py | 3 +-
vllm/envs.py | 5 +
vllm/model_executor/custom_op.py | 4 +-
vllm/platforms/tpu.py | 2 +-
vllm/plugins/__init__.py | 15 +-
vllm/utils.py | 25 ++
17 files changed, 983 insertions(+), 106 deletions(-)
create mode 100644 tests/compile/piecewise/__init__.py
create mode 100644 tests/compile/piecewise/piecewise_compilation_config.json
create mode 100644 tests/compile/piecewise/test_simple.py
create mode 100644 tests/compile/piecewise/test_toy_llama.py
create mode 100644 vllm/compilation/config.py
create mode 100644 vllm/compilation/counter.py
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 8c98aa36ac0ff..ed847a7e3696b 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -229,6 +229,9 @@ steps:
- tests/compile
commands:
- pytest -v -s compile/test_basic_correctness.py
+ # these tests need to be separated, cannot combine
+ - pytest -v -s compile/piecewise/test_simple.py
+ - pytest -v -s compile/piecewise/test_toy_llama.py
- label: "PyTorch Fullgraph Test" # 18min
source_file_dependencies:
diff --git a/tests/compile/piecewise/__init__.py b/tests/compile/piecewise/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/compile/piecewise/piecewise_compilation_config.json b/tests/compile/piecewise/piecewise_compilation_config.json
new file mode 100644
index 0000000000000..03d077b76f627
--- /dev/null
+++ b/tests/compile/piecewise/piecewise_compilation_config.json
@@ -0,0 +1,4 @@
+{
+ "use_cudagraph": true,
+ "non_cudagraph_ops": ["silly.attention"]
+}
\ No newline at end of file
diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py
new file mode 100644
index 0000000000000..a34d33efba1d8
--- /dev/null
+++ b/tests/compile/piecewise/test_simple.py
@@ -0,0 +1,96 @@
+"""
+Test the piecewise compilation with a simple model so that we
+can exactly calculate the expected output and side effects.
+"""
+import os
+
+import torch
+from torch import nn
+
+from vllm.compilation.compile_context import set_compile_context
+from vllm.compilation.counter import compilation_counter
+from vllm.compilation.decorators import support_torch_compile
+from vllm.compilation.levels import CompilationLevel
+
+os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(CompilationLevel.PIECEWISE)
+
+global_counter = 0
+
+
+@torch.library.custom_op("silly::attention", mutates_args=["out"])
+def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
+ out: torch.Tensor) -> None:
+ global global_counter
+ global_counter += 1
+ print(f"{global_counter=}")
+ out.copy_(q)
+ out[0] += 1
+
+
+@silly_attention.register_fake
+def _(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
+ out: torch.Tensor) -> None:
+ return
+
+
+@support_torch_compile
+class SillyModel(nn.Module):
+
+ def __init__(self) -> None:
+ super().__init__()
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ """
+ Overall effect:
+ x += 1
+ x[0] += 2
+ global_counter += 2
+ """
+ x = x + 1
+ x = x + 2
+ out = torch.empty_like(x)
+ torch.ops.silly.attention(x, x, x, out)
+ x = out
+ x = x - 2
+ x = x - 1
+ out = torch.empty_like(x)
+ torch.ops.silly.attention(x, x, x, out)
+ x = out
+ x = x + 1
+ return x
+
+
+def test_simple_piecewise_compile():
+
+ model = SillyModel()
+
+ directory = os.path.dirname(__file__)
+ config = os.path.join(directory, "piecewise_compilation_config.json")
+ os.environ["VLLM_TORCH_COMPILE_CONFIG"] = config
+
+ input_buffer = torch.randn(100).cuda()
+
+ with compilation_counter.expect(
+ num_graphs_seen=1, # one graph for the model
+ num_piecewise_graphs_seen=5, # 2 * num_layers + 1
+ num_piecewise_capturable_graphs_seen=3, # 1 + num_layers
+ num_inductor_compilations=3, # num_piecewise_capturable_graphs_seen
+ num_cudagraph_caputured=
+ 6, # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+ ):
+
+ with set_compile_context([1, 2]):
+ model(input_buffer)
+
+ model(input_buffer[:2])
+ model(input_buffer[:1])
+
+ input_buffer[:2].zero_()
+ global global_counter
+ global_counter = 0
+ output = model(input_buffer[:2])
+ assert global_counter == 2
+ assert torch.allclose(output.cpu(), torch.tensor([3., 1.]))
+
+ # clean up to avoid side effects for other tests
+ del os.environ["VLLM_TORCH_COMPILE_CONFIG"]
diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py
new file mode 100644
index 0000000000000..db6a983d70feb
--- /dev/null
+++ b/tests/compile/piecewise/test_toy_llama.py
@@ -0,0 +1,334 @@
+"""
+Test the piecewise compilation with a simple model, comparing the output
+with and without the piecewise compilation.
+"""
+import os
+from dataclasses import dataclass
+from typing import Optional, Tuple
+
+import torch
+from torch import nn
+
+from vllm.compilation.compile_context import set_compile_context
+from vllm.compilation.config import CompilationConfig
+from vllm.compilation.counter import compilation_counter
+from vllm.compilation.decorators import support_torch_compile
+from vllm.compilation.levels import CompilationLevel
+from vllm.plugins import set_compilation_config
+
+
+@torch.library.custom_op("silly::attention", mutates_args=["out"])
+def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
+ out: torch.Tensor) -> None:
+ out.copy_(q)
+ out += k
+ out += v
+
+
+@silly_attention.register_fake
+def _(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
+ out: torch.Tensor) -> None:
+ return
+
+
+@dataclass
+class LlamaConfig:
+ hidden_size: int = 128
+ mlp_size: int = 256
+ vocab_size: int = 128
+ num_layers: int = 2
+
+
+class LlamaMLP(nn.Module):
+
+ def __init__(self, config: LlamaConfig) -> None:
+ super().__init__()
+ self.gate_up_projection = nn.Linear(
+ in_features=config.hidden_size,
+ out_features=config.mlp_size * 2,
+ bias=False,
+ )
+ self.down_projection = nn.Linear(
+ in_features=config.mlp_size,
+ out_features=config.hidden_size,
+ bias=False,
+ )
+
+ self.gate_up_projection.weight.data.fill_(0.0)
+ self.down_projection.weight.data.fill_(0.0)
+
+ def forward(self, x):
+ x = self.gate_up_projection(x)
+ x = x[:, :x.size(1) // 2] * torch.nn.functional.relu(
+ x[:, x.size(1) // 2:])
+ x = self.down_projection(x)
+ return x
+
+
+class LlamaAttention(nn.Module):
+
+ def __init__(self, config: LlamaConfig) -> None:
+ super().__init__()
+ self.qkv_projection = nn.Linear(
+ in_features=config.hidden_size,
+ out_features=config.hidden_size * 3,
+ )
+
+ self.output_projection = nn.Linear(
+ in_features=config.hidden_size,
+ out_features=config.hidden_size,
+ )
+
+ self.qkv_projection.weight.data.fill_(0.0)
+ self.output_projection.weight.data.fill_(0.0)
+
+ def forward(
+ self,
+ positions: torch.Tensor,
+ hidden_states: torch.Tensor,
+ ) -> torch.Tensor:
+ qkv = self.qkv_projection(hidden_states)
+ hidden_size = qkv.size(-1) // 3
+ q, k, v = qkv.split([hidden_size, hidden_size, hidden_size], dim=-1)
+
+ q = q + positions.unsqueeze(1)
+ k = k + positions.unsqueeze(1)
+
+ attn_output = torch.empty_like(q)
+ torch.ops.silly.attention(q, k, v, attn_output)
+
+ output = self.output_projection(attn_output)
+ return output
+
+
+class LlamaDecoderLayer(nn.Module):
+
+ def __init__(self, config: LlamaConfig) -> None:
+ super().__init__()
+ self.self_attention = LlamaAttention(config)
+ self.mlp = LlamaMLP(config)
+
+ def forward(
+ self,
+ positions: torch.Tensor,
+ hidden_states: torch.Tensor,
+ residual: Optional[torch.Tensor],
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
+ if residual is None:
+ residual = hidden_states
+ hidden_states = hidden_states / 2
+ else:
+ hidden_states = hidden_states + residual
+ residual = hidden_states
+ hidden_states = hidden_states / 2
+
+ hidden_states = self.self_attention(positions=positions,
+ hidden_states=hidden_states)
+
+ hidden_states = hidden_states + residual
+ residual = hidden_states
+ hidden_states = hidden_states / 2
+ hidden_states = self.mlp(hidden_states)
+
+ return hidden_states, residual
+
+
+class LlamaModel(nn.Module):
+
+ def __init__(self, config: LlamaConfig) -> None:
+ super().__init__()
+ self.embedding_tokens = nn.Embedding(
+ num_embeddings=config.vocab_size,
+ embedding_dim=config.hidden_size,
+ )
+ self.layers = nn.ModuleList(
+ [LlamaDecoderLayer(config) for _ in range(config.num_layers)])
+
+ self.embedding_tokens.weight.data.fill_(0.0)
+
+ def forward(
+ self,
+ input_ids: Optional[torch.Tensor],
+ positions: torch.Tensor,
+ ) -> torch.Tensor:
+ hidden_states = self.embedding_tokens(input_ids)
+ residual = None
+ for layer in self.layers:
+ hidden_states, residual = layer(positions, hidden_states, residual)
+ return hidden_states
+
+
+@torch.inference_mode
+def run_model(llama_config,
+ use_compile: bool,
+ split_attn: bool = False) -> torch.Tensor:
+
+ if use_compile:
+ os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(
+ CompilationLevel.PIECEWISE)
+
+ if split_attn:
+ set_compilation_config(
+ CompilationConfig(
+ use_cudagraph=True,
+ non_cudagraph_ops=["silly.attention"],
+ ))
+ else:
+ set_compilation_config(CompilationConfig(use_cudagraph=True, ))
+ else:
+ os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(
+ CompilationLevel.NO_COMPILATION)
+ set_compilation_config(None)
+
+ cls = LlamaModel
+ if use_compile:
+ cls = support_torch_compile(LlamaModel)
+ model = cls(llama_config).eval().cuda()
+
+ B = 16 # max batch size
+ input_ids = torch.randint(0, llama_config.vocab_size, (B, )).cuda()
+ positions = torch.arange(B).cuda()
+
+ with set_compile_context([1, 2]):
+ model(input_ids, positions)
+ model(input_ids[:2], positions[:2])
+ model(input_ids[:1], positions[:1])
+
+ input_ids[:2].zero_()
+ output = model(input_ids[:2], positions[:2])
+
+ # manual cleanup
+ del os.environ["VLLM_TORCH_COMPILE_LEVEL"]
+ set_compilation_config(None)
+
+ return output.cpu()
+
+
+def test_toy_llama():
+ # compare output with and without piecewise compilation
+
+ llama_config = LlamaConfig(hidden_size=128,
+ mlp_size=256,
+ vocab_size=128,
+ num_layers=2)
+
+ outputs = []
+ with compilation_counter.expect(
+ num_graphs_seen=0,
+ num_piecewise_graphs_seen=0,
+ num_piecewise_capturable_graphs_seen=0,
+ num_inductor_compilations=0,
+ num_cudagraph_caputured=0,
+ ):
+ outputs.append(run_model(llama_config, use_compile=False))
+ with compilation_counter.expect(
+ num_graphs_seen=1, # one graph for the model
+ num_piecewise_graphs_seen=1,
+ num_piecewise_capturable_graphs_seen=1,
+ num_inductor_compilations=1, # num_piecewise_capturable_graphs_seen
+ num_cudagraph_caputured=
+ 2, # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+ ):
+ outputs.append(run_model(llama_config, use_compile=True))
+
+ with compilation_counter.expect(
+ num_graphs_seen=1, # one graph for the model
+ num_piecewise_graphs_seen=2 * llama_config.num_layers +
+ 1, # 2 * num_layers + 1
+ num_piecewise_capturable_graphs_seen=1 +
+ llama_config.num_layers, # 1 + num_layers
+ num_inductor_compilations=1 +
+ llama_config.num_layers, # num_piecewise_capturable_graphs_seen
+ num_cudagraph_caputured=2 *
+ (1 + llama_config.num_layers
+ ), # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+ ):
+ outputs.append(
+ run_model(llama_config, use_compile=True, split_attn=True))
+
+ for i in range(1, len(outputs)):
+ assert torch.allclose(outputs[0], outputs[i])
+
+
+@torch.inference_mode
+def benchmark():
+ os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(CompilationLevel.PIECEWISE)
+ from triton.testing import do_bench
+ cls = support_torch_compile(LlamaModel)
+
+ # similar to llama 3.1-8B
+ llama_config = LlamaConfig(hidden_size=4096,
+ mlp_size=14336,
+ vocab_size=128 * 1024,
+ num_layers=32)
+
+ # a tiny model to measure the overhead
+ # of piecewise cudagraph
+ llama_config = LlamaConfig(hidden_size=40,
+ mlp_size=80,
+ vocab_size=128,
+ num_layers=2)
+
+ cudagraph_sizes = [1, 2, 4] + [i * 8 for i in range(1, 33)]
+
+ eager_time = {}
+ full_cudagraph_time = {}
+ piecewise_cudagraph_time = {}
+
+ pool = torch.cuda.graph_pool_handle()
+
+ for piecewise in [False, True]:
+ if piecewise:
+ set_compilation_config(
+ CompilationConfig(
+ use_cudagraph=True,
+ non_cudagraph_ops=["silly.attention"],
+ ))
+ else:
+ set_compilation_config(None)
+
+ model = cls(llama_config).eval().cuda().to(torch.bfloat16)
+
+ B = 256 # max batch size
+ input_ids = torch.randint(0, llama_config.vocab_size, (B, )).cuda()
+ positions = torch.arange(B).cuda().to(torch.bfloat16)
+
+ graphs = {}
+
+ with set_compile_context(cudagraph_sizes):
+ model(input_ids, positions)
+ for b in cudagraph_sizes[::-1]:
+ if not piecewise:
+ graph = torch.cuda.CUDAGraph()
+ with torch.cuda.graph(graph, pool=pool):
+ output = model(input_ids[:b], positions[:b])
+ graphs[b] = (graph, output)
+ else:
+ output = model(input_ids[:b], positions[:b])
+ graphs[b] = (model, output)
+ for b in cudagraph_sizes:
+ if piecewise:
+ # noqa is for `Function definition does not bind loop variable`
+ # it will be problematic if we save the created lambda function
+ # and use it later, because it will look up the name `b` in the
+ # enclosing scope, and the value of `b` will always be 256.
+ # it is fine here, because we only use the lambda function once.
+ runtime = do_bench(lambda: graphs[b][0] # noqa
+ (input_ids[:b], positions[:b])) # noqa
+ piecewise_cudagraph_time[b] = runtime
+ else:
+ runtime = do_bench(lambda: graphs[b][0].replay()) # noqa
+ eager_runtime = do_bench(
+ lambda: model(input_ids[:b], positions[:b])) # noqa
+ full_cudagraph_time[b] = runtime
+ eager_time[b] = eager_runtime
+
+ # print in tabular format
+ print("batch size\teager mode\tfull cudagraph\tpiecewise cudagraph")
+ for b in cudagraph_sizes:
+ print((f"{b}\t{eager_time[b]:.3f}\t{full_cudagraph_time[b]:.3f}"
+ f"\t{piecewise_cudagraph_time[b]:.3f}"))
+
+
+if __name__ == "__main__":
+ benchmark()
diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py
index f28f9145bb442..f00334934cb46 100644
--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/test_full_graph.py
@@ -9,7 +9,7 @@
@pytest.mark.parametrize("model_info", TEST_MODELS)
@pytest.mark.parametrize(
"optimization_level",
- [CompilationLevel.DYNAMO_ONCE, CompilationLevel.INDUCTOR])
+ [CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE])
@fork_new_process_for_each_test
def test_full_graph(model_info, optimization_level):
model = model_info[0]
diff --git a/tests/compile/utils.py b/tests/compile/utils.py
index 64fc08e80de3b..95cad19126df6 100644
--- a/tests/compile/utils.py
+++ b/tests/compile/utils.py
@@ -9,17 +9,19 @@
TEST_MODELS = [
("facebook/opt-125m", {}),
- ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
- "dtype": torch.float16,
- "quantization": "compressed-tensors"
- }),
+ # TODO: add fake implementation for compressed-tensors
+ # ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
+ # "dtype": torch.float16,
+ # "quantization": "compressed-tensors"
+ # }),
("neuralmagic/Meta-Llama-3-8B-Instruct-FP8", {
"dtype": torch.float16,
"quantization": "fp8"
}),
- ("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples", {
- "quantization": "compressed-tensors"
- }),
+ # TODO: add fake implementation for compressed-tensors
+ # ("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples", {
+ # "quantization": "compressed-tensors"
+ # }),
("meta-llama/Meta-Llama-3-8B", {}),
]
@@ -73,7 +75,7 @@ def check_full_graph_support(model,
# much memory.
quantization = model_kwargs.get("quantization")
if ((quantization == "fp8" or model == "meta-llama/Meta-Llama-3-8B")
- and optimization_level >= CompilationLevel.INDUCTOR):
+ and optimization_level >= CompilationLevel.PIECEWISE):
return
prompts = [
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 6d9832e2c39c0..10cf49e19eccc 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -1,13 +1,16 @@
import copy
+import dataclasses
import operator
-from typing import Callable, Dict, List, Optional, Tuple, Union
+from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
import torch
import torch.fx as fx
from vllm.logger import init_logger
+from vllm.utils import weak_ref_tensors
-from .compile_context import get_compile_context
+from .config import CompilationConfig
+from .counter import compilation_counter
from .levels import CompilationLevel
logger = init_logger(__name__)
@@ -157,113 +160,326 @@ def fix_functionalization(graph: fx.Graph):
# print(graph.python_code(root_module="self", verbose=True).src, file=f)
-def wrap_inductor(graph, example_inputs, additional_inductor_config):
+def wrap_inductor(graph,
+ example_inputs,
+ additional_inductor_config,
+ do_logging=False,
+ runtime_shape: Optional[int] = None,
+ use_inductor: bool = True):
+ if not use_inductor:
+ return graph
+
+ compilation_counter.num_inductor_compilations += 1
+
+ if do_logging:
+ if runtime_shape is None:
+ logger.info("Compiling a graph for general shape")
+ else:
+ logger.info("Compiling a graph for shape %s", runtime_shape)
+
from torch._inductor import config
current_config = config.shallow_copy_dict()
from torch._inductor.compile_fx import compile_fx
if additional_inductor_config is not None:
current_config.update(additional_inductor_config)
- if current_config['post_grad_custom_post_pass'] is not None:
- logger.warning(
- "post_grad_custom_post_pass is already set in the config. "
- "Overwriting it with the fix_functionalization")
- current_config['post_grad_custom_post_pass'] = fix_functionalization
+
+ # inductor can inplace modify the graph, so we need to copy it
+ # see https://github.com/pytorch/pytorch/issues/138980
+ graph = copy.deepcopy(graph)
return compile_fx(graph, example_inputs, config_patches=current_config)
-def vllm_backend(
+@dataclasses.dataclass
+class SplitItem:
+ submod_name: str
+ is_splitting_graph: bool
+ graph: fx.GraphModule
+
+
+def split_graph(graph: fx.GraphModule,
+ ops: List[str]) -> Tuple[fx.GraphModule, List[SplitItem]]:
+ # split graph by ops
+ subgraph_id = 0
+ node_to_subgraph_id = {}
+ split_op_graphs = []
+ for node in graph.graph.nodes:
+ if node.op in ("output", "placeholder"):
+ continue
+ if node.op == 'call_function' and str(node.target) in ops:
+ subgraph_id += 1
+ node_to_subgraph_id[node] = subgraph_id
+ split_op_graphs.append(subgraph_id)
+ subgraph_id += 1
+ else:
+ node_to_subgraph_id[node] = subgraph_id
+
+ # `keep_original_order` is important!
+ # otherwise pytorch might reorder the nodes and
+ # the semantics of the graph will change when we
+ # have mutations in the graph
+ split_gm = torch.fx.passes.split_module.split_module(
graph,
- example_inputs,
- additional_inductor_config: Optional[Dict] = None) -> Callable:
-
- context = get_compile_context()
- context = copy.deepcopy(context) if context is not None else []
- sizes_to_specialize: List[int] = context
+ None,
+ lambda node: node_to_subgraph_id[node],
+ keep_original_order=True)
- # flags for all the seen shapes, whether we need to specialize
- runtime_shapes_to_compile_flags: Dict[Tuple[int, ...], bool] = {}
+ outputs = []
- # if we need to specialize, the compiled graph for that shape
- runtime_shapes_to_compiled_graph: Dict[Tuple[int, ...], Callable] = {}
+ # sort the names to make sure the order is deterministic
+ names = [name for (name, module) in split_gm.named_modules()]
+ names.sort()
- # this is the first compilation, we will compile a graph with
- # dynamic shape, as the caller will mark first dimension as dynamic
- logger.info("Compiling a graph for general shapes")
- graph_for_symbolic_shape = wrap_inductor(graph, example_inputs,
- additional_inductor_config)
+ for name in names:
+ if "." in name or name == "":
+ # recursive child module or the root module
+ continue
- # TODO: Dynamo does not pass all dynamic shapes.
- # Need to investigate why. It works now because all the dynamic
- # shapes have the same value, and either of them can be used.
- sym_shape_indices = [
- i for i, x in enumerate(example_inputs) if isinstance(x, torch.SymInt)
- ]
+ module = getattr(split_gm, name)
- first_run = True
+ graph_id = int(name.replace("submod_", ""))
+ outputs.append(SplitItem(name, graph_id in split_op_graphs, module))
- # this is the function we return to Dynamo to run finally
- def compiled_graph_wrapper(*args):
+ return split_gm, outputs
- runtime_shapes: Tuple[int,
- ...] = tuple(args[i] for i in sym_shape_indices)
- nonlocal first_run
- nonlocal runtime_shapes_to_compile_flags
- nonlocal runtime_shapes_to_compiled_graph
+class VllmBackend:
+ """The compilation backend for `torch.compile` with VLLM.
+ It is used for compilation level of `CompilationLevel.PIECEWISE`,
+ where we customize the compilation.
- if first_run:
- # the first compilation is for profiling, we directly run it
- first_run = False
- return graph_for_symbolic_shape(*args)
-
- if runtime_shapes not in runtime_shapes_to_compile_flags:
- # we haven't seen this shape before
- # query if we need to specialize for this shape
- # we only specialize for the first dimension.
- # TODO: investigate if any model needs to specialize
- # beyond the first dimension
- runtime_shapes_to_compile_flags[runtime_shapes] = runtime_shapes[
- 0] in sizes_to_specialize
-
- if not runtime_shapes_to_compile_flags[runtime_shapes]:
- # we don't need to specialize for this shape
- return graph_for_symbolic_shape(*args)
+ The major work of this backend is to split the graph into
+ piecewise graphs, and pass them to the piecewise backend.
+ """
- if runtime_shapes not in runtime_shapes_to_compiled_graph:
- # we need to specialize for this shape, and we haven't compiled
- # compile the graph for this shape
- logger.info("Compiling a graph for shapes %s", runtime_shapes)
- runtime_shapes_to_compiled_graph[runtime_shapes] = wrap_inductor(
- graph, args, additional_inductor_config)
+ compilation_configs: CompilationConfig
+ graph_pool: Any
+ _called: bool = False
+ # the graph we compiled
+ graph: fx.GraphModule
+ # the stiching graph module for all the piecewise graphs
+ split_gm: fx.GraphModule
+ piecewise_graphs: List[SplitItem]
+ returned_callable: Callable
+
+ def __init__(self, ):
+ # every instance of VllmBackend has its own graph pool
+ self.graph_pool = torch.cuda.graph_pool_handle()
+
+ # `torch.compile` is JIT compiled, so we don't need to
+ # do anything here
+
+ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
+
+ compilation_counter.num_graphs_seen += 1
+
+ # we control the compilation process, each instance can only be
+ # called once
+ assert not self._called, "VllmBackend can only be called once"
+
+ self.graph = graph
+ # config is read now, because only here can
+ # we get the sizes to capture for cudagraph
+ # from compilation context
+ self.compilation_configs = CompilationConfig.select_and_init_config()
+
+ self.split_gm, self.piecewise_graphs = split_graph(
+ graph, self.compilation_configs.non_cudagraph_ops)
+
+ returned_callable: Callable # type: ignore
+
+ if len(self.piecewise_graphs) == 0:
+ compilation_counter.num_piecewise_graphs_seen += 1
+ compilation_counter.num_piecewise_capturable_graphs_seen += 1
+ returned_callable = PiecewiseBackend(graph,
+ self.compilation_configs,
+ self.graph_pool,
+ is_first_graph=True)
+ else:
+ from torch._dynamo.utils import lazy_format_graph_code
+ logger.debug(
+ "%s", lazy_format_graph_code("stiching module", self.split_gm))
+
+ is_first_graph = True
+
+ for item in self.piecewise_graphs:
+ compilation_counter.num_piecewise_graphs_seen += 1
+ compilation_counter.num_piecewise_capturable_graphs_seen += not item.is_splitting_graph # noqa
+ if not item.is_splitting_graph:
+ # cannot setattr to a module, so we need to set
+ # the attribute in the __dict__
+ self.split_gm.__dict__[
+ item.submod_name] = PiecewiseBackend(
+ item.graph, self.compilation_configs,
+ self.graph_pool, is_first_graph)
+ is_first_graph = False
+ returned_callable = self.split_gm
+
+ self.returned_callable = returned_callable
+ # trigger the first compilation
+ # code borrowed from https://github.com/pytorch/pytorch/blob/4e3e08b71171fa34172b2362ff668553fac75f27/torch/_dynamo/backends/distributed.py#L206 # noqa
+ # to turn the inputs into fake tensors
+ import torch._guards
+ from torch._guards import detect_fake_mode
+ fake_mode = detect_fake_mode(example_inputs)
+ fake_args = []
+ for arg in example_inputs:
+ if isinstance(arg, torch.Tensor) and not isinstance(
+ arg, torch._subclasses.FakeTensor):
+ fake_args.append(
+ torch._dynamo.utils.to_fake_tensor(arg, fake_mode))
+ else:
+ fake_args.append(arg)
+ self.returned_callable(*fake_args)
+
+ self._called = True
+
+ return self.returned_callable
+
+
+@dataclasses.dataclass
+class ConcreteSizeEntry:
+ runtime_shape: int
+ need_to_compile: bool # the size is in compile_sizes
+ use_cudagraph: bool # the size is in capture_sizes
+
+ compiled: bool = False
+ runnable: Callable = None # type: ignore
+ num_finished_warmup: int = 0
+ cudagraph: Optional[torch.cuda.CUDAGraph] = None
+ output: Optional[Any] = None
+
+
+class PiecewiseBackend:
+
+ def __init__(self,
+ graph: fx.GraphModule,
+ compilation_configs: CompilationConfig,
+ graph_pool: Any,
+ is_first_graph: bool = False):
+ """
+ The backend for piecewise compilation.
+ It mainly handles the compilation and cudagraph capturing.
+
+ We will compile `self.graph` once for the general shape,
+ and then compile for different shapes specified in
+ `compilation_configs.compile_sizes`.
+
+ Independently, we will capture cudagraph for different shapes.
+
+ If a shape needs both compilation and cudagraph, we will
+ compile it first, and then capture cudagraph.
+ """
+ self.graph = graph
+ self.compilation_configs = compilation_configs
+ self.graph_pool = graph_pool
+ self.is_first_graph = is_first_graph
+
+ self.compile_sizes: Set[int] = set(
+ self.compilation_configs.compile_sizes)
+ self.capture_sizes: Set[int] = set(
+ self.compilation_configs.capture_sizes
+ ) if self.compilation_configs.use_cudagraph else set()
+
+ self.compile_finished = False
+ self.first_run_finished = False
+
+ self.compiled_graph_for_general_shape: Callable = None # type: ignore
+
+ self.sym_shape_indices: List[int] = []
+
+ # the entries for different shapes that we need to either
+ # compile or capture cudagraph
+ self.concrete_size_entries: Dict[int, ConcreteSizeEntry] = {}
+ for shape in self.compile_sizes.union(self.capture_sizes):
+ self.concrete_size_entries[shape] = ConcreteSizeEntry(
+ runtime_shape=shape,
+ need_to_compile=shape in self.compile_sizes,
+ use_cudagraph=shape in self.capture_sizes,
+ )
+
+ def __call__(self, *args) -> Any:
+
+ if not self.compile_finished:
+ self.compile_finished = True
+
+ # this is the first compilation, we will compile a graph with
+ # dynamic shape, as the caller will mark first dimension as dynamic
+
+ self.sym_shape_indices = [
+ i for i, x in enumerate(args) if isinstance(x, torch.SymInt)
+ ]
+
+ self.compiled_graph_for_general_shape = wrap_inductor(
+ self.graph,
+ args,
+ self.compilation_configs.inductor_compile_config,
+ runtime_shape=None,
+ do_logging=self.is_first_graph,
+ use_inductor=self.compilation_configs.use_inductor)
+
+ return self.graph(*args)
+
+ if not self.first_run_finished:
+ self.first_run_finished = True
+ return self.compiled_graph_for_general_shape(*args)
+
+ runtime_shape = args[self.sym_shape_indices[0]]
+ if runtime_shape not in self.concrete_size_entries:
+ # we don't need to do anything for this shape
+ return self.compiled_graph_for_general_shape(*args)
+
+ entry = self.concrete_size_entries[runtime_shape]
- return runtime_shapes_to_compiled_graph[runtime_shapes](*args)
+ if entry.runnable is None:
+ entry.runnable = self.compiled_graph_for_general_shape
- return compiled_graph_wrapper
+ if entry.need_to_compile and not entry.compiled:
+ entry.compiled = True
+ # args are real arguments
+ entry.runnable = wrap_inductor(
+ self.graph,
+ args,
+ self.compilation_configs.inductor_compile_config,
+ runtime_shape=runtime_shape,
+ do_logging=self.is_first_graph,
+ use_inductor=self.compilation_configs.use_inductor)
+
+ if not entry.use_cudagraph:
+ return entry.runnable(*args)
+
+ if entry.cudagraph is None:
+ if entry.num_finished_warmup < self.compilation_configs.cudagraph_num_of_warmups: # noqa
+ entry.num_finished_warmup += 1
+ if self.is_first_graph:
+ logger.debug(
+ "Warming up %s/%s for shape %s",
+ entry.num_finished_warmup,
+ self.compilation_configs.cudagraph_num_of_warmups,
+ runtime_shape)
+ return entry.runnable(*args)
+
+ if self.is_first_graph:
+ logger.info("Capturing a cudagraph for shape %s",
+ runtime_shape)
+
+ cudagraph = torch.cuda.CUDAGraph()
+ with torch.cuda.graph(cudagraph, pool=self.graph_pool):
+ entry.output = weak_ref_tensors(entry.runnable(*args))
+
+ compilation_counter.num_cudagraph_caputured += 1
+
+ entry.cudagraph = cudagraph
+ return entry.output
+
+ entry.cudagraph.replay()
+ return entry.output
def select_default_backend(level: int) -> Union[str, Callable]:
if level in [CompilationLevel.DYNAMO_AS_IS, CompilationLevel.DYNAMO_ONCE]:
backend_str = "eager"
return backend_str
- assert level in [
- CompilationLevel.INDUCTOR, CompilationLevel.INDUCTOR_MAX_AUTOTUNE
- ], f"Invalid level {level}"
-
- from vllm.compilation.backends import vllm_backend
- from vllm.plugins import get_inductor_additional_configs
- additional_configs = get_inductor_additional_configs()
-
- if level == CompilationLevel.INDUCTOR_MAX_AUTOTUNE:
- if "max_autotune" in additional_configs and not additional_configs[
- "max_autotune"]:
- logger.warning(
- "max_autotune is disabled, but is overridden by level %s",
- CompilationLevel.INDUCTOR_MAX_AUTOTUNE)
- additional_configs['max_autotune'] = True
-
- from functools import partial
- backend = partial(vllm_backend,
- additional_inductor_config=additional_configs)
-
- return backend
+ assert level == CompilationLevel.PIECEWISE
+
+ return VllmBackend()
diff --git a/vllm/compilation/config.py b/vllm/compilation/config.py
new file mode 100644
index 0000000000000..514f2b93ef64f
--- /dev/null
+++ b/vllm/compilation/config.py
@@ -0,0 +1,154 @@
+import copy
+from typing import Any, Dict, List, Optional
+
+from pydantic import BaseModel, Field, PrivateAttr
+
+import vllm.envs as envs
+from vllm.logger import init_logger
+
+from .compile_context import get_compile_context
+
+logger = init_logger(__name__)
+
+
+class CompilationConfig(BaseModel):
+ """
+ Configuration for compilation.
+ It has two parts:
+ - CudaGraph capture:
+ - use_cudagraph: whether to use cudagraph inside compilation.
+ - False: cudagraph inside compilation is not used.
+ - True: cudagraph inside compilation is used. It requires
+ that all input buffers have fixed addresses.
+ Note that this is orthogonal to the cudagraph capture out
+ side of compilation.
+ TODO: move outside cudagraph logic into compilation.
+ torch.compile will handle cudagraph capture logic in the future.
+ - cudagraph_capture_sizes: sizes to capture cudagraph.
+ - None: capture sizes are inferred from compilation context.
+ - List[int]: capture sizes are specified.
+ - cudagraph_num_of_warmups: number of warmup runs for cudagraph.
+ It means the first several runs will be treated as warmup runs.
+ Only after that, the execution will be recorded, and the recorded
+ cudagraph will be used for subsequent runs.
+ - Inductor compilation:
+ - use_inductor: whether to use inductor compilation.
+ - False: inductor compilation is not used. graph runs in eager.
+ - True: inductor compilation is used. one graph for symbolic shape
+ is compiled. In addition, compile for different sizes specified
+ in inductor_compile_sizes, using configurations
+ in inductor_compile_config.
+ - inductor_compile_sizes: sizes to compile for inductor.
+ - inductor_specialize_for_cudagraph_no_more_than: an optional integer
+ to specialize inductor for cudagraph sizes no more than the
+ specified size. It is useful when we want to specialize inductor
+ with a subset of cudagraph sizes.
+ - inductor_compile_config: additional configurations for inductor.
+ - None: use default configurations.
+ - inductor_passes: additional passes for inductor. It is a dictionary
+ from pass name to pass function qualified name. We use function
+ name because the config uses json format. If we pass the config
+ from Python, functions can also be passed directly via Python object
+ constructor, e.g. `CompilationConfig(inductor_passes={"a": func})`
+
+ Why we have different sizes for cudagraph and inductor:
+ - cudagraph: a cudagraph captured for a specific size can only be used
+ for the same size. We need to capture all the sizes we want to use.
+ - inductor: a graph compiled by inductor for a general shape can be used
+ for different sizes. Inductor can also compile for specific sizes,
+ where it can have more information to optimize the graph with fully
+ static shapes. However, we find the general shape compilation is
+ sufficient for most cases. It might be beneficial to compile for
+ certain small batchsizes, where inductor is good at optimizing.
+ """
+ use_inductor: bool = True
+ inductor_specialize_for_cudagraph_no_more_than: Optional[int] = None
+ inductor_compile_sizes: Optional[List[int]] = Field(default_factory=dict)
+ inductor_compile_config: Dict = Field(default_factory=dict)
+ inductor_passes: Dict[str, str] = Field(default_factory=dict)
+
+ use_cudagraph: bool = False
+ non_cudagraph_ops: List[str] = Field(default_factory=list)
+ cudagraph_num_of_warmups: int = 0
+ cudagraph_capture_sizes: Optional[List[int]] = None
+
+ # not configurable, computed after init
+ compile_sizes: List[int] = PrivateAttr
+ capture_sizes: List[int] = PrivateAttr
+
+ def model_post_init(self, __context: Any) -> None:
+ for k, v in self.inductor_passes.items():
+ if not isinstance(v, str):
+ assert callable(v), (
+ f"pass {k} should be a function or a qualified name")
+ self.inductor_passes[k] = v
+ continue
+
+ # resolve function from qualified name
+ names = v.split(".")
+ module = ".".join(names[:-1])
+ func_name = names[-1]
+ func = __import__(module).__dict__[func_name]
+ self.inductor_compile_config[k] = func
+
+ from vllm.compilation.backends import fix_functionalization
+ from vllm.utils import combine_fx_passes
+ if "post_grad_custom_post_pass" in self.inductor_compile_config:
+ self.inductor_compile_config[
+ "post_grad_custom_post_pass"] = combine_fx_passes(
+ fix_functionalization,
+ self.inductor_compile_config["post_grad_custom_post_pass"],
+ )
+ else:
+ self.inductor_compile_config[
+ "post_grad_custom_post_pass"] = fix_functionalization
+
+ def init_during_runtime(self):
+ """To complete the initialization of config,
+ we need to know the compile context, which is only available
+ during the first run of the model.
+ """
+ context = get_compile_context()
+ context = copy.deepcopy(context) if context is not None else []
+ sizes_to_specialize: List[int] = context
+ if self.cudagraph_capture_sizes is None:
+ self.capture_sizes = sizes_to_specialize
+ else:
+ self.capture_sizes = self.cudagraph_capture_sizes
+ logger.info(("cudagraph sizes specified by model runner"
+ " %s is overridden by config %s"),
+ sizes_to_specialize, self.cudagraph_capture_sizes)
+ if self.inductor_specialize_for_cudagraph_no_more_than is not None:
+ assert self.inductor_compile_sizes is None, (
+ "inductor_compile_sizes should be None when "
+ "inductor_specialize_for_cudagraph_no_more_than is not None")
+ self.compile_sizes = [
+ x for x in self.capture_sizes
+ if x <= self.inductor_specialize_for_cudagraph_no_more_than
+ ]
+ else:
+ assert self.inductor_compile_sizes is not None, (
+ "inductor_compile_sizes should not be None when "
+ "inductor_specialize_for_cudagraph_no_more_than is None")
+ self.compile_sizes = self.inductor_compile_sizes
+
+ @staticmethod
+ def select_and_init_config() -> "CompilationConfig":
+ """The order of selecting config is:
+ 1. Use the config specified in environment variable.
+ 2. Use the config specified in plugins.
+ 3. Use the default config.
+ """
+ config_path = envs.VLLM_TORCH_COMPILE_CONFIG
+ if config_path is not None:
+ with open(config_path) as json_file:
+ config = CompilationConfig.model_validate_json(
+ json_file.read())
+ else:
+ from vllm.plugins import get_compilation_config
+ predefined_config = get_compilation_config()
+ config = predefined_config if predefined_config is not None else (
+ CompilationConfig())
+
+ config.init_during_runtime()
+ return config
diff --git a/vllm/compilation/counter.py b/vllm/compilation/counter.py
new file mode 100644
index 0000000000000..100a49aba74ac
--- /dev/null
+++ b/vllm/compilation/counter.py
@@ -0,0 +1,30 @@
+import copy
+import dataclasses
+from contextlib import contextmanager
+
+
+@dataclasses.dataclass
+class CompilationCounter:
+ num_graphs_seen: int = 0
+ # including the splitting ops
+ num_piecewise_graphs_seen: int = 0
+ # not including the splitting ops
+ num_piecewise_capturable_graphs_seen: int = 0
+ num_inductor_compilations: int = 0
+ num_cudagraph_caputured: int = 0
+
+ def clone(self) -> "CompilationCounter":
+ return copy.deepcopy(self)
+
+ @contextmanager
+ def expect(self, **kwargs):
+ old = self.clone()
+ yield
+ for k, v in kwargs.items():
+ assert getattr(self, k) - getattr(old, k) == v, (
+ f"{k} not as expected, before it is {getattr(old, k)}"
+ f", after it is {getattr(self, k)}, "
+ f"expected diff is {v}")
+
+
+compilation_counter = CompilationCounter()
diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py
index 0449f9354d0a2..3053e57e0b63b 100644
--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -121,7 +121,10 @@ def _support_torch_compile(cls: type,
# take care of method resolution order
# make sure super().__init__ is called on the base class
# other than TorchCompileWrapperWithCustomDispatcher
- cls.__bases__ = cls.__bases__ + (TorchCompileWrapperWithCustomDispatcher, )
+ if TorchCompileWrapperWithCustomDispatcher not in cls.__bases__:
+ # support decorating multiple times
+ cls.__bases__ = cls.__bases__ + (
+ TorchCompileWrapperWithCustomDispatcher, )
old_init = cls.__init__ # type: ignore
@@ -160,6 +163,11 @@ def __call__(self, *args, **kwargs):
# compiled function and let torch.compile handle the dispatching,
# with the overhead of guard evaluation and recompilation.
if len(self.compiled_codes) < 1 or not self.use_custom_dispatcher:
+ # it seems Dynamo reuse the compilation across instances,
+ # while we need to make sure the compiled code is not reused.
+ # we need to control all the compilation of the model.
+ torch._dynamo.eval_frame.remove_from_cache(
+ self.original_code_object)
return self.compiled_callable(*args, **kwargs)
# usually, capturing the model once is enough, and then we can
diff --git a/vllm/compilation/levels.py b/vllm/compilation/levels.py
index 162bf5ae64997..19a3a2b526870 100644
--- a/vllm/compilation/levels.py
+++ b/vllm/compilation/levels.py
@@ -5,5 +5,4 @@ class CompilationLevel:
NO_COMPILATION = 0
DYNAMO_AS_IS = 1
DYNAMO_ONCE = 2
- INDUCTOR = 3
- INDUCTOR_MAX_AUTOTUNE = 4
+ PIECEWISE = 3
diff --git a/vllm/envs.py b/vllm/envs.py
index ae6825f280073..b4a263d1e086e 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -209,6 +209,11 @@ def get_default_config_root():
os.environ.get("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1") != "0"),
"VLLM_TORCH_COMPILE_LEVEL":
lambda: int(os.environ.get("VLLM_TORCH_COMPILE_LEVEL", "0")),
+
+ # Path to the config file for torch compile
+ "VLLM_TORCH_COMPILE_CONFIG":
+ lambda: os.environ.get("VLLM_TORCH_COMPILE_CONFIG", None),
+
# Fine-grained control over which custom ops to enable/disable.
# Use 'all' to enable all, 'none' to disable all.
# Also specify a list of custom op names to enable (prefixed with a '+'),
diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py
index 83910339f3c9f..764f4e9c99df8 100644
--- a/vllm/model_executor/custom_op.py
+++ b/vllm/model_executor/custom_op.py
@@ -100,7 +100,7 @@ def enabled(cls) -> bool:
return (CustomOp.default_on() or enabled) and not disabled
- # On by default if VLLM_TORCH_COMPILE_LEVEL < CompilationLevel.INDUCTOR
+ # On by default if VLLM_TORCH_COMPILE_LEVEL < CompilationLevel.PIECEWISE
# Specifying 'all' or 'none' in VLLM_CUSTOM_OPS takes precedence.
@staticmethod
@lru_cache()
@@ -108,7 +108,7 @@ def default_on() -> bool:
count_none = envs.VLLM_CUSTOM_OPS.count("none")
count_all = envs.VLLM_CUSTOM_OPS.count("all")
assert count_none + count_all <= 1, "Can only specify 'none' or 'all'"
- return envs.VLLM_TORCH_COMPILE_LEVEL < CompilationLevel.INDUCTOR and \
+ return envs.VLLM_TORCH_COMPILE_LEVEL < CompilationLevel.PIECEWISE and \
not count_none > 0 or count_all > 0
# Dictionary of all custom ops (classes, indexed by registered name).
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index 8ba973b28263f..8d0ce47df4040 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -11,7 +11,7 @@
if "VLLM_TORCH_COMPILE_LEVEL" not in os.environ:
os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(CompilationLevel.DYNAMO_ONCE)
-assert envs.VLLM_TORCH_COMPILE_LEVEL < CompilationLevel.INDUCTOR,\
+assert envs.VLLM_TORCH_COMPILE_LEVEL < CompilationLevel.PIECEWISE,\
"TPU does not support Inductor."
set_torch_compile_backend("openxla")
diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py
index 211fedbc6e2ec..4338cbc37f6c1 100644
--- a/vllm/plugins/__init__.py
+++ b/vllm/plugins/__init__.py
@@ -1,7 +1,8 @@
import logging
-from typing import Callable, Dict, Optional, Union
+from typing import Callable, Optional, Union
import vllm.envs as envs
+from vllm.compilation.config import CompilationConfig
logger = logging.getLogger(__name__)
@@ -44,13 +45,13 @@ def get_torch_compile_backend() -> Optional[Union[Callable, str]]:
return _torch_compile_backend
-_inductor_additional_configs: Dict = {}
+_compilation_config: Optional[CompilationConfig] = None
-def set_inductor_additional_configs(configs: Dict):
- global _inductor_additional_configs
- _inductor_additional_configs = configs
+def set_compilation_config(config: Optional[CompilationConfig]):
+ global _compilation_config
+ _compilation_config = config
-def get_inductor_additional_configs() -> Dict:
- return _inductor_additional_configs
+def get_compilation_config() -> Optional[CompilationConfig]:
+ return _compilation_config
diff --git a/vllm/utils.py b/vllm/utils.py
index fea318ebcdf41..90c4b84757810 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -1479,6 +1479,15 @@ def __len__(self):
return len(self._factory)
+def combine_fx_passes(passes: List[Callable]) -> Callable:
+
+ def combined_fx(graph) -> None:
+ for fx in passes:
+ fx(graph)
+
+ return combined_fx
+
+
def weak_ref_tensor(tensor: torch.Tensor) -> torch.Tensor:
"""
Create a weak reference to a tensor.
@@ -1486,3 +1495,19 @@ def weak_ref_tensor(tensor: torch.Tensor) -> torch.Tensor:
but will not keep the original tensor alive.
"""
return torch.ops._C.weak_ref_tensor(tensor)
+
+
+def weak_ref_tensors(
+ tensors: Union[torch.Tensor, List[torch.Tensor], Tuple[torch.Tensor]]
+) -> Union[torch.Tensor, List[torch.Tensor], Tuple[torch.Tensor]]:
+ """
+ Convenience function to create weak references to tensors,
+ for single tensor, list of tensors or tuple of tensors.
+ """
+ if isinstance(tensors, torch.Tensor):
+ return weak_ref_tensor(tensors)
+ if isinstance(tensors, list):
+ return [weak_ref_tensor(t) for t in tensors]
+ if isinstance(tensors, tuple):
+ return tuple(weak_ref_tensor(t) for t in tensors)
+ raise ValueError("Invalid type for tensors")
From 6aa6020f9bd4c1e414c10f7bd3a7c2555f1950b2 Mon Sep 17 00:00:00 2001
From: Jee Jee Li
Date: Wed, 30 Oct 2024 14:05:43 +0800
Subject: [PATCH 049/113] [Misc] Specify minimum pynvml version (#9827)
Signed-off-by: Jee Jee Li
---
requirements-cuda.txt | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/requirements-cuda.txt b/requirements-cuda.txt
index 92fa303d687a2..282ab11838bf4 100644
--- a/requirements-cuda.txt
+++ b/requirements-cuda.txt
@@ -3,7 +3,7 @@
# Dependencies for NVIDIA GPUs
ray >= 2.9
-nvidia-ml-py # for pynvml package
+nvidia-ml-py >= 12.560.30 # for pynvml package
torch == 2.5.0
# These must be updated alongside torch
torchvision == 0.20 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
From 211fe91aa88730c04df439298d8103a587302493 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon
Date: Wed, 30 Oct 2024 02:41:38 -0700
Subject: [PATCH 050/113] [TPU] Correctly profile peak memory usage & Upgrade
PyTorch XLA (#9438)
---
Dockerfile.tpu | 2 +-
docs/source/getting_started/tpu-installation.rst | 4 ++--
vllm/worker/tpu_worker.py | 15 ++++++++-------
3 files changed, 11 insertions(+), 10 deletions(-)
diff --git a/Dockerfile.tpu b/Dockerfile.tpu
index bdfab3f61910f..dd8f9ad4714a9 100644
--- a/Dockerfile.tpu
+++ b/Dockerfile.tpu
@@ -1,4 +1,4 @@
-ARG NIGHTLY_DATE="20240828"
+ARG NIGHTLY_DATE="20241017"
ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE"
FROM $BASE_IMAGE
diff --git a/docs/source/getting_started/tpu-installation.rst b/docs/source/getting_started/tpu-installation.rst
index 217028839e347..edba209986f6a 100644
--- a/docs/source/getting_started/tpu-installation.rst
+++ b/docs/source/getting_started/tpu-installation.rst
@@ -56,8 +56,8 @@ First, install the dependencies:
$ pip uninstall torch torch-xla -y
$ # Install PyTorch and PyTorch XLA.
- $ export DATE="20240828"
- $ export TORCH_VERSION="2.5.0"
+ $ export DATE="20241017"
+ $ export TORCH_VERSION="2.6.0"
$ pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-${TORCH_VERSION}.dev${DATE}-cp310-cp310-linux_x86_64.whl
$ pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-${TORCH_VERSION}.dev${DATE}-cp310-cp310-linux_x86_64.whl
diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py
index fe819b9f4b3a8..de6f7ab0072fd 100644
--- a/vllm/worker/tpu_worker.py
+++ b/vllm/worker/tpu_worker.py
@@ -133,18 +133,19 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
# Synchronize before measuring the memory usage.
xm.wait_device_ops()
- dtype_btyes = get_dtype_size(self.cache_dtype)
- block_size = self.cache_config.block_size
- block_size_bytes = (dtype_btyes * block_size * num_layers * 2 *
- head_size * num_kv_heads)
-
- # Calculate the TPU KV cache size based on profiling.
+ # Get the maximum amount of memory used by the model weights and
+ # intermediate activations.
m = xm.get_memory_info(self.device)
total_memory_size = m["bytes_limit"]
+ profiled = m["peak_bytes_used"] # Weights + intermediate activations.
+
+ # Calculate the TPU KV cache size based on profiling.
usable_memory_size = int(total_memory_size *
self.cache_config.gpu_memory_utilization)
- profiled = m["bytes_used"] # Weights + intermediate activations.
tpu_kv_cache_bytes = max(usable_memory_size - profiled, 0)
+ dtype_btyes = get_dtype_size(self.cache_dtype)
+ block_size_bytes = (dtype_btyes * self.cache_config.block_size *
+ num_layers * 2 * head_size * num_kv_heads)
num_tpu_blocks = tpu_kv_cache_bytes // block_size_bytes
num_tpu_blocks = (num_tpu_blocks // 8) * 8 # Round down to 8.
From cc98f1e0798cf2b5ea5bc5d0c565af2f884bf6e8 Mon Sep 17 00:00:00 2001
From: Alex Brooks
Date: Wed, 30 Oct 2024 10:32:17 -0600
Subject: [PATCH 051/113] [CI/Build] VLM Test Consolidation (#9372)
Signed-off-by: Alex-Brooks
---
.buildkite/test-pipeline.yaml | 7 +-
tests/conftest.py | 6 +-
tests/engine/test_short_mm_context.py | 29 +
.../audio_language/test_ultravox.py | 2 +-
.../models/decoder_only/language/test_qwen.py | 34 +
.../mm_processor_kwargs/__init__.py | 0
.../mm_processor_kwargs/test_llava_next.py | 68 ++
.../mm_processor_kwargs/test_phi3v.py | 181 ++++++
.../mm_processor_kwargs/test_qwen.py | 144 +++++
.../test_qwen2_vl.py | 4 +-
.../vision_language/test_blip2.py | 101 ---
.../vision_language/test_broadcast.py | 46 --
.../vision_language/test_chameleon.py | 130 ----
.../decoder_only/vision_language/test_fuyu.py | 139 ----
.../decoder_only/vision_language/test_glm4.py | 133 ----
.../vision_language/test_internvl.py | 290 +--------
.../vision_language/test_llava.py | 313 ---------
.../test_llava_image_embeds.py | 158 -----
.../vision_language/test_llava_next.py | 347 ----------
.../vision_language/test_llava_next_video.py | 226 -------
.../vision_language/test_llava_onevision.py | 272 --------
.../vision_language/test_minicpmv.py | 199 ------
.../vision_language/test_models.py | 594 ++++++++++++++++++
.../vision_language/test_paligemma.py | 174 -----
.../vision_language/test_phi3v.py | 185 +-----
.../decoder_only/vision_language/test_qwen.py | 374 -----------
.../vision_language/vlm_utils/__init__.py | 0
.../vision_language/vlm_utils/builders.py | 235 +++++++
.../vlm_utils/case_filtering.py | 157 +++++
.../vision_language/vlm_utils/core.py | 141 +++++
.../vlm_utils/custom_inputs.py | 102 +++
.../vision_language/vlm_utils/model_utils.py | 338 ++++++++++
.../vision_language/vlm_utils/runners.py | 130 ++++
.../vision_language/vlm_utils/types.py | 187 ++++++
.../vision_language/test_llava_next.py | 2 +
.../vision_language/test_mllama.py | 2 +-
tests/utils.py | 24 +-
vllm/utils.py | 3 +-
38 files changed, 2381 insertions(+), 3096 deletions(-)
create mode 100644 tests/engine/test_short_mm_context.py
create mode 100644 tests/models/decoder_only/language/test_qwen.py
create mode 100644 tests/models/decoder_only/vision_language/mm_processor_kwargs/__init__.py
create mode 100644 tests/models/decoder_only/vision_language/mm_processor_kwargs/test_llava_next.py
create mode 100644 tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py
create mode 100644 tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py
rename tests/models/decoder_only/vision_language/{ => mm_processor_kwargs}/test_qwen2_vl.py (98%)
delete mode 100644 tests/models/decoder_only/vision_language/test_blip2.py
delete mode 100644 tests/models/decoder_only/vision_language/test_broadcast.py
delete mode 100644 tests/models/decoder_only/vision_language/test_chameleon.py
delete mode 100644 tests/models/decoder_only/vision_language/test_fuyu.py
delete mode 100644 tests/models/decoder_only/vision_language/test_glm4.py
delete mode 100644 tests/models/decoder_only/vision_language/test_llava.py
delete mode 100644 tests/models/decoder_only/vision_language/test_llava_image_embeds.py
delete mode 100644 tests/models/decoder_only/vision_language/test_llava_next.py
delete mode 100644 tests/models/decoder_only/vision_language/test_llava_next_video.py
delete mode 100644 tests/models/decoder_only/vision_language/test_llava_onevision.py
delete mode 100644 tests/models/decoder_only/vision_language/test_minicpmv.py
create mode 100644 tests/models/decoder_only/vision_language/test_models.py
delete mode 100644 tests/models/decoder_only/vision_language/test_paligemma.py
delete mode 100644 tests/models/decoder_only/vision_language/test_qwen.py
create mode 100644 tests/models/decoder_only/vision_language/vlm_utils/__init__.py
create mode 100644 tests/models/decoder_only/vision_language/vlm_utils/builders.py
create mode 100644 tests/models/decoder_only/vision_language/vlm_utils/case_filtering.py
create mode 100644 tests/models/decoder_only/vision_language/vlm_utils/core.py
create mode 100644 tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py
create mode 100644 tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
create mode 100644 tests/models/decoder_only/vision_language/vlm_utils/runners.py
create mode 100644 tests/models/decoder_only/vision_language/vlm_utils/types.py
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index ed847a7e3696b..32eed1a771718 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -338,7 +338,10 @@ steps:
- tests/models/decoder_only/vision_language
commands:
- pytest -v -s models/decoder_only/audio_language
- - pytest -v -s models/decoder_only/vision_language
+ # HACK - run phi3v tests separately to sidestep this transformers bug
+ # https://github.com/huggingface/transformers/issues/34307
+ - pytest -v -s models/decoder_only/vision_language/test_phi3v.py
+ - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language
- label: Other Models Test # 6min
#mirror_hardwares: [amd]
@@ -413,7 +416,7 @@ steps:
# Avoid importing model tests that cause CUDA reinitialization error
- pytest models/encoder_decoder/language/test_bart.py -v -s -m distributed_2_gpus
- pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
- - pytest models/decoder_only/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
+ - pytest models/decoder_only/vision_language/test_models.py -v -s -m distributed_2_gpus
- pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
- pip install -e ./plugins/vllm_add_dummy_model
- pytest -v -s distributed/test_distributed_oot.py
diff --git a/tests/conftest.py b/tests/conftest.py
index 2fce2d772c6ed..bdc6ffb148602 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -259,8 +259,7 @@ def __init__(
is_sentence_transformer: bool = False,
skip_tokenizer_init: bool = False,
auto_cls: Type[_BaseAutoModelClass] = AutoModelForCausalLM,
- postprocess_inputs: Callable[[BatchEncoding],
- BatchEncoding] = identity,
+ postprocess_inputs: Callable[..., BatchEncoding] = identity,
) -> None:
torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
@@ -303,6 +302,7 @@ def __init__(
if skip_tokenizer_init:
self.tokenizer = self.processor.tokenizer
+ self.dtype = dtype
self.postprocess_inputs = postprocess_inputs
def get_inputs(
@@ -337,7 +337,7 @@ def get_inputs(
processor_kwargs["sampling_rate"] = sr
inputs = self.processor(**processor_kwargs)
- inputs = self.postprocess_inputs(inputs)
+ inputs = self.postprocess_inputs(inputs, dtype=self.dtype)
all_inputs.append(inputs)
diff --git a/tests/engine/test_short_mm_context.py b/tests/engine/test_short_mm_context.py
new file mode 100644
index 0000000000000..a6ba7a131c506
--- /dev/null
+++ b/tests/engine/test_short_mm_context.py
@@ -0,0 +1,29 @@
+import pytest
+
+from ..conftest import IMAGE_ASSETS
+
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
+ "stop_sign":
+ "USER: \nWhat's the content of the image?\nASSISTANT:",
+ "cherry_blossom":
+ "USER: \nWhat is the season?\nASSISTANT:",
+})
+
+models = ["llava-hf/llava-1.5-7b-hf"]
+
+
+@pytest.mark.parametrize("model", models)
+def test_context_length_too_short(vllm_runner, image_assets, model):
+ images = [asset.pil_image for asset in image_assets]
+
+ with pytest.raises(ValueError, match="too long to fit into the model"):
+ vllm_model = vllm_runner(
+ model,
+ max_model_len=128, # LLaVA has a feature size of 576
+ enforce_eager=True,
+ )
+
+ with vllm_model:
+ vllm_model.generate_greedy([HF_IMAGE_PROMPTS[0]],
+ max_tokens=1,
+ images=[images[0]])
diff --git a/tests/models/decoder_only/audio_language/test_ultravox.py b/tests/models/decoder_only/audio_language/test_ultravox.py
index bfffd34d1142c..ad6c2d854d1f0 100644
--- a/tests/models/decoder_only/audio_language/test_ultravox.py
+++ b/tests/models/decoder_only/audio_language/test_ultravox.py
@@ -92,7 +92,7 @@ def run_test(
for vllm_prompt, _, audio in prompts_and_audios
]
- def process(hf_inputs: BatchEncoding):
+ def process(hf_inputs: BatchEncoding, **kwargs):
hf_inputs["audio_values"] = hf_inputs["audio_values"] \
.to(torch_dtype) # type: ignore
return hf_inputs
diff --git a/tests/models/decoder_only/language/test_qwen.py b/tests/models/decoder_only/language/test_qwen.py
new file mode 100644
index 0000000000000..128fe65afbb84
--- /dev/null
+++ b/tests/models/decoder_only/language/test_qwen.py
@@ -0,0 +1,34 @@
+"""Ensure that a text-only Qwen model can be run without throwing an error.
+We explicitly test this because Qwen is implemented as a multimodal and
+supports a visual encoder for models like Qwen-VL.
+"""
+from typing import List, Type
+
+import pytest
+
+from ....conftest import VllmRunner
+
+models = [
+ "Qwen/Qwen-7B-Chat" # Has no visual encoder
+]
+
+
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_text_only_qwen_model_can_be_loaded_and_run(
+ vllm_runner: Type[VllmRunner],
+ example_prompts: List[str],
+ model: str,
+ *,
+ dtype: str,
+ max_tokens: int,
+ num_logprobs: int,
+):
+ with vllm_runner(model, dtype=dtype) as vllm_model:
+ vllm_model.generate_greedy_logprobs(
+ example_prompts,
+ max_tokens,
+ num_logprobs=num_logprobs,
+ )
diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/__init__.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_llava_next.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_llava_next.py
new file mode 100644
index 0000000000000..c2d3fda6994f6
--- /dev/null
+++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_llava_next.py
@@ -0,0 +1,68 @@
+import pytest
+
+from vllm.inputs import InputContext
+
+from ....utils import build_model_context
+
+
+@pytest.fixture()
+def get_max_llava_next_image_tokens():
+ from vllm.model_executor.models.llava_next import (
+ get_max_llava_next_image_tokens)
+ return get_max_llava_next_image_tokens
+
+
+@pytest.fixture()
+def dummy_data_for_llava_next():
+ from vllm.model_executor.models.llava_next import dummy_data_for_llava_next
+ return dummy_data_for_llava_next
+
+
+@pytest.mark.parametrize("gridpoints,expected_max_tokens", [
+ ([[336, 336]], 1176),
+ ([[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]], 2928),
+])
+def test_get_max_llava_next_image_tokens(gridpoints, expected_max_tokens,
+ get_max_llava_next_image_tokens):
+ ctx = build_model_context(model_name="llava-hf/llava-v1.6-mistral-7b-hf")
+
+ # Update the config image_grid_pinpoints
+ # and calculate the resulting max tokens
+ ctx.model_config.hf_config.image_grid_pinpoints = gridpoints
+
+ actual_max_tokens = get_max_llava_next_image_tokens(
+ InputContext(ctx.model_config))
+
+ assert expected_max_tokens == actual_max_tokens
+
+
+@pytest.mark.parametrize(
+ "gridpoints,expected_size",
+ [
+ # One point; it has to be the largest
+ ([[336, 336]], (336, 336)),
+ # Default for most llava next models; the 2x2 tile is the largest
+ ([[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]],
+ (672, 672)),
+ # If two rectangular gridpoints are the same, the more vertical
+ # one has the higher feature count due to newline features
+ ([[336, 672], [672, 336]], (672, 336))
+ ])
+def test_dummy_data_for_llava_next_feature_size(dummy_data_for_llava_next,
+ gridpoints, expected_size):
+ ctx = build_model_context(model_name="llava-hf/llava-v1.6-mistral-7b-hf")
+
+ # Update the config image_grid_pinpoints
+ ctx.model_config.hf_config.image_grid_pinpoints = gridpoints
+ seq_len = 5000 # bigger than the max feature size for any image
+
+ seq_data, mm_data = dummy_data_for_llava_next(
+ ctx,
+ seq_len=seq_len,
+ mm_counts={"image": 1},
+ )
+
+ # The dummy data dims should match the gridpoint with the biggest feat size
+ assert mm_data["image"].height == expected_size[0]
+ assert mm_data["image"].width == expected_size[1]
+ assert len(seq_data.get_token_ids()) >= seq_len
diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py
new file mode 100644
index 0000000000000..d6a7b34fdde9f
--- /dev/null
+++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py
@@ -0,0 +1,181 @@
+"""Tests for phi3v's multimodal preprocessing kwargs."""
+from typing import Optional
+
+import pytest
+import torch
+from transformers import AutoImageProcessor, AutoTokenizer
+
+from vllm.inputs import InputContext, token_inputs
+from vllm.model_executor.models.phi3v import _IMAGE_TOKEN_ID
+from vllm.multimodal import MultiModalRegistry
+
+from .....conftest import _ImageAssets
+from ....utils import build_model_context
+
+models = ["microsoft/Phi-3.5-vision-instruct"]
+
+
+# Wrap lazy imports to avoid initializing CUDA during test collection
+@pytest.fixture()
+def input_processor_for_phi3v():
+ from vllm.model_executor.models.phi3v import input_processor_for_phi3v
+ return input_processor_for_phi3v
+
+
+@pytest.fixture()
+def dummy_data_for_phi3v():
+ from vllm.model_executor.models.phi3v import dummy_data_for_phi3v
+ return dummy_data_for_phi3v
+
+
+@pytest.fixture()
+def get_max_phi3v_image_tokens():
+ from vllm.model_executor.models.phi3v import get_max_phi3v_image_tokens
+ return get_max_phi3v_image_tokens
+
+
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("num_crops", [4, 16, None])
+def test_input_mapper_override(model: str, image_assets: _ImageAssets,
+ num_crops: Optional[int]):
+ """Ensure that the [default] input mapper handles num_crops properly."""
+ # We pass the processor kwargs here since for this model, we fall back to
+ # the default mapper; this will fall back to the HF mapper and forward
+ # mm_processor_kwargs to it.
+ mm_processor_kwargs = {
+ "num_crops": num_crops
+ } if num_crops is not None else {}
+ ctx = build_model_context(
+ model_name=model,
+ tokenizer_name=model,
+ trust_remote_code=True,
+ mm_processor_kwargs=mm_processor_kwargs,
+ )
+
+ hf_processor = AutoImageProcessor.from_pretrained(model,
+ trust_remote_code=True,
+ **mm_processor_kwargs)
+
+ mm_registry = MultiModalRegistry()
+ mm_registry.init_mm_limits_per_prompt(ctx.model_config)
+
+ image = image_assets[0].pil_image
+ hf_result = hf_processor.preprocess(
+ image,
+ return_tensors="pt",
+ )
+
+ vllm_result = mm_registry.map_input(
+ ctx.model_config,
+ {"image": image},
+ )
+
+ assert torch.all(hf_result["image_sizes"] == vllm_result["image_sizes"])
+ assert torch.all(
+ hf_result["num_img_tokens"] == vllm_result["num_img_tokens"])
+
+ # For pixel values, the second axis should be the num_crops + 1
+ # for the rescaled original image. The default value in VLLM falls
+ # back to the HF config, which is why we compare to the processor num_crops
+ assert torch.all(hf_result["pixel_values"] == vllm_result["pixel_values"])
+ assert vllm_result["pixel_values"].shape[1] == hf_processor.num_crops + 1
+
+
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("num_crops,expected_max_tokens", [
+ (4, 781),
+ (16, 2653),
+])
+def test_max_tokens_override(get_max_phi3v_image_tokens, model: str,
+ num_crops: int, expected_max_tokens: int):
+ """Ensure get_max_phi3v_image_tokens handles num_crops properly."""
+ # NOTE: mm_processor_kwargs on the context in this test is unused, since
+ # this is testing the mapper directly. In practice, the processor kwargs
+ # are wrapped in a closure when calling the max tokens func. We explicitly
+ # do NOT use the mm_processor_kwargs in the model context here to ensure
+ # that the max image tokens implementation is referencing a mix of the
+ # kwargs to the function and the original mm_processor_kwargs in case
+ # values are somehow updated and end up in a bad state.
+ ctx = build_model_context(
+ model_name=model,
+ tokenizer_name=model,
+ trust_remote_code=True,
+ mm_processor_kwargs=None,
+ )
+
+ actual_max_tokens = get_max_phi3v_image_tokens(
+ InputContext(ctx.model_config),
+ num_crops=num_crops,
+ )
+
+ assert expected_max_tokens == actual_max_tokens
+
+
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("num_crops,toks_per_img,num_imgs", [
+ (4, 781, 1),
+ (4, 781, 2),
+ (16, 2653, 1),
+ (16, 2653, 2),
+])
+def test_dummy_data_override(dummy_data_for_phi3v, model: str, num_crops: int,
+ toks_per_img: int, num_imgs: int):
+ """Ensure dummy_data_for_phi3v handles num_crops properly."""
+ # Same as the previous test - don't initialize mm_processor_kwargs
+ # in this test and assume that the kwargs will be correctly expanded by
+ # the partial when calling the dummy data func.
+ ctx = build_model_context(
+ model_name=model,
+ tokenizer_name=model,
+ trust_remote_code=True,
+ mm_processor_kwargs=None,
+ )
+
+ sequence_data, _, = dummy_data_for_phi3v(
+ ctx=ctx,
+ seq_len=8192, # Should be bigger than num_imgs * toks_per_img
+ mm_counts={"image": num_imgs},
+ num_crops=num_crops,
+ )
+ # Ensure we have the right number of placeholders per num_crops size
+ img_tok_count = sequence_data.get_token_ids().count(_IMAGE_TOKEN_ID)
+ assert img_tok_count == toks_per_img * num_imgs
+
+
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("num_crops,expected_toks_per_img,num_imgs", [
+ (4, 757, 1),
+ (4, 757, 2),
+ (16, 1921, 1),
+ (16, 1921, 2),
+])
+def test_input_processor_override(input_processor_for_phi3v,
+ image_assets: _ImageAssets, model: str,
+ num_crops: int, expected_toks_per_img: int,
+ num_imgs: int):
+ """Ensure input_processor_for_phi3v handles num_crops properly."""
+ # Same as the previous test - don't initialize mm_processor_kwargs
+ # in this test and assume that the kwargs will be correctly expanded by
+ # the partial when calling the custom input processor.
+ ctx = build_model_context(
+ model_name=model,
+ tokenizer_name=model,
+ trust_remote_code=True,
+ )
+ tokenizer = AutoTokenizer.from_pretrained(model)
+ # Build the image str / prompt based on the number of images we pass
+ img_str = "".join([f"<|image_{idx}|>\n" for idx in range(1, num_imgs + 1)])
+ prompt = f"<|user|>\n{img_str}<|end|>\n<|assistant|>\n"
+ images = [image_assets[0].pil_image] * num_imgs
+
+ inputs = token_inputs(prompt_token_ids=tokenizer.encode(prompt),
+ prompt=prompt,
+ multi_modal_data={"image": images})
+
+ processed_inputs = input_processor_for_phi3v(ctx,
+ inputs,
+ num_crops=num_crops)
+
+ # Ensure we have the right number of placeholders per num_crops size
+ img_tok_count = processed_inputs["prompt_token_ids"].count(_IMAGE_TOKEN_ID)
+ assert img_tok_count == expected_toks_per_img * num_imgs
diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py
new file mode 100644
index 0000000000000..a01651b171d60
--- /dev/null
+++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py
@@ -0,0 +1,144 @@
+"""Tests for Qwen's multimodal preprocessing kwargs."""
+from typing import Dict, List, Union
+
+import pytest
+import torch
+from PIL.Image import Image
+
+from vllm.inputs import InputContext, token_inputs
+from vllm.multimodal.base import MultiModalInputs
+from vllm.multimodal.utils import cached_get_tokenizer
+
+from .....conftest import IMAGE_ASSETS
+from ....utils import build_model_context
+
+### Multimodal preprocessing tests
+SAMPLE_IMAGE = IMAGE_ASSETS[0].pil_image
+# These values are specific to Qwen-VL/Chat; we can get these from the model
+# config also, but they are hardcoded here to keep the parameterize/fixtures
+# easy to read.
+IMG_START_ID = 151857
+IMG_END_ID = 151858
+IMG_PAD_ID = 151859
+TOKS_PER_IMG = 256
+VIS_ENC_DIM = 4096
+IMG_SIZE = 448
+
+
+@pytest.fixture()
+def input_mapper_for_qwen():
+ # Lazy import to avoid initializing CUDA during test collection
+ from vllm.model_executor.models.qwen import input_mapper_for_qwen
+ return input_mapper_for_qwen
+
+
+@pytest.fixture()
+def input_processor_for_qwen():
+ # Lazy import to avoid initializing CUDA during test collection
+ from vllm.model_executor.models.qwen import input_processor_for_qwen
+ return input_processor_for_qwen
+
+
+@pytest.fixture()
+def qwen_vl_context() -> InputContext:
+ """Get an InputContext for Qwen-VL."""
+ return build_model_context(model_name="Qwen/Qwen-VL",
+ trust_remote_code=True)
+
+
+# Happy path tests for single/multi-image scenarios for the multimodal
+# input processor and mapper, respectively
+@pytest.mark.parametrize("num_images", [1, 2])
+def test_input_processor_valid_mm_data(input_processor_for_qwen,
+ qwen_vl_context: InputContext,
+ num_images: int):
+ """Happy cases for image inputs to Qwen's multimodal input processor."""
+ prompt = "".join(
+ [f"Picture {num}: \n" for num in range(1, num_images + 1)])
+ inputs = token_inputs(
+ prompt=prompt,
+ # When processing multimodal data for a multimodal model, the qwen
+ # input processor will overwrite the provided prompt_token_ids with
+ # the image prompts
+ prompt_token_ids=[],
+ multi_modal_data={"image": torch.rand(num_images, TOKS_PER_IMG, 4096)},
+ )
+ proc_inputs = input_processor_for_qwen(qwen_vl_context, inputs)
+ assert isinstance(proc_inputs, dict)
+
+ # Each image should have one start / stop and a fixed context of 256
+ proc_tokens = proc_inputs["prompt_token_ids"]
+ assert proc_tokens.count(IMG_START_ID) == num_images
+ assert proc_tokens.count(IMG_END_ID) == num_images
+ assert proc_tokens.count(IMG_PAD_ID) == num_images * TOKS_PER_IMG
+
+
+@pytest.mark.parametrize(
+ "img_data,expected_shape",
+ [
+ # single / multi-image
+ (SAMPLE_IMAGE, (1, 3, IMG_SIZE, IMG_SIZE)),
+ (2 * [SAMPLE_IMAGE], (2, 3, IMG_SIZE, IMG_SIZE)),
+ # single / multi-image embeddings
+ (torch.rand(
+ (TOKS_PER_IMG, VIS_ENC_DIM)), (1, TOKS_PER_IMG, VIS_ENC_DIM)),
+ (torch.rand(
+ (1, TOKS_PER_IMG, VIS_ENC_DIM)), (1, TOKS_PER_IMG, VIS_ENC_DIM)),
+ (torch.rand(
+ (2, TOKS_PER_IMG, VIS_ENC_DIM)), (2, TOKS_PER_IMG, VIS_ENC_DIM)),
+ ])
+def test_input_mapper_valid_mm_data(input_mapper_for_qwen,
+ qwen_vl_context: InputContext,
+ img_data: Union[torch.Tensor, List[Image],
+ Image],
+ expected_shape: List[int]):
+ """Happy cases for image inputs to Qwen's multimodal input mapper."""
+ mapped_img_data = input_mapper_for_qwen(qwen_vl_context, img_data)
+ # Ensure that we get the appropriately shaped pixel_values
+ # for images and image embeddings, respectively.
+ assert isinstance(mapped_img_data, MultiModalInputs)
+ assert "pixel_values" in mapped_img_data
+ assert mapped_img_data["pixel_values"].shape == expected_shape
+
+
+# Sad path tests for the multimodal input processor and mapper, respectively
+@pytest.mark.parametrize("mm_data", [
+ {
+ "image": torch.rand((5))
+ },
+ {
+ "image": torch.rand((5, 5, 5, 5, 5))
+ },
+])
+def test_input_processor_invalid_mm_data(input_processor_for_qwen,
+ qwen_vl_context: InputContext,
+ mm_data: Dict[str, torch.Tensor]):
+ """Test sad cases validated in Qwen's multimodal input processor."""
+ tokenizer = cached_get_tokenizer(qwen_vl_context.model_config.tokenizer,
+ trust_remote_code=True)
+ prompt = "Picture 1: \n"
+ prompt_token_ids = tokenizer.encode(prompt)
+ inputs = token_inputs(prompt=prompt,
+ prompt_token_ids=prompt_token_ids,
+ multi_modal_data=mm_data)
+ # Should fail since we have too many or too few dimensions for embeddings
+ with pytest.raises(ValueError):
+ input_processor_for_qwen(qwen_vl_context, inputs)
+
+
+@pytest.mark.parametrize(
+ "img_data",
+ [
+ # Wrong context length
+ torch.rand((1, TOKS_PER_IMG + 10, VIS_ENC_DIM)),
+ # Wrong visual encoder output size
+ torch.rand((1, TOKS_PER_IMG, VIS_ENC_DIM + 10)),
+ ])
+def test_input_mapper_invalid_mm_data(
+ input_mapper_for_qwen,
+ qwen_vl_context: InputContext,
+ img_data: Union[torch.Tensor, List[Image], Image],
+):
+ """Sad cases validated in Qwen VL's multimodal input mapper."""
+ with pytest.raises(ValueError):
+ input_mapper_for_qwen(qwen_vl_context, img_data)
diff --git a/tests/models/decoder_only/vision_language/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py
similarity index 98%
rename from tests/models/decoder_only/vision_language/test_qwen2_vl.py
rename to tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py
index d3de5fb26d4b8..5c90e7f7a267c 100644
--- a/tests/models/decoder_only/vision_language/test_qwen2_vl.py
+++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py
@@ -8,8 +8,8 @@
from vllm.inputs import InputContext, token_inputs
from vllm.multimodal import MultiModalRegistry
-from ....conftest import _ImageAssets
-from ...utils import build_model_context
+from .....conftest import _ImageAssets
+from ....utils import build_model_context
MODEL = "Qwen/Qwen2-VL-2B-Instruct"
MIN_PIXELS = "min_pixels"
diff --git a/tests/models/decoder_only/vision_language/test_blip2.py b/tests/models/decoder_only/vision_language/test_blip2.py
deleted file mode 100644
index e1e32b96d89ac..0000000000000
--- a/tests/models/decoder_only/vision_language/test_blip2.py
+++ /dev/null
@@ -1,101 +0,0 @@
-from typing import List, Optional, Tuple
-
-import pytest
-from transformers import AutoModelForVision2Seq, AutoTokenizer
-
-from vllm.multimodal.utils import rescale_image_size
-from vllm.sequence import SampleLogprobs
-
-from ....conftest import IMAGE_ASSETS
-from ...utils import check_logprobs_close
-
-HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
- "stop_sign":
- "Question: What's the content of the image? Answer:",
- "cherry_blossom":
- "Question: What is the season? Answer:",
-})
-
-
-def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
- Optional[SampleLogprobs]],
- model: str):
- """Sanitize vllm output to be comparable with hf output."""
- _, output_str, out_logprobs = vllm_output
-
- hf_output_str = output_str + "\n"
-
- tokenizer = AutoTokenizer.from_pretrained(model)
- hf_output_ids = tokenizer.encode(hf_output_str)
- assert hf_output_ids[0] == tokenizer.bos_token_id
- hf_output_ids = hf_output_ids[1:]
-
- return hf_output_ids, hf_output_str, out_logprobs
-
-
-@pytest.mark.parametrize("model", ["Salesforce/blip2-opt-2.7b"])
-@pytest.mark.parametrize(
- "size_factors",
- [
- # No image
- [],
- # Single-scale
- [1.0],
- # Single-scale, batched
- [1.0, 1.0, 1.0],
- # Multi-scale
- [0.25, 0.5, 1.0],
- ],
-)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [5])
-def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
- dtype: str, max_tokens: int, num_logprobs: int) -> None:
- """Inference result should be the same between hf and vllm.
-
- All the image fixtures for the test are from IMAGE_ASSETS.
- For huggingface runner, we provide the PIL images as input.
- For vllm runner, we provide MultiModalData objects and corresponding
- MultiModalConfig as input.
- Note, the text input is also adjusted to abide by vllm contract.
- The text output is sanitized to be able to compare with hf.
- """
- images = [asset.pil_image for asset in image_assets]
-
- inputs_per_image = [(
- [prompt for _ in size_factors],
- [rescale_image_size(image, factor) for factor in size_factors],
- ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
-
- # max_model_len should be greater than image_feature_size
- with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model:
- vllm_outputs_per_image = [
- vllm_model.generate_greedy_logprobs(prompts,
- max_tokens,
- num_logprobs=num_logprobs,
- images=images)
- for prompts, images in inputs_per_image
- ]
-
- with hf_runner(model, dtype=dtype,
- auto_cls=AutoModelForVision2Seq) as hf_model:
- hf_outputs_per_image = [
- hf_model.generate_greedy_logprobs_limit(prompts,
- max_tokens,
- num_logprobs=num_logprobs,
- images=images)
- for prompts, images in inputs_per_image
- ]
-
- for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
- vllm_outputs_per_image):
- check_logprobs_close(
- outputs_0_lst=hf_outputs,
- outputs_1_lst=[
- vllm_to_hf_output(vllm_output, model)
- for vllm_output in vllm_outputs
- ],
- name_0="hf",
- name_1="vllm",
- )
diff --git a/tests/models/decoder_only/vision_language/test_broadcast.py b/tests/models/decoder_only/vision_language/test_broadcast.py
deleted file mode 100644
index 38c4a95de16f4..0000000000000
--- a/tests/models/decoder_only/vision_language/test_broadcast.py
+++ /dev/null
@@ -1,46 +0,0 @@
-import pytest
-import transformers
-
-from ....utils import multi_gpu_test
-
-
-@multi_gpu_test(num_gpus=2)
-@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
-@pytest.mark.parametrize("model", [
- "llava-hf/llava-1.5-7b-hf",
- "llava-hf/llava-v1.6-mistral-7b-hf",
- "facebook/chameleon-7b",
-])
-def test_models(hf_runner, vllm_runner, image_assets,
- distributed_executor_backend, model) -> None:
-
- dtype = "half"
- max_tokens = 5
- num_logprobs = 5
- tensor_parallel_size = 2
-
- if model.startswith("llava-hf/llava-1.5"):
- from .test_llava import models, run_test
- elif model.startswith("llava-hf/llava-v1.6"):
- from .test_llava_next import models, run_test # type: ignore[no-redef]
- elif model.startswith("facebook/chameleon"):
- if transformers.__version__.startswith("4.46"):
- pytest.skip("Model broken in HF, "
- "see huggingface/transformers#34379")
- from .test_chameleon import models, run_test # type: ignore[no-redef]
- else:
- raise NotImplementedError(f"Unsupported model: {model}")
-
- run_test(
- hf_runner,
- vllm_runner,
- image_assets,
- model=models[0],
- # So that LLaVA-NeXT processor may return nested list
- size_factors=[0.25, 0.5, 1.0],
- dtype=dtype,
- max_tokens=max_tokens,
- num_logprobs=num_logprobs,
- tensor_parallel_size=tensor_parallel_size,
- distributed_executor_backend=distributed_executor_backend,
- )
diff --git a/tests/models/decoder_only/vision_language/test_chameleon.py b/tests/models/decoder_only/vision_language/test_chameleon.py
deleted file mode 100644
index 4bd678b9f21c4..0000000000000
--- a/tests/models/decoder_only/vision_language/test_chameleon.py
+++ /dev/null
@@ -1,130 +0,0 @@
-from typing import List, Optional, Type
-
-import pytest
-import transformers
-from transformers import AutoModelForVision2Seq, BatchEncoding
-
-from vllm.multimodal.utils import rescale_image_size
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
-
-from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
-from ...utils import check_outputs_equal
-
-HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
- "stop_sign":
- "USER: \nWhat's the content of the image?\nASSISTANT:",
- "cherry_blossom":
- "USER: \nWhat is the season?\nASSISTANT:",
-})
-
-models = ["facebook/chameleon-7b"]
-
-
-def run_test(
- hf_runner: Type[HfRunner],
- vllm_runner: Type[VllmRunner],
- image_assets: _ImageAssets,
- model: str,
- *,
- size_factors: List[float],
- dtype: str,
- max_tokens: int,
- num_logprobs: int,
- tensor_parallel_size: int,
- distributed_executor_backend: Optional[str] = None,
-):
- """Inference result should be the same between hf and vllm.
-
- All the image fixtures for the test are from IMAGE_ASSETS.
- For huggingface runner, we provide the PIL images as input.
- For vllm runner, we provide MultiModalDataDict objects
- and corresponding vision language config as input.
- Note, the text input is also adjusted to abide by vllm contract.
- The text output is sanitized to be able to compare with hf.
- """
- torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
- images = [asset.pil_image for asset in image_assets]
-
- inputs_per_image = [(
- [prompt for _ in size_factors],
- [rescale_image_size(image, factor) for factor in size_factors],
- ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
-
- with vllm_runner(model,
- max_model_len=4096,
- dtype=dtype,
- tensor_parallel_size=tensor_parallel_size,
- distributed_executor_backend=distributed_executor_backend,
- enforce_eager=True) as vllm_model:
-
- vllm_outputs_per_image = [
- vllm_model.generate_greedy_logprobs(prompts,
- max_tokens,
- num_logprobs=num_logprobs,
- images=images)
- for prompts, images in inputs_per_image
- ]
-
- def process(hf_inputs: BatchEncoding):
- hf_inputs["pixel_values"] = hf_inputs["pixel_values"] \
- .to(torch_dtype) # type: ignore
- return hf_inputs
-
- with hf_runner(model,
- dtype=dtype,
- postprocess_inputs=process,
- auto_cls=AutoModelForVision2Seq) as hf_model:
- hf_outputs_per_image = [
- hf_model.generate_greedy_logprobs_limit(prompts,
- max_tokens,
- num_logprobs=num_logprobs,
- images=images)
- for prompts, images in inputs_per_image
- ]
-
- for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
- vllm_outputs_per_image):
- # HF Logprobs include image tokens, unlike vLLM, so we don't directly
- # compare them
- check_outputs_equal(
- outputs_0_lst=[outputs[:2] for outputs in hf_outputs],
- outputs_1_lst=[outputs[:2] for outputs in vllm_outputs],
- name_0="hf",
- name_1="vllm",
- )
-
-
-@pytest.mark.skipif(
- transformers.__version__.startswith("4.46.0"),
- reason="Model broken in HF, see huggingface/transformers#34379",
-)
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize(
- "size_factors",
- [
- # No image
- [],
- # Single-scale
- [1.0],
- # Single-scale, batched
- [1.0, 1.0, 1.0],
- # Multi-scale
- [0.25, 0.5, 1.0],
- ],
-)
-@pytest.mark.parametrize("dtype", ["bfloat16"])
-@pytest.mark.parametrize("max_tokens", [8])
-@pytest.mark.parametrize("num_logprobs", [5])
-def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
- dtype, max_tokens, num_logprobs) -> None:
- run_test(
- hf_runner,
- vllm_runner,
- image_assets,
- model,
- size_factors=size_factors,
- dtype=dtype,
- max_tokens=max_tokens,
- num_logprobs=num_logprobs,
- tensor_parallel_size=1,
- )
diff --git a/tests/models/decoder_only/vision_language/test_fuyu.py b/tests/models/decoder_only/vision_language/test_fuyu.py
deleted file mode 100644
index 1affcd10ee72d..0000000000000
--- a/tests/models/decoder_only/vision_language/test_fuyu.py
+++ /dev/null
@@ -1,139 +0,0 @@
-from typing import List, Optional, Tuple, Type
-
-import pytest
-
-from vllm.multimodal.utils import rescale_image_size
-from vllm.platforms import current_platform
-from vllm.sequence import SampleLogprobs
-
-from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
-from ...utils import check_logprobs_close
-
-HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
- "stop_sign":
- "What's the content of the image?\n",
- "cherry_blossom":
- "What is the season?\n",
-})
-
-models = ["adept/fuyu-8b"]
-
-
-def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
- Optional[SampleLogprobs]]):
- """Sanitize vllm output to be comparable with hf output."""
- output_ids, output_str, out_logprobs = vllm_output
-
- hf_output_str = output_str.lstrip() + "|ENDOFTEXT|"
-
- return output_ids, hf_output_str, out_logprobs
-
-
-def run_test(
- hf_runner: Type[HfRunner],
- vllm_runner: Type[VllmRunner],
- image_assets: _ImageAssets,
- model: str,
- *,
- size_factors: List[float],
- dtype: str,
- max_tokens: int,
- num_logprobs: int,
- tensor_parallel_size: int,
- distributed_executor_backend: Optional[str] = None,
-):
- """Inference result should be the same between hf and vllm.
-
- All the image fixtures for the test are from IMAGE_ASSETS.
- For huggingface runner, we provide the PIL images as input.
- For vllm runner, we provide MultiModalDataDict objects
- and corresponding MultiModalConfig as input.
- Note, the text input is also adjusted to abide by vllm contract.
- The text output is sanitized to be able to compare with hf.
- """
- images = [asset.pil_image for asset in image_assets]
-
- inputs_per_image = [(
- [prompt for _ in size_factors],
- [rescale_image_size(image, factor) for factor in size_factors],
- ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
-
- # NOTE: take care of the order. run vLLM first, and then run HF.
- # vLLM needs a fresh new process without cuda initialization.
- # if we run HF first, the cuda initialization will be done and it
- # will hurt multiprocessing backend with fork method (the default method).
-
- # max_model_len should be greater than image_feature_size
- with vllm_runner(model,
- max_model_len=2048,
- max_num_seqs=2,
- dtype=dtype,
- tensor_parallel_size=tensor_parallel_size,
- distributed_executor_backend=distributed_executor_backend,
- enforce_eager=True) as vllm_model:
- vllm_outputs_per_image = [
- vllm_model.generate_greedy_logprobs(prompts,
- max_tokens,
- num_logprobs=num_logprobs,
- images=images)
- for prompts, images in inputs_per_image
- ]
-
- with hf_runner(model, dtype=dtype) as hf_model:
- eos_token_id = hf_model.processor.tokenizer.eos_token_id
- hf_outputs_per_image = [
- hf_model.generate_greedy_logprobs_limit(prompts,
- max_tokens,
- num_logprobs=num_logprobs,
- images=images,
- eos_token_id=eos_token_id)
- for prompts, images in inputs_per_image
- ]
-
- for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
- vllm_outputs_per_image):
- check_logprobs_close(
- outputs_0_lst=hf_outputs,
- outputs_1_lst=[
- vllm_to_hf_output(vllm_output) for vllm_output in vllm_outputs
- ],
- name_0="hf",
- name_1="vllm",
- )
-
-
-target_dtype = "half"
-if current_platform.is_cpu():
- target_dtype = "bfloat16"
-
-
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize(
- "size_factors",
- [
- # No image
- [],
- # Single-scale
- [0.25],
- # Single-scale, batched
- [0.25, 0.25, 0.25],
- # Multi-scale
- [0.25, 0.2, 0.15],
- ],
-)
-@pytest.mark.parametrize("dtype", [target_dtype])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [10])
-def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
- dtype: str, max_tokens: int, num_logprobs: int) -> None:
- run_test(
- hf_runner,
- vllm_runner,
- image_assets,
- model,
- size_factors=size_factors,
- dtype=dtype,
- max_tokens=max_tokens,
- num_logprobs=num_logprobs,
- tensor_parallel_size=1,
- )
diff --git a/tests/models/decoder_only/vision_language/test_glm4.py b/tests/models/decoder_only/vision_language/test_glm4.py
deleted file mode 100644
index 47922a57f680b..0000000000000
--- a/tests/models/decoder_only/vision_language/test_glm4.py
+++ /dev/null
@@ -1,133 +0,0 @@
-from typing import List, Optional, Tuple, Type
-
-import pytest
-
-from vllm.multimodal.utils import rescale_image_size
-from vllm.transformers_utils.tokenizer import patch_padding_side
-
-from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
-from ....utils import large_gpu_test
-from ...utils import check_logprobs_close
-
-HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
- "stop_sign":
- "What's the content of the image?",
- "cherry_blossom":
- "What is the season?",
-})
-
-models = ["THUDM/glm-4v-9b"]
-target_dtype = "bfloat16"
-
-
-def run_test(
- hf_runner: Type[HfRunner],
- vllm_runner: Type[VllmRunner],
- inputs: List[Tuple[List[str], PromptImageInput]],
- model: str,
- *,
- dtype: str,
- max_tokens: int,
- num_logprobs: int,
- mm_limit: int,
- tensor_parallel_size: int,
- distributed_executor_backend: Optional[str] = None,
-):
- # max_model_len should be greater than image_feature_size
- with vllm_runner(model,
- max_model_len=2048,
- max_num_seqs=2,
- dtype=dtype,
- limit_mm_per_prompt={"image": mm_limit},
- tensor_parallel_size=tensor_parallel_size,
- distributed_executor_backend=distributed_executor_backend,
- enforce_eager=True) as vllm_model:
- stop_token_ids = [151329, 151336, 151338]
- vllm_outputs_per_image = [
- vllm_model.generate_greedy_logprobs(prompts,
- max_tokens,
- num_logprobs=num_logprobs,
- images=images,
- stop_token_ids=stop_token_ids)
- for prompts, images in inputs
- ]
-
- with hf_runner(model, dtype=dtype) as hf_model:
- hf_processor = hf_model.processor
- patch_padding_side(hf_processor)
-
- def processor(*args, text="", images=None, **kwargs):
- if images is None:
- return hf_processor(*args, **kwargs)
-
- return hf_processor.apply_chat_template(
- [{
- "role": "user",
- "image": images,
- "content": text
- }],
- add_generation_prompt=True,
- tokenize=True,
- return_dict=True,
- **kwargs,
- )
-
- hf_model.processor = processor
- hf_model.model.get_output_embeddings = lambda: \
- hf_model.model.transformer.output_layer
- hf_outputs_per_image = [
- hf_model.generate_greedy_logprobs_limit(
- prompts,
- max_tokens,
- num_logprobs=num_logprobs,
- images=images,
- ) for prompts, images in inputs
- ]
-
- for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
- vllm_outputs_per_image):
- check_logprobs_close(
- outputs_0_lst=hf_outputs,
- outputs_1_lst=vllm_outputs,
- name_0="hf",
- name_1="vllm",
- )
-
-
-@large_gpu_test(min_gb=48)
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize(
- "size_factors",
- [
- # No image
- [],
- # Single-scale
- [1.0],
- # Single-scale, batched
- [1.0, 1.0, 1.0],
- # Multi-scale
- [0.25, 0.5, 1.0],
- ],
-)
-@pytest.mark.parametrize("dtype", [target_dtype])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [5])
-def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
- dtype: str, max_tokens: int, num_logprobs: int) -> None:
- images = [asset.pil_image for asset in image_assets]
-
- inputs_per_image = [(
- [prompt for _ in size_factors],
- [rescale_image_size(image, factor) for factor in size_factors],
- ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
- run_test(
- hf_runner,
- vllm_runner,
- inputs_per_image,
- model,
- dtype=dtype,
- max_tokens=max_tokens,
- num_logprobs=num_logprobs,
- mm_limit=1,
- tensor_parallel_size=1,
- )
diff --git a/tests/models/decoder_only/vision_language/test_internvl.py b/tests/models/decoder_only/vision_language/test_internvl.py
index fc842ec4a6171..2fd1ac4bb08f7 100644
--- a/tests/models/decoder_only/vision_language/test_internvl.py
+++ b/tests/models/decoder_only/vision_language/test_internvl.py
@@ -1,15 +1,11 @@
-import types
-from typing import List, Optional, Tuple, Type, Union
+from typing import List, Optional, Tuple, Type
import pytest
import torch
-from PIL.Image import Image
-from transformers import AutoConfig
from vllm.multimodal.utils import rescale_image_size
-from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
- _ImageAssets)
+from ....conftest import IMAGE_ASSETS, VllmRunner, _ImageAssets
from ...utils import check_logprobs_close
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
@@ -18,171 +14,6 @@
"cherry_blossom":
"<|im_start|>User\n\nWhat is the season?<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
})
-HF_MULTIIMAGE_IMAGE_PROMPT = "<|im_start|>User\nImage-1: \nImage-2: \nDescribe the two images in short.<|im_end|>\n<|im_start|>Assistant\n" # noqa: E501
-
-models = [
- "OpenGVLab/InternVL2-1B",
- "OpenGVLab/InternVL2-2B",
- # NOTE: Mono-InternVL-2B doesn't work with fp16,
- # it will result NaN during inference.
- # See: https://huggingface.co/OpenGVLab/Mono-InternVL-2B/discussions/9
- "OpenGVLab/Mono-InternVL-2B",
- # Broken due to outdated implementation of Phi-3
- # See: https://huggingface.co/OpenGVLab/InternVL2-4B/discussions/3
- # "OpenGVLab/InternVL2-4B",
-]
-target_dtype = "bfloat16"
-
-
-# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/modeling_internvl_chat.py
-def generate(
- self,
- pixel_values: torch.FloatTensor,
- input_ids: torch.FloatTensor,
- attention_mask: Optional[torch.LongTensor] = None,
- **generate_kwargs,
-) -> torch.LongTensor:
- """Generate method for InternVL2 model without fixed use_cache."""
- assert self.img_context_token_id is not None
- vit_embeds = self.extract_feature(pixel_values)
- input_embeds = self.language_model.get_input_embeddings()(input_ids)
- B, N, C = input_embeds.shape
- input_embeds = input_embeds.reshape(B * N, C)
-
- input_ids = input_ids.reshape(B * N)
- selected = (input_ids == self.img_context_token_id)
- assert selected.sum() != 0
- input_embeds[selected] = vit_embeds.reshape(-1, C).to(input_embeds.device)
-
- input_embeds = input_embeds.reshape(B, N, C)
-
- forward_kwargs = dict(
- inputs_embeds=input_embeds,
- attention_mask=attention_mask,
- )
- if getattr(self, "use_visual_token_mask", False):
- visual_token_mask = selected.reshape(B, N, 1).to(input_embeds.dtype)
- forward_kwargs["visual_token_mask"] = visual_token_mask
- outputs = self.language_model.generate(
- **forward_kwargs,
- **generate_kwargs,
- )
-
- return outputs
-
-
-def run_test(
- hf_runner: Type[HfRunner],
- vllm_runner: Type[VllmRunner],
- inputs: List[Tuple[List[str], PromptImageInput]],
- model: str,
- *,
- dtype: str,
- max_tokens: int,
- num_logprobs: int,
- mm_limit: int,
- tensor_parallel_size: int,
- distributed_executor_backend: Optional[str] = None,
-):
- """Inference result should be the same between hf and vllm.
-
- All the image fixtures for the test are from IMAGE_ASSETS.
- For huggingface runner, we provide the PIL images as input.
- For vllm runner, we provide MultiModalDataDict objects
- and corresponding MultiModalConfig as input.
- Note, the text input is also adjusted to abide by vllm contract.
- The text output is sanitized to be able to compare with hf.
- """
-
- # NOTE: take care of the order. run vLLM first, and then run HF.
- # vLLM needs a fresh new process without cuda initialization.
- # if we run HF first, the cuda initialization will be done and it
- # will hurt multiprocessing backend with fork method (the default method).
-
- class InternVLProcessor:
- """A simple processor for InternVL2 which misses a processor."""
-
- def __init__(self, hf_runner: HfRunner):
- self.num_image_token = hf_runner.model.num_image_token
- self.tokenizer = hf_runner.tokenizer
- self.dtype = hf_runner.model.dtype
-
- self.config = AutoConfig.from_pretrained(hf_runner.model_name,
- trust_remote_code=True)
- self.vision_config = self.config.vision_config
- self.use_thumbnail = self.config.use_thumbnail
- self.min_num = self.config.min_dynamic_patch
- self.max_num = self.config.max_dynamic_patch
- self.image_size = self.vision_config.image_size
-
- def __call__(self, text: str, images: Union[Image, List[Image]],
- **kwargs):
- from vllm.model_executor.models.internvl import (
- IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values)
- images = [images] if isinstance(images, Image) else images
- pixel_values = [
- image_to_pixel_values(image, self.image_size, self.min_num,
- self.max_num,
- self.use_thumbnail).to(self.dtype)
- for image in images
- ]
- num_patches_list = [
- pixel_value.shape[0] for pixel_value in pixel_values
- ]
- pixel_values = torch.cat(pixel_values, dim=0)
- for num_patches in num_patches_list:
- context_tokens = IMG_CONTEXT * self.num_image_token \
- * num_patches
- image_tokens = IMG_START + context_tokens + IMG_END
- text = text.replace('', image_tokens, 1)
- prompt = self.tokenizer(text, return_tensors="pt")
- prompt.update({"pixel_values": pixel_values})
- return prompt
-
- # max_model_len should be greater than image_feature_size
- with vllm_runner(model,
- max_model_len=4096,
- dtype=dtype,
- limit_mm_per_prompt={"image": mm_limit},
- tensor_parallel_size=tensor_parallel_size,
- distributed_executor_backend=distributed_executor_backend,
- enforce_eager=True) as vllm_model:
- vllm_outputs_per_image = [
- vllm_model.generate_greedy_logprobs(prompts,
- max_tokens,
- num_logprobs=num_logprobs,
- images=images)
- for prompts, images in inputs
- ]
-
- with hf_runner(model, dtype=dtype) as hf_model:
- img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids(
- "")
- hf_model.model.img_context_token_id = img_context_token_id
- hf_model.processor = InternVLProcessor(hf_model)
- hf_model.model.get_output_embeddings = lambda: \
- hf_model.model.language_model.get_output_embeddings()
- hf_model.model.generate = types.MethodType(generate, hf_model.model)
- eos_token_id = hf_model.tokenizer.eos_token_id
- hf_outputs_per_image = [
- hf_model.generate_greedy_logprobs_limit(prompts,
- max_tokens,
- num_logprobs=num_logprobs,
- images=hf_images,
- eos_token_id=eos_token_id)
- for prompts, hf_images in inputs
- ]
-
- for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
- vllm_outputs_per_image):
- # TODO: Check whether using original CLIPVisionModel can improve
- # consistency against HF
- check_logprobs_close(
- outputs_0_lst=hf_outputs,
- outputs_1_lst=vllm_outputs,
- name_0="hf",
- name_1="vllm",
- )
def run_awq_test(
@@ -253,123 +84,6 @@ def run_awq_test(
)
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize(
- "size_factors",
- [
- # No image
- [],
- # Single-scale
- [1.0],
- # Single-scale, batched
- [1.0, 1.0, 1.0],
- # Multi-scale
- [0.25, 0.5, 1.0],
- ],
-)
-@pytest.mark.parametrize("dtype", [target_dtype])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [5])
-@torch.inference_mode()
-def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
- dtype: str, max_tokens: int, num_logprobs: int) -> None:
- images = [asset.pil_image for asset in image_assets]
-
- inputs_per_image = [(
- [prompt for _ in size_factors],
- [rescale_image_size(image, factor) for factor in size_factors],
- ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
-
- run_test(
- hf_runner,
- vllm_runner,
- inputs_per_image,
- model,
- dtype=dtype,
- max_tokens=max_tokens,
- num_logprobs=num_logprobs,
- mm_limit=1,
- tensor_parallel_size=1,
- )
-
-
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize(
- "size_factors",
- [
- # No image
- [],
- # Single-scale
- [1.0],
- # Single-scale, batched
- [1.0, 1.0, 1.0],
- # Multi-scale
- [0.5, 0.75, 1.0],
- ],
-)
-@pytest.mark.parametrize("dtype", [target_dtype])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [5])
-@torch.inference_mode()
-def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
- size_factors, dtype: str, max_tokens: int,
- num_logprobs: int) -> None:
- images = [asset.pil_image for asset in image_assets]
-
- inputs_per_case = [
- ([HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
- [[rescale_image_size(image, factor) for image in images]
- for factor in size_factors])
- ]
-
- run_test(
- hf_runner,
- vllm_runner,
- inputs_per_case,
- model,
- dtype=dtype,
- max_tokens=max_tokens,
- num_logprobs=num_logprobs,
- mm_limit=2,
- tensor_parallel_size=1,
- )
-
-
-@pytest.mark.parametrize("model", ["OpenGVLab/InternVL2-2B"])
-@pytest.mark.parametrize("size_factors", [[0.5, 1.0]])
-@pytest.mark.parametrize("dtype", [target_dtype])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [5])
-@torch.inference_mode()
-def test_different_num_patches(hf_runner, vllm_runner, image_assets, model,
- size_factors, dtype: str, max_tokens: int,
- num_logprobs: int) -> None:
- images = [asset.pil_image.resize((896, 896)) for asset in image_assets]
-
- inputs_batching = [(
- [prompt for _ in size_factors],
- [rescale_image_size(image, factor) for factor in size_factors],
- ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
-
- inputs_multi_images = [
- ([HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
- [[rescale_image_size(image, factor) for image in images]
- for factor in size_factors])
- ]
- for inputs in [inputs_batching, inputs_multi_images]:
- run_test(
- hf_runner,
- vllm_runner,
- inputs,
- model,
- dtype=dtype,
- max_tokens=max_tokens,
- num_logprobs=num_logprobs,
- mm_limit=2,
- tensor_parallel_size=1,
- )
-
-
@pytest.mark.parametrize(
"models", [("OpenGVLab/InternVL2-2B", "OpenGVLab/InternVL2-2B-AWQ")])
@pytest.mark.parametrize(
diff --git a/tests/models/decoder_only/vision_language/test_llava.py b/tests/models/decoder_only/vision_language/test_llava.py
deleted file mode 100644
index fd28a9367b4b2..0000000000000
--- a/tests/models/decoder_only/vision_language/test_llava.py
+++ /dev/null
@@ -1,313 +0,0 @@
-from typing import List, Optional, Tuple, Type, overload
-
-import pytest
-from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer,
- BatchEncoding)
-
-from vllm.multimodal.utils import rescale_image_size
-from vllm.sequence import SampleLogprobs
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
-
-from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
- _ImageAssets)
-from ...utils import check_logprobs_close
-
-_LIMIT_IMAGE_PER_PROMPT = 4
-
-HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
- "stop_sign":
- "USER: \nWhat's the content of the image?\nASSISTANT:",
- "cherry_blossom":
- "USER: \nWhat is the season?\nASSISTANT:",
-})
-
-models = [
- "llava-hf/llava-1.5-7b-hf",
- # TODO: Get this model to produce meaningful output in vLLM
- # "TIGER-Lab/Mantis-8B-siglip-llama3",
-]
-
-
-def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
- Optional[SampleLogprobs]],
- model: str):
- """Sanitize vllm output to be comparable with hf output."""
- output_ids, output_str, out_logprobs = vllm_output
-
- config = AutoConfig.from_pretrained(model)
- image_token_id = config.image_token_index
-
- tokenizer = AutoTokenizer.from_pretrained(model)
- eos_token_id = tokenizer.eos_token_id
-
- hf_output_ids = [
- token_id for idx, token_id in enumerate(output_ids)
- if token_id != image_token_id or output_ids[idx - 1] != image_token_id
- ]
-
- assert output_str[0] == " "
- hf_output_str = output_str[1:]
- if hf_output_ids[-1] == eos_token_id:
- hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
-
- return hf_output_ids, hf_output_str, out_logprobs
-
-
-@overload
-def run_test(
- hf_runner: Type[HfRunner],
- vllm_runner: Type[VllmRunner],
- image_assets: _ImageAssets,
- model: str,
- *,
- size_factors: List[float],
- dtype: str,
- max_tokens: int,
- num_logprobs: int,
- tensor_parallel_size: int,
- distributed_executor_backend: Optional[str] = None,
-):
- ...
-
-
-@overload
-def run_test(
- hf_runner: Type[HfRunner],
- vllm_runner: Type[VllmRunner],
- image_assets: _ImageAssets,
- model: str,
- *,
- sizes: List[Tuple[int, int]],
- dtype: str,
- max_tokens: int,
- num_logprobs: int,
- tensor_parallel_size: int,
- distributed_executor_backend: Optional[str] = None,
-):
- ...
-
-
-def run_test(
- hf_runner: Type[HfRunner],
- vllm_runner: Type[VllmRunner],
- image_assets: _ImageAssets,
- model: str,
- *,
- size_factors: Optional[List[float]] = None,
- sizes: Optional[List[Tuple[int, int]]] = None,
- dtype: str,
- max_tokens: int,
- num_logprobs: int,
- tensor_parallel_size: int,
- distributed_executor_backend: Optional[str] = None,
-):
- images = [asset.pil_image for asset in image_assets]
-
- if size_factors is not None:
- inputs_per_image = [(
- [prompt for _ in size_factors],
- [rescale_image_size(image, factor) for factor in size_factors],
- ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
- elif sizes is not None:
- inputs_per_image = [(
- [prompt for _ in sizes],
- [image.resize(size) for size in sizes],
- ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
- else:
- raise ValueError("You must provide either `size_factors` or `sizes`")
-
- _run_test(hf_runner,
- vllm_runner,
- inputs_per_image,
- model,
- dtype=dtype,
- max_tokens=max_tokens,
- num_logprobs=num_logprobs,
- tensor_parallel_size=tensor_parallel_size,
- distributed_executor_backend=distributed_executor_backend)
-
-
-def _run_test(
- hf_runner: Type[HfRunner],
- vllm_runner: Type[VllmRunner],
- inputs: List[Tuple[List[str], PromptImageInput]],
- model: str,
- *,
- dtype: str,
- max_tokens: int,
- num_logprobs: int,
- tensor_parallel_size: int,
- distributed_executor_backend: Optional[str] = None,
-):
- """Inference result should be the same between hf and vllm.
-
- All the image fixtures for the test are from IMAGE_ASSETS.
- For huggingface runner, we provide the PIL images as input.
- For vllm runner, we provide MultiModalDataDict objects
- and corresponding MultiModalConfig as input.
- Note, the text input is also adjusted to abide by vllm contract.
- The text output is sanitized to be able to compare with hf.
- """
- # NOTE: For local use; this isn't tested in CI yet (see TODO above)
- if model.startswith("TIGER-Lab/Mantis"):
- from mantis.models.mllava import MLlavaProcessor
-
- torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
- mantis_processor = MLlavaProcessor.from_pretrained(
- model, torch_dtype=torch_dtype)
- assert isinstance(mantis_processor, MLlavaProcessor)
- else:
- mantis_processor = None
-
- # NOTE: take care of the order. run vLLM first, and then run HF.
- # vLLM needs a fresh new process without cuda initialization.
- # if we run HF first, the cuda initialization will be done and it
- # will hurt multiprocessing backend with fork method (the default method).
-
- # max_model_len should be greater than image_feature_size
- with vllm_runner(model,
- dtype=dtype,
- max_model_len=4096,
- tensor_parallel_size=tensor_parallel_size,
- distributed_executor_backend=distributed_executor_backend,
- enforce_eager=True,
- limit_mm_per_prompt={"image": _LIMIT_IMAGE_PER_PROMPT
- }) as vllm_model:
- vllm_outputs_per_image = [
- vllm_model.generate_greedy_logprobs(prompts,
- max_tokens,
- num_logprobs=num_logprobs,
- images=images)
- for prompts, images in inputs
- ]
-
- if mantis_processor is not None:
-
- def process(hf_inputs: BatchEncoding):
- hf_inputs["pixel_values"] = hf_inputs["pixel_values"] \
- .to(torch_dtype) # type: ignore
- return hf_inputs
- else:
-
- def process(hf_inputs: BatchEncoding):
- return hf_inputs
-
- with hf_runner(model,
- dtype=dtype,
- postprocess_inputs=process,
- auto_cls=AutoModelForVision2Seq) as hf_model:
- hf_outputs_per_image = [
- hf_model.generate_greedy_logprobs_limit(prompts,
- max_tokens,
- num_logprobs=num_logprobs,
- images=images)
- for prompts, images in inputs
- ]
-
- for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
- vllm_outputs_per_image):
- # TODO: Check whether using original CLIPVisionModel can improve
- # consistency against HF
- check_logprobs_close(
- outputs_0_lst=hf_outputs,
- outputs_1_lst=[
- vllm_to_hf_output(vllm_output, model)
- for vllm_output in vllm_outputs
- ],
- name_0="hf",
- name_1="vllm",
- )
-
-
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize(
- "size_factors",
- [
- # No image
- [],
- # Single-scale
- [1.0],
- # Single-scale, batched
- [1.0, 1.0, 1.0],
- # Multi-scale
- [0.25, 0.5, 1.0],
- ],
-)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [5])
-def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
- dtype, max_tokens, num_logprobs) -> None:
- run_test(
- hf_runner,
- vllm_runner,
- image_assets,
- model,
- size_factors=size_factors,
- dtype=dtype,
- max_tokens=max_tokens,
- num_logprobs=num_logprobs,
- tensor_parallel_size=1,
- )
-
-
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [5])
-def test_models_multiple_image_inputs(hf_runner, vllm_runner, image_assets,
- model, dtype, max_tokens,
- num_logprobs) -> None:
- stop_sign = image_assets[0].pil_image
- cherry_blossom = image_assets[1].pil_image
-
- inputs = [(
- [
- "USER: \nDescribe 2 images.\nASSISTANT:",
- "USER: \nDescribe 2 images.\nASSISTANT:",
- "USER: \nDescribe 4 images.\nASSISTANT:", # noqa: E501
- "USER: \nWhat is the season?\nASSISTANT:",
- ],
- [
- [stop_sign, cherry_blossom],
- # Images with different sizes and aspect-ratios
- [
- rescale_image_size(stop_sign, 0.1),
- stop_sign,
- ],
- [
- stop_sign,
- rescale_image_size(stop_sign, 0.25),
- cherry_blossom.resize((183, 488)),
- cherry_blossom.resize((488, 183))
- ],
- cherry_blossom,
- ])]
-
- _run_test(
- hf_runner,
- vllm_runner,
- inputs,
- model,
- dtype=dtype,
- max_tokens=max_tokens,
- num_logprobs=num_logprobs,
- tensor_parallel_size=1,
- )
-
-
-@pytest.mark.parametrize("model", models)
-def test_context_length_too_short(vllm_runner, image_assets, model):
- images = [asset.pil_image for asset in image_assets]
-
- with pytest.raises(ValueError, match="too long to fit into the model"):
- vllm_model = vllm_runner(
- model,
- max_model_len=128, # LLaVA has a feature size of 576
- enforce_eager=True,
- )
-
- with vllm_model:
- vllm_model.generate_greedy([HF_IMAGE_PROMPTS[0]],
- max_tokens=1,
- images=[images[0]])
diff --git a/tests/models/decoder_only/vision_language/test_llava_image_embeds.py b/tests/models/decoder_only/vision_language/test_llava_image_embeds.py
deleted file mode 100644
index 66414032509ed..0000000000000
--- a/tests/models/decoder_only/vision_language/test_llava_image_embeds.py
+++ /dev/null
@@ -1,158 +0,0 @@
-from typing import List, Optional, Tuple, Type
-
-import pytest
-from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer
-
-from vllm.sequence import SampleLogprobs
-
-from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
-from ...utils import check_logprobs_close
-
-HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
- "stop_sign":
- "USER: \nWhat's the content of the image?\nASSISTANT:",
- "cherry_blossom":
- "USER: \nWhat is the season?\nASSISTANT:",
-})
-
-models = [
- "llava-hf/llava-1.5-7b-hf",
-]
-
-
-def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
- Optional[SampleLogprobs]],
- model: str):
- """Sanitize vllm output to be comparable with hf output."""
- output_ids, output_str, out_logprobs = vllm_output
-
- config = AutoConfig.from_pretrained(model)
- image_token_id = config.image_token_index
-
- tokenizer = AutoTokenizer.from_pretrained(model)
- eos_token_id = tokenizer.eos_token_id
-
- hf_output_ids = [
- token_id for idx, token_id in enumerate(output_ids)
- if token_id != image_token_id or output_ids[idx - 1] != image_token_id
- ]
-
- assert output_str[0] == " "
- hf_output_str = output_str[1:]
- if hf_output_ids[-1] == eos_token_id:
- hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
-
- return hf_output_ids, hf_output_str, out_logprobs
-
-
-def run_test(
- hf_runner: Type[HfRunner],
- vllm_runner: Type[VllmRunner],
- image_assets: _ImageAssets,
- model: str,
- *,
- size_factors: List[float],
- dtype: str,
- max_tokens: int,
- num_logprobs: int,
- tensor_parallel_size: int,
- distributed_executor_backend: Optional[str] = None,
-):
- """Inference result should be the same between hf and vllm.
-
- All the image fixtures for the test are from IMAGE_ASSETS.
- For huggingface runner, we provide the PIL images as input.
- For vllm runner, we provide MultiModalDataDict objects
- and corresponding vision language config as input.
- Note, the text input is also adjusted to abide by vllm contract.
- The text output is sanitized to be able to compare with hf.
- """
-
- # vLLM to load from image embeddings
- vllm_images = [asset.image_embeds for asset in image_assets]
-
- # transformers to load from PIL images
- hf_images = [asset.pil_image for asset in image_assets]
-
- vllm_inputs_per_image = [(
- [prompt for _ in size_factors],
- [image for _ in size_factors],
- ) for image, prompt in zip(vllm_images, HF_IMAGE_PROMPTS)]
-
- hf_inputs_per_image = [(
- [prompt for _ in size_factors],
- [image for _ in size_factors],
- ) for image, prompt in zip(hf_images, HF_IMAGE_PROMPTS)]
-
- # NOTE: take care of the order. run vLLM first, and then run HF.
- # vLLM needs a fresh new process without cuda initialization.
- # if we run HF first, the cuda initialization will be done and it
- # will hurt multiprocessing backend with fork method (the default method).
-
- # max_model_len should be greater than image_feature_size
- with vllm_runner(model,
- dtype=dtype,
- tensor_parallel_size=tensor_parallel_size,
- distributed_executor_backend=distributed_executor_backend,
- enforce_eager=True) as vllm_model:
- vllm_outputs_per_image = [
- vllm_model.generate_greedy_logprobs(prompts,
- max_tokens,
- num_logprobs=num_logprobs,
- images=images)
- for prompts, images in vllm_inputs_per_image
- ]
-
- with hf_runner(model, dtype=dtype,
- auto_cls=AutoModelForVision2Seq) as hf_model:
- hf_outputs_per_image = [
- hf_model.generate_greedy_logprobs_limit(prompts,
- max_tokens,
- num_logprobs=num_logprobs,
- images=images)
- for prompts, images in hf_inputs_per_image
- ]
-
- for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
- vllm_outputs_per_image):
- # TODO: Check whether using original CLIPVisionModel can improve
- # consistency against HF
- check_logprobs_close(
- outputs_0_lst=hf_outputs,
- outputs_1_lst=[
- vllm_to_hf_output(vllm_output, model)
- for vllm_output in vllm_outputs
- ],
- name_0="hf",
- name_1="vllm",
- )
-
-
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize(
- "size_factors",
- [
- # No image
- [],
- # Single-scale
- [1.0],
- # Single-scale, batched
- [1.0, 1.0, 1.0],
- ],
-)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [5])
-def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
- dtype: str, max_tokens: int, num_logprobs: int) -> None:
- run_test(
- hf_runner,
- vllm_runner,
- image_assets,
- model,
- size_factors=size_factors,
- dtype=dtype,
- max_tokens=max_tokens,
- num_logprobs=num_logprobs,
- tensor_parallel_size=1,
- )
diff --git a/tests/models/decoder_only/vision_language/test_llava_next.py b/tests/models/decoder_only/vision_language/test_llava_next.py
deleted file mode 100644
index aa9b297c5dd4e..0000000000000
--- a/tests/models/decoder_only/vision_language/test_llava_next.py
+++ /dev/null
@@ -1,347 +0,0 @@
-from typing import List, Optional, Tuple, Type, overload
-
-import pytest
-from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer
-
-from vllm.inputs import InputContext
-from vllm.multimodal.utils import rescale_image_size
-from vllm.sequence import SampleLogprobs
-
-from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
- _ImageAssets)
-from ...utils import build_model_context, check_logprobs_close
-
-_LIMIT_IMAGE_PER_PROMPT = 4
-
-HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
- "stop_sign":
- "[INST] \nWhat's the content of the image? [/INST]",
- "cherry_blossom":
- "[INST] \nWhat is the season? [/INST]",
-})
-
-models = ["llava-hf/llava-v1.6-mistral-7b-hf"]
-
-
-@pytest.fixture()
-def get_max_llava_next_image_tokens():
- from vllm.model_executor.models.llava_next import (
- get_max_llava_next_image_tokens)
- return get_max_llava_next_image_tokens
-
-
-@pytest.fixture()
-def dummy_data_for_llava_next():
- from vllm.model_executor.models.llava_next import dummy_data_for_llava_next
- return dummy_data_for_llava_next
-
-
-def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
- Optional[SampleLogprobs]],
- model: str):
- """Sanitize vllm output to be comparable with hf output."""
- output_ids, output_str, out_logprobs = vllm_output
-
- config = AutoConfig.from_pretrained(model)
- image_token_id = config.image_token_index
-
- tokenizer = AutoTokenizer.from_pretrained(model)
- eos_token_id = tokenizer.eos_token_id
-
- hf_output_ids = [
- token_id for idx, token_id in enumerate(output_ids)
- if token_id != image_token_id or output_ids[idx - 1] != image_token_id
- ]
-
- assert output_str[0] == " "
- hf_output_str = output_str[1:]
- if hf_output_ids[-1] == eos_token_id:
- hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
-
- return hf_output_ids, hf_output_str, out_logprobs
-
-
-@overload
-def run_test(
- hf_runner: Type[HfRunner],
- vllm_runner: Type[VllmRunner],
- image_assets: _ImageAssets,
- model: str,
- *,
- size_factors: List[float],
- dtype: str,
- max_tokens: int,
- num_logprobs: int,
- tensor_parallel_size: int,
- distributed_executor_backend: Optional[str] = None,
-):
- ...
-
-
-@overload
-def run_test(
- hf_runner: Type[HfRunner],
- vllm_runner: Type[VllmRunner],
- image_assets: _ImageAssets,
- model: str,
- *,
- sizes: List[Tuple[int, int]],
- dtype: str,
- max_tokens: int,
- num_logprobs: int,
- tensor_parallel_size: int,
- distributed_executor_backend: Optional[str] = None,
-):
- ...
-
-
-def run_test(
- hf_runner: Type[HfRunner],
- vllm_runner: Type[VllmRunner],
- image_assets: _ImageAssets,
- model: str,
- *,
- size_factors: Optional[List[float]] = None,
- sizes: Optional[List[Tuple[int, int]]] = None,
- dtype: str,
- max_tokens: int,
- num_logprobs: int,
- tensor_parallel_size: int,
- distributed_executor_backend: Optional[str] = None,
-):
- images = [asset.pil_image for asset in image_assets]
-
- if size_factors is not None:
- inputs_per_image = [(
- [prompt for _ in size_factors],
- [rescale_image_size(image, factor) for factor in size_factors],
- ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
- elif sizes is not None:
- inputs_per_image = [(
- [prompt for _ in sizes],
- [image.resize(size) for size in sizes],
- ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
- else:
- raise ValueError("You must provide either `size_factors` or `sizes`")
-
- _run_test(hf_runner,
- vllm_runner,
- inputs_per_image,
- model,
- dtype=dtype,
- max_tokens=max_tokens,
- num_logprobs=num_logprobs,
- tensor_parallel_size=tensor_parallel_size,
- distributed_executor_backend=distributed_executor_backend)
-
-
-def _run_test(
- hf_runner: Type[HfRunner],
- vllm_runner: Type[VllmRunner],
- inputs: List[Tuple[List[str], PromptImageInput]],
- model: str,
- dtype: str,
- max_tokens: int,
- num_logprobs: int,
- tensor_parallel_size: int,
- distributed_executor_backend: Optional[str] = None,
-):
- # max_model_len should be greater than image_feature_size
- with vllm_runner(model,
- dtype=dtype,
- max_model_len=10240,
- tensor_parallel_size=tensor_parallel_size,
- distributed_executor_backend=distributed_executor_backend,
- enforce_eager=True,
- limit_mm_per_prompt={"image": _LIMIT_IMAGE_PER_PROMPT
- }) as vllm_model:
- vllm_outputs_per_image = [
- vllm_model.generate_greedy_logprobs(prompts,
- max_tokens,
- num_logprobs=num_logprobs,
- images=images)
- for prompts, images in inputs
- ]
-
- with hf_runner(model, dtype=dtype,
- auto_cls=AutoModelForVision2Seq) as hf_model:
- hf_outputs_per_image = [
- hf_model.generate_greedy_logprobs_limit(prompts,
- max_tokens,
- num_logprobs=num_logprobs,
- images=images)
- for prompts, images in inputs
- ]
-
- for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
- vllm_outputs_per_image):
- # TODO: Check whether using original CLIPVisionModel can improve
- # consistency against HF
- check_logprobs_close(
- outputs_0_lst=hf_outputs,
- outputs_1_lst=[
- vllm_to_hf_output(vllm_output, model)
- for vllm_output in vllm_outputs
- ],
- name_0="hf",
- name_1="vllm",
- )
-
-
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize(
- "size_factors",
- [
- # No image
- [],
- # Single-scale
- [1.0],
- # Single-scale, batched
- [1.0, 1.0, 1.0],
- # Multi-scale
- [0.25, 0.5, 1.0],
- ],
-)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [5])
-def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
- dtype, max_tokens, num_logprobs) -> None:
- """Inference result should be the same between hf and vllm.
-
- All the image fixtures for the test are from IMAGE_ASSETS.
- For huggingface runner, we provide the PIL images as input.
- For vllm runner, we provide MultiModalDataDict objects
- and corresponding MultiModalConfig as input.
- Note, the text input is also adjusted to abide by vllm contract.
- The text output is sanitized to be able to compare with hf.
- """
- run_test(
- hf_runner,
- vllm_runner,
- image_assets,
- model,
- size_factors=size_factors,
- dtype=dtype,
- max_tokens=max_tokens,
- num_logprobs=num_logprobs,
- tensor_parallel_size=1,
- )
-
-
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize(
- "sizes",
- [[(1669, 2560), (2560, 1669), (183, 488), (488, 183)]],
-)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [5])
-def test_models_fixed_sizes(hf_runner, vllm_runner, image_assets, model, sizes,
- dtype, max_tokens, num_logprobs) -> None:
- run_test(
- hf_runner,
- vllm_runner,
- image_assets,
- model,
- sizes=sizes,
- dtype=dtype,
- max_tokens=max_tokens,
- num_logprobs=num_logprobs,
- tensor_parallel_size=1,
- )
-
-
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [5])
-def test_models_multiple_image_inputs(hf_runner, vllm_runner, image_assets,
- model, dtype, max_tokens,
- num_logprobs) -> None:
- stop_sign = image_assets[0].pil_image
- cherry_blossom = image_assets[1].pil_image
-
- inputs = [(
- [
- "[INST] \nDescribe 2 images. [/INST]",
- "[INST] \nDescribe 2 images. [/INST]",
- "[INST] \nDescribe 4 images. [/INST]",
- "[INST] \nWhat is the season? [/INST]"
- ],
- [
- [stop_sign, cherry_blossom],
- # Images with different sizes and aspect-ratios
- [
- rescale_image_size(stop_sign, 0.1),
- stop_sign,
- ],
- [
- stop_sign,
- rescale_image_size(stop_sign, 0.25),
- cherry_blossom.resize((183, 488)),
- cherry_blossom.resize((488, 183))
- ],
- cherry_blossom,
- ])]
-
- _run_test(
- hf_runner,
- vllm_runner,
- inputs,
- model,
- dtype=dtype,
- max_tokens=max_tokens,
- num_logprobs=num_logprobs,
- tensor_parallel_size=1,
- )
-
-
-@pytest.mark.parametrize("gridpoints,expected_max_tokens", [
- ([[336, 336]], 1176),
- ([[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]], 2928),
-])
-def test_get_max_llava_next_image_tokens(gridpoints, expected_max_tokens,
- get_max_llava_next_image_tokens):
- ctx = build_model_context(model_name="llava-hf/llava-v1.6-mistral-7b-hf")
-
- # Update the config image_grid_pinpoints
- # and calculate the resulting max tokens
- ctx.model_config.hf_config.image_grid_pinpoints = gridpoints
-
- actual_max_tokens = get_max_llava_next_image_tokens(
- InputContext(ctx.model_config))
-
- assert expected_max_tokens == actual_max_tokens
-
-
-@pytest.mark.parametrize(
- "gridpoints,expected_size",
- [
- # One point; it has to be the largest
- ([[336, 336]], (336, 336)),
- # Default for most llava next models; the 2x2 tile is the largest
- ([[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]],
- (672, 672)),
- # If two rectangular gridpoints are the same, the more vertical
- # one has the higher feature count due to newline features
- ([[336, 672], [672, 336]], (672, 336))
- ])
-def test_dummy_data_for_llava_next_feature_size(dummy_data_for_llava_next,
- gridpoints, expected_size):
- ctx = build_model_context(model_name="llava-hf/llava-v1.6-mistral-7b-hf")
-
- # Update the config image_grid_pinpoints
- ctx.model_config.hf_config.image_grid_pinpoints = gridpoints
- seq_len = 5000 # bigger than the max feature size for any image
-
- seq_data, mm_data = dummy_data_for_llava_next(
- ctx,
- seq_len=seq_len,
- mm_counts={"image": 1},
- )
-
- # The dummy data dims should match the gridpoint with the biggest feat size
- assert mm_data["image"].height == expected_size[0]
- assert mm_data["image"].width == expected_size[1]
- assert len(seq_data.get_token_ids()) >= seq_len
diff --git a/tests/models/decoder_only/vision_language/test_llava_next_video.py b/tests/models/decoder_only/vision_language/test_llava_next_video.py
deleted file mode 100644
index 7b7b23c783e2a..0000000000000
--- a/tests/models/decoder_only/vision_language/test_llava_next_video.py
+++ /dev/null
@@ -1,226 +0,0 @@
-from typing import List, Optional, Tuple, Type, overload
-
-import pytest
-from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer
-
-from vllm.multimodal.utils import (rescale_video_size, resize_video,
- sample_frames_from_video)
-from vllm.sequence import SampleLogprobs
-
-from ....conftest import VIDEO_ASSETS, HfRunner, VllmRunner, _VideoAssets
-from ...utils import check_logprobs_close
-
-_PREFACE = (
- "A chat between a curious human and an artificial intelligence assistant. "
- "The assistant gives helpful, detailed, and polite answers to the human's "
- "questions.")
-
-HF_VIDEO_PROMPTS = VIDEO_ASSETS.prompts({
- "sample_demo_1":
- f"{_PREFACE}USER: \nWhy is this video funny? ASSISTANT:"
-})
-
-models = ["llava-hf/LLaVA-NeXT-Video-7B-hf"]
-
-
-def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
- Optional[SampleLogprobs]],
- model: str):
- """Sanitize vllm output to be comparable with hf output."""
- output_ids, output_str, out_logprobs = vllm_output
-
- config = AutoConfig.from_pretrained(model)
- video_token_id = config.video_token_index
-
- tokenizer = AutoTokenizer.from_pretrained(model)
- eos_token_id = tokenizer.eos_token_id
-
- hf_output_ids = [
- token_id for idx, token_id in enumerate(output_ids)
- if token_id != video_token_id or output_ids[idx - 1] != video_token_id
- ]
-
- assert output_str[0] == " "
- hf_output_str = output_str[1:]
- if hf_output_ids[-1] == eos_token_id:
- hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
-
- return hf_output_ids, hf_output_str, out_logprobs
-
-
-@overload
-def run_test(
- hf_runner: Type[HfRunner],
- vllm_runner: Type[VllmRunner],
- video_assets: _VideoAssets,
- model: str,
- *,
- size_factors: List[float],
- dtype: str,
- max_tokens: int,
- num_logprobs: int,
- num_frames: int,
- tensor_parallel_size: int,
- distributed_executor_backend: Optional[str] = None,
-):
- ...
-
-
-@overload
-def run_test(
- hf_runner: Type[HfRunner],
- vllm_runner: Type[VllmRunner],
- video_assets: _VideoAssets,
- model: str,
- *,
- sizes: List[Tuple[int, int]],
- dtype: str,
- max_tokens: int,
- num_logprobs: int,
- num_frames: int,
- tensor_parallel_size: int,
- distributed_executor_backend: Optional[str] = None,
-):
- ...
-
-
-def run_test(
- hf_runner: Type[HfRunner],
- vllm_runner: Type[VllmRunner],
- video_assets: _VideoAssets,
- model: str,
- *,
- size_factors: Optional[List[float]] = None,
- sizes: Optional[List[Tuple[int, int]]] = None,
- dtype: str,
- max_tokens: int,
- num_logprobs: int,
- num_frames: int,
- tensor_parallel_size: int,
- distributed_executor_backend: Optional[str] = None,
-):
- videos = [
- sample_frames_from_video(asset.np_ndarrays, num_frames)
- for asset in video_assets
- ]
-
- if size_factors is not None:
- inputs_per_video = [(
- [prompt for _ in size_factors],
- [rescale_video_size(video, factor) for factor in size_factors],
- ) for video, prompt in zip(videos, HF_VIDEO_PROMPTS)]
- elif sizes is not None:
- inputs_per_video = [(
- [prompt for _ in sizes],
- [resize_video(video, size) for size in sizes],
- ) for video, prompt in zip(videos, HF_VIDEO_PROMPTS)]
- else:
- raise ValueError("You must provide either `size_factors` or `sizes`")
-
- # max_model_len should be greater than image_feature_size
- with vllm_runner(model,
- dtype=dtype,
- max_model_len=4096,
- tensor_parallel_size=tensor_parallel_size,
- distributed_executor_backend=distributed_executor_backend,
- enforce_eager=True) as vllm_model:
- vllm_outputs_per_video = [
- vllm_model.generate_greedy_logprobs(prompts,
- max_tokens,
- num_logprobs=num_logprobs,
- videos=videos)
- for prompts, videos in inputs_per_video
- ]
-
- with hf_runner(model, dtype=dtype,
- auto_cls=AutoModelForVision2Seq) as hf_model:
- hf_outputs_per_video = [
- hf_model.generate_greedy_logprobs_limit(prompts,
- max_tokens,
- num_logprobs=num_logprobs,
- videos=videos)
- for prompts, videos in inputs_per_video
- ]
-
- for hf_outputs, vllm_outputs in zip(hf_outputs_per_video,
- vllm_outputs_per_video):
- # TODO: Check whether using original CLIPVisionModel can improve
- # consistency against HF
- check_logprobs_close(
- outputs_0_lst=hf_outputs,
- outputs_1_lst=[
- vllm_to_hf_output(vllm_output, model)
- for vllm_output in vllm_outputs
- ],
- name_0="hf",
- name_1="vllm",
- )
-
-
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize(
- "size_factors",
- [
- # No video
- [],
- # Single-scale
- [1.0],
- # Single-scale, batched
- [1.0, 1.0, 1.0],
- # Multi-scale
- [0.25, 0.5, 1.0],
- ],
-)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [5])
-@pytest.mark.parametrize("num_frames", [16])
-def test_models(hf_runner, vllm_runner, video_assets, model, size_factors,
- dtype, max_tokens, num_logprobs, num_frames) -> None:
- """Inference result should be the same between hf and vllm.
-
- All the image fixtures for the test is under tests/videos.
- For huggingface runner, we provide the np.ndarray as input.
- For vllm runner, we provide MultiModalDataDict objects
- and corresponding MultiModalConfig as input.
- Note, the text input is also adjusted to abide by vllm contract.
- The text output is sanitized to be able to compare with hf.
- """
- run_test(
- hf_runner,
- vllm_runner,
- video_assets,
- model,
- size_factors=size_factors,
- dtype=dtype,
- max_tokens=max_tokens,
- num_logprobs=num_logprobs,
- num_frames=num_frames,
- tensor_parallel_size=1,
- )
-
-
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize(
- "sizes",
- [[(1669, 2560), (2560, 1669), (183, 488), (488, 183)]],
-)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [5])
-@pytest.mark.parametrize("num_frames", [16])
-def test_models_fixed_sizes(hf_runner, vllm_runner, video_assets, model, sizes,
- dtype, max_tokens, num_logprobs,
- num_frames) -> None:
- run_test(
- hf_runner,
- vllm_runner,
- video_assets,
- model,
- sizes=sizes,
- dtype=dtype,
- max_tokens=max_tokens,
- num_logprobs=num_logprobs,
- num_frames=num_frames,
- tensor_parallel_size=1,
- )
diff --git a/tests/models/decoder_only/vision_language/test_llava_onevision.py b/tests/models/decoder_only/vision_language/test_llava_onevision.py
deleted file mode 100644
index 1616fd299b9aa..0000000000000
--- a/tests/models/decoder_only/vision_language/test_llava_onevision.py
+++ /dev/null
@@ -1,272 +0,0 @@
-from typing import List, Optional, Tuple, Type
-
-import pytest
-from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer,
- BatchEncoding)
-
-from vllm.multimodal.utils import (rescale_image_size, rescale_video_size,
- resize_video, sample_frames_from_video)
-from vllm.sequence import SampleLogprobs
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
-
-from ....conftest import (VIDEO_ASSETS, HfRunner, PromptImageInput,
- PromptVideoInput, VllmRunner)
-from ...utils import check_logprobs_close
-
-# Video test
-HF_VIDEO_PROMPTS = VIDEO_ASSETS.prompts({
- "sample_demo_1":
- "<|im_start|>user\n\nwhy is this video funny?<|im_end|>\n<|im_start|>assistant\n" # noqa: E501
-})
-
-models = ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"]
-
-
-def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
- Optional[SampleLogprobs]],
- model: str):
- """Sanitize vllm output to be comparable with hf output."""
- output_ids, output_str, out_logprobs = vllm_output
-
- config = AutoConfig.from_pretrained(model)
- video_token_id = config.video_token_index
-
- tokenizer = AutoTokenizer.from_pretrained(model)
- eos_token_id = tokenizer.eos_token_id
-
- hf_output_ids = [
- token_id for idx, token_id in enumerate(output_ids)
- if token_id != video_token_id or output_ids[idx - 1] != video_token_id
- ]
-
- hf_output_str = output_str
- if hf_output_ids[-1] == eos_token_id:
- hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
-
- return hf_output_ids, hf_output_str, out_logprobs
-
-
-# Video test
-_LIMIT_VIDEO_PER_PROMPT = 4
-
-
-def run_video_test(
- hf_runner: Type[HfRunner],
- vllm_runner: Type[VllmRunner],
- inputs: List[Tuple[List[str], PromptVideoInput]],
- model: str,
- *,
- dtype: str,
- max_tokens: int,
- num_logprobs: int,
- num_frames: int,
- tensor_parallel_size: int,
- distributed_executor_backend: Optional[str] = None,
-):
- torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
- with vllm_runner(model,
- dtype=dtype,
- max_model_len=16384,
- tensor_parallel_size=tensor_parallel_size,
- distributed_executor_backend=distributed_executor_backend,
- enforce_eager=True,
- limit_mm_per_prompt={"video": _LIMIT_VIDEO_PER_PROMPT
- }) as vllm_model:
- vllm_outputs_per_input = [
- vllm_model.generate_greedy_logprobs(prompts,
- max_tokens,
- num_logprobs=num_logprobs,
- videos=videos)
- for prompts, videos in inputs
- ]
-
- def process(hf_inputs: BatchEncoding):
- hf_inputs["pixel_values_videos"] = hf_inputs["pixel_values_videos"] \
- .to(torch_dtype) # type: ignore
- return hf_inputs
-
- with hf_runner(model,
- dtype=dtype,
- postprocess_inputs=process,
- auto_cls=AutoModelForVision2Seq) as hf_model:
- hf_outputs_per_input = [
- hf_model.generate_greedy_logprobs_limit(prompts,
- max_tokens,
- num_logprobs=num_logprobs,
- videos=videos)
- for prompts, videos in inputs
- ]
-
- for hf_outputs, vllm_outputs in zip(hf_outputs_per_input,
- vllm_outputs_per_input):
- # TODO: Check whether using original CLIPVisionModel can improve
- # consistency against HF
- check_logprobs_close(
- outputs_0_lst=hf_outputs,
- outputs_1_lst=[
- vllm_to_hf_output(vllm_output, model)
- for vllm_output in vllm_outputs
- ],
- name_0="hf",
- name_1="vllm",
- )
-
-
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [5])
-@pytest.mark.parametrize("num_frames", [16])
-def test_models_multiple_video_inputs(hf_runner, vllm_runner, video_assets,
- model, dtype, max_tokens, num_logprobs,
- num_frames) -> None:
- video = sample_frames_from_video(video_assets[0].np_ndarrays, num_frames)
- inputs = [(
- [
- "<|im_start|>user \nDescribe 2 videos. \
- <|im_end|><|im_start|>assistant\n",
- "<|im_start|>user \nDescribe 2 videos. \
- <|im_end|><|im_start|>assistant\n",
- "<|im_start|>user \nDescribe 4 videos. \
- <|im_end|><|im_start|>assistant\n",
- "<|im_start|>user \nwhy is this video funny? \
- <|im_end|><|im_start|>assistant\n",
- ],
- [
- [video, video],
- # Images with different sizes and aspect-ratios
- [
- rescale_video_size(video, 0.1),
- video,
- ],
- [
- video,
- rescale_video_size(video, 0.25),
- resize_video(video, (183, 488)),
- resize_video(video, (488, 183))
- ],
- video,
- ])]
- run_video_test(
- hf_runner,
- vllm_runner,
- inputs,
- model,
- dtype=dtype,
- max_tokens=max_tokens,
- num_logprobs=num_logprobs,
- tensor_parallel_size=1,
- num_frames=num_frames,
- )
-
-
-# Image test
-_LIMIT_IMAGE_PER_PROMPT = 4
-
-
-def run_image_test(
- hf_runner: Type[HfRunner],
- vllm_runner: Type[VllmRunner],
- inputs: List[Tuple[List[str], PromptImageInput]],
- model: str,
- dtype: str,
- max_tokens: int,
- num_logprobs: int,
- tensor_parallel_size: int,
- distributed_executor_backend: Optional[str] = None,
-):
- torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
-
- # max_model_len should be greater than image_feature_size
- with vllm_runner(model,
- dtype=dtype,
- max_model_len=16384,
- max_num_seqs=2,
- tensor_parallel_size=tensor_parallel_size,
- distributed_executor_backend=distributed_executor_backend,
- enforce_eager=True,
- limit_mm_per_prompt={"image": _LIMIT_IMAGE_PER_PROMPT
- }) as vllm_model:
- vllm_outputs_per_image = [
- vllm_model.generate_greedy_logprobs(prompts,
- max_tokens,
- num_logprobs=num_logprobs,
- images=images)
- for prompts, images in inputs
- ]
-
- def process(hf_inputs: BatchEncoding):
- hf_inputs["pixel_values"] = hf_inputs["pixel_values"] \
- .to(torch_dtype) # type: ignore
- return hf_inputs
-
- with hf_runner(model,
- dtype=dtype,
- postprocess_inputs=process,
- auto_cls=AutoModelForVision2Seq) as hf_model:
- hf_outputs_per_image = [
- hf_model.generate_greedy_logprobs_limit(prompts,
- max_tokens,
- num_logprobs=num_logprobs,
- images=images)
- for prompts, images in inputs
- ]
-
- for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
- vllm_outputs_per_image):
- # TODO: Check whether using original CLIPVisionModel can improve
- # consistency against HF
- check_logprobs_close(
- outputs_0_lst=hf_outputs,
- outputs_1_lst=[
- vllm_to_hf_output(vllm_output, model)
- for vllm_output in vllm_outputs
- ],
- name_0="hf",
- name_1="vllm",
- )
-
-
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [5])
-def test_models_multiple_image_inputs(hf_runner, vllm_runner, image_assets,
- model, dtype, max_tokens,
- num_logprobs) -> None:
- stop_sign = image_assets[0].pil_image
- cherry_blossom = image_assets[1].pil_image
-
- inputs = [(
- [
- "<|im_start|>user\n\nDescribe 2 images.<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
- "<|im_start|>user\n\nDescribe 2 images.<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
- "<|im_start|>user\n\nDescribe 4 images.<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
- "<|im_start|>user\n\nWhat is the season?<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
- ],
- [
- [stop_sign, cherry_blossom],
- # Images with different sizes and aspect-ratios
- [
- rescale_image_size(stop_sign, 0.1),
- stop_sign,
- ],
- [
- stop_sign,
- rescale_image_size(stop_sign, 0.25),
- cherry_blossom.resize((183, 488)),
- cherry_blossom.resize((488, 183))
- ],
- cherry_blossom,
- ])]
-
- run_image_test(
- hf_runner,
- vllm_runner,
- inputs,
- model,
- dtype=dtype,
- max_tokens=max_tokens,
- num_logprobs=num_logprobs,
- tensor_parallel_size=1,
- )
diff --git a/tests/models/decoder_only/vision_language/test_minicpmv.py b/tests/models/decoder_only/vision_language/test_minicpmv.py
deleted file mode 100644
index d3a0561f65797..0000000000000
--- a/tests/models/decoder_only/vision_language/test_minicpmv.py
+++ /dev/null
@@ -1,199 +0,0 @@
-from typing import List, Optional, Tuple, Type, Union
-
-import pytest
-import torch
-import torch.types
-from PIL import Image
-from transformers import BatchEncoding
-
-from vllm.multimodal.utils import rescale_image_size
-from vllm.sequence import SampleLogprobs
-
-from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner
-from ...utils import check_logprobs_close
-
-# The image token is placed before "user" on purpose so that the test can pass
-HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
- "stop_sign":
- "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" \
- "(./)\nWhat's the content of the image?<|eot_id|>" \
- "<|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501
- "cherry_blossom":
- "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" \
- "(./)\nWhat is the season?<|eot_id|>" \
- "<|start_header_id|>assistant<|end_header_id|>\n\n",
-})
-HF_MULTIIMAGE_IMAGE_PROMPT = \
- "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" \
- "(./)\n(./)\n" \
- "Describe these images.<|eot_id|>" \
- "<|start_header_id|>assistant<|end_header_id|>\n\n"
-
-models = ["openbmb/MiniCPM-Llama3-V-2_5"]
-
-
-def _wrap_inputs(hf_inputs: BatchEncoding):
- return {"model_inputs": hf_inputs}
-
-
-def trunc_hf_output(hf_output: Tuple[List[int], str,
- Optional[SampleLogprobs]]):
- output_ids, output_str, out_logprobs = hf_output
- if output_str.endswith("<|eot_id|>"):
- output_str = output_str.split("<|eot_id|>")[0]
- return output_ids, output_str, out_logprobs
-
-
-target_dtype = "half"
-
-
-def run_test(
- hf_runner: Type[HfRunner],
- vllm_runner: Type[VllmRunner],
- inputs: List[Tuple[List[str], Union[List[Image.Image],
- List[List[Image.Image]]]]],
- model: str,
- *,
- dtype: str,
- max_tokens: int,
- num_logprobs: int,
- mm_limit: int,
- tensor_parallel_size: int,
- distributed_executor_backend: Optional[str] = None,
-):
- """Inference result should be the same between hf and vllm.
-
- All the image fixtures for the test are from IMAGE_ASSETS.
- For huggingface runner, we provide the PIL images as input.
- For vllm runner, we provide MultiModalDataDict objects
- and corresponding MultiModalConfig as input.
- Note, the text input is also adjusted to abide by vllm contract.
- The text output is sanitized to be able to compare with hf.
- """
-
- # NOTE: take care of the order. run vLLM first, and then run HF.
- # vLLM needs a fresh new process without cuda initialization.
- # if we run HF first, the cuda initialization will be done and it
- # will hurt multiprocessing backend with fork method (the default method).
-
- # max_model_len should be greater than image_feature_size
- with vllm_runner(model,
- max_model_len=4096,
- max_num_seqs=2,
- dtype=dtype,
- limit_mm_per_prompt={"image": mm_limit},
- tensor_parallel_size=tensor_parallel_size,
- distributed_executor_backend=distributed_executor_backend,
- enforce_eager=True) as vllm_model:
- tokenizer = vllm_model.model.get_tokenizer()
- stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id]
- vllm_outputs_per_image = [
- vllm_model.generate_greedy_logprobs(prompts,
- max_tokens,
- num_logprobs=num_logprobs,
- images=images,
- stop_token_ids=stop_token_ids)
- for prompts, images in inputs
- ]
-
- hf_model = hf_runner(model, dtype=dtype, postprocess_inputs=_wrap_inputs)
- with hf_model, torch.no_grad():
- hf_outputs_per_image = [
- hf_model.generate_greedy_logprobs_limit(prompts,
- max_tokens,
- num_logprobs=num_logprobs,
- images=images,
- tokenizer=tokenizer)
- for prompts, images in inputs
- ]
-
- for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
- vllm_outputs_per_image):
- check_logprobs_close(
- outputs_0_lst=[
- trunc_hf_output(hf_output) for hf_output in hf_outputs
- ],
- outputs_1_lst=vllm_outputs,
- name_0="hf",
- name_1="vllm",
- )
-
-
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize(
- "size_factors",
- [
- # No image
- [],
- # Single-scale
- [1.0],
- # Single-scale, batched
- [1.0, 1.0, 1.0],
- # Multi-scale
- [0.25, 0.5, 1.0],
- ],
-)
-@pytest.mark.parametrize("dtype", [target_dtype])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [5])
-def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
- dtype: str, max_tokens: int, num_logprobs: int) -> None:
- images = [asset.pil_image for asset in image_assets]
-
- inputs_per_image = [(
- [prompt for _ in size_factors],
- [rescale_image_size(image, factor) for factor in size_factors],
- ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
-
- run_test(
- hf_runner,
- vllm_runner,
- inputs_per_image,
- model,
- dtype=dtype,
- max_tokens=max_tokens,
- num_logprobs=num_logprobs,
- mm_limit=1,
- tensor_parallel_size=1,
- )
-
-
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize(
- "size_factors",
- [
- # No image
- [],
- # Single-scale
- [1.0],
- # Single-scale, batched
- [1.0, 1.0, 1.0],
- # Multi-scale
- [0.25, 0.5, 1.0],
- ],
-)
-@pytest.mark.parametrize("dtype", [target_dtype])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [5])
-def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
- size_factors, dtype: str, max_tokens: int,
- num_logprobs: int) -> None:
- images = [asset.pil_image for asset in image_assets]
-
- inputs_per_case = [
- ([HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
- [[rescale_image_size(image, factor) for image in images]
- for factor in size_factors])
- ]
-
- run_test(
- hf_runner,
- vllm_runner,
- inputs_per_case,
- model,
- dtype=dtype,
- max_tokens=max_tokens,
- num_logprobs=num_logprobs,
- mm_limit=2,
- tensor_parallel_size=1,
- )
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
new file mode 100644
index 0000000000000..9370527e3cd57
--- /dev/null
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -0,0 +1,594 @@
+"""Common tests for testing .generate() functionality for single / multiple
+image, embedding, and video support for different VLMs in vLLM.
+"""
+import os
+from pathlib import PosixPath
+from typing import Type
+
+import pytest
+import transformers
+from transformers import AutoModelForVision2Seq
+
+from vllm.platforms import current_platform
+from vllm.utils import cuda_device_count_stateless, identity
+
+from ....conftest import (IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets,
+ _VideoAssets)
+from ....utils import fork_new_process_for_each_test, large_gpu_mark
+from ...utils import check_outputs_equal
+from .vlm_utils import custom_inputs, model_utils, runners
+from .vlm_utils.case_filtering import get_parametrized_options
+from .vlm_utils.types import (CustomTestOptions, ExpandableVLMTestArgs,
+ VLMTestInfo, VLMTestType)
+
+# This hack is needed for phi3v & paligemma models
+# ROCm Triton FA can run into shared memory issues with these models,
+# use other backends in the meantime
+# FIXME (mattwong, gshtrasb, hongxiayan)
+if current_platform.is_rocm():
+ os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
+
+# yapf: disable
+COMMON_BROADCAST_SETTINGS = {
+ "test_type": VLMTestType.IMAGE,
+ "dtype": "half",
+ "max_tokens": 5,
+ "tensor_parallel_size": 2,
+ "image_size_factors": [(.25, 0.5, 1.0)],
+ "distributed_executor_backend": (
+ "ray",
+ "mp",
+ )
+}
+
+### Test configuration for specific models
+# NOTE: The convention of the test settings below is to lead each test key
+# with the name of the model arch used in the test, using underscores in place
+# of hyphens; this makes it more convenient to filter tests for a specific kind
+# of model. For example....
+#
+# To run all test types for a specific key:
+# use the k flag to substring match with a leading square bracket; if the
+# model arch happens to be a substring of another one, you can add a
+# trailing hyphen. E.g.,
+# - pytest $TEST_FILE -k "[llava-"
+# prevents matching on "[llava_next-" & will match just the enabled cases
+# for llava, i.e., single image, image embedding, and custom input tests.
+#
+# To run a test for a Test Info for just one of multiple models:
+# use the k flag to substring match the model name, e.g.,
+# - pytest $TEST_FILE -k OpenGVLab/InternVL2-1B
+# prevents matching on nGVLab/InternVL2-2B.
+#
+# You can also combine substrings to match more granularly.
+# ex 1:
+# pytest $TEST_FILE -k "test_single_image and OpenGVLab/InternVL2-1B"
+# will run only test_single_image* for OpenGVLab/InternVL2-1B; this would
+# match both wrappers for single image tests, since it also matches
+# test_single_image_heavy (which forks if we have a distributed backend)
+# ex 2:
+# pytest $TEST_FILE -k "[llava- or [intern_vl-"
+# will run all of the tests for only llava & internvl.
+#
+# NOTE you can add --collect-only to any of the above commands to see
+# which cases would be selected and deselected by pytest. In general,
+# this is a good idea for checking your command first, since tests are slow.
+
+VLM_TEST_SETTINGS = {
+ "blip2": VLMTestInfo(
+ models=["Salesforce/blip2-opt-2.7b"],
+ test_type=VLMTestType.IMAGE,
+ prompt_formatter=lambda img_prompt: f"Question: {img_prompt} Answer:",
+ img_idx_to_prompt=lambda idx: "",
+ auto_cls=AutoModelForVision2Seq,
+ vllm_output_post_proc=model_utils.blip2_vllm_to_hf_output,
+ ),
+ "chameleon": VLMTestInfo(
+ models=["facebook/chameleon-7b"],
+ test_type=VLMTestType.IMAGE,
+ prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
+ max_model_len=4096,
+ auto_cls=AutoModelForVision2Seq,
+ postprocess_inputs=model_utils.get_key_type_post_processor(
+ "pixel_values"
+ ),
+ # For chameleon, we only compare the sequences
+ vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2],
+ hf_output_post_proc = lambda hf_output, model: hf_output[:2],
+ comparator=check_outputs_equal,
+ max_tokens=8,
+ dtype="bfloat16",
+ marks=[
+ pytest.mark.skipif(
+ transformers.__version__.startswith("4.46"),
+ reason="Model broken in HF, see huggingface/transformers#34379"
+ )
+ ]
+ ),
+ "fuyu": VLMTestInfo(
+ models=["adept/fuyu-8b"],
+ test_type=VLMTestType.IMAGE,
+ prompt_formatter=lambda img_prompt: f"{img_prompt}\n",
+ img_idx_to_prompt=lambda idx: "",
+ max_model_len=2048,
+ max_num_seqs=2,
+ use_tokenizer_eos=True,
+ vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output,
+ num_logprobs=10,
+ dtype="bfloat16" if current_platform.is_cpu() else "half",
+ image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
+ ),
+ "glm4": VLMTestInfo(
+ models=["THUDM/glm-4v-9b"],
+ test_type=VLMTestType.IMAGE,
+ prompt_formatter=identity,
+ img_idx_to_prompt=lambda idx: "",
+ max_model_len=2048,
+ max_num_seqs=2,
+ dtype="bfloat16",
+ get_stop_token_ids=lambda tok: [151329, 151336, 151338],
+ marks=[large_gpu_mark(min_gb=48)],
+ patch_hf_runner=model_utils.glm_patch_hf_runner,
+ ),
+ "intern_vl": VLMTestInfo(
+ models=[
+ "OpenGVLab/InternVL2-1B",
+ "OpenGVLab/InternVL2-2B",
+ "OpenGVLab/Mono-InternVL-2B",
+ ],
+ test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+ prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
+ single_image_prompts=IMAGE_ASSETS.prompts({
+ "stop_sign": "\nWhat's the content in the center of the image?", # noqa: E501
+ "cherry_blossom": "\nWhat is the season?",
+ }),
+ multi_image_prompt="Image-1: \nImage-2: \nDescribe the two images in short.", # noqa: E501
+ max_model_len=4096,
+ # NOTE: Mono-InternVL-2B doesn't work with fp16,
+ # it will result NaN during inference.
+ # See: https://huggingface.co/OpenGVLab/Mono-InternVL-2B/discussions/9
+ dtype="bfloat16",
+ use_tokenizer_eos=True,
+ patch_hf_runner=model_utils.internvl_patch_hf_runner,
+ ),
+ "llava": VLMTestInfo(
+ models=["llava-hf/llava-1.5-7b-hf"],
+ test_type=(
+ VLMTestType.EMBEDDING,
+ VLMTestType.IMAGE,
+ VLMTestType.CUSTOM_INPUTS
+ ),
+ prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
+ convert_assets_to_embeddings=model_utils.get_llava_embeddings,
+ max_model_len=4096,
+ auto_cls=AutoModelForVision2Seq,
+ vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
+ custom_test_opts=[CustomTestOptions(
+ inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
+ formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:"
+ ),
+ limit_mm_per_prompt={"image": 4},
+ )],
+ ),
+ "llava_next": VLMTestInfo(
+ models=["llava-hf/llava-v1.6-mistral-7b-hf"],
+ test_type=(VLMTestType.IMAGE, VLMTestType.CUSTOM_INPUTS),
+ prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]",
+ max_model_len=10240,
+ auto_cls=AutoModelForVision2Seq,
+ vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
+ custom_test_opts=[CustomTestOptions(
+ inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
+ formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]"
+ ),
+ limit_mm_per_prompt={"image": 4},
+ )],
+ # Llava-next tests fixed sizes & the default size factors
+ image_sizes=[((1669, 2560), (2560, 1669), (183, 488), (488, 183))],
+ ),
+ "llava_one_vision": VLMTestInfo(
+ models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"],
+ test_type=VLMTestType.CUSTOM_INPUTS,
+ prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
+ dtype="half",
+ num_video_frames=16,
+ max_model_len=16384,
+ postprocess_inputs=model_utils.get_key_type_post_processor(
+ "pixel_values_videos"
+ ),
+ auto_cls=AutoModelForVision2Seq,
+ vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
+ # Llava-one-vision tests fixed sizes & the default size factors
+ image_sizes=[((1669, 2560), (2560, 1669), (183, 488), (488, 183))],
+ runner_mm_key="videos",
+ custom_test_opts=[CustomTestOptions(
+ inputs=custom_inputs.multi_video_multi_aspect_ratio_inputs(
+ formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
+ ),
+ limit_mm_per_prompt={"video": 4},
+ )],
+ ),
+ # FIXME
+ "llava_next_video": VLMTestInfo(
+ models=["llava-hf/LLaVA-NeXT-Video-7B-hf"],
+ test_type=VLMTestType.VIDEO,
+ prompt_formatter=lambda vid_prompt: f"USER: {vid_prompt} ASSISTANT:",
+ num_video_frames=16,
+ max_model_len=4096,
+ auto_cls=AutoModelForVision2Seq,
+ vllm_output_post_proc=model_utils.llava_video_vllm_to_hf_output,
+ image_sizes=[((1669, 2560), (2560, 1669), (183, 488), (488, 183))],
+ runner_mm_key="videos",
+ marks=[
+ pytest.mark.skip(reason="LLava next video tests currently fail.")
+ ],
+ ),
+ "minicpmv": VLMTestInfo(
+ models=["openbmb/MiniCPM-Llama3-V-2_5"],
+ test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+ prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501
+ img_idx_to_prompt=lambda idx: "(./)\n",
+ max_model_len=4096,
+ max_num_seqs=2,
+ get_stop_token_ids=lambda tok: [tok.eos_id, tok.eot_id],
+ postprocess_inputs=model_utils.wrap_inputs_post_processor,
+ hf_output_post_proc=model_utils.minicmpv_trunc_hf_output,
+ ),
+ "paligemma": VLMTestInfo(
+ models=["google/paligemma-3b-mix-224"],
+ test_type=VLMTestType.IMAGE,
+ prompt_formatter=identity,
+ img_idx_to_prompt = lambda idx: "",
+ # Paligemma uses its own sample prompts because the default one fails
+ single_image_prompts=IMAGE_ASSETS.prompts({
+ "stop_sign": "caption es",
+ "cherry_blossom": "What is in the picture?",
+ }),
+ auto_cls=AutoModelForVision2Seq,
+ postprocess_inputs=model_utils.get_key_type_post_processor(
+ "pixel_values"
+ ),
+ vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output,
+ dtype="half" if current_platform.is_rocm() else ("half", "float"),
+ ),
+ # Tests for phi3v currently live in another file because of a bug in
+ # transformers. Once this issue is fixed, we can enable them here instead.
+ # https://github.com/huggingface/transformers/issues/34307
+ # "phi3v": VLMTestInfo(
+ # models=["microsoft/Phi-3.5-vision-instruct"],
+ # test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+ # prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|end|>\n<|assistant|>\n", # noqa: E501
+ # img_idx_to_prompt=lambda idx: f"<|image_{idx}|>\n",
+ # max_model_len=4096,
+ # max_num_seqs=2,
+ # task="generate",
+ # # use eager mode for hf runner since phi3v didn't work with flash_attn
+ # model_kwargs={"_attn_implementation": "eager"},
+ # use_tokenizer_eos=True,
+ # vllm_output_post_proc=model_utils.phi3v_vllm_to_hf_output,
+ # num_logprobs=10,
+ # ),
+ "qwen": VLMTestInfo(
+ models=["Qwen/Qwen-VL"],
+ test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+ prompt_formatter=identity,
+ img_idx_to_prompt=lambda idx: f"Picture {idx}: \n",
+ max_model_len=1024,
+ max_num_seqs=2,
+ vllm_output_post_proc=model_utils.qwen_vllm_to_hf_output,
+ prompt_path_encoder=model_utils.qwen_prompt_path_encoder,
+ ),
+ ### Tensor parallel / multi-gpu broadcast tests
+ "broadcast-chameleon": VLMTestInfo(
+ models=["facebook/chameleon-7b"],
+ prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
+ max_model_len=4096,
+ auto_cls=AutoModelForVision2Seq,
+ postprocess_inputs=model_utils.get_key_type_post_processor(
+ "pixel_values"
+ ),
+ vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2],
+ hf_output_post_proc = lambda hf_output, model: hf_output[:2],
+ comparator=check_outputs_equal,
+ marks=[
+ pytest.mark.distributed_2_gpus,
+ pytest.mark.skipif(
+ cuda_device_count_stateless() < 2,
+ reason="Need at least 2 GPUs to run the test.",
+ ),
+ pytest.mark.skipif(
+ transformers.__version__.startswith("4.46"),
+ reason="Model broken in HF, see huggingface/transformers#34379"
+ )
+ ],
+ **COMMON_BROADCAST_SETTINGS # type: ignore
+ ),
+ "broadcast-llava": VLMTestInfo(
+ models=["llava-hf/llava-1.5-7b-hf"],
+ prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
+ max_model_len=4096,
+ auto_cls=AutoModelForVision2Seq,
+ vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
+ marks=[
+ pytest.mark.distributed_2_gpus,
+ pytest.mark.skipif(
+ cuda_device_count_stateless() < 2,
+ reason="Need at least 2 GPUs to run the test.",
+ )
+ ],
+ **COMMON_BROADCAST_SETTINGS # type: ignore
+ ),
+ "broadcast-llava_next": VLMTestInfo(
+ models=["llava-hf/llava-v1.6-mistral-7b-hf"],
+ prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]",
+ max_model_len=10240,
+ auto_cls=AutoModelForVision2Seq,
+ vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
+ marks=[
+ pytest.mark.distributed_2_gpus,
+ pytest.mark.skipif(
+ cuda_device_count_stateless() < 2,
+ reason="Need at least 2 GPUs to run the test.",
+ )
+ ],
+ **COMMON_BROADCAST_SETTINGS # type: ignore
+ ),
+ ### Custom input edge-cases for specific models
+ "intern_vl-diff-patches": VLMTestInfo(
+ models=["OpenGVLab/InternVL2-2B"],
+ prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
+ test_type=VLMTestType.CUSTOM_INPUTS,
+ max_model_len=4096,
+ dtype="bfloat16" if current_platform.is_cpu() else "half",
+ use_tokenizer_eos=True,
+ patch_hf_runner=model_utils.internvl_patch_hf_runner,
+ custom_test_opts=[
+ CustomTestOptions(
+ inputs=inp,
+ limit_mm_per_prompt={"image": 2},
+ ) for inp in custom_inputs.different_patch_input_cases_internvl()
+ ],
+ ),
+ "llava_one_vision-multiple-images": VLMTestInfo(
+ models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"],
+ test_type=VLMTestType.CUSTOM_INPUTS,
+ max_model_len=16384,
+ max_num_seqs=2,
+ dtype="half",
+ postprocess_inputs=model_utils.get_key_type_post_processor(
+ "pixel_values"
+ ),
+ auto_cls=AutoModelForVision2Seq,
+ vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
+ custom_test_opts=[CustomTestOptions(
+ inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
+ formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
+ ),
+ limit_mm_per_prompt={"image": 4},
+ )],
+ ),
+}
+# yapf: enable
+
+
+### Test wrappers
+# Wrappers around the core test running func for:
+# - single image
+# - multi-image
+# - image embeddings
+# - video
+# - custom inputs
+@pytest.mark.parametrize("model_type,test_case",
+ get_parametrized_options(
+ VLM_TEST_SETTINGS,
+ test_type=VLMTestType.IMAGE,
+ fork_new_process_for_each_test=False,
+ ))
+def test_single_image_models(tmp_path: PosixPath, model_type: str,
+ test_case: ExpandableVLMTestArgs,
+ hf_runner: Type[HfRunner],
+ vllm_runner: Type[VllmRunner],
+ image_assets: _ImageAssets):
+ model_test_info = VLM_TEST_SETTINGS[model_type]
+ runners.run_single_image_test(
+ tmp_path=tmp_path,
+ model_test_info=model_test_info,
+ test_case=test_case,
+ hf_runner=hf_runner,
+ vllm_runner=vllm_runner,
+ image_assets=image_assets,
+ )
+
+
+@pytest.mark.parametrize("model_type,test_case",
+ get_parametrized_options(
+ VLM_TEST_SETTINGS,
+ test_type=VLMTestType.MULTI_IMAGE,
+ fork_new_process_for_each_test=False,
+ ))
+def test_multi_image_models(tmp_path: PosixPath, model_type: str,
+ test_case: ExpandableVLMTestArgs,
+ hf_runner: Type[HfRunner],
+ vllm_runner: Type[VllmRunner],
+ image_assets: _ImageAssets):
+ model_test_info = VLM_TEST_SETTINGS[model_type]
+ runners.run_multi_image_test(
+ tmp_path=tmp_path,
+ model_test_info=model_test_info,
+ test_case=test_case,
+ hf_runner=hf_runner,
+ vllm_runner=vllm_runner,
+ image_assets=image_assets,
+ )
+
+
+@pytest.mark.parametrize("model_type,test_case",
+ get_parametrized_options(
+ VLM_TEST_SETTINGS,
+ test_type=VLMTestType.EMBEDDING,
+ fork_new_process_for_each_test=False,
+ ))
+def test_image_embedding_models(model_type: str,
+ test_case: ExpandableVLMTestArgs,
+ hf_runner: Type[HfRunner],
+ vllm_runner: Type[VllmRunner],
+ image_assets: _ImageAssets):
+ model_test_info = VLM_TEST_SETTINGS[model_type]
+ runners.run_embedding_test(
+ model_test_info=model_test_info,
+ test_case=test_case,
+ hf_runner=hf_runner,
+ vllm_runner=vllm_runner,
+ image_assets=image_assets,
+ )
+
+
+@pytest.mark.parametrize("model_type,test_case",
+ get_parametrized_options(
+ VLM_TEST_SETTINGS,
+ test_type=VLMTestType.VIDEO,
+ fork_new_process_for_each_test=False,
+ ))
+def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
+ hf_runner: Type[HfRunner], vllm_runner: Type[VllmRunner],
+ video_assets: _VideoAssets):
+ model_test_info = VLM_TEST_SETTINGS[model_type]
+ runners.run_video_test(
+ model_test_info=model_test_info,
+ test_case=test_case,
+ hf_runner=hf_runner,
+ vllm_runner=vllm_runner,
+ video_assets=video_assets,
+ )
+
+
+@pytest.mark.parametrize("model_type,test_case",
+ get_parametrized_options(
+ VLM_TEST_SETTINGS,
+ test_type=VLMTestType.CUSTOM_INPUTS,
+ fork_new_process_for_each_test=False,
+ ))
+def test_custom_inputs_models(
+ model_type: str,
+ test_case: ExpandableVLMTestArgs,
+ hf_runner: Type[HfRunner],
+ vllm_runner: Type[VllmRunner],
+):
+ model_test_info = VLM_TEST_SETTINGS[model_type]
+ runners.run_custom_inputs_test(
+ model_test_info=model_test_info,
+ test_case=test_case,
+ hf_runner=hf_runner,
+ vllm_runner=vllm_runner,
+ )
+
+
+#### Tests filtering for things running each test as a new process
+@pytest.mark.parametrize("model_type,test_case",
+ get_parametrized_options(
+ VLM_TEST_SETTINGS,
+ test_type=VLMTestType.IMAGE,
+ fork_new_process_for_each_test=True,
+ ))
+@fork_new_process_for_each_test
+def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str,
+ test_case: ExpandableVLMTestArgs,
+ hf_runner: Type[HfRunner],
+ vllm_runner: Type[VllmRunner],
+ image_assets: _ImageAssets):
+ model_test_info = VLM_TEST_SETTINGS[model_type]
+ runners.run_single_image_test(
+ tmp_path=tmp_path,
+ model_test_info=model_test_info,
+ test_case=test_case,
+ hf_runner=hf_runner,
+ vllm_runner=vllm_runner,
+ image_assets=image_assets,
+ )
+
+
+@pytest.mark.parametrize("model_type,test_case",
+ get_parametrized_options(
+ VLM_TEST_SETTINGS,
+ test_type=VLMTestType.MULTI_IMAGE,
+ fork_new_process_for_each_test=True,
+ ))
+@fork_new_process_for_each_test
+def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str,
+ test_case: ExpandableVLMTestArgs,
+ hf_runner: Type[HfRunner],
+ vllm_runner: Type[VllmRunner],
+ image_assets: _ImageAssets):
+ model_test_info = VLM_TEST_SETTINGS[model_type]
+ runners.run_multi_image_test(
+ tmp_path=tmp_path,
+ model_test_info=model_test_info,
+ test_case=test_case,
+ hf_runner=hf_runner,
+ vllm_runner=vllm_runner,
+ image_assets=image_assets,
+ )
+
+
+@pytest.mark.parametrize("model_type,test_case",
+ get_parametrized_options(
+ VLM_TEST_SETTINGS,
+ test_type=VLMTestType.EMBEDDING,
+ fork_new_process_for_each_test=True,
+ ))
+@fork_new_process_for_each_test
+def test_image_embedding_models_heavy(model_type: str,
+ test_case: ExpandableVLMTestArgs,
+ hf_runner: Type[HfRunner],
+ vllm_runner: Type[VllmRunner],
+ image_assets: _ImageAssets):
+ model_test_info = VLM_TEST_SETTINGS[model_type]
+ runners.run_embedding_test(
+ model_test_info=model_test_info,
+ test_case=test_case,
+ hf_runner=hf_runner,
+ vllm_runner=vllm_runner,
+ image_assets=image_assets,
+ )
+
+
+@pytest.mark.parametrize("model_type,test_case",
+ get_parametrized_options(
+ VLM_TEST_SETTINGS,
+ test_type=VLMTestType.VIDEO,
+ fork_new_process_for_each_test=True,
+ ))
+def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
+ hf_runner: Type[HfRunner],
+ vllm_runner: Type[VllmRunner],
+ video_assets: _VideoAssets):
+ model_test_info = VLM_TEST_SETTINGS[model_type]
+ runners.run_video_test(
+ model_test_info=model_test_info,
+ test_case=test_case,
+ hf_runner=hf_runner,
+ vllm_runner=vllm_runner,
+ video_assets=video_assets,
+ )
+
+
+@pytest.mark.parametrize("model_type,test_case",
+ get_parametrized_options(
+ VLM_TEST_SETTINGS,
+ test_type=VLMTestType.CUSTOM_INPUTS,
+ fork_new_process_for_each_test=True,
+ ))
+@fork_new_process_for_each_test
+def test_custom_inputs_models_heavy(
+ model_type: str,
+ test_case: ExpandableVLMTestArgs,
+ hf_runner: Type[HfRunner],
+ vllm_runner: Type[VllmRunner],
+):
+ model_test_info = VLM_TEST_SETTINGS[model_type]
+ runners.run_custom_inputs_test(
+ model_test_info=model_test_info,
+ test_case=test_case,
+ hf_runner=hf_runner,
+ vllm_runner=vllm_runner,
+ )
diff --git a/tests/models/decoder_only/vision_language/test_paligemma.py b/tests/models/decoder_only/vision_language/test_paligemma.py
deleted file mode 100644
index 69189ba2f25cb..0000000000000
--- a/tests/models/decoder_only/vision_language/test_paligemma.py
+++ /dev/null
@@ -1,174 +0,0 @@
-import os
-from typing import List, Optional, Tuple, Type
-
-import pytest
-from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer,
- BatchEncoding)
-
-from vllm.multimodal.utils import rescale_image_size
-from vllm.platforms import current_platform
-from vllm.sequence import SampleLogprobs
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
-
-from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
-from ...utils import check_logprobs_close
-
-HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
- "stop_sign":
- "caption es",
- "cherry_blossom":
- "What is in the picture?",
-})
-
-models = ["google/paligemma-3b-mix-224"]
-
-# ROCm Triton FA can run into compilation issues with these models due to,
-# excessive use of shared memory. Use other backends in the meantime.
-# FIXME (mattwong, gshtrasb, hongxiayan)
-if current_platform.is_rocm():
- os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
-
-
-def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
- Optional[SampleLogprobs]],
- model: str):
- """Sanitize vllm output to be comparable with hf output."""
- output_ids, output_str, out_logprobs = vllm_output
-
- config = AutoConfig.from_pretrained(model)
- image_token_id = config.image_token_index
-
- tokenizer = AutoTokenizer.from_pretrained(model)
- eos_token_id = tokenizer.eos_token_id
-
- hf_output_ids = [
- token_id for idx, token_id in enumerate(output_ids)
- if token_id != image_token_id or output_ids[idx - 1] != image_token_id
- ]
-
- hf_output_str = output_str
-
- if hf_output_ids[-1] == eos_token_id:
- hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
-
- return hf_output_ids, hf_output_str, out_logprobs
-
-
-def run_test(
- hf_runner: Type[HfRunner],
- vllm_runner: Type[VllmRunner],
- image_assets: _ImageAssets,
- model: str,
- *,
- size_factors: List[float],
- dtype: str,
- max_tokens: int,
- num_logprobs: int,
- tensor_parallel_size: int,
- distributed_executor_backend: Optional[str] = None,
-):
- """Inference result should be the same between hf and vllm.
-
- All the image fixtures for the test are from IMAGE_ASSETS.
- For huggingface runner, we provide the PIL images as input.
- For vllm runner, we provide MultiModalDataDict objects
- and corresponding MultiModalConfig as input.
- Note, the text input is also adjusted to abide by vllm contract.
- The text output is sanitized to be able to compare with hf.
- """
- torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
- images = [asset.pil_image for asset in image_assets]
-
- inputs_per_image = [(
- [prompt for _ in size_factors],
- [rescale_image_size(image, factor) for factor in size_factors],
- ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
-
- # NOTE: take care of the order. run vLLM first, and then run HF.
- # vLLM needs a fresh new process without cuda initialization.
- # if we run HF first, the cuda initialization will be done and it
- # will hurt multiprocessing backend with fork method (the default method).
-
- # max_model_len should be greater than image_feature_size
- with vllm_runner(model,
- dtype=dtype,
- tensor_parallel_size=tensor_parallel_size,
- distributed_executor_backend=distributed_executor_backend,
- enforce_eager=True) as vllm_model:
- vllm_outputs_per_image = [
- vllm_model.generate_greedy_logprobs(prompts,
- max_tokens,
- num_logprobs=num_logprobs,
- images=images)
- for prompts, images in inputs_per_image
- ]
-
- def process(hf_inputs: BatchEncoding):
- hf_inputs["pixel_values"] = hf_inputs["pixel_values"] \
- .to(torch_dtype) # type: ignore
- return hf_inputs
-
- with hf_runner(model,
- dtype=dtype,
- postprocess_inputs=process,
- auto_cls=AutoModelForVision2Seq) as hf_model:
- hf_outputs_per_image = [
- hf_model.generate_greedy_logprobs_limit(prompts,
- max_tokens,
- num_logprobs=num_logprobs,
- images=images)
- for prompts, images in inputs_per_image
- ]
-
- for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
- vllm_outputs_per_image):
-
- check_logprobs_close(
- outputs_0_lst=hf_outputs,
- outputs_1_lst=[
- vllm_to_hf_output(vllm_output, model)
- for vllm_output in vllm_outputs
- ],
- name_0="hf",
- name_1="vllm",
- )
-
-
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize(
- "size_factors",
- [
- # No image
- [],
- # Single-scale
- [1.0],
- # Single-scale, batched
- [1.0, 1.0, 1.0],
- # Multi-scale
- [0.25, 0.5, 1.0],
- ],
-)
-@pytest.mark.parametrize("dtype", [
- pytest.param(
- "float",
- marks=pytest.mark.skipif(
- current_platform.is_rocm(),
- reason=
- "ROCm FA does not yet fully support 32-bit precision on PaliGemma")
- ), "half"
-])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [5])
-def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
- dtype: str, max_tokens: int, num_logprobs: int) -> None:
- run_test(
- hf_runner,
- vllm_runner,
- image_assets,
- model,
- size_factors=size_factors,
- dtype=dtype,
- max_tokens=max_tokens,
- num_logprobs=num_logprobs,
- tensor_parallel_size=1,
- )
diff --git a/tests/models/decoder_only/vision_language/test_phi3v.py b/tests/models/decoder_only/vision_language/test_phi3v.py
index 1840b4bb8574c..b9c20ddb2d746 100644
--- a/tests/models/decoder_only/vision_language/test_phi3v.py
+++ b/tests/models/decoder_only/vision_language/test_phi3v.py
@@ -3,19 +3,14 @@
from typing import List, Optional, Tuple, Type
import pytest
-import torch
-from transformers import AutoImageProcessor, AutoTokenizer
+from transformers import AutoTokenizer
-from vllm.inputs import InputContext, token_inputs
-from vllm.model_executor.models.phi3v import _IMAGE_TOKEN_ID
-from vllm.multimodal import MultiModalRegistry
from vllm.multimodal.utils import rescale_image_size
from vllm.platforms import current_platform
from vllm.sequence import SampleLogprobs
-from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
- _ImageAssets)
-from ...utils import build_model_context, check_logprobs_close
+from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
+from ...utils import check_logprobs_close
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
"stop_sign":
@@ -81,12 +76,15 @@ def run_test(
Note, the text input is also adjusted to abide by vllm contract.
The text output is sanitized to be able to compare with hf.
"""
+ # HACK - this is an attempted workaround for the following bug
+ # https://github.com/huggingface/transformers/issues/34307
+ from transformers import AutoImageProcessor # noqa: F401
+ from transformers import AutoProcessor # noqa: F401
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
-
# max_model_len should be greater than image_feature_size
with vllm_runner(model,
task="generate",
@@ -236,172 +234,3 @@ def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
mm_limit=2,
tensor_parallel_size=1,
)
-
-
-### Fast tests for correctness in processor_kwarg override handling
-
-
-# Wrap lazy imports to avoid initializing CUDA during test collection
-@pytest.fixture()
-def input_processor_for_phi3v():
- from vllm.model_executor.models.phi3v import input_processor_for_phi3v
- return input_processor_for_phi3v
-
-
-@pytest.fixture()
-def dummy_data_for_phi3v():
- from vllm.model_executor.models.phi3v import dummy_data_for_phi3v
- return dummy_data_for_phi3v
-
-
-@pytest.fixture()
-def get_max_phi3v_image_tokens():
- from vllm.model_executor.models.phi3v import get_max_phi3v_image_tokens
- return get_max_phi3v_image_tokens
-
-
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("num_crops", [4, 16, None])
-def test_input_mapper_override(model: str, image_assets: _ImageAssets,
- num_crops: Optional[int]):
- """Ensure that the [default] input mapper handles num_crops properly."""
- # We pass the processor kwargs here since for this model, we fall back to
- # the default mapper; this will fall back to the HF mapper and forward
- # mm_processor_kwargs to it.
- mm_processor_kwargs = {
- "num_crops": num_crops
- } if num_crops is not None else {}
- ctx = build_model_context(
- model_name=model,
- tokenizer_name=model,
- trust_remote_code=True,
- mm_processor_kwargs=mm_processor_kwargs,
- )
-
- hf_processor = AutoImageProcessor.from_pretrained(model,
- trust_remote_code=True,
- **mm_processor_kwargs)
-
- mm_registry = MultiModalRegistry()
- mm_registry.init_mm_limits_per_prompt(ctx.model_config)
-
- image = image_assets[0].pil_image
- hf_result = hf_processor.preprocess(
- image,
- return_tensors="pt",
- )
-
- vllm_result = mm_registry.map_input(
- ctx.model_config,
- {"image": image},
- )
-
- assert torch.all(hf_result["image_sizes"] == vllm_result["image_sizes"])
- assert torch.all(
- hf_result["num_img_tokens"] == vllm_result["num_img_tokens"])
-
- # For pixel values, the second axis should be the num_crops + 1
- # for the rescaled original image. The default value in VLLM falls
- # back to the HF config, which is why we compare to the processor num_crops
- assert torch.all(hf_result["pixel_values"] == vllm_result["pixel_values"])
- assert vllm_result["pixel_values"].shape[1] == hf_processor.num_crops + 1
-
-
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("num_crops,expected_max_tokens", [
- (4, 781),
- (16, 2653),
-])
-def test_max_tokens_override(get_max_phi3v_image_tokens, model: str,
- num_crops: int, expected_max_tokens: int):
- """Ensure get_max_phi3v_image_tokens handles num_crops properly."""
- # NOTE: mm_processor_kwargs on the context in this test is unused, since
- # this is testing the mapper directly. In practice, the processor kwargs
- # are wrapped in a closure when calling the max tokens func. We explicitly
- # do NOT use the mm_processor_kwargs in the model context here to ensure
- # that the max image tokens implementation is referencing a mix of the
- # kwargs to the function and the original mm_processor_kwargs in case
- # values are somehow updated and end up in a bad state.
- ctx = build_model_context(
- model_name=model,
- tokenizer_name=model,
- trust_remote_code=True,
- mm_processor_kwargs=None,
- )
-
- actual_max_tokens = get_max_phi3v_image_tokens(
- InputContext(ctx.model_config),
- num_crops=num_crops,
- )
-
- assert expected_max_tokens == actual_max_tokens
-
-
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("num_crops,toks_per_img,num_imgs", [
- (4, 781, 1),
- (4, 781, 2),
- (16, 2653, 1),
- (16, 2653, 2),
-])
-def test_dummy_data_override(dummy_data_for_phi3v, model: str, num_crops: int,
- toks_per_img: int, num_imgs: int):
- """Ensure dummy_data_for_phi3v handles num_crops properly."""
- # Same as the previous test - don't initialize mm_processor_kwargs
- # in this test and assume that the kwargs will be correctly expanded by
- # the partial when calling the dummy data func.
- ctx = build_model_context(
- model_name=model,
- tokenizer_name=model,
- trust_remote_code=True,
- mm_processor_kwargs=None,
- )
-
- sequence_data, _, = dummy_data_for_phi3v(
- ctx=ctx,
- seq_len=8192, # Should be bigger than num_imgs * toks_per_img
- mm_counts={"image": num_imgs},
- num_crops=num_crops,
- )
- # Ensure we have the right number of placeholders per num_crops size
- img_tok_count = sequence_data.get_token_ids().count(_IMAGE_TOKEN_ID)
- assert img_tok_count == toks_per_img * num_imgs
-
-
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("num_crops,expected_toks_per_img,num_imgs", [
- (4, 757, 1),
- (4, 757, 2),
- (16, 1921, 1),
- (16, 1921, 2),
-])
-def test_input_processor_override(input_processor_for_phi3v,
- image_assets: _ImageAssets, model: str,
- num_crops: int, expected_toks_per_img: int,
- num_imgs: int):
- """Ensure input_processor_for_phi3v handles num_crops properly."""
- # Same as the previous test - don't initialize mm_processor_kwargs
- # in this test and assume that the kwargs will be correctly expanded by
- # the partial when calling the custom input processor.
- ctx = build_model_context(
- model_name=model,
- tokenizer_name=model,
- trust_remote_code=True,
- )
- tokenizer = AutoTokenizer.from_pretrained(model)
- # Build the image str / prompt based on the number of images we pass
- img_str = "".join([f"<|image_{idx}|>\n" for idx in range(1, num_imgs + 1)])
- prompt = f"<|user|>\n{img_str}<|end|>\n<|assistant|>\n"
- images = [image_assets[0].pil_image] * num_imgs
-
- inputs = token_inputs(prompt_token_ids=tokenizer.encode(prompt),
- prompt=prompt,
- multi_modal_data={"image": images})
-
- processed_inputs = input_processor_for_phi3v(ctx,
- inputs,
- num_crops=num_crops)
-
- # Ensure we have the right number of placeholders per num_crops size
- img_tok_count = processed_inputs["prompt_token_ids"].count(_IMAGE_TOKEN_ID)
- assert img_tok_count == expected_toks_per_img * num_imgs
diff --git a/tests/models/decoder_only/vision_language/test_qwen.py b/tests/models/decoder_only/vision_language/test_qwen.py
deleted file mode 100644
index db5ab485f872d..0000000000000
--- a/tests/models/decoder_only/vision_language/test_qwen.py
+++ /dev/null
@@ -1,374 +0,0 @@
-import pathlib
-from typing import Dict, List, Optional, Tuple, Type, Union
-
-import pytest
-import torch
-from PIL.Image import Image
-
-from vllm.inputs import InputContext, token_inputs
-from vllm.multimodal.base import MultiModalInputs
-from vllm.multimodal.utils import cached_get_tokenizer, rescale_image_size
-
-from ....conftest import (IMAGE_ASSETS, HfRunner, ImageAsset, PromptImageInput,
- VllmRunner, _ImageAssets)
-from ...utils import build_model_context, check_logprobs_close
-
-text_only_models = [
- "Qwen/Qwen-7B-Chat" # Has no visual component
-]
-
-multimodal_models = ["Qwen/Qwen-VL"]
-
-HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
- "stop_sign":
- "Picture 1: \nWhat's the content of the image?: ",
- "cherry_blossom":
- "Picture 1: \nWhat is the season?: ",
-})
-
-HF_MULTIIMAGE_IMAGE_PROMPT = "Picture 1: \nPicture 2: \nCan you compare these images?\n" # noqa: E501
-HF_MULTIIMAGE_IMAGE_PROMPT = "Picture 1: \nPicture 2: \nDescribe the two images in detail.\n" # noqa: E501
-### Multimodal preprocessing tests
-SAMPLE_IMAGE = IMAGE_ASSETS[0].pil_image
-# These values are specific to Qwen-VL/Chat; we can get these from the model
-# config also, but they are hardcoded here to keep the parameterize/fixtures
-# easy to read.
-IMG_START_ID = 151857
-IMG_END_ID = 151858
-IMG_PAD_ID = 151859
-TOKS_PER_IMG = 256
-VIS_ENC_DIM = 4096
-IMG_SIZE = 448
-
-
-@pytest.fixture()
-def input_mapper_for_qwen():
- # Lazy import to avoid initializing CUDA during test collection
- from vllm.model_executor.models.qwen import input_mapper_for_qwen
- return input_mapper_for_qwen
-
-
-@pytest.fixture()
-def input_processor_for_qwen():
- # Lazy import to avoid initializing CUDA during test collection
- from vllm.model_executor.models.qwen import input_processor_for_qwen
- return input_processor_for_qwen
-
-
-@pytest.fixture()
-def qwen_vl_context() -> InputContext:
- """Get an InputContext for Qwen-VL."""
- return build_model_context(model_name="Qwen/Qwen-VL",
- trust_remote_code=True)
-
-
-# Happy path tests for single/multi-image scenarios for the multimodal
-# input processor and mapper, respectively
-@pytest.mark.parametrize("num_images", [1, 2])
-def test_input_processor_valid_mm_data(input_processor_for_qwen,
- qwen_vl_context: InputContext,
- num_images: int):
- """Happy cases for image inputs to Qwen's multimodal input processor."""
- prompt = "".join(
- [f"Picture {num}: \n" for num in range(1, num_images + 1)])
- inputs = token_inputs(
- prompt=prompt,
- # When processing multimodal data for a multimodal model, the qwen
- # input processor will overwrite the provided prompt_token_ids with
- # the image prompts
- prompt_token_ids=[],
- multi_modal_data={"image": torch.rand(num_images, TOKS_PER_IMG, 4096)},
- )
- proc_inputs = input_processor_for_qwen(qwen_vl_context, inputs)
- assert isinstance(proc_inputs, dict)
-
- # Each image should have one start / stop and a fixed context of 256
- proc_tokens = proc_inputs["prompt_token_ids"]
- assert proc_tokens.count(IMG_START_ID) == num_images
- assert proc_tokens.count(IMG_END_ID) == num_images
- assert proc_tokens.count(IMG_PAD_ID) == num_images * TOKS_PER_IMG
-
-
-@pytest.mark.parametrize(
- "img_data,expected_shape",
- [
- # single / multi-image
- (SAMPLE_IMAGE, (1, 3, IMG_SIZE, IMG_SIZE)),
- (2 * [SAMPLE_IMAGE], (2, 3, IMG_SIZE, IMG_SIZE)),
- # single / multi-image embeddings
- (torch.rand(
- (TOKS_PER_IMG, VIS_ENC_DIM)), (1, TOKS_PER_IMG, VIS_ENC_DIM)),
- (torch.rand(
- (1, TOKS_PER_IMG, VIS_ENC_DIM)), (1, TOKS_PER_IMG, VIS_ENC_DIM)),
- (torch.rand(
- (2, TOKS_PER_IMG, VIS_ENC_DIM)), (2, TOKS_PER_IMG, VIS_ENC_DIM)),
- ])
-def test_input_mapper_valid_mm_data(input_mapper_for_qwen,
- qwen_vl_context: InputContext,
- img_data: Union[torch.Tensor, List[Image],
- Image],
- expected_shape: List[int]):
- """Happy cases for image inputs to Qwen's multimodal input mapper."""
- mapped_img_data = input_mapper_for_qwen(qwen_vl_context, img_data)
- # Ensure that we get the appropriately shaped pixel_values
- # for images and image embeddings, respectively.
- assert isinstance(mapped_img_data, MultiModalInputs)
- assert "pixel_values" in mapped_img_data
- assert mapped_img_data["pixel_values"].shape == expected_shape
-
-
-# Sad path tests for the multimodal input processor and mapper, respectively
-@pytest.mark.parametrize("mm_data", [
- {
- "image": torch.rand((5))
- },
- {
- "image": torch.rand((5, 5, 5, 5, 5))
- },
-])
-def test_input_processor_invalid_mm_data(input_processor_for_qwen,
- qwen_vl_context: InputContext,
- mm_data: Dict[str, torch.Tensor]):
- """Test sad cases validated in Qwen's multimodal input processor."""
- tokenizer = cached_get_tokenizer(qwen_vl_context.model_config.tokenizer,
- trust_remote_code=True)
- prompt = "Picture 1: \n"
- prompt_token_ids = tokenizer.encode(prompt)
- inputs = token_inputs(prompt=prompt,
- prompt_token_ids=prompt_token_ids,
- multi_modal_data=mm_data)
- # Should fail since we have too many or too few dimensions for embeddings
- with pytest.raises(ValueError):
- input_processor_for_qwen(qwen_vl_context, inputs)
-
-
-@pytest.mark.parametrize(
- "img_data",
- [
- # Wrong context length
- torch.rand((1, TOKS_PER_IMG + 10, VIS_ENC_DIM)),
- # Wrong visual encoder output size
- torch.rand((1, TOKS_PER_IMG, VIS_ENC_DIM + 10)),
- ])
-def test_input_mapper_invalid_mm_data(
- input_mapper_for_qwen,
- qwen_vl_context: InputContext,
- img_data: Union[torch.Tensor, List[Image], Image],
-):
- """Sad cases validated in Qwen VL's multimodal input mapper."""
- with pytest.raises(ValueError):
- input_mapper_for_qwen(qwen_vl_context, img_data)
-
-
-### End-to-end generation tests
-def get_prompt_with_path(tmp_path: pathlib.PosixPath, prompt: str,
- assets: Union[_ImageAssets, List[ImageAsset]]) -> str:
- """Given a temporary dir path, export one or more image assets into the
- tempdir & replace its contents with the local path to the string so that
- the HF version of Qwen-VL can resolve the path and load the image ni its
- forward() call.
-
- Args:
- tmp_path: Tempdir for test under consideration.
- prompt: Prompt with image placeholders.
- assets: List of image assets whose len equals the num placeholders.
- """
- # Ensure that the number of placeholders matches the number of assets;
- # If this is not true, the test is probably written incorrectly.
- assert prompt.count("") == len(assets)
-
- # Replace the placeholders with local paths to the exported assets
- for asset in assets:
- image_tmp_path = tmp_path / f"{asset.name}.jpg"
- asset.pil_image.save(image_tmp_path)
- prompt = prompt.replace(
- "",
- f"{image_tmp_path}",
- 1,
- )
- return prompt
-
-
-def run_test(
- hf_runner: Type[HfRunner],
- vllm_runner: Type[VllmRunner],
- inputs: List[Tuple[List[str], PromptImageInput]],
- model: str,
- *,
- dtype: str,
- max_tokens: int,
- num_logprobs: int,
- mm_limit: int,
- tensor_parallel_size: int,
- distributed_executor_backend: Optional[str] = None,
-):
- """Inference result should be the same between hf and vllm.
-
- All the image fixtures for the test is under tests/images.
- For huggingface runner, we provide the PIL images as input.
- For vllm runner, we provide MultiModalDataDict objects
- and corresponding MultiModalConfig as input.
- Note, the text input is also adjusted to abide by vllm contract.
- The text output is sanitized to be able to compare with hf.
- """
-
- # NOTE: take care of the order. run vLLM first, and then run HF.
- # vLLM needs a fresh new process without cuda initialization.
- # if we run HF first, the cuda initialization will be done and it
- # will hurt multiprocessing backend with fork method (the default method).
-
- # max_model_len should be greater than image_feature_size
- # Qwen encodes each image into a fixed content size of 256
- with vllm_runner(model,
- max_model_len=1024,
- max_num_seqs=2,
- dtype=dtype,
- limit_mm_per_prompt={"image": mm_limit},
- tensor_parallel_size=tensor_parallel_size,
- distributed_executor_backend=distributed_executor_backend,
- enforce_eager=True) as vllm_model:
- vllm_outputs_per_image = [
- vllm_model.generate_greedy_logprobs(prompts,
- max_tokens,
- num_logprobs=num_logprobs,
- images=images)
- for prompts, images in inputs
- ]
-
- with hf_runner(model, dtype=dtype) as hf_model:
- hf_outputs_per_image = [
- hf_model.generate_greedy_logprobs_limit(prompts,
- max_tokens,
- num_logprobs=num_logprobs,
- images=images)
- for prompts, images in inputs
- ]
-
- for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
- vllm_outputs_per_image):
-
- check_logprobs_close(
- outputs_0_lst=hf_outputs,
- outputs_1_lst=vllm_outputs,
- name_0="hf",
- name_1="vllm",
- )
-
-
-@pytest.mark.parametrize("model", multimodal_models)
-@pytest.mark.parametrize(
- "size_factors",
- [
- # No image
- [],
- # Single-scale
- [1.0],
- # Single-scale, batched
- [1.0, 1.0, 1.0],
- # Multi-scale
- [0.25, 0.5, 1.0],
- ],
-)
-@pytest.mark.parametrize("dtype", ["bfloat16"])
-@pytest.mark.parametrize("max_tokens", [8])
-@pytest.mark.parametrize("num_logprobs", [5])
-def test_multimodal_models_single_image(tmp_path: pathlib.PosixPath,
- hf_runner: Type[HfRunner],
- vllm_runner: Type[VllmRunner],
- image_assets: _ImageAssets, model: str,
- size_factors: List[float], dtype: str,
- max_tokens: int,
- num_logprobs: int) -> None:
- """Tests multimodal models with single image prompts."""
- images = [asset.pil_image for asset in image_assets]
-
- prompts = [
- get_prompt_with_path(tmp_path, prompt, [asset])
- for prompt, asset in zip(HF_IMAGE_PROMPTS, image_assets)
- ]
-
- inputs = [(
- [prompt for _ in size_factors],
- [rescale_image_size(image, factor) for factor in size_factors],
- ) for image, prompt in zip(images, prompts)]
-
- run_test(
- hf_runner,
- vllm_runner,
- inputs,
- model,
- dtype=dtype,
- max_tokens=max_tokens,
- num_logprobs=num_logprobs,
- mm_limit=1,
- tensor_parallel_size=1,
- )
-
-
-@pytest.mark.parametrize("model", multimodal_models)
-@pytest.mark.parametrize(
- "size_factors",
- [
- # No image
- [],
- # Single-scale
- [1.0],
- # Single-scale, batched
- [1.0, 1.0, 1.0],
- # Multi-scale
- [0.25, 0.5, 1.0],
- ],
-)
-@pytest.mark.parametrize("dtype", ["bfloat16"])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [5])
-def test_multimodal_models_multi_image(tmp_path: pathlib.PosixPath,
- hf_runner: Type[HfRunner],
- vllm_runner: Type[VllmRunner],
- image_assets: _ImageAssets, model: str,
- size_factors: List[float], dtype: str,
- max_tokens: int,
- num_logprobs: int) -> None:
- """Tests multimodal models with multi-image prompts."""
- images = [asset.pil_image for asset in image_assets]
- # Put all of the images into one prompt.
- prompt = get_prompt_with_path(tmp_path, HF_MULTIIMAGE_IMAGE_PROMPT,
- image_assets)
- inputs = [([prompt for _ in size_factors],
- [[rescale_image_size(image, factor) for image in images]
- for factor in size_factors])]
-
- run_test(
- hf_runner,
- vllm_runner,
- inputs,
- model,
- dtype=dtype,
- max_tokens=max_tokens,
- num_logprobs=num_logprobs,
- mm_limit=2,
- tensor_parallel_size=1,
- )
-
-
-# Ensure that a text-only Qwen model can still be loaded and
-# used for inference in VLLM without throwing.
-@pytest.mark.parametrize("model", text_only_models)
-@pytest.mark.parametrize("dtype", ["bfloat16"])
-@pytest.mark.parametrize("max_tokens", [32])
-@pytest.mark.parametrize("num_logprobs", [5])
-def test_text_only_qwen_model_can_be_loaded_and_run(
- vllm_runner: Type[VllmRunner],
- example_prompts: List[str],
- model: str,
- *,
- dtype: str,
- max_tokens: int,
- num_logprobs: int,
-):
- with vllm_runner(model, dtype=dtype) as vllm_model:
- vllm_model.generate_greedy_logprobs(
- example_prompts,
- max_tokens,
- num_logprobs=num_logprobs,
- )
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/__init__.py b/tests/models/decoder_only/vision_language/vlm_utils/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/builders.py b/tests/models/decoder_only/vision_language/vlm_utils/builders.py
new file mode 100644
index 0000000000000..66668296139f5
--- /dev/null
+++ b/tests/models/decoder_only/vision_language/vlm_utils/builders.py
@@ -0,0 +1,235 @@
+"""Helpers for building inputs that can be leveraged for different test types.
+"""
+from pathlib import PosixPath
+from typing import Callable, Iterable, List, Optional, Tuple, Union
+
+import torch
+
+from vllm.multimodal.utils import (rescale_image_size, rescale_video_size,
+ resize_video, sample_frames_from_video)
+
+from .....conftest import _ImageAssets, _VideoAssets
+from .types import (SINGLE_IMAGE_BASE_PROMPTS, TEST_IMG_PLACEHOLDER,
+ TEST_VIDEO_PLACEHOLDER, VIDEO_BASE_PROMPT,
+ ImageSizeWrapper, SizeType, VLMTestInfo)
+
+
+def replace_test_placeholder(prompt: str, img_idx_to_prompt: Callable[[int],
+ str],
+ test_placeholder: str) -> str:
+ """Given a prompt, replaces each test placeholder with the
+ model-specific tag.
+ """
+ prompt_segments = prompt.split(test_placeholder)
+ img_prompt = prompt_segments[0]
+ for placeholder_idx, next_seg in enumerate(prompt_segments[1:], start=1):
+ img_prompt += img_idx_to_prompt(placeholder_idx)
+ img_prompt += next_seg
+ return img_prompt
+
+
+def get_model_prompts(base_prompts: Iterable[str],
+ img_idx_to_prompt: Optional[Callable[[int], str]],
+ video_idx_to_prompt: Optional[Callable[[int], str]],
+ prompt_formatter: Callable[[str], str]) -> List[str]:
+ """Given a model-agnostic base prompt and test configuration for a model(s)
+ to be tested, update the media placeholders and apply the prompt formatting
+ to get the test prompt string for this model.
+
+ Example for phi3v, given the base_prompt: "What is the season?"
+ 1. Replace img placeholder(s)
+ -> "<|image_1|>\nWhat is the season?"
+ 2. Apply prompt formatter:
+ -> <|user|>\n<|image_1|>\nWhat is the season?<|end|>\n<|assistant|>\n
+ """
+ assert isinstance(base_prompts, (list, tuple))
+ model_prompts = []
+ for base_prompt in base_prompts:
+ # Replace the multimodal placeholders in the base prompt with
+ # the correct ones for the model that we are testing
+ if img_idx_to_prompt:
+ base_prompt = replace_test_placeholder(base_prompt,
+ img_idx_to_prompt,
+ TEST_IMG_PLACEHOLDER)
+
+ if video_idx_to_prompt:
+ base_prompt = replace_test_placeholder(base_prompt,
+ video_idx_to_prompt,
+ TEST_VIDEO_PLACEHOLDER)
+
+ # Apply the prompt formatter to wrap the base prompt with
+ # the correct media placeholders to get the model test prompt
+ model_prompt = prompt_formatter(base_prompt)
+ model_prompts.append(model_prompt)
+ return model_prompts
+
+
+def build_single_image_inputs_from_test_info(
+ test_info: VLMTestInfo,
+ image_assets: _ImageAssets,
+ size_wrapper: ImageSizeWrapper,
+ tmp_path: Optional[PosixPath] = None):
+ if test_info.prompt_formatter is None:
+ raise ValueError(
+ "Prompt formatter must be set to build single image inputs")
+
+ model_prompts = get_model_prompts(test_info.single_image_prompts,
+ test_info.img_idx_to_prompt,
+ test_info.video_idx_to_prompt,
+ test_info.prompt_formatter)
+
+ # For models that require a local path / URL encoded in the image; export
+ # assets and encode into tmp_path for this test. This should be avoided
+ # where possible (currently needed for Qwen-VL).
+ if test_info.prompt_path_encoder is not None:
+ if tmp_path is None:
+ raise ValueError("Prompt path encoder requires setting local path")
+ model_prompts = [
+ test_info.prompt_path_encoder(tmp_path, prompt, [asset])
+ for prompt, asset in zip(model_prompts, image_assets)
+ ]
+
+ images = [asset.pil_image for asset in image_assets]
+ assert len(images) == len(model_prompts)
+ return build_single_image_inputs(images, model_prompts, size_wrapper)
+
+
+def build_single_image_inputs(images, model_prompts,
+ size_wrapper: ImageSizeWrapper):
+ # For every image / prompt pair, get a pair containing two lists of
+ # length size_factors, where the first contains duplicates of the model
+ # prompt [str], and the second contains copies of the image after being
+ # scaled by one of the size factors.
+ #
+ # NOTE: rescaling preserves the image aspect ratio.
+ return [(
+ [prompt for _ in size_wrapper.data],
+ [
+ apply_image_size_scaling(image, size, size_wrapper.type)
+ for size in size_wrapper.data
+ ],
+ ) for image, prompt in zip(images, model_prompts)]
+
+
+def build_multi_image_inputs_from_test_info(
+ test_info: VLMTestInfo,
+ image_assets: _ImageAssets,
+ size_wrapper: ImageSizeWrapper,
+ tmp_path: Optional[PosixPath] = None):
+ if test_info.prompt_formatter is None:
+ raise ValueError(
+ "Prompt formatter must be set to build multi image inputs")
+
+ model_prompts = get_model_prompts([test_info.multi_image_prompt],
+ test_info.img_idx_to_prompt,
+ test_info.video_idx_to_prompt,
+ test_info.prompt_formatter)
+
+ if test_info.prompt_path_encoder is not None:
+ if tmp_path is None:
+ raise ValueError("Prompt path encoder requires setting local path")
+ model_prompts = [
+ test_info.prompt_path_encoder(tmp_path, model_prompt, image_assets)
+ for model_prompt in model_prompts
+ ]
+
+ images = [asset.pil_image for asset in image_assets]
+
+ # Currently, we only have one multi-image list & one multi-image prompt
+ return build_multi_image_inputs(
+ image_lists=[images],
+ model_prompts=model_prompts,
+ size_wrapper=size_wrapper,
+ )
+
+
+def build_multi_image_inputs(image_lists, model_prompts,
+ size_wrapper: ImageSizeWrapper):
+ return [(
+ [prompt for _ in size_wrapper.data],
+ [[
+ apply_image_size_scaling(image, size, size_wrapper.type)
+ for image in images
+ ] for size in size_wrapper.data],
+ ) for images, prompt in zip(image_lists, model_prompts)]
+
+
+def build_embedding_inputs_from_test_info(
+ test_info: VLMTestInfo,
+ image_assets: _ImageAssets,
+ size_wrapper: ImageSizeWrapper,
+):
+ # These conditions will always be true if invoked through filtering,
+ # but we still check them in case this is ever called directly
+ if test_info.prompt_formatter is None:
+ raise ValueError(
+ "Prompt formatter must be set to build image embedding inputs")
+ if size_wrapper.type != SizeType.SIZE_FACTOR or not \
+ all(factor == 1.0 for factor in size_wrapper.data):
+ raise ValueError("Embedding tests require constant (1.0) size factors")
+ if test_info.convert_assets_to_embeddings is None:
+ raise ValueError("No conversion func for getting embeddings found")
+
+ model_prompts = get_model_prompts(
+ SINGLE_IMAGE_BASE_PROMPTS,
+ test_info.img_idx_to_prompt,
+ test_info.video_idx_to_prompt,
+ test_info.prompt_formatter,
+ )
+
+ images = [asset.pil_image for asset in image_assets]
+ embeds = test_info.convert_assets_to_embeddings(image_assets)
+ assert len(images) == len(model_prompts)
+
+ inputs = build_single_image_inputs(images, model_prompts, size_wrapper)
+ vllm_embeddings = build_single_image_inputs(embeds, model_prompts,
+ size_wrapper)
+ return inputs, vllm_embeddings
+
+
+def build_video_inputs_from_test_info(
+ test_info: VLMTestInfo,
+ video_assets: _VideoAssets,
+ size_wrapper: ImageSizeWrapper,
+ num_frames: int,
+):
+ if test_info.prompt_formatter is None:
+ raise ValueError("Prompt formatter must be set to build video inputs")
+ model_prompts = get_model_prompts(
+ [VIDEO_BASE_PROMPT],
+ test_info.img_idx_to_prompt,
+ test_info.video_idx_to_prompt,
+ test_info.prompt_formatter,
+ )
+
+ sampled_vids = [
+ sample_frames_from_video(asset.np_ndarrays, num_frames)
+ for asset in video_assets
+ ]
+
+ video_scaler = (resize_video if size_wrapper.type == SizeType.FIXED_SIZE
+ else rescale_video_size)
+
+ return [(
+ [prompt for _ in size_wrapper.data],
+ [video_scaler(video, size) for size in size_wrapper.data],
+ ) for video, prompt in zip(sampled_vids, model_prompts)]
+
+
+def apply_image_size_scaling(image, size: Union[float, Tuple[int, int]],
+ size_type: SizeType):
+ """Applies a size scaler to one image; this can be a an image size factor,
+ which scales the image while maintaining the aspect ratio"""
+ # Special case for embeddings; if it's a tensor, it's only valid if we
+ # are considering size factors at constant scale, i.e., we just clone
+ # the tensor
+ if isinstance(image, torch.Tensor):
+ assert size_type == SizeType.SIZE_FACTOR and size == 1
+ return image
+ if size_type == SizeType.SIZE_FACTOR:
+ # We have a list of image size factors
+ return rescale_image_size(image, size)
+ elif size_type == SizeType.FIXED_SIZE:
+ # We have a list of fixed sizes
+ return image.resize(size)
+ raise ValueError("ImageSizeWrapper type must be FIXED_SIZE or SIZE_FACTOR")
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/case_filtering.py b/tests/models/decoder_only/vision_language/vlm_utils/case_filtering.py
new file mode 100644
index 0000000000000..9bb7134160659
--- /dev/null
+++ b/tests/models/decoder_only/vision_language/vlm_utils/case_filtering.py
@@ -0,0 +1,157 @@
+"""Utils for determining which subset of model tests belong to a specific
+modality, getting all combinations (similar to pytest's parametrization),
+handling multimodal placeholder substitution, and so on.
+"""
+import itertools
+from collections import OrderedDict
+from typing import Dict, Iterable, Tuple
+
+import pytest
+
+from .types import (EMBEDDING_SIZE_FACTORS, ExpandableVLMTestArgs,
+ ImageSizeWrapper, SizeType, VLMTestInfo, VLMTestType)
+
+
+def get_filtered_test_settings(test_settings: Dict[str, VLMTestInfo],
+ test_type: VLMTestType,
+ fork_per_test: bool) -> Dict[str, VLMTestInfo]:
+ """Given the dict of potential test settings to run, return a subdict
+ of tests who have the current test type enabled with the matching val for
+ fork_per_test.
+ """
+
+ def matches_test_type(test_info: VLMTestInfo, test_type: VLMTestType):
+ return test_info.test_type == test_type or (
+ isinstance(test_info.test_type, Iterable)
+ and test_type in test_info.test_type)
+
+ matching_tests = {}
+ for test_name, test_info in test_settings.items():
+ # Otherwise check if the test has the right type & keep if it does
+ if matches_test_type(test_info, test_type):
+ # Embedding tests need to have a conversion func in their test info
+ if matches_test_type(test_info, VLMTestType.EMBEDDING):
+ assert test_info.convert_assets_to_embeddings is not None
+ # Custom test inputs need to explicitly define the mm limit/inputs
+ if matches_test_type(test_info, VLMTestType.CUSTOM_INPUTS):
+ assert (test_info.custom_test_opts is not None
+ and isinstance(test_info.custom_test_opts, Iterable))
+ # For all types besides custom inputs, we need a prompt formatter
+ else:
+ assert test_info.prompt_formatter is not None
+
+ # Everything looks okay; keep if this is has correct proc handling
+ if (test_info.distributed_executor_backend
+ is not None) == fork_per_test:
+ matching_tests[test_name] = test_info
+
+ return matching_tests
+
+
+def get_parametrized_options(test_settings: Dict[str, VLMTestInfo],
+ test_type: VLMTestType,
+ fork_new_process_for_each_test: bool):
+ """Converts all of our VLMTestInfo into an expanded list of parameters.
+ This is similar to nesting pytest parametrize calls, but done directly
+ through an itertools product so that each test can set things like
+ size factors etc, while still running in isolated test cases.
+ """
+ matching_tests = get_filtered_test_settings(
+ test_settings, test_type, fork_new_process_for_each_test)
+
+ # Ensure that something is wrapped as an iterable it's not already
+ ensure_wrapped = lambda e: e if isinstance(e, (list, tuple)) else (e, )
+
+ def get_model_type_cases(model_type: str, test_info: VLMTestInfo):
+ # This is essentially the same as nesting a bunch of mark.parametrize
+ # decorators, but we do it programmatically to allow overrides for on
+ # a per-model basis, while still being able to execute each of these
+ # as individual test cases in pytest.
+ iter_kwargs = OrderedDict([
+ ("model", ensure_wrapped(test_info.models)),
+ ("max_tokens", ensure_wrapped(test_info.max_tokens)),
+ ("num_logprobs", ensure_wrapped(test_info.num_logprobs)),
+ ("dtype", ensure_wrapped(test_info.dtype)),
+ ("distributed_executor_backend",
+ ensure_wrapped(test_info.distributed_executor_backend)),
+ ])
+
+ # num_frames is video only
+ if test_type == VLMTestType.VIDEO:
+ iter_kwargs["num_video_frames"] = ensure_wrapped(
+ test_info.num_video_frames)
+
+ # No sizes passed for custom inputs, since inputs are directly provided
+ if test_type != VLMTestType.CUSTOM_INPUTS:
+ wrapped_sizes = get_wrapped_test_sizes(test_info, test_type)
+ if wrapped_sizes is None:
+ raise ValueError(
+ f"Sizes must be set for test type {test_type}")
+ iter_kwargs["size_wrapper"] = wrapped_sizes
+
+ #Otherwise expand the custom test options instead
+ else:
+ if test_info.custom_test_opts is None:
+ raise ValueError("Test has type CUSTOM_INPUTS, but none given")
+ iter_kwargs["custom_test_opts"] = test_info.custom_test_opts
+
+ # yapf: disable
+ # Wrap all model cases in a pytest parameter & pass marks through
+ return [
+ pytest.param(
+ model_type,
+ ExpandableVLMTestArgs(
+ **{k: v for k, v in zip(iter_kwargs.keys(), case)}
+ ),
+ marks=test_info.marks if test_info.marks is not None else []
+ ) for case in list(itertools.product(*iter_kwargs.values()))
+ ]
+ # yapf: enable
+
+ # Get a list per model type, where each entry contains a tuple of all of
+ # that model type's cases, then flatten them into the top level so that
+ # we can consume them in one mark.parametrize call.
+ cases_by_model_type = [
+ get_model_type_cases(model_type, test_info)
+ for model_type, test_info in matching_tests.items()
+ ]
+ return list(itertools.chain(*cases_by_model_type))
+
+
+def get_wrapped_test_sizes(
+ test_info: VLMTestInfo,
+ test_type: VLMTestType) -> Tuple[ImageSizeWrapper, ...]:
+ """Given a test info which may have size factors or fixed sizes, wrap them
+ and combine them into an iterable, each of which will be used in parameter
+ expansion.
+
+ Args:
+ test_info: Test configuration to be expanded.
+ test_type: The type of test being filtered for.
+ """
+ # If it is an embedding test, we always use the EMBEDDING_SIZE_FACTORS
+ if test_type == VLMTestType.EMBEDDING:
+ return tuple([
+ ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=factor)
+ for factor in EMBEDDING_SIZE_FACTORS
+ ])
+ # Custom inputs have preprocessed inputs
+ elif test_type == VLMTestType.CUSTOM_INPUTS:
+ return tuple()
+
+ size_factors = test_info.image_size_factors \
+ if test_info.image_size_factors else []
+ fixed_sizes = test_info.image_sizes \
+ if test_info.image_sizes else []
+
+ wrapped_factors = [
+ ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=factor)
+ for factor in size_factors
+ ]
+
+ wrapped_sizes = [
+ ImageSizeWrapper(type=SizeType.FIXED_SIZE, data=size)
+ for size in fixed_sizes
+ ]
+
+ return tuple(wrapped_factors + wrapped_sizes)
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/core.py b/tests/models/decoder_only/vision_language/vlm_utils/core.py
new file mode 100644
index 0000000000000..7e8c6dabb15af
--- /dev/null
+++ b/tests/models/decoder_only/vision_language/vlm_utils/core.py
@@ -0,0 +1,141 @@
+"""Core test implementation to be shared across modalities."""
+from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
+
+import torch
+from PIL.Image import Image
+from transformers import AutoTokenizer, BatchEncoding
+from transformers.models.auto.auto_factory import _BaseAutoModelClass
+
+from .....conftest import HfRunner, VllmRunner
+from .types import RunnerOutput
+
+
+def run_test(
+ *,
+ hf_runner: Type[HfRunner],
+ vllm_runner: Type[VllmRunner],
+ inputs: List[Tuple[List[str], List[Union[List[Image], Image]]]],
+ model: str,
+ dtype: str,
+ max_tokens: int,
+ num_logprobs: int,
+ enforce_eager: bool,
+ max_model_len: int,
+ max_num_seqs: int,
+ hf_output_post_proc: Optional[Callable[[RunnerOutput, str], Any]],
+ vllm_output_post_proc: Optional[Callable[[RunnerOutput, str], Any]],
+ auto_cls: Type[_BaseAutoModelClass],
+ use_tokenizer_eos: bool,
+ postprocess_inputs: Callable[[BatchEncoding], BatchEncoding],
+ comparator: Callable[..., None],
+ get_stop_token_ids: Optional[Callable[[AutoTokenizer], List[int]]],
+ limit_mm_per_prompt: Dict[str, int],
+ model_kwargs: Optional[Dict[str, Any]],
+ patch_hf_runner: Optional[Callable[[HfRunner], HfRunner]],
+ task: str = "auto",
+ runner_mm_key: str = "images",
+ distributed_executor_backend: Optional[str] = None,
+ tensor_parallel_size: int = 1,
+ vllm_embeddings: Optional[torch.Tensor] = None,
+):
+ """Modality agnostic test test executor for comparing HF/vLLM outputs."""
+ # In the case of embeddings, vLLM takes separate input tensors
+ vllm_inputs = vllm_embeddings if vllm_embeddings is not None else inputs
+ tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
+
+ vllm_outputs_per_mm = []
+ hf_outputs_per_mm = []
+
+ # NOTE: take care of the order. run vLLM first, and then run HF.
+ # vLLM needs a fresh new process without cuda initialization.
+ # if we run HF first, the cuda initialization will be done and it
+ # will hurt multiprocessing backend with fork method (the default method).
+ vllm_kwargs = {}
+ if get_stop_token_ids is not None:
+ vllm_kwargs["stop_token_ids"] = get_stop_token_ids(tokenizer)
+
+ with vllm_runner(model,
+ max_model_len=max_model_len,
+ max_num_seqs=max_num_seqs,
+ dtype=dtype,
+ limit_mm_per_prompt=limit_mm_per_prompt,
+ tensor_parallel_size=tensor_parallel_size,
+ distributed_executor_backend=distributed_executor_backend,
+ enforce_eager=enforce_eager,
+ task=task) as vllm_model:
+ for prompts, media in vllm_inputs:
+ vllm_kwargs[runner_mm_key] = media
+ vllm_output = vllm_model.generate_greedy_logprobs(
+ prompts, max_tokens, num_logprobs=num_logprobs, **vllm_kwargs)
+ vllm_outputs_per_mm.append(vllm_output)
+
+ hf_model = hf_runner(model,
+ dtype=dtype,
+ auto_cls=auto_cls,
+ postprocess_inputs=postprocess_inputs,
+ model_kwargs=model_kwargs)
+
+ # Some models need to patch things like the model processor, e.g., internvl
+ if patch_hf_runner is not None:
+ hf_model = patch_hf_runner(hf_model)
+
+ # Some models need to explicitly pass the eos_token_id off the tokenizer or
+ # processor for a good comparison; currently assume processor/tokenizer
+ # agree on the EOS, and pull it off the tokenizer if requested.
+ hf_kwargs = {}
+ if use_tokenizer_eos:
+ hf_kwargs["eos_token_id"] = tokenizer.eos_token_id
+
+ with hf_model, torch.no_grad():
+ for prompts, media in inputs:
+ hf_kwargs[runner_mm_key] = media
+ hf_output = hf_model.generate_greedy_logprobs_limit(
+ prompts,
+ max_tokens,
+ num_logprobs=num_logprobs,
+ tokenizer=tokenizer,
+ **hf_kwargs)
+ hf_outputs_per_mm.append(hf_output)
+
+ # Apply output processing / sanitation to the vLLM and HF runner results
+ hf_outputs_per_mm, vllm_outputs_per_mm = process_runner_outputs(
+ model,
+ first_runner_outputs=hf_outputs_per_mm,
+ second_runner_outputs=vllm_outputs_per_mm,
+ first_runner_processor=hf_output_post_proc,
+ second_runner_processor=vllm_output_post_proc,
+ )
+
+ for hf_outputs, vllm_outputs in zip(hf_outputs_per_mm,
+ vllm_outputs_per_mm):
+ # This is usually check_logprobs_close, but it's passed through to
+ # allow things like check_outputs_equal where needed
+ comparator(
+ outputs_0_lst=hf_outputs,
+ outputs_1_lst=vllm_outputs,
+ name_0="hf",
+ name_1="vllm",
+ )
+
+
+def process_runner_outputs(
+ model,
+ first_runner_outputs,
+ second_runner_outputs,
+ first_runner_processor=None,
+ second_runner_processor=None,
+):
+ """Applies the runner processor(s) to the runner outputs, if any."""
+ if first_runner_processor is not None:
+ first_runner_outputs = process_outputs(first_runner_processor, model,
+ first_runner_outputs)
+ if second_runner_processor is not None:
+ second_runner_outputs = process_outputs(second_runner_processor, model,
+ second_runner_outputs)
+ return first_runner_outputs, second_runner_outputs
+
+
+def process_outputs(output_processor, model, outputs_per_image):
+ """Applies a model specific post-processor function to a runner's output"""
+ return [[output_processor(res, model) for res in outputs]
+ for outputs in outputs_per_image]
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py b/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py
new file mode 100644
index 0000000000000..e698d8d3f6f56
--- /dev/null
+++ b/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py
@@ -0,0 +1,102 @@
+"""Custom input builders for edge-cases in different models."""
+from typing import Callable
+
+from vllm.multimodal.utils import (rescale_image_size, rescale_video_size,
+ resize_video, sample_frames_from_video)
+
+from .....conftest import IMAGE_ASSETS, VIDEO_ASSETS
+from .builders import build_multi_image_inputs, build_single_image_inputs
+from .types import ImageSizeWrapper, SizeType
+
+
+def multi_image_multi_aspect_ratio_inputs(formatter: Callable[[str], str]):
+ """Builds inputs for multi-image (varied sizes/aspect ratio) testing.
+
+ Args:
+ formatter: model-specific prompt formatter.
+ """
+ stop_sign = IMAGE_ASSETS[0].pil_image
+ cherry_blossom = IMAGE_ASSETS[1].pil_image
+
+ # Apply the selected formatter to the base prompts
+ img_prompts = [
+ "\nDescribe 2 images.",
+ "\nDescribe 2 images.",
+ "\nDescribe 4 images.",
+ "\nWhat is the season?",
+ ]
+ formatted_prompts = [formatter(prompt) for prompt in img_prompts]
+
+ return [(
+ formatted_prompts,
+ [
+ [stop_sign, cherry_blossom],
+ # Images with different sizes and aspect-ratios
+ [
+ rescale_image_size(stop_sign, 0.1),
+ stop_sign,
+ ],
+ [
+ stop_sign,
+ rescale_image_size(stop_sign, 0.25),
+ cherry_blossom.resize((183, 488)),
+ cherry_blossom.resize((488, 183))
+ ],
+ cherry_blossom,
+ ])]
+
+
+def multi_video_multi_aspect_ratio_inputs(formatter: Callable[[str], str],
+ num_frames: int = 16):
+ """Builds inputs for multi-video (varied sizes/aspect ratio) testing.
+
+ Args:
+ formatter: model-specific prompt formatter.
+ """
+ video = sample_frames_from_video(VIDEO_ASSETS[0].np_ndarrays, num_frames)
+ # Apply the selected formatter to the base prompts
+ video_prompts = [
+ "\nDescribe 2 videos.",
+ "\nDescribe 2 videos.",
+ "\nDescribe 4 videos.",
+ "\nWhy is this video funny?",
+ ]
+ formatted_prompts = [formatter(prompt) for prompt in video_prompts]
+
+ return [(
+ formatted_prompts,
+ [
+ [video, video],
+ # Videos with different sizes and aspect-ratios
+ [
+ rescale_video_size(video, 0.1),
+ video,
+ ],
+ [
+ video,
+ rescale_video_size(video, 0.25),
+ resize_video(video, (183, 488)),
+ resize_video(video, (488, 183))
+ ],
+ video,
+ ])]
+
+
+def different_patch_input_cases_internvl():
+ images = [asset.pil_image.resize((896, 896)) for asset in IMAGE_ASSETS]
+ formatter = lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n" # noqa: E501
+ single_img_prompts = [
+ "\nWhat's the content in the center of the image?",
+ "\nWhat is the season?",
+ ]
+ multi_img_prompts = [
+ "Image-1: \nImage-2: \nDescribe the two images in detail.\n", # noqa: E501
+ ]
+ formatted_sprompts = [formatter(prompt) for prompt in single_img_prompts]
+ formatted_mprompts = [formatter(prompt) for prompt in multi_img_prompts]
+
+ wrapped_sf = ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=[0.5, 1.0])
+ return [
+ build_single_image_inputs(images, formatted_sprompts, wrapped_sf),
+ build_multi_image_inputs([images], formatted_mprompts, wrapped_sf),
+ ]
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
new file mode 100644
index 0000000000000..6856e8df81a13
--- /dev/null
+++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
@@ -0,0 +1,338 @@
+"""Common utility functions relating to different models that are useful
+for manipulating the input / output of HF & vLLM test runners, which are
+typically specific to a small subset of models.
+"""
+import re
+import types
+from pathlib import PosixPath
+from typing import Callable, List, Optional, Tuple, Union
+
+import torch
+from PIL.Image import Image
+from transformers import AutoConfig, AutoTokenizer, BatchEncoding
+
+from vllm.sequence import SampleLogprobs
+from vllm.transformers_utils.tokenizer import patch_padding_side
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
+
+from .....conftest import HfRunner, ImageAsset, _ImageAssets
+from .types import RunnerOutput
+
+
+####### vLLM output processors functions
+def blip2_vllm_to_hf_output(vllm_output: RunnerOutput,
+ model: str) -> RunnerOutput:
+ """Sanitize vllm output [blip2 models] to be comparable with hf output."""
+ _, output_str, out_logprobs = vllm_output
+
+ hf_output_str = output_str + "\n"
+
+ tokenizer = AutoTokenizer.from_pretrained(model)
+ hf_output_ids = tokenizer.encode(hf_output_str)
+ assert hf_output_ids[0] == tokenizer.bos_token_id
+ hf_output_ids = hf_output_ids[1:]
+
+ return hf_output_ids, hf_output_str, out_logprobs
+
+
+def fuyu_vllm_to_hf_output(vllm_output: RunnerOutput,
+ model: str) -> RunnerOutput:
+ """Sanitize vllm output [fuyu models] to be comparable with hf output."""
+ output_ids, output_str, out_logprobs = vllm_output
+
+ hf_output_str = output_str.lstrip() + "|ENDOFTEXT|"
+
+ return output_ids, hf_output_str, out_logprobs
+
+
+def qwen_vllm_to_hf_output(
+ vllm_output: RunnerOutput,
+ model: str) -> Tuple[List[int], str, Optional[SampleLogprobs]]:
+ """Sanitize vllm output [qwen models] to be comparable with hf output."""
+ output_ids, output_str, out_logprobs = vllm_output
+
+ hf_output_str = output_str + "<|endoftext|>"
+
+ return output_ids, hf_output_str, out_logprobs
+
+
+def llava_image_vllm_to_hf_output(vllm_output: RunnerOutput,
+ model: str) -> RunnerOutput:
+ config = AutoConfig.from_pretrained(model)
+ mm_token_id = config.image_token_index
+ return _llava_vllm_to_hf_output(vllm_output, model, mm_token_id)
+
+
+def llava_video_vllm_to_hf_output(
+ vllm_output: RunnerOutput,
+ model: str) -> Tuple[List[int], str, Optional[SampleLogprobs]]:
+ config = AutoConfig.from_pretrained(model)
+ mm_token_id = config.video_token_index
+ return _llava_vllm_to_hf_output(vllm_output, model, mm_token_id)
+
+
+def _llava_vllm_to_hf_output(vllm_output: RunnerOutput, model: str,
+ mm_token_id: int) -> RunnerOutput:
+ """Sanitize vllm output [Llava models] to be comparable with hf output."""
+ output_ids, output_str, out_logprobs = vllm_output
+
+ tokenizer = AutoTokenizer.from_pretrained(model)
+ eos_token_id = tokenizer.eos_token_id
+
+ hf_output_ids = [
+ token_id for idx, token_id in enumerate(output_ids)
+ if token_id != mm_token_id or output_ids[idx - 1] != mm_token_id
+ ]
+
+ assert output_str[0] == " "
+ hf_output_str = output_str[1:]
+ if hf_output_ids[-1] == eos_token_id:
+ hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
+
+ return hf_output_ids, hf_output_str, out_logprobs
+
+
+def llava_onevision_vllm_to_hf_output(vllm_output: RunnerOutput,
+ model: str) -> RunnerOutput:
+ """Sanitize vllm output [llava-onevision] to compare with hf output."""
+ output_ids, output_str, out_logprobs = vllm_output
+
+ config = AutoConfig.from_pretrained(model)
+ video_token_id = config.video_token_index
+
+ tokenizer = AutoTokenizer.from_pretrained(model)
+ eos_token_id = tokenizer.eos_token_id
+
+ hf_output_ids = [
+ token_id for idx, token_id in enumerate(output_ids)
+ if token_id != video_token_id or output_ids[idx - 1] != video_token_id
+ ]
+
+ hf_output_str = output_str
+ if hf_output_ids[-1] == eos_token_id:
+ hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
+
+ return hf_output_ids, hf_output_str, out_logprobs
+
+
+def phi3v_vllm_to_hf_output(vllm_output: RunnerOutput,
+ model: str) -> RunnerOutput:
+ """Sanitize vllm output [phi3v] to be comparable with hf output."""
+ _, output_str, out_logprobs = vllm_output
+
+ output_str_without_image = re.sub(r"(<\|image_\d+\|>)+", "", output_str)
+ assert output_str_without_image[0] == " "
+ output_str_without_image = output_str_without_image[1:]
+
+ hf_output_str = output_str_without_image + "<|end|><|endoftext|>"
+
+ tokenizer = AutoTokenizer.from_pretrained(model)
+ hf_output_ids = tokenizer.encode(output_str_without_image)
+ assert hf_output_ids[0] == 1
+ hf_output_ids = hf_output_ids[1:]
+
+ return hf_output_ids, hf_output_str, out_logprobs
+
+
+def paligemma_vllm_to_hf_output(vllm_output: RunnerOutput,
+ model: str) -> RunnerOutput:
+ """Sanitize vllm output to be comparable with hf output."""
+ output_ids, output_str, out_logprobs = vllm_output
+
+ config = AutoConfig.from_pretrained(model)
+ image_token_id = config.image_token_index
+
+ tokenizer = AutoTokenizer.from_pretrained(model)
+ eos_token_id = tokenizer.eos_token_id
+
+ hf_output_ids = [
+ token_id for idx, token_id in enumerate(output_ids)
+ if token_id != image_token_id or output_ids[idx - 1] != image_token_id
+ ]
+
+ hf_output_str = output_str
+
+ if hf_output_ids[-1] == eos_token_id:
+ hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
+
+ return hf_output_ids, hf_output_str, out_logprobs
+
+
+####### Post-processors for HF outputs
+def minicmpv_trunc_hf_output(hf_output: RunnerOutput,
+ model: str) -> RunnerOutput:
+ output_ids, output_str, out_logprobs = hf_output
+ if output_str.endswith("<|eot_id|>"):
+ output_str = output_str.split("<|eot_id|>")[0]
+ return output_ids, output_str, out_logprobs
+
+
+####### Functions for converting image assets to embeddings
+def get_llava_embeddings(image_assets: _ImageAssets):
+ return [asset.image_embeds for asset in image_assets]
+
+
+####### postprocessors to run on HF BatchEncoding
+def get_key_type_post_processor(
+ hf_inp_key: str) -> Callable[[BatchEncoding, str], BatchEncoding]:
+ """Gets a handle to a post processor which converts a given key into a
+ target data type."""
+
+ def process(hf_inputs: BatchEncoding, dtype: str):
+ torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
+ hf_inputs[hf_inp_key] = hf_inputs[hf_inp_key].to(torch_dtype)
+ return hf_inputs
+
+ return process
+
+
+def wrap_inputs_post_processor(hf_inputs: BatchEncoding, dtype: str):
+ return {"model_inputs": hf_inputs}
+
+
+####### Prompt path encoders for models that need models on disk
+def qwen_prompt_path_encoder(
+ tmp_path: PosixPath, prompt: str, assets: Union[List[ImageAsset],
+ _ImageAssets]) -> str:
+ """Given a temporary dir path, export one or more image assets into the
+ tempdir & replace its contents with the local path to the string so that
+ the HF version of Qwen-VL can resolve the path and load the image in its
+ forward() call.
+
+ Args:
+ tmp_path: Tempdir for test under consideration.
+ prompt: Prompt with image placeholders.
+ assets: List of image assets whose len equals the num placeholders.
+ """
+ # Ensure that the number of placeholders matches the number of assets;
+ # If this is not true, the test is probably written incorrectly.
+ assert prompt.count("") == len(assets)
+
+ # Replace the placeholders with local paths to the exported assets
+ for asset in assets:
+ image_tmp_path = tmp_path / f"{asset.name}.jpg"
+ asset.pil_image.save(image_tmp_path)
+ prompt = prompt.replace(
+ "",
+ f"{image_tmp_path}",
+ 1,
+ )
+ return prompt
+
+
+####### Model-specific HuggingFace runner patchers
+def glm_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+ """Patches and returns an instance of the HfRunner to use for GLM4."""
+ hf_processor = hf_model.processor
+ patch_padding_side(hf_processor)
+
+ def processor(*args, text="", images=None, **kwargs):
+ if images is None:
+ return hf_processor(*args, **kwargs)
+
+ return hf_processor.apply_chat_template(
+ [{
+ "role": "user",
+ "image": images,
+ "content": text
+ }],
+ add_generation_prompt=True,
+ tokenize=True,
+ return_dict=True,
+ **kwargs,
+ )
+
+ hf_model.processor = processor
+ hf_model.model.get_output_embeddings = lambda: \
+ hf_model.model.transformer.output_layer
+ return hf_model
+
+
+def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+ """Patches and returns an instance of the HfRunner to use for InternVL."""
+
+ class InternVLProcessor:
+ """A simple processor for InternVL2 which misses a processor."""
+
+ def __init__(self, hf_runner: HfRunner):
+ self.num_image_token = hf_runner.model.num_image_token
+ self.tokenizer = hf_runner.tokenizer
+ self.dtype = hf_runner.model.dtype
+
+ self.config = AutoConfig.from_pretrained(hf_runner.model_name,
+ trust_remote_code=True)
+ self.vision_config = self.config.vision_config
+ self.use_thumbnail = self.config.use_thumbnail
+ self.min_num = self.config.min_dynamic_patch
+ self.max_num = self.config.max_dynamic_patch
+ self.image_size = self.vision_config.image_size
+
+ def __call__(self, text: str, images: Union[Image, List[Image]],
+ **kwargs):
+ from vllm.model_executor.models.internvl import (
+ IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values)
+ images = [images] if isinstance(images, Image) else images
+ pixel_values = [
+ image_to_pixel_values(image, self.image_size, self.min_num,
+ self.max_num,
+ self.use_thumbnail).to(self.dtype)
+ for image in images
+ ]
+ num_patches_list = [
+ pixel_value.shape[0] for pixel_value in pixel_values
+ ]
+ pixel_values = torch.cat(pixel_values, dim=0)
+ for num_patches in num_patches_list:
+ context_tokens = IMG_CONTEXT * self.num_image_token \
+ * num_patches
+ image_tokens = IMG_START + context_tokens + IMG_END
+ text = text.replace('', image_tokens, 1)
+ prompt = self.tokenizer(text, return_tensors="pt")
+ prompt.update({"pixel_values": pixel_values})
+ return prompt
+
+ img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids(
+ "")
+ hf_model.model.img_context_token_id = img_context_token_id
+ hf_model.processor = InternVLProcessor(hf_model)
+ hf_model.model.get_output_embeddings = lambda: \
+ hf_model.model.language_model.get_output_embeddings()
+ hf_model.model.generate = types.MethodType(_internvl_generate,
+ hf_model.model)
+ return hf_model
+
+
+def _internvl_generate(
+ self,
+ pixel_values: torch.FloatTensor,
+ input_ids: torch.FloatTensor,
+ attention_mask: Optional[torch.LongTensor] = None,
+ **generate_kwargs,
+) -> torch.LongTensor:
+ """Generate method for InternVL2 model without fixed use_cache."""
+ assert self.img_context_token_id is not None
+ vit_embeds = self.extract_feature(pixel_values)
+ input_embeds = self.language_model.get_input_embeddings()(input_ids)
+ B, N, C = input_embeds.shape
+ input_embeds = input_embeds.reshape(B * N, C)
+
+ input_ids = input_ids.reshape(B * N)
+ selected = (input_ids == self.img_context_token_id)
+ assert selected.sum() != 0
+ input_embeds[selected] = vit_embeds.reshape(-1, C).to(input_embeds.device)
+
+ input_embeds = input_embeds.reshape(B, N, C)
+
+ forward_kwargs = dict(
+ inputs_embeds=input_embeds,
+ attention_mask=attention_mask,
+ )
+ if getattr(self, "use_visual_token_mask", False):
+ visual_token_mask = selected.reshape(B, N, 1).to(input_embeds.dtype)
+ forward_kwargs["visual_token_mask"] = visual_token_mask
+ outputs = self.language_model.generate(
+ **forward_kwargs,
+ **generate_kwargs,
+ )
+
+ return outputs
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/runners.py b/tests/models/decoder_only/vision_language/vlm_utils/runners.py
new file mode 100644
index 0000000000000..5a3f9e820dad0
--- /dev/null
+++ b/tests/models/decoder_only/vision_language/vlm_utils/runners.py
@@ -0,0 +1,130 @@
+"""Entrypoints for wrapping the core run_test implementation for specific test
+types / modalities.
+"""
+from pathlib import PosixPath
+from typing import Type
+
+from .....conftest import HfRunner, VllmRunner, _ImageAssets, _VideoAssets
+from . import builders, core
+from .types import ExpandableVLMTestArgs, VLMTestInfo
+
+
+####### Entrypoints for running different test types
+def run_single_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
+ test_case: ExpandableVLMTestArgs,
+ hf_runner: Type[HfRunner],
+ vllm_runner: Type[VllmRunner],
+ image_assets: _ImageAssets):
+ assert test_case.size_wrapper is not None
+ inputs = builders.build_single_image_inputs_from_test_info(
+ model_test_info, image_assets, test_case.size_wrapper, tmp_path)
+
+ core.run_test(
+ hf_runner=hf_runner,
+ vllm_runner=vllm_runner,
+ inputs=inputs,
+ model=test_case.model,
+ dtype=test_case.dtype,
+ max_tokens=test_case.max_tokens,
+ num_logprobs=test_case.num_logprobs,
+ limit_mm_per_prompt={"image": 1},
+ distributed_executor_backend=test_case.distributed_executor_backend,
+ **model_test_info.get_non_parametrized_runner_kwargs())
+
+
+def run_multi_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
+ test_case: ExpandableVLMTestArgs,
+ hf_runner: Type[HfRunner],
+ vllm_runner: Type[VllmRunner],
+ image_assets: _ImageAssets):
+ assert test_case.size_wrapper is not None
+ inputs = builders.build_multi_image_inputs_from_test_info(
+ model_test_info, image_assets, test_case.size_wrapper, tmp_path)
+
+ core.run_test(
+ hf_runner=hf_runner,
+ vllm_runner=vllm_runner,
+ inputs=inputs,
+ model=test_case.model,
+ dtype=test_case.dtype,
+ max_tokens=test_case.max_tokens,
+ num_logprobs=test_case.num_logprobs,
+ limit_mm_per_prompt={"image": len(image_assets)},
+ distributed_executor_backend=test_case.distributed_executor_backend,
+ **model_test_info.get_non_parametrized_runner_kwargs())
+
+
+def run_embedding_test(*, model_test_info: VLMTestInfo,
+ test_case: ExpandableVLMTestArgs,
+ hf_runner: Type[HfRunner],
+ vllm_runner: Type[VllmRunner],
+ image_assets: _ImageAssets):
+ assert test_case.size_wrapper is not None
+ inputs, vllm_embeddings = builders.build_embedding_inputs_from_test_info(
+ model_test_info, image_assets, test_case.size_wrapper)
+
+ core.run_test(
+ hf_runner=hf_runner,
+ vllm_runner=vllm_runner,
+ inputs=inputs,
+ model=test_case.model,
+ dtype=test_case.dtype,
+ max_tokens=test_case.max_tokens,
+ num_logprobs=test_case.num_logprobs,
+ limit_mm_per_prompt={"image": 1},
+ vllm_embeddings=vllm_embeddings,
+ distributed_executor_backend=test_case.distributed_executor_backend,
+ **model_test_info.get_non_parametrized_runner_kwargs())
+
+
+def run_video_test(
+ *,
+ model_test_info: VLMTestInfo,
+ test_case: ExpandableVLMTestArgs,
+ hf_runner: Type[HfRunner],
+ vllm_runner: Type[VllmRunner],
+ video_assets: _VideoAssets,
+):
+ assert test_case.size_wrapper is not None
+ assert test_case.num_video_frames is not None
+ inputs = builders.build_video_inputs_from_test_info(
+ model_test_info, video_assets, test_case.size_wrapper,
+ test_case.num_video_frames)
+
+ core.run_test(
+ hf_runner=hf_runner,
+ vllm_runner=vllm_runner,
+ inputs=inputs,
+ model=test_case.model,
+ dtype=test_case.dtype,
+ max_tokens=test_case.max_tokens,
+ num_logprobs=test_case.num_logprobs,
+ limit_mm_per_prompt={"video": len(video_assets)},
+ distributed_executor_backend=test_case.distributed_executor_backend,
+ **model_test_info.get_non_parametrized_runner_kwargs())
+
+
+def run_custom_inputs_test(*, model_test_info: VLMTestInfo,
+ test_case: ExpandableVLMTestArgs,
+ hf_runner: Type[HfRunner],
+ vllm_runner: Type[VllmRunner]):
+ # Custom test cases can provide inputs directly, but they need to
+ # explicitly provided a CustomTestConfig, which wraps the inputs and
+ # the limit_mm_per_prompt
+ assert test_case.custom_test_opts is not None
+
+ inputs = test_case.custom_test_opts.inputs
+ limit_mm_per_prompt = test_case.custom_test_opts.limit_mm_per_prompt
+ assert inputs is not None and limit_mm_per_prompt is not None
+
+ core.run_test(
+ hf_runner=hf_runner,
+ vllm_runner=vllm_runner,
+ inputs=inputs,
+ model=test_case.model,
+ dtype=test_case.dtype,
+ max_tokens=test_case.max_tokens,
+ num_logprobs=test_case.num_logprobs,
+ limit_mm_per_prompt=limit_mm_per_prompt,
+ distributed_executor_backend=test_case.distributed_executor_backend,
+ **model_test_info.get_non_parametrized_runner_kwargs())
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/types.py b/tests/models/decoder_only/vision_language/vlm_utils/types.py
new file mode 100644
index 0000000000000..4d18d53af30fa
--- /dev/null
+++ b/tests/models/decoder_only/vision_language/vlm_utils/types.py
@@ -0,0 +1,187 @@
+"""Types for writing multimodal model tests."""
+from enum import Enum
+from pathlib import PosixPath
+from typing import (Any, Callable, Dict, Iterable, List, NamedTuple, Optional,
+ Tuple, Type, Union)
+
+import torch
+from PIL.Image import Image
+from pytest import MarkDecorator
+from transformers import AutoModelForCausalLM, AutoTokenizer, BatchEncoding
+from transformers.models.auto.auto_factory import _BaseAutoModelClass
+
+from vllm.sequence import SampleLogprobs
+from vllm.utils import identity
+
+from .....conftest import IMAGE_ASSETS, HfRunner, ImageAsset, _ImageAssets
+from ....utils import check_logprobs_close
+
+# meta image tag; will be replaced by the appropriate tag for the model
+TEST_IMG_PLACEHOLDER = ""
+TEST_VIDEO_PLACEHOLDER = ""
+
+# yapf: disable
+SINGLE_IMAGE_BASE_PROMPTS = IMAGE_ASSETS.prompts({
+ "stop_sign": f"{TEST_IMG_PLACEHOLDER}What's the content of the image?",
+ "cherry_blossom": f"{TEST_IMG_PLACEHOLDER}What is the season?",
+})
+
+MULTI_IMAGE_BASE_PROMPT = f"Image-1: {TEST_IMG_PLACEHOLDER}Image-2: {TEST_IMG_PLACEHOLDER}Describe the two images in detail.\n" # noqa: E501
+VIDEO_BASE_PROMPT = f"{TEST_VIDEO_PLACEHOLDER}Why is this video funny?"
+
+
+IMAGE_SIZE_FACTORS = [(), (1.0, ), (1.0, 1.0, 1.0), (0.25, 0.5, 1.0)]
+EMBEDDING_SIZE_FACTORS = [(), (1.0, ), (1.0, 1.0, 1.0)]
+RunnerOutput = Tuple[List[int], str, Optional[SampleLogprobs]]
+# yapf: enable
+
+
+class VLMTestType(Enum):
+ IMAGE = 1
+ MULTI_IMAGE = 2
+ EMBEDDING = 3
+ VIDEO = 4
+ CUSTOM_INPUTS = 5
+
+
+class SizeType(Enum):
+ SIZE_FACTOR = 1
+ FIXED_SIZE = 2
+
+
+class CustomTestOptions(NamedTuple):
+ inputs: List[Tuple[List[str], List[Union[List[Image], Image]]]]
+ limit_mm_per_prompt: Dict[str, int]
+
+
+class ImageSizeWrapper(NamedTuple):
+ type: SizeType
+ # A size factor is a wrapper of 0+ floats,
+ # while a fixed size contains an iterable of integer pairs
+ data: Union[Iterable[float], Iterable[Tuple[int, int]]]
+
+
+class VLMTestInfo(NamedTuple):
+ """Holds the configuration for 1+ tests for one model architecture."""
+
+ models: Union[List[str]]
+ test_type: Union[VLMTestType, Iterable[VLMTestType]]
+
+ # Should be None only if this is a CUSTOM_INPUTS test
+ prompt_formatter: Optional[Callable[[str], str]] = None
+ img_idx_to_prompt: Callable[[int], str] = lambda idx: "\n"
+ video_idx_to_prompt: Callable[[int], str] = lambda idx: "\n"
+
+ # Most models work on the single / multi-image prompts above, but in some
+ # cases the log prob check fails, e.g., for paligemma. We allow passing
+ # an override for the single image prompts / multi-image prompt for this
+ # reason.
+ single_image_prompts: Iterable[str] = SINGLE_IMAGE_BASE_PROMPTS
+ multi_image_prompt: str = MULTI_IMAGE_BASE_PROMPT
+
+ # Function for converting ImageAssets to image embeddings;
+ # We need to define this explicitly for embedding tests
+ convert_assets_to_embeddings: Optional[Callable[[_ImageAssets],
+ torch.Tensor]] = None
+
+ # Exposed options for vLLM runner; we change these in a several tests,
+ # but the defaults are derived from VllmRunner & the engine defaults
+ # These settings are chosen to avoid OOMs when running in the CI
+ enforce_eager: bool = True
+ max_model_len: int = 1024
+ max_num_seqs: int = 256
+ task: str = "auto"
+ tensor_parallel_size: int = 1
+
+ # Optional callable which gets a list of token IDs from the model tokenizer
+ get_stop_token_ids: Optional[Callable[[AutoTokenizer], List[int]]] = None
+
+ # Exposed options for HF runner
+ model_kwargs: Optional[Dict[str, Any]] = None
+ # Indicates we should explicitly pass the EOS from the tokeniezr
+ use_tokenizer_eos: bool = False
+ auto_cls: Type[_BaseAutoModelClass] = AutoModelForCausalLM
+ # Callable to pass to the HF runner to run on inputs; for now, we also pass
+ # the data type to input post processing, because almost all of the uses of
+ # postprocess_inputs are to fix the data types of BatchEncoding values.
+ postprocess_inputs: Callable[[BatchEncoding, str],
+ BatchEncoding] = identity
+ patch_hf_runner: Optional[Callable[[HfRunner], HfRunner]] = None
+
+ # Post processors that if defined, will run oun the outputs of the
+ # vLLM and HF runner, respectively (useful for sanitization, etc).
+ vllm_output_post_proc: Optional[Callable[[RunnerOutput, str], Any]] = None
+ hf_output_post_proc: Optional[Callable[[RunnerOutput, str], Any]] = None
+
+ # Consumes the output of the callables above and checks if they're equal
+ comparator: Callable[..., None] = check_logprobs_close
+
+ # Default expandable params per test; these defaults can be overridden in
+ # instances of this object; the complete set of test cases for the model
+ # is all combinations of .models + all fields below
+ max_tokens: Union[int, Tuple[int]] = 128
+ num_logprobs: Union[int, Tuple[int]] = 5
+ dtype: Union[str, Iterable[str]] = "half"
+ distributed_executor_backend: Optional[Union[str, Iterable[str]]] = None
+ # Only expanded in video tests
+ num_video_frames: Union[int, Tuple[int]] = 16
+
+ # Fixed image sizes / image size factors; most tests use image_size_factors
+ # The values provided for these two fields will be stacked and expanded
+ # such that each model will consider each image size factor / image size
+ # once per tests (much like concatenating and wrapping in one parametrize
+ # call)
+ image_size_factors: Iterable[Iterable[float]] = IMAGE_SIZE_FACTORS
+ image_sizes: Optional[Iterable[Iterable[Tuple[int, int]]]] = None
+
+ # Hack for updating a prompt to take into a local path; currently only used
+ # for Qwen-VL, which requires encoding the image path / url into the prompt
+ # for HF runner
+ prompt_path_encoder: Optional[
+ Callable[[PosixPath, str, Union[List[ImageAsset], _ImageAssets]],
+ str]] = None # noqa: E501
+
+ # kwarg to pass multimodal data in as to vllm/hf runner instances
+ runner_mm_key: str = "images"
+
+ # Allows configuring a test to run with custom inputs
+ custom_test_opts: Optional[List[CustomTestOptions]] = None
+
+ marks: Optional[List[MarkDecorator]] = None
+
+ def get_non_parametrized_runner_kwargs(self):
+ """Returns a dictionary of expandable kwargs for items that are used
+ in all test types, which are NOT used when creating the parametrized
+ test cases.
+ """
+ return {
+ "enforce_eager": self.enforce_eager,
+ "max_model_len": self.max_model_len,
+ "max_num_seqs": self.max_num_seqs,
+ "task": self.task,
+ "hf_output_post_proc": self.hf_output_post_proc,
+ "vllm_output_post_proc": self.vllm_output_post_proc,
+ "auto_cls": self.auto_cls,
+ "use_tokenizer_eos": self.use_tokenizer_eos,
+ "postprocess_inputs": self.postprocess_inputs,
+ "comparator": self.comparator,
+ "get_stop_token_ids": self.get_stop_token_ids,
+ "model_kwargs": self.model_kwargs,
+ "patch_hf_runner": self.patch_hf_runner,
+ "runner_mm_key": self.runner_mm_key,
+ }
+
+
+class ExpandableVLMTestArgs(NamedTuple):
+ """The expanded kwargs which correspond to a single test case."""
+ model: str
+ max_tokens: int
+ num_logprobs: int
+ dtype: str
+ distributed_executor_backend: Optional[str]
+ # Sizes are used for everything except for custom input tests
+ size_wrapper: Optional[ImageSizeWrapper] = None
+ # Video only
+ num_video_frames: Optional[int] = None
+ # Custom inputs only
+ custom_test_opts: Optional[CustomTestOptions] = None
diff --git a/tests/models/embedding/vision_language/test_llava_next.py b/tests/models/embedding/vision_language/test_llava_next.py
index 52aef8c34d6f3..a8d0ac4fc160d 100644
--- a/tests/models/embedding/vision_language/test_llava_next.py
+++ b/tests/models/embedding/vision_language/test_llava_next.py
@@ -85,6 +85,8 @@ def _run_test(
)
+# FIXME
+@pytest.mark.skip(reason="LLava next embedding tests currently fail")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
def test_models_text(
diff --git a/tests/models/encoder_decoder/vision_language/test_mllama.py b/tests/models/encoder_decoder/vision_language/test_mllama.py
index 52f74ec885946..7f82347841cdb 100644
--- a/tests/models/encoder_decoder/vision_language/test_mllama.py
+++ b/tests/models/encoder_decoder/vision_language/test_mllama.py
@@ -192,7 +192,7 @@ def _run_test(
for prompts, images in inputs
]
- def process(hf_inputs: BatchEncoding):
+ def process(hf_inputs: BatchEncoding, **kwargs):
return hf_inputs
with hf_runner(model,
diff --git a/tests/utils.py b/tests/utils.py
index 0c61891cfefec..f6f588df48810 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -561,12 +561,11 @@ def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None:
return wrapper
-def large_gpu_test(*, min_gb: int):
- """
- Decorate a test to be skipped if no GPU is available or it does not have
- sufficient memory.
-
- Currently, the CI machine uses L4 GPU which has 24 GB VRAM.
+def large_gpu_mark(min_gb: int) -> pytest.MarkDecorator:
+ """Gets a pytest skipif mark, which triggers ig the the device doesn't have
+ meet a minimum memory requirement in gb; can be leveraged via
+ @large_gpu_test to skip tests in environments without enough resources, or
+ called when filtering tests to run directly.
"""
try:
if current_platform.is_cpu():
@@ -578,14 +577,23 @@ def large_gpu_test(*, min_gb: int):
f"An error occurred when finding the available memory: {e}",
stacklevel=2,
)
-
memory_gb = 0
- test_skipif = pytest.mark.skipif(
+ return pytest.mark.skipif(
memory_gb < min_gb,
reason=f"Need at least {memory_gb}GB GPU memory to run the test.",
)
+
+def large_gpu_test(*, min_gb: int):
+ """
+ Decorate a test to be skipped if no GPU is available or it does not have
+ sufficient memory.
+
+ Currently, the CI machine uses L4 GPU which has 24 GB VRAM.
+ """
+ test_skipif = large_gpu_mark(min_gb)
+
def wrapper(f: Callable[_P, None]) -> Callable[_P, None]:
return test_skipif(f)
diff --git a/vllm/utils.py b/vllm/utils.py
index 90c4b84757810..03cdbe6a0dc7b 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -977,7 +977,8 @@ def enable_trace_function_call_for_thread() -> None:
# `functools` helpers
-def identity(value: T) -> T:
+def identity(value: T, **kwargs) -> T:
+ """Returns the first provided value."""
return value
From 81f09cfd80a5a2e1572ee79facd60bb823923367 Mon Sep 17 00:00:00 2001
From: Went-Liang
Date: Thu, 31 Oct 2024 00:33:42 +0800
Subject: [PATCH 052/113] [Model] Support math-shepherd-mistral-7b-prm model
(#9697)
Signed-off-by: Went-Liang
---
vllm/config.py | 115 +++++++++++++++------
vllm/engine/arg_utils.py | 64 ++++++++++++
vllm/engine/llm_engine.py | 4 +-
vllm/entrypoints/llm.py | 10 ++
vllm/model_executor/layers/pooler.py | 62 ++++++++++-
vllm/model_executor/model_loader/loader.py | 15 ++-
vllm/model_executor/models/bert.py | 9 +-
vllm/model_executor/models/gemma2.py | 10 +-
vllm/model_executor/models/llama.py | 23 ++++-
vllm/model_executor/models/llava_next.py | 12 ++-
vllm/model_executor/models/phi3v.py | 13 ++-
vllm/model_executor/models/qwen2_cls.py | 11 +-
vllm/model_executor/models/qwen2_rm.py | 10 +-
vllm/model_executor/models/registry.py | 16 +++
14 files changed, 312 insertions(+), 62 deletions(-)
diff --git a/vllm/config.py b/vllm/config.py
index 3814e41aeb92d..e9559c40dbdfb 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -112,38 +112,58 @@ class ModelConfig:
Defaults to 'auto' which defaults to 'hf'.
mm_processor_kwargs: Arguments to be forwarded to the model's processor
for multi-modal data, e.g., image processor.
+ pooling_type: Used to configure the pooling method in the embedding
+ model.
+ pooling_norm: Used to determine whether to normalize the pooled
+ data in the embedding model.
+ pooling_softmax: Used to determine whether to softmax the pooled
+ data in the embedding model.
+ pooling_step_tag_id: When pooling_step_tag_id is not -1, it indicates
+ that the score corresponding to the pooling_step_tag_id in the
+ generated sentence should be returned. Otherwise, it returns
+ the scores for all tokens.
+ pooling_returned_token_ids: pooling_returned_token_ids represents a
+ list of indices for the vocabulary dimensions to be extracted,
+ such as the token IDs of good_token and bad_token in the
+ math-shepherd-mistral-7b-prm model.
"""
- def __init__(self,
- model: str,
- task: Union[TaskOption, _Task],
- tokenizer: str,
- tokenizer_mode: str,
- trust_remote_code: bool,
- dtype: Union[str, torch.dtype],
- seed: int,
- revision: Optional[str] = None,
- code_revision: Optional[str] = None,
- rope_scaling: Optional[dict] = None,
- rope_theta: Optional[float] = None,
- tokenizer_revision: Optional[str] = None,
- max_model_len: Optional[int] = None,
- spec_target_max_model_len: Optional[int] = None,
- quantization: Optional[str] = None,
- quantization_param_path: Optional[str] = None,
- enforce_eager: Optional[bool] = None,
- max_context_len_to_capture: Optional[int] = None,
- max_seq_len_to_capture: Optional[int] = None,
- max_logprobs: int = 20,
- disable_sliding_window: bool = False,
- skip_tokenizer_init: bool = False,
- served_model_name: Optional[Union[str, List[str]]] = None,
- limit_mm_per_prompt: Optional[Mapping[str, int]] = None,
- use_async_output_proc: bool = True,
- override_neuron_config: Optional[Dict[str, Any]] = None,
- config_format: ConfigFormat = ConfigFormat.AUTO,
- chat_template_text_format: str = "string",
- mm_processor_kwargs: Optional[Dict[str, Any]] = None) -> None:
+ def __init__(
+ self,
+ model: str,
+ task: Union[TaskOption, _Task],
+ tokenizer: str,
+ tokenizer_mode: str,
+ trust_remote_code: bool,
+ dtype: Union[str, torch.dtype],
+ seed: int,
+ revision: Optional[str] = None,
+ code_revision: Optional[str] = None,
+ rope_scaling: Optional[dict] = None,
+ rope_theta: Optional[float] = None,
+ tokenizer_revision: Optional[str] = None,
+ max_model_len: Optional[int] = None,
+ spec_target_max_model_len: Optional[int] = None,
+ quantization: Optional[str] = None,
+ quantization_param_path: Optional[str] = None,
+ enforce_eager: Optional[bool] = None,
+ max_context_len_to_capture: Optional[int] = None,
+ max_seq_len_to_capture: Optional[int] = None,
+ max_logprobs: int = 20,
+ disable_sliding_window: bool = False,
+ skip_tokenizer_init: bool = False,
+ served_model_name: Optional[Union[str, List[str]]] = None,
+ limit_mm_per_prompt: Optional[Mapping[str, int]] = None,
+ use_async_output_proc: bool = True,
+ override_neuron_config: Optional[Dict[str, Any]] = None,
+ config_format: ConfigFormat = ConfigFormat.AUTO,
+ chat_template_text_format: str = "string",
+ mm_processor_kwargs: Optional[Dict[str, Any]] = None,
+ pooling_type: Optional[str] = None,
+ pooling_norm: Optional[bool] = None,
+ pooling_softmax: Optional[bool] = None,
+ pooling_step_tag_id: Optional[int] = None,
+ pooling_returned_token_ids: Optional[List[int]] = None) -> None:
self.model = model
self.tokenizer = tokenizer
self.tokenizer_mode = tokenizer_mode
@@ -224,6 +244,13 @@ def __init__(self,
supported_tasks, task = self._resolve_task(task, self.hf_config)
self.supported_tasks = supported_tasks
self.task: Final = task
+ self.pooler_config = self._init_pooler_config(
+ pooling_type,
+ pooling_norm,
+ pooling_softmax,
+ pooling_step_tag_id,
+ pooling_returned_token_ids,
+ )
self._verify_quantization()
self._verify_cuda_graph()
@@ -242,6 +269,23 @@ def _init_multimodal_config(
return None
+ def _init_pooler_config(
+ self,
+ pooling_type: Optional[str] = None,
+ pooling_norm: Optional[bool] = None,
+ pooling_softmax: Optional[bool] = None,
+ pooling_step_tag_id: Optional[int] = None,
+ pooling_returned_token_ids: Optional[List[int]] = None
+ ) -> Optional["PoolerConfig"]:
+ if self.task == "embedding":
+ return PoolerConfig(
+ pooling_type=pooling_type,
+ pooling_norm=pooling_norm,
+ pooling_softmax=pooling_softmax,
+ pooling_step_tag_id=pooling_step_tag_id,
+ pooling_returned_token_ids=pooling_returned_token_ids)
+ return None
+
def _init_attention_free(self) -> bool:
architectures = getattr(self.hf_config, "architectures", [])
return ModelRegistry.is_attention_free_model(architectures)
@@ -1647,6 +1691,17 @@ class MultiModalConfig:
# TODO: Add configs to init vision tower or not.
+@dataclass
+class PoolerConfig:
+ """Controls the behavior of pooler in embedding model"""
+
+ pooling_type: Optional[str] = None
+ pooling_norm: Optional[bool] = None
+ pooling_softmax: Optional[bool] = None
+ pooling_step_tag_id: Optional[int] = None
+ pooling_returned_token_ids: Optional[List[int]] = None
+
+
_STR_DTYPE_TO_TORCH_DTYPE = {
"half": torch.float16,
"float16": torch.float16,
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 38687809a31f6..de886c98e51bd 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -184,6 +184,13 @@ class EngineArgs:
mm_processor_kwargs: Optional[Dict[str, Any]] = None
scheduling_policy: Literal["fcfs", "priority"] = "fcfs"
+ # Pooling configuration.
+ pooling_type: Optional[str] = None
+ pooling_norm: Optional[bool] = None
+ pooling_softmax: Optional[bool] = None
+ pooling_step_tag_id: Optional[int] = None
+ pooling_returned_token_ids: Optional[List[int]] = None
+
def __post_init__(self):
if not self.tokenizer:
self.tokenizer = self.model
@@ -850,6 +857,58 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
'priority (lower value means earlier handling) and time of '
'arrival deciding any ties).')
+ parser.add_argument(
+ '--pooling-type',
+ choices=['LAST', 'ALL', 'CLS', 'STEP'],
+ default=None,
+ help='Used to configure the pooling method in the embedding model.'
+ )
+
+ parser.add_argument('--pooling-norm',
+ default=None,
+ action='store_true',
+ help="Used to determine whether to normalize "
+ "the pooled data in the embedding model.")
+
+ parser.add_argument('--no-pooling-norm',
+ default=None,
+ action='store_false',
+ dest='pooling_norm',
+ help="Used to determine whether to normalize "
+ "the pooled data in the embedding model.")
+
+ parser.add_argument('--pooling-softmax',
+ default=None,
+ action='store_true',
+ help="Used to determine whether to softmax "
+ "the pooled data in the embedding model.")
+
+ parser.add_argument('--no-pooling-softmax',
+ default=None,
+ action='store_false',
+ dest='pooling_softmax',
+ help="Used to determine whether to softmax "
+ "the pooled data in the embedding model.")
+
+ parser.add_argument(
+ '--pooling-step-tag-id',
+ type=int,
+ default=None,
+ help="When pooling-step-tag-id is not -1, it indicates "
+ "that the score corresponding to the step-tag-ids in the "
+ "generated sentence should be returned. Otherwise, it "
+ "returns the scores for all tokens.")
+
+ parser.add_argument(
+ '--pooling-returned-token-ids',
+ nargs='+',
+ type=int,
+ default=None,
+ help="pooling-returned-token-ids represents a list of "
+ "indices for the vocabulary dimensions to be extracted, "
+ "such as the token IDs of good_token and bad_token in "
+ "the math-shepherd-mistral-7b-prm model.")
+
return parser
@classmethod
@@ -891,6 +950,11 @@ def create_model_config(self) -> ModelConfig:
override_neuron_config=self.override_neuron_config,
config_format=self.config_format,
mm_processor_kwargs=self.mm_processor_kwargs,
+ pooling_type=self.pooling_type,
+ pooling_norm=self.pooling_norm,
+ pooling_softmax=self.pooling_softmax,
+ pooling_step_tag_id=self.pooling_step_tag_id,
+ pooling_returned_token_ids=self.pooling_returned_token_ids,
)
def create_load_config(self) -> LoadConfig:
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index fde768ed5165e..3fd34fadee1ca 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -257,7 +257,8 @@ def __init__(
"num_scheduler_steps=%d, chunked_prefill_enabled=%s "
"multi_step_stream_outputs=%s, enable_prefix_caching=%s, "
"use_async_output_proc=%s, use_cached_outputs=%s, "
- "chat_template_text_format=%s, mm_processor_kwargs=%s)",
+ "chat_template_text_format=%s, mm_processor_kwargs=%s, "
+ "pooler_config=%r)",
VLLM_VERSION,
model_config.model,
speculative_config,
@@ -294,6 +295,7 @@ def __init__(
use_cached_outputs,
model_config.chat_template_text_format,
model_config.mm_processor_kwargs,
+ model_config.pooler_config,
)
# TODO(woosuk): Print more configs in debug mode.
self.model_config = model_config
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index db97fe0a0285b..083b67c2f8e7d 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -159,6 +159,11 @@ def __init__(
mm_processor_kwargs: Optional[Dict[str, Any]] = None,
# After positional args are removed, move this right below `model`
task: TaskOption = "auto",
+ pooling_type: Optional[str] = None,
+ pooling_norm: Optional[bool] = None,
+ pooling_softmax: Optional[bool] = None,
+ pooling_step_tag_id: Optional[int] = None,
+ pooling_returned_token_ids: Optional[List[int]] = None,
**kwargs,
) -> None:
'''
@@ -193,6 +198,11 @@ def __init__(
disable_custom_all_reduce=disable_custom_all_reduce,
disable_async_output_proc=disable_async_output_proc,
mm_processor_kwargs=mm_processor_kwargs,
+ pooling_type=pooling_type,
+ pooling_norm=pooling_norm,
+ pooling_softmax=pooling_softmax,
+ pooling_step_tag_id=pooling_step_tag_id,
+ pooling_returned_token_ids=pooling_returned_token_ids,
**kwargs,
)
self.llm_engine = LLMEngine.from_engine_args(
diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py
index 0a1df9cb699ae..1c9772b41cbef 100644
--- a/vllm/model_executor/layers/pooler.py
+++ b/vllm/model_executor/layers/pooler.py
@@ -1,8 +1,10 @@
from enum import IntEnum
+from typing import List, Optional
import torch
import torch.nn as nn
+from vllm.config import PoolerConfig
from vllm.model_executor.pooling_metadata import (PoolingMetadata,
PoolingTensors)
from vllm.sequence import EmbeddingSequenceGroupOutput, PoolerOutput
@@ -13,6 +15,7 @@ class PoolingType(IntEnum):
LAST = 0
ALL = 1
CLS = 2
+ STEP = 3
class Pooler(nn.Module):
@@ -28,15 +31,47 @@ class Pooler(nn.Module):
normalize: Whether to normalize the pooled data.
"""
- def __init__(self,
- pooling_type: PoolingType,
- normalize: bool,
- softmax: bool = False):
+ def __init__(
+ self,
+ pooling_type: PoolingType,
+ normalize: bool,
+ softmax: bool,
+ step_tag_id: Optional[int] = None,
+ returned_token_ids: Optional[List[int]] = None,
+ ):
super().__init__()
self.pooling_type = pooling_type
self.normalize = normalize
self.softmax = softmax
+ self.step_tag_id = step_tag_id
+ self.returned_token_ids = returned_token_ids
+
+ @classmethod
+ def from_config_with_defaults(
+ cls,
+ pooler_config: PoolerConfig,
+ pooling_type: PoolingType,
+ normalize: bool,
+ softmax: bool,
+ step_tag_id: Optional[int] = None,
+ returned_token_ids: Optional[List[int]] = None,
+ ) -> Optional["Pooler"]:
+ if pooler_config is None:
+ return None
+ return cls(
+ pooling_type=PoolingType[pooler_config.pooling_type]
+ if pooler_config.pooling_type is not None else pooling_type,
+ normalize=pooler_config.pooling_norm
+ if pooler_config.pooling_norm is not None else normalize,
+ softmax=pooler_config.pooling_softmax
+ if pooler_config.pooling_softmax is not None else softmax,
+ step_tag_id=pooler_config.pooling_step_tag_id
+ if pooler_config.pooling_step_tag_id is not None else step_tag_id,
+ returned_token_ids=pooler_config.pooling_returned_token_ids
+ if pooler_config.pooling_returned_token_ids is not None else
+ returned_token_ids,
+ )
def forward(
self,
@@ -62,6 +97,25 @@ def forward(
for prompt_len in prompt_lens:
pooled_data.append(hidden_states[offset:offset + prompt_len])
offset += prompt_len
+ elif self.pooling_type == PoolingType.STEP:
+ if self.returned_token_ids is not None and len(
+ self.returned_token_ids) > 0:
+ logits = hidden_states[:,
+ self.returned_token_ids].softmax(dim=-1)
+ else:
+ logits = hidden_states.softmax(dim=-1)
+ offset = 0
+ pooled_data = []
+ for prompt_len, seq_data_i in zip(
+ prompt_lens, pooling_metadata.seq_data.values()):
+ if self.step_tag_id is None:
+ pooled_data.append(logits[offset:offset + prompt_len])
+ else:
+ step_idxs = torch.tensor(
+ seq_data_i.prompt_token_ids) == self.step_tag_id
+ pooled_data.append(logits[offset:offset +
+ prompt_len][step_idxs])
+ offset += prompt_len
else:
raise ValueError(f"Invalid pooling type: {self.pooling_type}")
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 3ae8a51859f70..79703bb7ded7a 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -23,7 +23,7 @@
from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoadFormat,
LoRAConfig, ModelConfig, MultiModalConfig,
- ParallelConfig, SchedulerConfig)
+ ParallelConfig, PoolerConfig, SchedulerConfig)
from vllm.distributed import (get_tensor_model_parallel_rank,
get_tensor_model_parallel_world_size)
from vllm.envs import VLLM_USE_MODELSCOPE
@@ -122,7 +122,8 @@ def _get_model_initialization_kwargs(
model_class: Type[nn.Module],
lora_config: Optional[LoRAConfig],
multimodal_config: Optional[MultiModalConfig],
- scheduler_config: Optional[SchedulerConfig] = None) -> Dict[str, Any]:
+ scheduler_config: Optional[SchedulerConfig] = None,
+ pooler_config: Optional[PoolerConfig] = None) -> Dict[str, Any]:
"""Get extra kwargs for model initialization."""
extra_kwargs: Dict[str, Any] = {}
@@ -143,7 +144,8 @@ def _get_model_initialization_kwargs(
if has_inner_state(model_class) and scheduler_config:
extra_kwargs["scheduler_config"] = scheduler_config
-
+ if pooler_config:
+ extra_kwargs["pooler_config"] = pooler_config
return extra_kwargs
@@ -155,10 +157,12 @@ def build_model(model_class: Type[nn.Module],
lora_config: Optional[LoRAConfig],
multimodal_config: Optional[MultiModalConfig],
scheduler_config: Optional[SchedulerConfig],
- prefix: Optional[str] = None) -> nn.Module:
+ prefix: Optional[str] = None,
+ pooler_config: Optional[PoolerConfig] = None) -> nn.Module:
extra_kwargs = _get_model_initialization_kwargs(model_class, lora_config,
multimodal_config,
- scheduler_config)
+ scheduler_config,
+ pooler_config)
if prefix:
extra_kwargs["prefix"] = prefix
@@ -185,6 +189,7 @@ def _initialize_model(
lora_config=lora_config,
multimodal_config=model_config.multimodal_config,
scheduler_config=scheduler_config,
+ pooler_config=model_config.pooler_config,
)
diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py
index 4c0a0e303e655..bfed2929d57d2 100644
--- a/vllm/model_executor/models/bert.py
+++ b/vllm/model_executor/models/bert.py
@@ -6,7 +6,7 @@
from vllm.attention import Attention, AttentionMetadata, AttentionType
from vllm.attention.backends.xformers import XFormersImpl
-from vllm.config import CacheConfig
+from vllm.config import CacheConfig, PoolerConfig
from vllm.distributed import get_tensor_model_parallel_world_size
from vllm.model_executor.layers.activation import get_act_fn
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
@@ -387,10 +387,15 @@ def __init__(
config: BertConfig,
cache_config: Optional[CacheConfig] = None,
quant_config: Optional[QuantizationConfig] = None,
+ pooler_config: Optional[PoolerConfig] = None,
) -> None:
super().__init__()
self.model = BertModel(config, cache_config, quant_config)
- self._pooler = Pooler(pooling_type=PoolingType.CLS, normalize=True)
+ self._pooler = Pooler.from_config_with_defaults(
+ pooler_config,
+ pooling_type=PoolingType.CLS,
+ normalize=True,
+ softmax=False)
def forward(
self,
diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py
index d79248f93f5ae..693f32160a289 100644
--- a/vllm/model_executor/models/gemma2.py
+++ b/vllm/model_executor/models/gemma2.py
@@ -22,7 +22,7 @@
from vllm.attention import Attention, AttentionMetadata
from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig, LoRAConfig
+from vllm.config import CacheConfig, LoRAConfig, PoolerConfig
from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
from vllm.logger import init_logger
from vllm.model_executor.layers.activation import GeluAndMul
@@ -473,13 +473,17 @@ class Gemma2EmbeddingModel(nn.Module, SupportsPP):
def __init__(
self,
+ pooler_config: Optional[PoolerConfig] = None,
**kwargs,
) -> None:
super().__init__()
self.model = Gemma2Model(**kwargs)
- self._pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
-
+ self._pooler = Pooler.from_config_with_defaults(
+ pooler_config,
+ pooling_type=PoolingType.LAST,
+ normalize=True,
+ softmax=False)
self.make_empty_intermediate_tensors = (
self.model.make_empty_intermediate_tensors)
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 98c53bdaae811..8a9e5203972be 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -29,7 +29,7 @@
from vllm.attention import Attention, AttentionMetadata
from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig, LoRAConfig
+from vllm.config import CacheConfig, LoRAConfig, PoolerConfig
from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
get_tensor_model_parallel_world_size)
from vllm.model_executor.layers.activation import SiluAndMul
@@ -502,6 +502,7 @@ def __init__(
quant_config: Optional[QuantizationConfig] = None,
lora_config: Optional[LoRAConfig] = None,
prefix: str = "",
+ pooler_config: Optional[PoolerConfig] = None,
) -> None:
super().__init__()
@@ -543,6 +544,11 @@ def __init__(
self.lm_head = PPMissingLayer()
self.make_empty_intermediate_tensors = (
self.model.make_empty_intermediate_tensors)
+ self._pooler = Pooler.from_config_with_defaults(
+ pooler_config,
+ pooling_type=PoolingType.STEP,
+ normalize=False,
+ softmax=False)
def forward(
self,
@@ -565,6 +571,14 @@ def compute_logits(
sampling_metadata)
return logits
+ def pooler(
+ self,
+ hidden_states: torch.Tensor,
+ pooling_metadata: PoolingMetadata,
+ ) -> Optional[PoolerOutput]:
+ logits = self.compute_logits(hidden_states, None)
+ return self._pooler(logits, pooling_metadata)
+
def sample(self, logits: torch.Tensor,
sampling_metadata: SamplingMetadata) -> Optional[SamplerOutput]:
next_tokens = self.sampler(logits, sampling_metadata)
@@ -630,12 +644,17 @@ class LlamaEmbeddingModel(nn.Module, SupportsPP):
def __init__(
self,
+ pooler_config: Optional[PoolerConfig] = None,
**kwargs,
) -> None:
super().__init__()
self.model = LlamaModel(**kwargs)
- self._pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
+ self._pooler = Pooler.from_config_with_defaults(
+ pooler_config,
+ pooling_type=PoolingType.LAST,
+ normalize=True,
+ softmax=False)
self.make_empty_intermediate_tensors = (
self.model.make_empty_intermediate_tensors)
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index f85129b206919..e8540d85ff565 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -11,7 +11,7 @@
from typing_extensions import NotRequired
from vllm.attention import AttentionMetadata
-from vllm.config import CacheConfig, MultiModalConfig
+from vllm.config import CacheConfig, MultiModalConfig, PoolerConfig
from vllm.inputs import INPUT_REGISTRY, DecoderOnlyInputs, InputContext
from vllm.model_executor.layers.pooler import Pooler, PoolingType
from vllm.model_executor.layers.quantization import QuantizationConfig
@@ -285,7 +285,8 @@ def __init__(self,
config: LlavaNextConfig,
multimodal_config: MultiModalConfig,
cache_config: Optional[CacheConfig] = None,
- quant_config: Optional[QuantizationConfig] = None) -> None:
+ quant_config: Optional[QuantizationConfig] = None,
+ pooler_config: Optional[PoolerConfig] = None) -> None:
super().__init__()
self.config = config
@@ -312,8 +313,11 @@ def __init__(self,
# The same model class supports both language generation and embedding
# because the architecture name is the same
- self._pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
-
+ self._pooler = Pooler.from_config_with_defaults(
+ pooler_config,
+ pooling_type=PoolingType.LAST,
+ normalize=True,
+ softmax=False)
self.make_empty_intermediate_tensors = (
self.language_model.make_empty_intermediate_tensors)
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 0962d3d3847c9..0fc4556831fd7 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -26,7 +26,8 @@
from transformers import CLIPVisionConfig, PretrainedConfig
from vllm.attention import AttentionMetadata
-from vllm.config import CacheConfig, ModelConfig, MultiModalConfig
+from vllm.config import (CacheConfig, ModelConfig, MultiModalConfig,
+ PoolerConfig)
from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext,
token_inputs)
from vllm.logger import init_logger
@@ -530,7 +531,8 @@ def __init__(self,
config: PretrainedConfig,
multimodal_config: MultiModalConfig,
cache_config: Optional[CacheConfig] = None,
- quant_config: Optional[QuantizationConfig] = None) -> None:
+ quant_config: Optional[QuantizationConfig] = None,
+ pooler_config: Optional[PoolerConfig] = None) -> None:
super().__init__()
self.config = config
@@ -556,8 +558,11 @@ def __init__(self,
# The same model class supports both language generation and embedding
# because the architecture name is the same
- self._pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
-
+ self._pooler = Pooler.from_config_with_defaults(
+ pooler_config,
+ pooling_type=PoolingType.LAST,
+ normalize=True,
+ softmax=False)
self.make_empty_intermediate_tensors = (
self.language_model.make_empty_intermediate_tensors)
diff --git a/vllm/model_executor/models/qwen2_cls.py b/vllm/model_executor/models/qwen2_cls.py
index e10c6dbbb6472..2d6f3e90f761c 100644
--- a/vllm/model_executor/models/qwen2_cls.py
+++ b/vllm/model_executor/models/qwen2_cls.py
@@ -12,7 +12,7 @@
from transformers import Qwen2Config
from vllm.attention import AttentionMetadata
-from vllm.config import CacheConfig, LoRAConfig
+from vllm.config import CacheConfig, LoRAConfig, PoolerConfig
from vllm.model_executor.layers.linear import RowParallelLinear
from vllm.model_executor.layers.pooler import Pooler, PoolingType
from vllm.model_executor.layers.quantization.base_config import (
@@ -53,6 +53,7 @@ def __init__(
cache_config: Optional[CacheConfig] = None,
quant_config: Optional[QuantizationConfig] = None,
lora_config: Optional[LoRAConfig] = None,
+ pooler_config: Optional[PoolerConfig] = None,
) -> None:
# TODO (@robertgshaw2): see if this can be moved out
if (cache_config.sliding_window is not None
@@ -77,9 +78,11 @@ def __init__(
self.score = RowParallelLinear(config.hidden_size,
config.num_labels,
quant_config=quant_config)
- self._pooler = Pooler(pooling_type=PoolingType.LAST,
- normalize=False,
- softmax=True)
+ self._pooler = Pooler.from_config_with_defaults(
+ pooler_config,
+ pooling_type=PoolingType.LAST,
+ normalize=False,
+ softmax=True)
def forward(
self,
diff --git a/vllm/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py
index ee0eeb9db3808..901b1daaa14a4 100644
--- a/vllm/model_executor/models/qwen2_rm.py
+++ b/vllm/model_executor/models/qwen2_rm.py
@@ -11,7 +11,7 @@
from transformers import Qwen2Config
from vllm.attention import AttentionMetadata
-from vllm.config import CacheConfig, LoRAConfig
+from vllm.config import CacheConfig, LoRAConfig, PoolerConfig
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
RowParallelLinear)
from vllm.model_executor.layers.pooler import Pooler, PoolingType
@@ -64,6 +64,7 @@ def __init__(
cache_config: Optional[CacheConfig] = None,
quant_config: Optional[QuantizationConfig] = None,
lora_config: Optional[LoRAConfig] = None,
+ pooler_config: Optional[PoolerConfig] = None,
) -> None:
# TODO (@robertgshaw2): see if this can be moved out
if (cache_config.sliding_window is not None
@@ -93,8 +94,11 @@ def __init__(
RowParallelLinear(config.hidden_size, 1,
quant_config=quant_config),
)
- self._pooler = Pooler(pooling_type=PoolingType.ALL, normalize=False)
-
+ self._pooler = Pooler.from_config_with_defaults(
+ pooler_config,
+ pooling_type=PoolingType.ALL,
+ normalize=False,
+ softmax=False)
self.make_empty_intermediate_tensors = (
self.model.make_empty_intermediate_tensors)
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 30dfff31f7e48..f50ceaccb1bbe 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -100,11 +100,27 @@
"Qwen2ForRewardModel": ("qwen2_rm", "Qwen2ForRewardModel"),
"Qwen2ForSequenceClassification": (
"qwen2_cls", "Qwen2ForSequenceClassification"),
+ "LlamaForCausalLM": ("llama", "LlamaForCausalLM"),
+ "Phi3ForCausalLM": ("phi3", "Phi3ForCausalLM"),
+ "DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"),
# [Multimodal]
"LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"), # noqa: E501
"Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
}
+def add_embedding_models(base_models, embedding_models):
+ with_pooler_method_models = {}
+ embedding_models_name = embedding_models.keys()
+ for name, (path, arch) in base_models.items():
+ if arch in embedding_models_name:
+ with_pooler_method_models[name] = (path, arch)
+ return with_pooler_method_models
+
+_EMBEDDING_MODELS = {
+ **add_embedding_models(_TEXT_GENERATION_MODELS, _EMBEDDING_MODELS),
+ **_EMBEDDING_MODELS,
+}
+
_MULTIMODAL_MODELS = {
# [Decoder-only]
"Blip2ForConditionalGeneration": ("blip2", "Blip2ForConditionalGeneration"),
From 9ff4511e43bb95efefd4e28048ca257e408277fb Mon Sep 17 00:00:00 2001
From: Elfie Guo <164945471+elfiegg@users.noreply.github.com>
Date: Wed, 30 Oct 2024 09:33:53 -0700
Subject: [PATCH 053/113] [Misc] Add chunked-prefill support on FlashInfer.
(#9781)
---
.../basic_correctness/test_chunked_prefill.py | 12 +++
vllm/attention/backends/flashinfer.py | 88 +++++++++++++------
2 files changed, 72 insertions(+), 28 deletions(-)
diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py
index 51aec8c873d12..cc5bc2aca27c9 100644
--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@@ -11,6 +11,8 @@
import pytest
+from tests.kernels.utils import override_backend_env_variable
+
from ..models.utils import check_logprobs_close, check_outputs_equal
from ..utils import multi_gpu_test
@@ -28,6 +30,7 @@
# NOTE: Increasing this in this suite will fail CI because we currently cannot
# reset distributed env properly. Use a value > 1 just when you test.
@pytest.mark.parametrize("tensor_parallel_size", [1])
+@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
def test_models(
hf_runner,
vllm_runner,
@@ -38,11 +41,15 @@ def test_models(
chunked_prefill_token_size: int,
enforce_eager: bool,
tensor_parallel_size: int,
+ attention_backend: str,
+ monkeypatch,
) -> None:
"""
Checks exact match decode between huggingface model and vllm runner with
chunked prefill.
"""
+ override_backend_env_variable(monkeypatch, attention_backend)
+
max_num_seqs = chunked_prefill_token_size
max_num_batched_tokens = chunked_prefill_token_size
@@ -71,13 +78,18 @@ def test_models(
@multi_gpu_test(num_gpus=2)
@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
def test_models_distributed(
hf_runner,
vllm_runner,
example_prompts,
model: str,
distributed_executor_backend: str,
+ attention_backend: str,
+ monkeypatch,
) -> None:
+ override_backend_env_variable(monkeypatch, attention_backend)
+
if (model == "meta-llama/Llama-2-7b-hf"
and distributed_executor_backend == "ray"):
# test ray adag
diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
index e43fb134a6a5a..5ea101ae0432f 100644
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -268,6 +268,11 @@ class FlashInferMetadata(AttentionMetadata):
# Maximum sequence length among prefill batch. 0 if there are decoding
# requests only.
max_prefill_seq_len: int
+ # Number of query tokens for each request in the batch.
+ # Currently, we require that all requests have the same number of query
+ # tokens during the decoding phase. When speculavie decoding is enabled,
+ # decode_query_len might be greater than 1. In all other cases, it is 1.
+ decode_query_len: Optional[int] = 1
use_cuda_graph: bool = True
@@ -335,6 +340,7 @@ def begin_forward(self):
assert self.paged_kv_last_page_len is not None
assert self.block_table_bound is not None
assert self.seq_lens_tensor is not None
+ self.query_start_loc = self.query_start_loc[:self.num_prefills + 1]
batch_size = self.query_start_loc.shape[0] - 1
assert batch_size >= 0
# We will use flash attention for profiling to
@@ -349,11 +355,13 @@ def begin_forward(self):
self.paged_kv_indices = self.paged_kv_indices.to(self.device)
self.prefill_wrapper.end_forward()
self.prefill_wrapper.begin_forward(
- self.query_start_loc, self.paged_kv_indptr,
- self.paged_kv_indices, self.paged_kv_last_page_len,
+ self.query_start_loc,
+ self.paged_kv_indptr[:self.num_prefills + 1],
+ self.paged_kv_indices,
+ self.paged_kv_last_page_len[:self.num_prefills],
self.num_qo_heads, self.num_kv_heads, self.head_dim,
self.page_size)
- else:
+ if self.num_decode_tokens > 0:
assert self.paged_kv_indices is not None
assert self.paged_kv_indptr is not None
assert self.paged_kv_last_page_len is not None
@@ -370,9 +378,9 @@ def begin_forward(self):
assert self.decode_wrapper is not None
self.decode_wrapper.end_forward()
self.decode_wrapper.begin_forward(
- self.paged_kv_indptr,
+ self.paged_kv_indptr[self.num_prefills:],
self.paged_kv_indices,
- self.paged_kv_last_page_len,
+ self.paged_kv_last_page_len[self.num_prefills:],
self.num_qo_heads,
self.num_kv_heads,
self.head_dim,
@@ -397,21 +405,14 @@ def asdict_zerocopy(self,
@property
def prefill_metadata(self) -> Optional["FlashInferMetadata"]:
- # Currently chunked prefill is not supported
- if self.num_decode_tokens == 0:
- assert self.num_prefills > 0
- return self
-
- return None
+ if self.num_prefills == 0:
+ return None
+ return self
@property
def decode_metadata(self) -> Optional["FlashInferMetadata"]:
- # Currently chunked prefill is not supported
- if self.num_prefills > 0:
- assert self.num_decode_tokens == 0, (
- "Chunked prefill is not supported with flashinfer yet.")
+ if self.num_decode_tokens == 0:
return None
-
return self
def advance_step(self,
@@ -599,11 +600,12 @@ def build(self, seq_lens: List[int], query_lens: List[int],
max_prefill_seq_len = max(self.prefill_seq_lens, default=0)
num_decode_tokens = self.num_decode_tokens
+ decode_query_len = max(query_lens[self.num_prefills:], default=1)
if use_captured_graph:
self.slot_mapping.extend([PAD_SLOT_ID] * cuda_graph_pad_size)
self.block_tables.extend([] * cuda_graph_pad_size)
- num_decode_tokens = batch_size
+ num_decode_tokens = batch_size - self.num_prefill_tokens
# The shape of graph_block_tables is
# [max batch size, max context len // block size].
@@ -689,6 +691,7 @@ def build(self, seq_lens: List[int], query_lens: List[int],
self.runner.kv_cache_dtype, self.runner.model_config.dtype)
return FlashInferMetadata(
+ decode_query_len=decode_query_len,
num_prefills=self.num_prefills,
slot_mapping=slot_mapping_tensor,
num_prefill_tokens=self.num_prefill_tokens,
@@ -811,12 +814,6 @@ def unified_flash_infer(
key = key.view(-1, num_kv_heads, head_size)
value = value.view(-1, num_kv_heads, head_size)
- if attn_metadata.num_prefill_tokens > 0:
- assert attn_metadata.num_decode_tokens == 0, (
- "Chunked prefill is not supported with flashinfer yet.")
- if attn_metadata.num_decode_tokens > 0:
- assert attn_metadata.num_prefill_tokens == 0, (
- "Chunked prefill is not supported with flashinfer yet.")
if kv_cache.numel() > 0:
# Use the same reshape and cache kernel as flash attention.
ops.reshape_and_cache_flash(
@@ -836,14 +833,33 @@ def unified_flash_infer(
kv_cache_dtype)
kv_cache = kv_cache.view(torch_dtype)
+ num_prefill_tokens = attn_metadata.num_prefill_tokens
+ num_decode_tokens = attn_metadata.num_decode_tokens
+ assert key.shape[0] == num_prefill_tokens + num_decode_tokens, \
+ f"key : {key.shape} : #prefill tokens {num_prefill_tokens} : #decode tokens {num_decode_tokens}" # noqa
+ assert value.shape[0] == num_prefill_tokens + num_decode_tokens, \
+ f"value : {value.shape} : #prefill toks {num_prefill_tokens} : #decode toks {num_decode_tokens}" # noqa
query = query.contiguous() # Flashinfer requires query to be contiguous
+ # Query for decode. KV is not needed because it is already cached.
+ # QKV for prefill.
+ decode_query = query[num_prefill_tokens:]
+ query = query[:num_prefill_tokens]
+
+ key = key[:num_prefill_tokens]
+ value = value[:num_prefill_tokens]
+
+ assert query.shape[0] == num_prefill_tokens
+ assert decode_query.shape[0] == num_decode_tokens
+
+ prefill_output: Optional[torch.Tensor] = None
+ decode_output: Optional[torch.Tensor] = None
if prefill_meta := attn_metadata.prefill_metadata:
# We will use flash attention for prefill
# when kv_cache is not provided.
# This happens when vllm runs the profiling to
# determine the number of blocks.
if kv_cache.numel() == 0:
- output = flash_attn_varlen_func(
+ prefill_output = flash_attn_varlen_func(
q=query,
k=key,
v=value,
@@ -859,18 +875,34 @@ def unified_flash_infer(
else:
assert prefill_meta is not None
assert prefill_meta.prefill_wrapper is not None
- output = prefill_meta.prefill_wrapper.forward(
+ prefill_output = prefill_meta.prefill_wrapper.forward(
query, kv_cache, logits_soft_cap=logits_soft_cap, causal=True)
- else:
+ if decode_meta := attn_metadata.decode_metadata:
assert attn_metadata.decode_metadata is not None
assert attn_metadata.decode_metadata.decode_wrapper is not None
- output = attn_metadata.decode_metadata.decode_wrapper.forward(
- query,
+ decode_output = attn_metadata.decode_metadata.decode_wrapper.forward(
+ decode_query,
kv_cache,
sm_scale=softmax_scale,
logits_soft_cap=logits_soft_cap,
k_scale=k_scale,
v_scale=v_scale)
+
+ if prefill_output is None and decode_output is not None:
+ # Decode only batch.
+ output, num_tokens = decode_output, num_decode_tokens
+ elif decode_output is None and prefill_output is not None:
+ # Prefill only batch.
+ output, num_tokens = prefill_output, num_prefill_tokens
+ else:
+ # Chunked prefill batch does not work with speculative decoding in
+ # FlashInfer backend, so the query length for decode should be 1.
+ assert prefill_output is not None
+ assert decode_output is not None
+ assert decode_meta is not None
+ assert decode_meta.decode_query_len == 1
+ decode_output = decode_output.squeeze(1)
+ output = torch.cat([prefill_output, decode_output], dim=0)
return output.view(num_tokens, hidden_size)
From 3b3f1e743631667795469946a33d8352fcc74efd Mon Sep 17 00:00:00 2001
From: Joe Runde
Date: Wed, 30 Oct 2024 10:34:07 -0600
Subject: [PATCH 054/113] [Bugfix][core] replace heartbeat with pid check
(#9818)
Signed-off-by: Joe Runde
---
tests/mq_llm_engine/test_error_handling.py | 27 +++++++++-
tests/mq_llm_engine/utils.py | 2 +-
vllm/engine/multiprocessing/client.py | 29 +++++++----
vllm/engine/multiprocessing/engine.py | 59 ++++------------------
vllm/entrypoints/openai/api_server.py | 7 ++-
5 files changed, 62 insertions(+), 62 deletions(-)
diff --git a/tests/mq_llm_engine/test_error_handling.py b/tests/mq_llm_engine/test_error_handling.py
index 205ab00aa6b17..83bc4e7cf847e 100644
--- a/tests/mq_llm_engine/test_error_handling.py
+++ b/tests/mq_llm_engine/test_error_handling.py
@@ -21,7 +21,7 @@
from vllm.utils import FlexibleArgumentParser
MODEL = "google/gemma-1.1-2b-it"
-ENGINE_ARGS = AsyncEngineArgs(model=MODEL)
+ENGINE_ARGS = AsyncEngineArgs(model=MODEL, enforce_eager=True)
RAISED_ERROR = KeyError
RAISED_VALUE = "foo"
@@ -266,3 +266,28 @@ async def test_mp_cuda_init():
async with build_async_engine_client(args):
pass
+
+
+@pytest.mark.asyncio
+async def test_engine_process_death(tmp_socket):
+ with RemoteMQLLMEngine(engine_args=ENGINE_ARGS,
+ ipc_path=tmp_socket) as engine:
+
+ client = await engine.make_client()
+ assert client.is_running
+
+ # kill the engine process
+ engine.proc.kill()
+
+ # Generate call should fail
+ with pytest.raises(MQEngineDeadError):
+ async for _ in client.generate(prompt="Hello my name is",
+ sampling_params=SamplingParams(),
+ request_id=uuid.uuid4()):
+ pass
+
+ # And the health check should show the engine is dead
+ with pytest.raises(RuntimeError, match="Engine process .* died"):
+ await client.check_health()
+
+ client.close()
diff --git a/tests/mq_llm_engine/utils.py b/tests/mq_llm_engine/utils.py
index 3ffa126070ca0..f717c1355431c 100644
--- a/tests/mq_llm_engine/utils.py
+++ b/tests/mq_llm_engine/utils.py
@@ -68,7 +68,7 @@ def __exit__(self, exc_type, exc_value, traceback):
async def make_client(self) -> MQLLMEngineClient:
engine_config = self.engine_args.create_engine_config()
- client = MQLLMEngineClient(self.ipc_path, engine_config)
+ client = MQLLMEngineClient(self.ipc_path, engine_config, self.proc.pid)
while True:
try:
await client.setup()
diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py
index 9e5a6b21f4c18..6e6630b3ff55f 100644
--- a/vllm/engine/multiprocessing/client.py
+++ b/vllm/engine/multiprocessing/client.py
@@ -6,6 +6,7 @@
Optional, Union, cast, overload)
import cloudpickle
+import psutil
import zmq
import zmq.asyncio
from zmq import Frame # type: ignore[attr-defined]
@@ -77,7 +78,8 @@ class MQLLMEngineClient(EngineClient):
every N seconds, confirming the engine is healthy
"""
- def __init__(self, ipc_path: str, engine_config: EngineConfig):
+ def __init__(self, ipc_path: str, engine_config: EngineConfig,
+ engine_pid: int):
self.context = zmq.asyncio.Context()
self._errored_with: Optional[BaseException] = None
@@ -115,6 +117,7 @@ def __init__(self, ipc_path: str, engine_config: EngineConfig):
# Loop to check health of the LLMEngine periodically.
# Started after the MQLLMEngine is ready.
self.health_loop: Optional[asyncio.Task] = None
+ self._engine_process = psutil.Process(engine_pid)
@staticmethod
def is_unsupported_config(engine_args: AsyncEngineArgs):
@@ -131,21 +134,22 @@ def get_data_socket(self) -> Iterator[Socket]:
socket.close(linger=0)
async def run_heartbeat_loop(self, timeout: int):
- """Background loop that continually listens to the RPCServer for
- heartbeats.
+ """Background loop that continually checks to ensure the engine process
+ is still alive.
"""
try:
while True:
- if await self.heartbeat_socket.poll(timeout=timeout) == 0:
- # No heartbeat was received. Set error and exit the loop
+ # Check if the engine process is running:
+ if not self._engine_process.is_running() or (
+ self._engine_process.status() == psutil.STATUS_ZOMBIE):
+ # NB: is_running() returns True for zombies
self._set_errored(
- TimeoutError("No heartbeat received "
- "from MQLLMEngine"))
- logger.debug("Shutting down MQLLMEngineClient check "
- "health loop due to timeout")
+ RuntimeError(
+ f"Engine process (pid {self._engine_process.pid}) "
+ "died."))
break
- else:
+ if await self.heartbeat_socket.poll(timeout=timeout):
# Heartbeat received- check the message
await self._check_success(
error_message="Heartbeat failed.",
@@ -156,6 +160,11 @@ async def run_heartbeat_loop(self, timeout: int):
except asyncio.CancelledError:
logger.debug("Shutting down MQLLMEngineClient check health loop.")
+ except psutil.NoSuchProcess:
+ self._set_errored(
+ RuntimeError(
+ f"Engine process (pid {self._engine_process.pid}) died."))
+
except Exception as e:
self._set_errored(e)
diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py
index f67acdf660759..0a7f430eca488 100644
--- a/vllm/engine/multiprocessing/engine.py
+++ b/vllm/engine/multiprocessing/engine.py
@@ -1,7 +1,5 @@
import pickle
import signal
-import threading
-import time
from contextlib import contextmanager
from typing import Iterator, List, Optional, Union
@@ -21,7 +19,7 @@
RPCStartupRequest, RPCStartupResponse,
RPCUProfileRequest)
# yapf: enable
-from vllm.envs import VLLM_RPC_TIMEOUT, VLLM_USE_V1
+from vllm.envs import VLLM_USE_V1
from vllm.executor.gpu_executor import GPUExecutor
from vllm.logger import init_logger
from vllm.outputs import RequestOutput
@@ -108,20 +106,6 @@ def __init__(self,
# Error state.
self._errored_with: Optional[BaseException] = None
- # Heartbeat thread
- self.heartbeat_thread = threading.Thread(target=self._heartbeat_loop,
- daemon=True)
- self._heartbeat_stop_event = threading.Event()
- # The heartbeat needs to be faster than what the client will wait for
- # The VLLM_RPC_TIMEOUT duration is in ms, and we need one in seconds
- self.heartbeat_interval_seconds = VLLM_RPC_TIMEOUT / 5000.0
-
- self._last_alive_time = time.time()
- # The heartbeats can tolerate a long period of the engine chugging
- # away at a generation request.
- # The VLLM_RPC_TIMEOUT duration is in ms, and we need one in seconds
- self.last_alive_threshold = VLLM_RPC_TIMEOUT * 3.0 / 1000.0
-
@property
def dead_error(self) -> BaseException:
if self._errored_with is not None:
@@ -157,8 +141,6 @@ def start(self):
try:
logger.debug("Starting Startup Loop.")
self.run_startup_loop()
- logger.debug("Starting heartbeat thread")
- self.heartbeat_thread.start()
logger.debug("Starting Engine Loop.")
self.run_engine_loop()
except Exception as e:
@@ -172,7 +154,6 @@ def start(self):
def cleanup(self):
"""Cleanup zeromq state on shutdown."""
# Closes all sockets and destroys context.
- self._heartbeat_stop_event.set()
self.ctx.destroy(linger=0)
del self.engine
@@ -211,11 +192,12 @@ def run_engine_loop(self):
"""Core busy loop of the LLMEngine."""
while True:
- self._alive()
if not self.engine.has_unfinished_requests():
# Poll until there is work to do.
while self.input_socket.poll(timeout=POLLING_TIMEOUT_MS) == 0:
- self._alive()
+ # When there's no work, check on engine health and send
+ # health status back to client
+ self._health_check()
self.engine.do_log_stats()
logger.debug("Waiting for new requests in engine loop.")
@@ -314,32 +296,16 @@ def _handle_abort_request(self, request: RPCAbortRequest):
if self.log_requests:
logger.info("Aborted request %s.", request.request_id)
- def _heartbeat_loop(self):
- while not self._heartbeat_stop_event.wait(
- timeout=self.heartbeat_interval_seconds):
- # Loops until the stop event is set
- self._heartbeat()
-
- logger.debug("Exiting MQLLMEngine heartbeat thread")
-
- def _heartbeat(self):
+ def _health_check(self):
# Send unhealthy if engine has already errored
if self._errored_with is not None:
self._send_unhealthy(self._errored_with)
-
- # Check for life of the main loop
- elif time.time() - self._last_alive_time > self.last_alive_threshold:
- self._send_unhealthy(RuntimeError("Engine loop has died"))
-
- else:
- # Otherwise- check health of the engine
- # self.engine.check_health() raises on unhealthy
- try:
- self.engine.check_health()
- self._send_healthy()
- except Exception as e:
- self._set_errored(e)
- self._send_unhealthy(e)
+ try:
+ self.engine.check_health()
+ self._send_healthy()
+ except Exception as e:
+ self._set_errored(e)
+ self._send_unhealthy(e)
def _send_outputs(self, outputs: REQUEST_OUTPUTS_T):
"""Send List of RequestOutput to RPCClient."""
@@ -369,9 +335,6 @@ def _set_errored(self, e: BaseException):
if self._errored_with is None:
self._errored_with = e
- def _alive(self):
- self._last_alive_time = time.time()
-
def start_profile(self) -> None:
if type(self.engine.model_executor) is GPUExecutor:
self.engine.model_executor.start_profile()
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index afa370a1cb40b..0e0ec311023eb 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -176,13 +176,16 @@ async def build_async_engine_client_from_engine_args(
UsageContext.OPENAI_API_SERVER,
ipc_path))
engine_process.start()
- logger.info("Started engine process with PID %d", engine_process.pid)
+ engine_pid = engine_process.pid
+ assert engine_pid is not None, "Engine process failed to start"
+ logger.info("Started engine process with PID %d", engine_pid)
# Build RPCClient, which conforms to EngineClient Protocol.
# NOTE: Actually, this is not true yet. We still need to support
# embedding models via RPC (see TODO above)
engine_config = engine_args.create_engine_config()
- mp_engine_client = MQLLMEngineClient(ipc_path, engine_config)
+ mp_engine_client = MQLLMEngineClient(ipc_path, engine_config,
+ engine_pid)
try:
while True:
From 33d257735f35da437262f381cc9cb5a02f3d6b6b Mon Sep 17 00:00:00 2001
From: Joe Runde
Date: Wed, 30 Oct 2024 11:28:29 -0600
Subject: [PATCH 055/113] [Doc] link bug for multistep guided decoding (#9843)
Signed-off-by: Joe Runde
---
docs/source/serving/compatibility_matrix.rst | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/docs/source/serving/compatibility_matrix.rst b/docs/source/serving/compatibility_matrix.rst
index cac0605ca132b..20a81f4cad1d1 100644
--- a/docs/source/serving/compatibility_matrix.rst
+++ b/docs/source/serving/compatibility_matrix.rst
@@ -283,7 +283,7 @@ Feature x Feature
- ✅
- ✅
- ✅
- - ✗
+ - `✗ `__
- ?
- ✅
- ✅
From c787f2d81ddc25a3505a2075238f1f54233ff76b Mon Sep 17 00:00:00 2001
From: Harsha vardhan manoj Bikki <39381063+hbikki@users.noreply.github.com>
Date: Wed, 30 Oct 2024 12:22:02 -0700
Subject: [PATCH 056/113] [Neuron] Update Dockerfile.neuron to fix build
failure (#9822)
---
Dockerfile.neuron | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/Dockerfile.neuron b/Dockerfile.neuron
index 3d9d8e7da487c..0d0d8df94578c 100644
--- a/Dockerfile.neuron
+++ b/Dockerfile.neuron
@@ -36,6 +36,6 @@ RUN python3 -m pip install -U \
ENV VLLM_TARGET_DEVICE neuron
RUN --mount=type=bind,source=.git,target=.git \
- pip install --no-build-isolation -v -e . \
+ pip install --no-build-isolation -v -e .
CMD ["/bin/bash"]
From c2cd1a21420e5cac847808bd3113b4c1100633c1 Mon Sep 17 00:00:00 2001
From: youkaichao
Date: Wed, 30 Oct 2024 13:36:51 -0700
Subject: [PATCH 057/113] [doc] update pp support (#9853)
Signed-off-by: youkaichao
---
docs/source/serving/distributed_serving.rst | 5 +----
1 file changed, 1 insertion(+), 4 deletions(-)
diff --git a/docs/source/serving/distributed_serving.rst b/docs/source/serving/distributed_serving.rst
index fcb2646df50d3..4d57206e53a05 100644
--- a/docs/source/serving/distributed_serving.rst
+++ b/docs/source/serving/distributed_serving.rst
@@ -22,7 +22,7 @@ After adding enough GPUs and nodes to hold the model, you can run vLLM first, wh
Details for Distributed Inference and Serving
----------------------------------------------
-vLLM supports distributed tensor-parallel inference and serving. Currently, we support `Megatron-LM's tensor parallel algorithm `_. We also support pipeline parallel as a beta feature for online serving. We manage the distributed runtime with either `Ray `_ or python native multiprocessing. Multiprocessing can be used when deploying on a single node, multi-node inferencing currently requires Ray.
+vLLM supports distributed tensor-parallel and pipeline-parallel inference and serving. Currently, we support `Megatron-LM's tensor parallel algorithm `_. We manage the distributed runtime with either `Ray `_ or python native multiprocessing. Multiprocessing can be used when deploying on a single node, multi-node inferencing currently requires Ray.
Multiprocessing will be used by default when not running in a Ray placement group and if there are sufficient GPUs available on the same node for the configured :code:`tensor_parallel_size`, otherwise Ray will be used. This default can be overridden via the :code:`LLM` class :code:`distributed-executor-backend` argument or :code:`--distributed-executor-backend` API server argument. Set it to :code:`mp` for multiprocessing or :code:`ray` for Ray. It's not required for Ray to be installed for the multiprocessing case.
@@ -49,9 +49,6 @@ You can also additionally specify :code:`--pipeline-parallel-size` to enable pip
$ --tensor-parallel-size 4 \
$ --pipeline-parallel-size 2
-.. note::
- Pipeline parallel is a beta feature. It is only supported for online serving as well as LLaMa, GPT2, Mixtral, Qwen, Qwen2, and Nemotron style models.
-
Multi-Node Inference and Serving
--------------------------------
From 00d91c8a2cf3ebaf0f3ea69312f6e3882ed9f372 Mon Sep 17 00:00:00 2001
From: Yongzao <532741407@qq.com>
Date: Thu, 31 Oct 2024 05:52:05 +0800
Subject: [PATCH 058/113] [CI/Build] Simplify exception trace in api server
tests (#9787)
Signed-off-by: youkaichao
Co-authored-by: youkaichao
---
tests/utils.py | 10 +++++++---
1 file changed, 7 insertions(+), 3 deletions(-)
diff --git a/tests/utils.py b/tests/utils.py
index f6f588df48810..e8aad9cb3268f 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -133,15 +133,19 @@ def _wait_for_server(self, *, url: str, timeout: float):
try:
if requests.get(url).status_code == 200:
break
- except Exception as err:
+ except Exception:
+ # this exception can only be raised by requests.get,
+ # which means the server is not ready yet.
+ # the stack trace is not useful, so we suppress it
+ # by using `raise from None`.
result = self.proc.poll()
if result is not None and result != 0:
- raise RuntimeError("Server exited unexpectedly.") from err
+ raise RuntimeError("Server exited unexpectedly.") from None
time.sleep(0.5)
if time.time() - start > timeout:
raise RuntimeError(
- "Server failed to start in time.") from err
+ "Server failed to start in time.") from None
@property
def url_root(self) -> str:
From 64384bbcdfe6bdf4b50ff82bda90e728160325f5 Mon Sep 17 00:00:00 2001
From: youkaichao
Date: Wed, 30 Oct 2024 16:34:22 -0700
Subject: [PATCH 059/113] [torch.compile] upgrade tests (#9858)
Signed-off-by: youkaichao
---
tests/compile/test_basic_correctness.py | 26 +++++++++++++------------
1 file changed, 14 insertions(+), 12 deletions(-)
diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py
index 6aa27b24b4a6e..2f92ff73845f5 100644
--- a/tests/compile/test_basic_correctness.py
+++ b/tests/compile/test_basic_correctness.py
@@ -30,18 +30,20 @@ def test_compile_correctness(model, model_args, pp_size, tp_size, attn_backend,
pytest.skip("Not correct CUDA devices for the test.")
import os
os.environ["VLLM_ATTENTION_BACKEND"] = attn_backend
- if not fullgraph:
- os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0"
- all_args = [["--enforce-eager"] + model_args + ["--max_model_len", "1024"]
- + ["-pp", str(pp_size)] + ["-tp", str(tp_size)]] * 3
+ all_args = [["--enforce-eager"] + model_args + ["-pp", str(pp_size)] +
+ ["-tp", str(tp_size)]] * 3
# don't test VLLM_TORCH_COMPILE_LEVEL == 3 case
# inductor will change the output, so we cannot compare them.
- all_envs: List[Optional[Dict[str, str]]] = [{
- "VLLM_TORCH_COMPILE_LEVEL":
- str(level)
- } for level in [
- CompilationLevel.NO_COMPILATION,
- CompilationLevel.DYNAMO_AS_IS,
- CompilationLevel.DYNAMO_ONCE,
- ]]
+ all_envs: List[Optional[Dict[str, str]]] = []
+ for level in [
+ CompilationLevel.NO_COMPILATION,
+ CompilationLevel.DYNAMO_AS_IS,
+ CompilationLevel.DYNAMO_ONCE,
+ ]:
+ all_envs.append({"VLLM_TORCH_COMPILE_LEVEL": str(level)})
+ if level != CompilationLevel.DYNAMO_ONCE and not fullgraph:
+ # "DYNAMO_ONCE" will always use fullgraph
+ all_envs[-1][
+ "VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0" # type: ignore
+
compare_all_settings(model, all_args, all_envs, method=method)
From abbfb6134dc73359cba015dbd1ad30fafd25a891 Mon Sep 17 00:00:00 2001
From: Guillaume Calmettes
Date: Thu, 31 Oct 2024 02:15:56 +0100
Subject: [PATCH 060/113] [Misc][OpenAI] deprecate max_tokens in favor of new
max_completion_tokens field for chat completion endpoint (#9837)
---
benchmarks/backend_request_func.py | 2 +-
docs/source/serving/run_on_sky.rst | 6 +-
examples/offline_inference_openai.md | 8 +-
examples/openai_api_client_for_multimodal.py | 12 +--
examples/openai_example_batch.jsonl | 4 +-
requirements-common.txt | 2 +-
tests/entrypoints/openai/test_audio.py | 32 +++---
tests/entrypoints/openai/test_chat.py | 103 ++++++++++---------
tests/entrypoints/openai/test_vision.py | 38 +++----
tests/tool_use/test_chat_completions.py | 8 +-
tests/tool_use/test_parallel_tool_calls.py | 8 +-
tests/tool_use/test_tool_calls.py | 8 +-
vllm/entrypoints/openai/protocol.py | 13 ++-
vllm/entrypoints/openai/serving_engine.py | 14 ++-
14 files changed, 140 insertions(+), 118 deletions(-)
diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index 4813fde27f0bc..0a903877f000d 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -324,7 +324,7 @@ async def async_request_openai_chat_completions(
},
],
"temperature": 0.0,
- "max_tokens": request_func_input.output_len,
+ "max_completion_tokens": request_func_input.output_len,
"stream": True,
"ignore_eos": request_func_input.ignore_eos,
}
diff --git a/docs/source/serving/run_on_sky.rst b/docs/source/serving/run_on_sky.rst
index 674b14a879bc3..227e6fd2a7818 100644
--- a/docs/source/serving/run_on_sky.rst
+++ b/docs/source/serving/run_on_sky.rst
@@ -109,7 +109,7 @@ SkyPilot can scale up the service to multiple service replicas with built-in aut
messages:
- role: user
content: Hello! What is your name?
- max_tokens: 1
+ max_completion_tokens: 1
.. raw:: html
@@ -129,7 +129,7 @@ SkyPilot can scale up the service to multiple service replicas with built-in aut
messages:
- role: user
content: Hello! What is your name?
- max_tokens: 1
+ max_completion_tokens: 1
resources:
accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
@@ -255,7 +255,7 @@ This will scale the service up to when the QPS exceeds 2 for each replica.
messages:
- role: user
content: Hello! What is your name?
- max_tokens: 1
+ max_completion_tokens: 1
resources:
accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
diff --git a/examples/offline_inference_openai.md b/examples/offline_inference_openai.md
index ea34374edd3f9..4c64197975534 100644
--- a/examples/offline_inference_openai.md
+++ b/examples/offline_inference_openai.md
@@ -35,8 +35,8 @@
```
$ cat openai_example_batch.jsonl
-{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
-{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
+{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
+{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
```
### Step 2: Run the batch
@@ -94,8 +94,8 @@ To follow along with this example, you can download the example batch, or create
```
$ cat openai_example_batch.jsonl
-{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
-{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
+{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
+{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
```
Now upload your batch file to your S3 bucket.
diff --git a/examples/openai_api_client_for_multimodal.py b/examples/openai_api_client_for_multimodal.py
index beb83e494ed0b..0ec4f71dddf93 100644
--- a/examples/openai_api_client_for_multimodal.py
+++ b/examples/openai_api_client_for_multimodal.py
@@ -53,7 +53,7 @@ def run_text_only() -> None:
"content": "What's the capital of France?"
}],
model=model,
- max_tokens=64,
+ max_completion_tokens=64,
)
result = chat_completion.choices[0].message.content
@@ -83,7 +83,7 @@ def run_single_image() -> None:
],
}],
model=model,
- max_tokens=64,
+ max_completion_tokens=64,
)
result = chat_completion_from_url.choices[0].message.content
@@ -109,7 +109,7 @@ def run_single_image() -> None:
],
}],
model=model,
- max_tokens=64,
+ max_completion_tokens=64,
)
result = chat_completion_from_base64.choices[0].message.content
@@ -144,7 +144,7 @@ def run_multi_image() -> None:
],
}],
model=model,
- max_tokens=64,
+ max_completion_tokens=64,
)
result = chat_completion_from_url.choices[0].message.content
@@ -175,7 +175,7 @@ def run_audio() -> None:
],
}],
model=model,
- max_tokens=64,
+ max_completion_tokens=64,
)
result = chat_completion_from_url.choices[0].message.content
@@ -201,7 +201,7 @@ def run_audio() -> None:
],
}],
model=model,
- max_tokens=64,
+ max_completion_tokens=64,
)
result = chat_completion_from_base64.choices[0].message.content
diff --git a/examples/openai_example_batch.jsonl b/examples/openai_example_batch.jsonl
index 5aa7e185c180a..54ac8c813ddb7 100644
--- a/examples/openai_example_batch.jsonl
+++ b/examples/openai_example_batch.jsonl
@@ -1,2 +1,2 @@
-{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
-{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
+{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
+{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
diff --git a/requirements-common.txt b/requirements-common.txt
index d72cc44762720..ef5ed8b645158 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -10,7 +10,7 @@ protobuf # Required by LlamaTokenizer.
fastapi >= 0.107.0, < 0.113.0; python_version < '3.9'
fastapi >= 0.107.0, != 0.113.*, != 0.114.0; python_version >= '3.9'
aiohttp
-openai >= 1.40.0 # Ensure modern openai package (ensure types module present)
+openai >= 1.45.0 # Ensure modern openai package (ensure types module present and max_completion_tokens field support)
uvicorn[standard]
pydantic >= 2.9 # Required for fastapi >= 0.113.0
pillow # Required for image processing
diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py
index df8a140283fbb..a74109e2f5120 100644
--- a/tests/entrypoints/openai/test_audio.py
+++ b/tests/entrypoints/openai/test_audio.py
@@ -68,11 +68,12 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI,
}]
# test single completion
- chat_completion = await client.chat.completions.create(model=model_name,
- messages=messages,
- max_tokens=10,
- logprobs=True,
- top_logprobs=5)
+ chat_completion = await client.chat.completions.create(
+ model=model_name,
+ messages=messages,
+ max_completion_tokens=10,
+ logprobs=True,
+ top_logprobs=5)
assert len(chat_completion.choices) == 1
choice = chat_completion.choices[0]
@@ -91,7 +92,7 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI,
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
- max_tokens=10,
+ max_completion_tokens=10,
)
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 0
@@ -123,11 +124,12 @@ async def test_single_chat_session_audio_base64encoded(
}]
# test single completion
- chat_completion = await client.chat.completions.create(model=model_name,
- messages=messages,
- max_tokens=10,
- logprobs=True,
- top_logprobs=5)
+ chat_completion = await client.chat.completions.create(
+ model=model_name,
+ messages=messages,
+ max_completion_tokens=10,
+ logprobs=True,
+ top_logprobs=5)
assert len(chat_completion.choices) == 1
choice = chat_completion.choices[0]
@@ -146,7 +148,7 @@ async def test_single_chat_session_audio_base64encoded(
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
- max_tokens=10,
+ max_completion_tokens=10,
)
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 0
@@ -178,7 +180,7 @@ async def test_chat_streaming_audio(client: openai.AsyncOpenAI,
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
- max_tokens=10,
+ max_completion_tokens=10,
temperature=0.0,
)
output = chat_completion.choices[0].message.content
@@ -188,7 +190,7 @@ async def test_chat_streaming_audio(client: openai.AsyncOpenAI,
stream = await client.chat.completions.create(
model=model_name,
messages=messages,
- max_tokens=10,
+ max_completion_tokens=10,
temperature=0.0,
stream=True,
)
@@ -242,7 +244,7 @@ async def test_multi_audio_input(client: openai.AsyncOpenAI, model_name: str,
await client.chat.completions.create(
model=model_name,
messages=messages,
- max_tokens=10,
+ max_completion_tokens=10,
temperature=0.0,
)
diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py
index d1aebbd70d256..8d13f64dce01c 100644
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -65,11 +65,12 @@ async def test_no_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
"content": "what is 1+1?"
}]
- chat_completion = await client.chat.completions.create(model=model_name,
- messages=messages,
- max_tokens=5,
- temperature=0.0,
- logprobs=False)
+ chat_completion = await client.chat.completions.create(
+ model=model_name,
+ messages=messages,
+ max_completion_tokens=5,
+ temperature=0.0,
+ logprobs=False)
choice = chat_completion.choices[0]
assert choice.logprobs is None
@@ -90,12 +91,13 @@ async def test_zero_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
"content": "what is 1+1?"
}]
- chat_completion = await client.chat.completions.create(model=model_name,
- messages=messages,
- max_tokens=5,
- temperature=0.0,
- logprobs=True,
- top_logprobs=0)
+ chat_completion = await client.chat.completions.create(
+ model=model_name,
+ messages=messages,
+ max_completion_tokens=5,
+ temperature=0.0,
+ logprobs=True,
+ top_logprobs=0)
choice = chat_completion.choices[0]
assert choice.logprobs is not None
@@ -117,12 +119,13 @@ async def test_some_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
"content": "what is 1+1?"
}]
- chat_completion = await client.chat.completions.create(model=model_name,
- messages=messages,
- max_tokens=5,
- temperature=0.0,
- logprobs=True,
- top_logprobs=5)
+ chat_completion = await client.chat.completions.create(
+ model=model_name,
+ messages=messages,
+ max_completion_tokens=5,
+ temperature=0.0,
+ logprobs=True,
+ top_logprobs=5)
choice = chat_completion.choices[0]
assert choice.logprobs is not None
@@ -149,7 +152,7 @@ async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI,
with pytest.raises((openai.BadRequestError, openai.APIError)):
stream = await client.chat.completions.create(model=model_name,
messages=messages,
- max_tokens=10,
+ max_completion_tokens=10,
logprobs=True,
top_logprobs=21,
stream=True)
@@ -159,16 +162,17 @@ async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI,
with pytest.raises(openai.BadRequestError):
await client.chat.completions.create(model=model_name,
messages=messages,
- max_tokens=10,
+ max_completion_tokens=10,
logprobs=True,
top_logprobs=30,
stream=False)
# the server should still work afterwards
- chat_completion = await client.chat.completions.create(model=model_name,
- messages=messages,
- max_tokens=10,
- stream=False)
+ chat_completion = await client.chat.completions.create(
+ model=model_name,
+ messages=messages,
+ max_completion_tokens=10,
+ stream=False)
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 0
@@ -271,11 +275,12 @@ async def test_single_chat_session(client: openai.AsyncOpenAI,
}]
# test single completion
- chat_completion = await client.chat.completions.create(model=model_name,
- messages=messages,
- max_tokens=10,
- logprobs=True,
- top_logprobs=5)
+ chat_completion = await client.chat.completions.create(
+ model=model_name,
+ messages=messages,
+ max_completion_tokens=10,
+ logprobs=True,
+ top_logprobs=5)
assert chat_completion.id is not None
assert len(chat_completion.choices) == 1
@@ -294,7 +299,7 @@ async def test_single_chat_session(client: openai.AsyncOpenAI,
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
- max_tokens=10,
+ max_completion_tokens=10,
)
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 0
@@ -319,7 +324,7 @@ async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str):
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
- max_tokens=10,
+ max_completion_tokens=10,
temperature=0.0,
)
output = chat_completion.choices[0].message.content
@@ -329,7 +334,7 @@ async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str):
stream = await client.chat.completions.create(
model=model_name,
messages=messages,
- max_tokens=10,
+ max_completion_tokens=10,
temperature=0.0,
stream=True,
)
@@ -369,7 +374,7 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
stream = await client.chat.completions.create(
model=model_name,
messages=messages,
- max_tokens=10,
+ max_completion_tokens=10,
temperature=0.0,
stream=True,
stream_options={"include_usage": False})
@@ -380,7 +385,7 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
# "continuous_usage_stats": False}}
stream = await client.chat.completions.create(model=model_name,
messages=messages,
- max_tokens=10,
+ max_completion_tokens=10,
temperature=0.0,
stream=True,
stream_options={
@@ -409,7 +414,7 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
await client.chat.completions.create(
model=model_name,
messages=messages,
- max_tokens=10,
+ max_completion_tokens=10,
temperature=0.0,
stream=False,
stream_options={"include_usage": None})
@@ -419,7 +424,7 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
await client.chat.completions.create(
model=model_name,
messages=messages,
- max_tokens=10,
+ max_completion_tokens=10,
temperature=0.0,
stream=False,
stream_options={"include_usage": True})
@@ -429,7 +434,7 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
stream = await client.chat.completions.create(
model=model_name,
messages=messages,
- max_tokens=10,
+ max_completion_tokens=10,
extra_body=dict(min_tokens=10),
temperature=0.0,
stream=True,
@@ -476,7 +481,7 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
chat_completion = await client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
- max_tokens=10,
+ max_completion_tokens=10,
extra_body=dict(guided_choice=sample_guided_choice,
guided_decoding_backend=guided_decoding_backend))
choice1 = chat_completion.choices[0].message.content
@@ -490,7 +495,7 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
chat_completion = await client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
- max_tokens=10,
+ max_completion_tokens=10,
extra_body=dict(guided_choice=sample_guided_choice,
guided_decoding_backend=guided_decoding_backend))
choice2 = chat_completion.choices[0].message.content
@@ -517,7 +522,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI,
chat_completion = await client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
- max_tokens=1000,
+ max_completion_tokens=1000,
extra_body=dict(guided_json=sample_json_schema,
guided_decoding_backend=guided_decoding_backend))
message = chat_completion.choices[0].message
@@ -535,7 +540,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI,
chat_completion = await client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
- max_tokens=1000,
+ max_completion_tokens=1000,
extra_body=dict(guided_json=sample_json_schema,
guided_decoding_backend=guided_decoding_backend))
message = chat_completion.choices[0].message
@@ -563,7 +568,7 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI,
chat_completion = await client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
- max_tokens=20,
+ max_completion_tokens=20,
extra_body=dict(guided_regex=sample_regex,
guided_decoding_backend=guided_decoding_backend))
ip1 = chat_completion.choices[0].message.content
@@ -575,7 +580,7 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI,
chat_completion = await client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
- max_tokens=20,
+ max_completion_tokens=20,
extra_body=dict(guided_regex=sample_regex,
guided_decoding_backend=guided_decoding_backend))
ip2 = chat_completion.choices[0].message.content
@@ -623,7 +628,7 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
chat_completion = await client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
- max_tokens=10,
+ max_completion_tokens=10,
logprobs=True,
top_logprobs=5,
extra_body=dict(guided_choice=sample_guided_choice,
@@ -660,7 +665,7 @@ async def test_named_tool_use(client: openai.AsyncOpenAI,
chat_completion = await client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
- max_tokens=1000,
+ max_completion_tokens=1000,
tools=[{
"type": "function",
"function": {
@@ -694,7 +699,7 @@ async def test_named_tool_use(client: openai.AsyncOpenAI,
stream = await client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
- max_tokens=1000,
+ max_completion_tokens=1000,
tools=[{
"type": "function",
"function": {
@@ -750,7 +755,7 @@ async def test_required_tool_use_not_yet_supported(
await client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
- max_tokens=1000,
+ max_completion_tokens=1000,
tools=[{
"type": "function",
"function": {
@@ -765,7 +770,7 @@ async def test_required_tool_use_not_yet_supported(
await client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
- max_tokens=1000,
+ max_completion_tokens=1000,
tools=[{
"type": "function",
"function": {
@@ -796,7 +801,7 @@ async def test_inconsistent_tool_choice_and_tools(client: openai.AsyncOpenAI,
with pytest.raises(openai.BadRequestError):
await client.chat.completions.create(model=MODEL_NAME,
messages=messages,
- max_tokens=1000,
+ max_completion_tokens=1000,
tool_choice={
"type": "function",
"function": {
@@ -809,7 +814,7 @@ async def test_inconsistent_tool_choice_and_tools(client: openai.AsyncOpenAI,
await client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
- max_tokens=1000,
+ max_completion_tokens=1000,
tools=[{
"type": "function",
"function": {
diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py
index 68804d6833c73..157d873a75b4d 100644
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -78,11 +78,12 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI,
}]
# test single completion
- chat_completion = await client.chat.completions.create(model=model_name,
- messages=messages,
- max_tokens=10,
- logprobs=True,
- top_logprobs=5)
+ chat_completion = await client.chat.completions.create(
+ model=model_name,
+ messages=messages,
+ max_completion_tokens=10,
+ logprobs=True,
+ top_logprobs=5)
assert len(chat_completion.choices) == 1
choice = chat_completion.choices[0]
@@ -101,7 +102,7 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI,
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
- max_tokens=10,
+ max_completion_tokens=10,
)
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 0
@@ -134,7 +135,7 @@ async def test_single_chat_session_image_beamsearch(client: openai.AsyncOpenAI,
model=model_name,
messages=messages,
n=2,
- max_tokens=10,
+ max_completion_tokens=10,
logprobs=True,
top_logprobs=5,
extra_body=dict(use_beam_search=True))
@@ -169,11 +170,12 @@ async def test_single_chat_session_image_base64encoded(
}]
# test single completion
- chat_completion = await client.chat.completions.create(model=model_name,
- messages=messages,
- max_tokens=10,
- logprobs=True,
- top_logprobs=5)
+ chat_completion = await client.chat.completions.create(
+ model=model_name,
+ messages=messages,
+ max_completion_tokens=10,
+ logprobs=True,
+ top_logprobs=5)
assert len(chat_completion.choices) == 1
choice = chat_completion.choices[0]
@@ -192,7 +194,7 @@ async def test_single_chat_session_image_base64encoded(
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
- max_tokens=10,
+ max_completion_tokens=10,
)
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 0
@@ -226,7 +228,7 @@ async def test_single_chat_session_image_base64encoded_beamsearch(
model=model_name,
messages=messages,
n=2,
- max_tokens=10,
+ max_completion_tokens=10,
extra_body=dict(use_beam_search=True))
assert len(chat_completion.choices) == 2
assert chat_completion.choices[
@@ -259,7 +261,7 @@ async def test_chat_streaming_image(client: openai.AsyncOpenAI,
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
- max_tokens=10,
+ max_completion_tokens=10,
temperature=0.0,
)
output = chat_completion.choices[0].message.content
@@ -269,7 +271,7 @@ async def test_chat_streaming_image(client: openai.AsyncOpenAI,
stream = await client.chat.completions.create(
model=model_name,
messages=messages,
- max_tokens=10,
+ max_completion_tokens=10,
temperature=0.0,
stream=True,
)
@@ -320,7 +322,7 @@ async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str,
await client.chat.completions.create(
model=model_name,
messages=messages,
- max_tokens=10,
+ max_completion_tokens=10,
temperature=0.0,
)
@@ -337,7 +339,7 @@ async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str,
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
- max_tokens=10,
+ max_completion_tokens=10,
temperature=0.0,
)
message = chat_completion.choices[0].message
diff --git a/tests/tool_use/test_chat_completions.py b/tests/tool_use/test_chat_completions.py
index 8e7cb9f5d3d90..75bbfbb766931 100644
--- a/tests/tool_use/test_chat_completions.py
+++ b/tests/tool_use/test_chat_completions.py
@@ -18,7 +18,7 @@ async def test_chat_completion_without_tools(client: openai.AsyncOpenAI,
chat_completion = await client.chat.completions.create(
messages=ensure_system_prompt(MESSAGES_WITHOUT_TOOLS, server_config),
temperature=0,
- max_tokens=150,
+ max_completion_tokens=150,
model=model_name,
logprobs=False)
choice = chat_completion.choices[0]
@@ -38,7 +38,7 @@ async def test_chat_completion_without_tools(client: openai.AsyncOpenAI,
stream = await client.chat.completions.create(
messages=ensure_system_prompt(MESSAGES_WITHOUT_TOOLS, server_config),
temperature=0,
- max_tokens=150,
+ max_completion_tokens=150,
model=model_name,
logprobs=False,
stream=True,
@@ -86,7 +86,7 @@ async def test_chat_completion_with_tools(client: openai.AsyncOpenAI,
chat_completion = await client.chat.completions.create(
messages=ensure_system_prompt(MESSAGES_WITHOUT_TOOLS, server_config),
temperature=0,
- max_tokens=150,
+ max_completion_tokens=150,
model=model_name,
tools=[WEATHER_TOOL],
logprobs=False)
@@ -107,7 +107,7 @@ async def test_chat_completion_with_tools(client: openai.AsyncOpenAI,
stream = await client.chat.completions.create(
messages=ensure_system_prompt(MESSAGES_WITHOUT_TOOLS, server_config),
temperature=0,
- max_tokens=150,
+ max_completion_tokens=150,
model=model_name,
logprobs=False,
tools=[WEATHER_TOOL],
diff --git a/tests/tool_use/test_parallel_tool_calls.py b/tests/tool_use/test_parallel_tool_calls.py
index cff3c8a556ca4..c294cb04919fa 100644
--- a/tests/tool_use/test_parallel_tool_calls.py
+++ b/tests/tool_use/test_parallel_tool_calls.py
@@ -26,7 +26,7 @@ async def test_parallel_tool_calls(client: openai.AsyncOpenAI,
chat_completion = await client.chat.completions.create(
messages=MESSAGES_ASKING_FOR_PARALLEL_TOOLS,
temperature=0,
- max_tokens=200,
+ max_completion_tokens=200,
model=model_name,
tools=[WEATHER_TOOL, SEARCH_TOOL],
logprobs=False)
@@ -63,7 +63,7 @@ async def test_parallel_tool_calls(client: openai.AsyncOpenAI,
model=model_name,
messages=MESSAGES_ASKING_FOR_PARALLEL_TOOLS,
temperature=0,
- max_tokens=200,
+ max_completion_tokens=200,
tools=[WEATHER_TOOL, SEARCH_TOOL],
logprobs=False,
stream=True)
@@ -154,7 +154,7 @@ async def test_parallel_tool_calls_with_results(client: openai.AsyncOpenAI,
chat_completion = await client.chat.completions.create(
messages=MESSAGES_WITH_PARALLEL_TOOL_RESPONSE,
temperature=0,
- max_tokens=200,
+ max_completion_tokens=200,
model=model_name,
tools=[WEATHER_TOOL, SEARCH_TOOL],
logprobs=False)
@@ -172,7 +172,7 @@ async def test_parallel_tool_calls_with_results(client: openai.AsyncOpenAI,
stream = await client.chat.completions.create(
messages=MESSAGES_WITH_PARALLEL_TOOL_RESPONSE,
temperature=0,
- max_tokens=200,
+ max_completion_tokens=200,
model=model_name,
tools=[WEATHER_TOOL, SEARCH_TOOL],
logprobs=False,
diff --git a/tests/tool_use/test_tool_calls.py b/tests/tool_use/test_tool_calls.py
index 9e6d715f44fcf..fe8cb496c9741 100644
--- a/tests/tool_use/test_tool_calls.py
+++ b/tests/tool_use/test_tool_calls.py
@@ -17,7 +17,7 @@ async def test_tool_call_and_choice(client: openai.AsyncOpenAI):
chat_completion = await client.chat.completions.create(
messages=MESSAGES_ASKING_FOR_TOOLS,
temperature=0,
- max_tokens=100,
+ max_completion_tokens=100,
model=model_name,
tools=[WEATHER_TOOL, SEARCH_TOOL],
logprobs=False)
@@ -61,7 +61,7 @@ async def test_tool_call_and_choice(client: openai.AsyncOpenAI):
model=model_name,
messages=MESSAGES_ASKING_FOR_TOOLS,
temperature=0,
- max_tokens=100,
+ max_completion_tokens=100,
tools=[WEATHER_TOOL, SEARCH_TOOL],
logprobs=False,
stream=True)
@@ -142,7 +142,7 @@ async def test_tool_call_with_results(client: openai.AsyncOpenAI):
chat_completion = await client.chat.completions.create(
messages=MESSAGES_WITH_TOOL_RESPONSE,
temperature=0,
- max_tokens=100,
+ max_completion_tokens=100,
model=model_name,
tools=[WEATHER_TOOL, SEARCH_TOOL],
logprobs=False)
@@ -159,7 +159,7 @@ async def test_tool_call_with_results(client: openai.AsyncOpenAI):
stream = await client.chat.completions.create(
messages=MESSAGES_WITH_TOOL_RESPONSE,
temperature=0,
- max_tokens=100,
+ max_completion_tokens=100,
model=model_name,
tools=[WEATHER_TOOL, SEARCH_TOOL],
logprobs=False,
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 7f270a81a7692..60fc5ac8d11d2 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -159,7 +159,12 @@ class ChatCompletionRequest(OpenAIBaseModel):
logit_bias: Optional[Dict[str, float]] = None
logprobs: Optional[bool] = False
top_logprobs: Optional[int] = 0
- max_tokens: Optional[int] = None
+ # TODO(#9845): remove max_tokens when field is removed from OpenAI API
+ max_tokens: Optional[int] = Field(
+ default=None,
+ deprecated=
+ 'max_tokens is deprecated in favor of the max_completion_tokens field')
+ max_completion_tokens: Optional[int] = None
n: Optional[int] = 1
presence_penalty: Optional[float] = 0.0
response_format: Optional[ResponseFormat] = None
@@ -295,7 +300,8 @@ class ChatCompletionRequest(OpenAIBaseModel):
def to_beam_search_params(self,
default_max_tokens: int) -> BeamSearchParams:
- max_tokens = self.max_tokens
+ # TODO(#9845): remove max_tokens when field is removed from OpenAI API
+ max_tokens = self.max_completion_tokens or self.max_tokens
if max_tokens is None:
max_tokens = default_max_tokens
@@ -311,7 +317,8 @@ def to_beam_search_params(self,
include_stop_str_in_output=self.include_stop_str_in_output)
def to_sampling_params(self, default_max_tokens: int) -> SamplingParams:
- max_tokens = self.max_tokens
+ # TODO(#9845): remove max_tokens when field is removed from OpenAI API
+ max_tokens = self.max_completion_tokens or self.max_tokens
if max_tokens is None:
max_tokens = default_max_tokens
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index e6d2ab93d3363..22a01b3dc4cc0 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -263,20 +263,26 @@ def _validate_input(
return TextTokensPrompt(prompt=input_text,
prompt_token_ids=input_ids)
- if request.max_tokens is None:
+ # chat completion endpoint supports max_completion_tokens
+ if isinstance(request, ChatCompletionRequest):
+ # TODO(#9845): remove max_tokens when field dropped from OpenAI API
+ max_tokens = request.max_completion_tokens or request.max_tokens
+ else:
+ max_tokens = request.max_tokens
+ if max_tokens is None:
if token_num >= self.max_model_len:
raise ValueError(
f"This model's maximum context length is "
f"{self.max_model_len} tokens. However, you requested "
f"{token_num} tokens in the messages, "
f"Please reduce the length of the messages.")
- elif token_num + request.max_tokens > self.max_model_len:
+ elif token_num + max_tokens > self.max_model_len:
raise ValueError(
f"This model's maximum context length is "
f"{self.max_model_len} tokens. However, you requested "
- f"{request.max_tokens + token_num} tokens "
+ f"{max_tokens + token_num} tokens "
f"({token_num} in the messages, "
- f"{request.max_tokens} in the completion). "
+ f"{max_tokens} in the completion). "
f"Please reduce the length of the messages or completion.")
return TextTokensPrompt(prompt=input_text, prompt_token_ids=input_ids)
From 890ca3607208a10514e65cfdf182bdd4125baef6 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu"
Date: Wed, 30 Oct 2024 15:44:51 -1000
Subject: [PATCH 061/113] Revert "[Bugfix] Use host argument to bind to
interface (#9798)" (#9852)
---
vllm/entrypoints/openai/api_server.py | 2 +-
vllm/entrypoints/openai/cli_args.py | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 0e0ec311023eb..46c92e10b360c 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -544,7 +544,7 @@ async def run_server(args, **uvicorn_kwargs) -> None:
# This avoids race conditions with ray.
# see https://github.com/vllm-project/vllm/issues/8204
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
- sock.bind((args.host, args.port))
+ sock.bind(("", args.port))
def signal_handler(*_) -> None:
# Interrupt server on sigterm while initializing
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index f4dd9df9587ce..a089985ac9758 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -77,7 +77,7 @@ def __call__(
def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
parser.add_argument("--host",
type=nullable_str,
- default="0.0.0.0",
+ default=None,
help="host name")
parser.add_argument("--port", type=int, default=8000, help="port number")
parser.add_argument(
From d087bf863e0d228c8b5aaae6535de15c5817eb7b Mon Sep 17 00:00:00 2001
From: Michael Goin
Date: Thu, 31 Oct 2024 01:41:20 -0400
Subject: [PATCH 062/113] [Model] Support quantization of
Qwen2VisionTransformer (#9817)
Signed-off-by: mgoin
---
vllm/model_executor/models/qwen2_vl.py | 58 ++++++++++++++++----------
1 file changed, 35 insertions(+), 23 deletions(-)
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 633d66b4af31a..1e12c2332b65e 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -126,15 +126,18 @@ def __init__(
hidden_features: int = None,
act_layer: Type[nn.Module] = QuickGELU,
quant_config: Optional[QuantizationConfig] = None,
+ prefix: str = "",
):
super().__init__()
self.fc1 = ColumnParallelLinear(in_features,
hidden_features,
- quant_config=quant_config)
+ quant_config=quant_config,
+ prefix=f"{prefix}.fc1")
self.act = act_layer()
self.fc2 = RowParallelLinear(hidden_features,
in_features,
- quant_config=quant_config)
+ quant_config=quant_config,
+ prefix=f"{prefix}.fc2")
def forward(self, x: torch.Tensor) -> torch.Tensor:
x_parallel, _ = self.fc1(x)
@@ -196,6 +199,7 @@ def __init__(
num_heads: Optional[int] = None,
projection_size: Optional[int] = None,
quant_config: Optional[QuantizationConfig] = None,
+ prefix: str = "",
) -> None:
super().__init__()
# Per attention head and per partition values.
@@ -207,10 +211,12 @@ def __init__(
self.qkv = ColumnParallelLinear(input_size=embed_dim,
output_size=3 * projection_size,
- quant_config=quant_config)
+ quant_config=quant_config,
+ prefix=f"{prefix}.qkv")
self.proj = RowParallelLinear(input_size=projection_size,
output_size=embed_dim,
- quant_config=quant_config)
+ quant_config=quant_config,
+ prefix=f"{prefix}.proj")
# Detect attention implementation.
self.attn_backend: _Backend = get_vit_attn_backend()
@@ -310,6 +316,7 @@ def __init__(
act_layer: Type[nn.Module] = QuickGELU,
norm_layer: Type[nn.Module] = None,
quant_config: Optional[QuantizationConfig] = None,
+ prefix: str = "",
) -> None:
super().__init__()
if norm_layer is None:
@@ -321,11 +328,13 @@ def __init__(
self.attn = Qwen2VisionAttention(embed_dim=dim,
num_heads=num_heads,
projection_size=dim,
- quant_config=quant_config)
+ quant_config=quant_config,
+ prefix=f"{prefix}.attn")
self.mlp = Qwen2VisionMLP(dim,
mlp_hidden_dim,
act_layer=act_layer,
- quant_config=quant_config)
+ quant_config=quant_config,
+ prefix=f"{prefix}.mlp")
def forward(self, x: torch.Tensor, cu_seqlens: torch.Tensor,
rotary_pos_emb: torch.Tensor) -> torch.Tensor:
@@ -374,6 +383,7 @@ def __init__(
norm_layer: Type[nn.Module] = None,
spatial_merge_size: int = 2,
quant_config: Optional[QuantizationConfig] = None,
+ prefix: str = "",
) -> None:
super().__init__()
self.hidden_size = context_dim * (spatial_merge_size**2)
@@ -384,12 +394,14 @@ def __init__(
ColumnParallelLinear(self.hidden_size,
self.hidden_size,
bias=True,
- quant_config=quant_config),
+ quant_config=quant_config,
+ prefix=f"{prefix}.mlp.0"),
nn.GELU(),
RowParallelLinear(self.hidden_size,
d_model,
bias=True,
- quant_config=quant_config),
+ quant_config=quant_config,
+ prefix=f"{prefix}.mlp.2"),
])
def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -440,6 +452,7 @@ def __init__(
vision_config: Qwen2VLVisionConfig,
norm_eps: float = 1e-6,
quant_config: Optional[QuantizationConfig] = None,
+ prefix: str = "",
) -> None:
super().__init__()
@@ -467,28 +480,29 @@ def __init__(
self.rotary_pos_emb = Qwen2VisionRotaryEmbedding(head_dim // 2)
self.blocks = nn.ModuleList([
- Qwen2VisionBlock(
- dim=embed_dim,
- num_heads=num_heads,
- mlp_ratio=mlp_ratio,
- norm_layer=norm_layer,
- quant_config=quant_config,
- ) for _ in range(depth)
+ Qwen2VisionBlock(dim=embed_dim,
+ num_heads=num_heads,
+ mlp_ratio=mlp_ratio,
+ norm_layer=norm_layer,
+ quant_config=quant_config,
+ prefix=f"{prefix}.blocks.{layer_idx}")
+ for layer_idx in range(depth)
])
self.merger = Qwen2VisionPatchMerger(
d_model=hidden_size,
context_dim=embed_dim,
norm_layer=norm_layer,
quant_config=quant_config,
+ prefix=f"{prefix}.merger",
)
@property
def dtype(self) -> torch.dtype:
- return self.blocks[0].mlp.fc2.weight.dtype
+ return self.patch_embed.proj.weight.dtype
@property
def device(self) -> torch.device:
- return self.blocks[0].mlp.fc2.weight.device
+ return self.patch_embed.proj.weight.device
def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor:
pos_ids = []
@@ -932,10 +946,8 @@ def __init__(self,
self.visual = Qwen2VisionTransformer(
config.vision_config,
norm_eps=getattr(config, "rms_norm_eps", 1e-6),
-
- # NOTE: Qwen2-VL vision encoder does not support any
- # quantization method now.
- quant_config=None,
+ quant_config=quant_config,
+ prefix="visual",
)
self.model = Qwen2Model(config,
@@ -1175,7 +1187,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
weight_loader(param, loaded_weight, shard_id)
break
else:
- if "visual" in name and "qkv.weight" in name:
+ if "visual" in name and name.endswith("qkv.weight"):
visual_num_heads = self.config.vision_config.num_heads
visual_embed_dim = self.config.vision_config.embed_dim
head_size = visual_embed_dim // visual_num_heads
@@ -1184,7 +1196,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
visual_embed_dim)
loaded_weight = loaded_weight.transpose(0, 1)
loaded_weight = loaded_weight.reshape(-1, visual_embed_dim)
- elif "visual" in name and "qkv.bias" in name:
+ elif "visual" in name and name.endswith("qkv.bias"):
visual_num_heads = self.config.vision_config.num_heads
visual_embed_dim = self.config.vision_config.embed_dim
head_size = visual_embed_dim // visual_num_heads
From 3ea2dc2ec49d1ddd7875045e2397ae76a8f50b38 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Thu, 31 Oct 2024 00:22:07 -0700
Subject: [PATCH 063/113] [Misc] Remove deprecated arg for cuda graph capture
(#9864)
Signed-off-by: Roger Wang
---
vllm/config.py | 7 -------
vllm/engine/arg_utils.py | 10 ----------
vllm/entrypoints/llm.py | 5 -----
vllm/worker/model_runner.py | 2 +-
4 files changed, 1 insertion(+), 23 deletions(-)
diff --git a/vllm/config.py b/vllm/config.py
index e9559c40dbdfb..c2a8c956b374a 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -84,9 +84,6 @@ class ModelConfig:
disable CUDA graph and always execute the model in eager mode.
If False, we will use CUDA graph and eager execution in hybrid.
If None, the user did not specify, so default to False.
- max_context_len_to_capture: Maximum context len covered by CUDA graphs.
- When a sequence has context length larger than this, we fall back
- to eager mode (DEPRECATED. Use max_seq_len_to_capture instead).
max_seq_len_to_capture: Maximum sequence len covered by CUDA graphs.
When a sequence has context length larger than this, we fall back
to eager mode. Additionally for encoder-decoder models, if the
@@ -147,7 +144,6 @@ def __init__(
quantization: Optional[str] = None,
quantization_param_path: Optional[str] = None,
enforce_eager: Optional[bool] = None,
- max_context_len_to_capture: Optional[int] = None,
max_seq_len_to_capture: Optional[int] = None,
max_logprobs: int = 20,
disable_sliding_window: bool = False,
@@ -181,9 +177,6 @@ def __init__(
self.quantization = quantization
self.quantization_param_path = quantization_param_path
self.enforce_eager = enforce_eager
- if max_context_len_to_capture is not None:
- raise ValueError("`max_context_len_to_capture` is deprecated. "
- "Use `max_seq_len_to_capture` instead.")
self.max_seq_len_to_capture = max_seq_len_to_capture
self.max_logprobs = max_logprobs
self.disable_sliding_window = disable_sliding_window
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index de886c98e51bd..b1f0f8b9df925 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -126,7 +126,6 @@ class EngineArgs:
tokenizer_revision: Optional[str] = None
quantization: Optional[str] = None
enforce_eager: Optional[bool] = None
- max_context_len_to_capture: Optional[int] = None
max_seq_len_to_capture: int = 8192
disable_custom_all_reduce: bool = False
tokenizer_pool_size: int = 0
@@ -504,14 +503,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
help='Always use eager-mode PyTorch. If False, '
'will use eager mode and CUDA graph in hybrid '
'for maximal performance and flexibility.')
- parser.add_argument('--max-context-len-to-capture',
- type=int,
- default=EngineArgs.max_context_len_to_capture,
- help='Maximum context length covered by CUDA '
- 'graphs. When a sequence has context length '
- 'larger than this, we fall back to eager mode. '
- '(DEPRECATED. Use --max-seq-len-to-capture instead'
- ')')
parser.add_argument('--max-seq-len-to-capture',
type=int,
default=EngineArgs.max_seq_len_to_capture,
@@ -939,7 +930,6 @@ def create_model_config(self) -> ModelConfig:
quantization=self.quantization,
quantization_param_path=self.quantization_param_path,
enforce_eager=self.enforce_eager,
- max_context_len_to_capture=self.max_context_len_to_capture,
max_seq_len_to_capture=self.max_seq_len_to_capture,
max_logprobs=self.max_logprobs,
disable_sliding_window=self.disable_sliding_window,
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 083b67c2f8e7d..3d62cb3598477 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -93,9 +93,6 @@ class LLM:
enforce_eager: Whether to enforce eager execution. If True, we will
disable CUDA graph and always execute the model in eager mode.
If False, we will use CUDA graph and eager execution in hybrid.
- max_context_len_to_capture: Maximum context len covered by CUDA graphs.
- When a sequence has context length larger than this, we fall back
- to eager mode (DEPRECATED. Use `max_seq_len_to_capture` instead).
max_seq_len_to_capture: Maximum sequence len covered by CUDA graphs.
When a sequence has context length larger than this, we fall back
to eager mode. Additionally for encoder-decoder models, if the
@@ -152,7 +149,6 @@ def __init__(
swap_space: float = 4,
cpu_offload_gb: float = 0,
enforce_eager: Optional[bool] = None,
- max_context_len_to_capture: Optional[int] = None,
max_seq_len_to_capture: int = 8192,
disable_custom_all_reduce: bool = False,
disable_async_output_proc: bool = False,
@@ -193,7 +189,6 @@ def __init__(
swap_space=swap_space,
cpu_offload_gb=cpu_offload_gb,
enforce_eager=enforce_eager,
- max_context_len_to_capture=max_context_len_to_capture,
max_seq_len_to_capture=max_seq_len_to_capture,
disable_custom_all_reduce=disable_custom_all_reduce,
disable_async_output_proc=disable_async_output_proc,
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 233a9e664d845..891637dafbb14 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -995,7 +995,7 @@ def __init__(
# Python can be expensive. To optimize this, we cache the block table
# in numpy and only copy the actual input content at every iteration.
# The shape of the cached block table will be
- # (max batch size to capture, max context len to capture / block size).
+ # (max batch size to capture, max seq len to capture / block size).
self.graph_block_tables = np.zeros(
(self.max_batchsize_to_capture, self.get_max_block_per_batch()),
dtype=np.int32)
From 5608e611c2116cc17c6808b2ae1ecb4a3e263493 Mon Sep 17 00:00:00 2001
From: Jee Jee Li
Date: Thu, 31 Oct 2024 16:54:18 +0800
Subject: [PATCH 064/113] [Doc] Update Qwen documentation (#9869)
---
docs/source/models/supported_models.rst | 7 +++++--
vllm/model_executor/models/qwen.py | 2 +-
2 files changed, 6 insertions(+), 3 deletions(-)
diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index ff893b613f150..3279e7a108232 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -277,7 +277,7 @@ Text Generation
* - :code:`QWenLMHeadModel`
- Qwen
- :code:`Qwen/Qwen-7B`, :code:`Qwen/Qwen-7B-Chat`, etc.
- -
+ - ✅︎
- ✅︎
* - :code:`Qwen2ForCausalLM`
- Qwen2
@@ -516,7 +516,7 @@ Text Generation
- Qwen-VL
- T + I\ :sup:`E+`
- :code:`Qwen/Qwen-VL`, :code:`Qwen/Qwen-VL-Chat`, etc.
- -
+ - ✅︎
- ✅︎
* - :code:`Qwen2AudioForConditionalGeneration`
- Qwen2-Audio
@@ -540,6 +540,9 @@ Text Generation
| :sup:`E` Pre-computed embeddings can be inputted for this modality.
| :sup:`+` Multiple items can be inputted per text prompt for this modality.
+.. note::
+ vLLM currently only supports adding LoRA to the language backbone of multimodal models.
+
.. note::
For :code:`openbmb/MiniCPM-V-2`, the official repo doesn't work yet, so we need to use a fork (:code:`HwwwH/MiniCPM-V-2`) for now.
For more details, please see: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index 0a1b40927e9f9..998016ea28c26 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -1048,7 +1048,7 @@ def get_mm_mapping(self) -> MultiModelKeys:
@MULTIMODAL_REGISTRY.register_max_image_tokens(MAX_QWEN_IMG_TOKENS)
@INPUT_REGISTRY.register_dummy_data(dummy_data_for_qwen)
@INPUT_REGISTRY.register_input_processor(input_processor_for_qwen)
-class QWenLMHeadModel(QWenBaseModel):
+class QWenLMHeadModel(QWenBaseModel, SupportsLoRA):
"""
QWenLMHeadModel is not only applicable to LLM but also to VL, which is not
conducive to the current integration logic of LoRA in vLLM. Therefore, it
From 16b8f7a86f5a93d2b0dc4bd20709a47d34918b8f Mon Sep 17 00:00:00 2001
From: Alex Brooks
Date: Thu, 31 Oct 2024 10:10:52 -0600
Subject: [PATCH 065/113] [CI/Build] Add Model Tests for Qwen2-VL (#9846)
Signed-off-by: Alex-Brooks
Co-authored-by: Cyrus Leung
Co-authored-by: DarkLight1337
---
.buildkite/test-pipeline.yaml | 17 ++-
examples/offline_inference_vision_language.py | 3 +-
.../audio_language/test_ultravox.py | 2 +
.../mm_processor_kwargs/test_qwen2_vl.py | 2 +-
.../vision_language/test_models.py | 101 +++++++++++-------
.../vision_language/vlm_utils/model_utils.py | 11 ++
.../vision_language/vlm_utils/runners.py | 11 +-
.../vision_language/vlm_utils/types.py | 6 +-
.../vision_language/test_llava_next.py | 5 +-
9 files changed, 106 insertions(+), 52 deletions(-)
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 32eed1a771718..9444dc43ea97e 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -9,6 +9,7 @@
# label(str): the name of the test. emoji allowed.
# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
# fast_check_only(bool): run this test on fastcheck pipeline only
+# nightly(bool): run this test in nightly pipeline only
# optional(bool): never run this test by default (i.e. need to unblock manually)
# command(str): the single command to run for tests. incompatible with commands.
# commands(list): the list of commands to run for test. incompatbile with command.
@@ -330,18 +331,28 @@ steps:
commands:
- pytest -v -s models/decoder_only/language --ignore=models/decoder_only/language/test_models.py --ignore=models/decoder_only/language/test_big_models.py
-- label: Decoder-only Multi-Modal Models Test # 1h31min
+- label: Decoder-only Multi-Modal Models Test (Standard)
#mirror_hardwares: [amd]
source_file_dependencies:
- vllm/
- tests/models/decoder_only/audio_language
- tests/models/decoder_only/vision_language
commands:
- - pytest -v -s models/decoder_only/audio_language
+ - pytest -v -s models/decoder_only/audio_language -m core_model
+ - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m core_model
+
+- label: Decoder-only Multi-Modal Models Test (Extended)
+ nightly: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/decoder_only/audio_language
+ - tests/models/decoder_only/vision_language
+ commands:
+ - pytest -v -s models/decoder_only/audio_language -m 'not core_model'
# HACK - run phi3v tests separately to sidestep this transformers bug
# https://github.com/huggingface/transformers/issues/34307
- pytest -v -s models/decoder_only/vision_language/test_phi3v.py
- - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language
+ - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model'
- label: Other Models Test # 6min
#mirror_hardwares: [amd]
diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py
index 83d2548a506e4..60cdb186331fe 100644
--- a/examples/offline_inference_vision_language.py
+++ b/examples/offline_inference_vision_language.py
@@ -262,10 +262,9 @@ def run_qwen2_vl(question: str, modality: str):
model_name = "Qwen/Qwen2-VL-7B-Instruct"
- # Tested on L40
llm = LLM(
model=model_name,
- max_model_len=8192,
+ max_model_len=4096,
max_num_seqs=5,
# Note - mm_processor_kwargs can also be passed to generate/chat calls
mm_processor_kwargs={
diff --git a/tests/models/decoder_only/audio_language/test_ultravox.py b/tests/models/decoder_only/audio_language/test_ultravox.py
index ad6c2d854d1f0..b9089e75ffab8 100644
--- a/tests/models/decoder_only/audio_language/test_ultravox.py
+++ b/tests/models/decoder_only/audio_language/test_ultravox.py
@@ -158,6 +158,7 @@ def run_multi_audio_test(
assert all(tokens for tokens, *_ in vllm_outputs)
+@pytest.mark.core_model
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5])
@@ -178,6 +179,7 @@ def test_models(hf_runner, vllm_runner, audio, dtype: str, max_tokens: int,
)
+@pytest.mark.core_model
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5])
diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py
index 5c90e7f7a267c..c23fbedf0c6ae 100644
--- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py
+++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py
@@ -17,7 +17,7 @@
# Fixtures lazy import to avoid initializing CUDA during test collection
-# NOTE: Qwen2vl supports multiple input modalities, so it registers multiple
+# NOTE: Qwen2VL supports multiple input modalities, so it registers multiple
# input mappers.
@pytest.fixture()
def image_input_mapper_for_qwen2_vl():
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index 9370527e3cd57..d738647c91b66 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -75,6 +75,63 @@
# this is a good idea for checking your command first, since tests are slow.
VLM_TEST_SETTINGS = {
+ #### Core tests to always run in the CI
+ "llava": VLMTestInfo(
+ models=["llava-hf/llava-1.5-7b-hf"],
+ test_type=(
+ VLMTestType.EMBEDDING,
+ VLMTestType.IMAGE,
+ VLMTestType.CUSTOM_INPUTS
+ ),
+ prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
+ convert_assets_to_embeddings=model_utils.get_llava_embeddings,
+ max_model_len=4096,
+ auto_cls=AutoModelForVision2Seq,
+ vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
+ custom_test_opts=[CustomTestOptions(
+ inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
+ formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:"
+ ),
+ limit_mm_per_prompt={"image": 4},
+ )],
+ marks=[pytest.mark.core_model],
+ ),
+ "paligemma": VLMTestInfo(
+ models=["google/paligemma-3b-mix-224"],
+ test_type=VLMTestType.IMAGE,
+ prompt_formatter=identity,
+ img_idx_to_prompt = lambda idx: "",
+ # Paligemma uses its own sample prompts because the default one fails
+ single_image_prompts=IMAGE_ASSETS.prompts({
+ "stop_sign": "caption es",
+ "cherry_blossom": "What is in the picture?",
+ }),
+ auto_cls=AutoModelForVision2Seq,
+ postprocess_inputs=model_utils.get_key_type_post_processor(
+ "pixel_values"
+ ),
+ vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output,
+ dtype="half" if current_platform.is_rocm() else ("half", "float"),
+ marks=[pytest.mark.core_model],
+ ),
+ "qwen2_vl": VLMTestInfo(
+ models=["Qwen/Qwen2-VL-2B-Instruct"],
+ test_type=(
+ VLMTestType.IMAGE,
+ VLMTestType.MULTI_IMAGE,
+ VLMTestType.VIDEO
+ ),
+ prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
+ img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
+ video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
+ max_model_len=4096,
+ max_num_seqs=2,
+ auto_cls=AutoModelForVision2Seq,
+ vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
+ marks=[pytest.mark.core_model],
+ image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
+ ),
+ #### Extended model tests
"blip2": VLMTestInfo(
models=["Salesforce/blip2-opt-2.7b"],
test_type=VLMTestType.IMAGE,
@@ -151,25 +208,6 @@
use_tokenizer_eos=True,
patch_hf_runner=model_utils.internvl_patch_hf_runner,
),
- "llava": VLMTestInfo(
- models=["llava-hf/llava-1.5-7b-hf"],
- test_type=(
- VLMTestType.EMBEDDING,
- VLMTestType.IMAGE,
- VLMTestType.CUSTOM_INPUTS
- ),
- prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
- convert_assets_to_embeddings=model_utils.get_llava_embeddings,
- max_model_len=4096,
- auto_cls=AutoModelForVision2Seq,
- vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
- custom_test_opts=[CustomTestOptions(
- inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
- formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:"
- ),
- limit_mm_per_prompt={"image": 4},
- )],
- ),
"llava_next": VLMTestInfo(
models=["llava-hf/llava-v1.6-mistral-7b-hf"],
test_type=(VLMTestType.IMAGE, VLMTestType.CUSTOM_INPUTS),
@@ -200,12 +238,12 @@
vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
# Llava-one-vision tests fixed sizes & the default size factors
image_sizes=[((1669, 2560), (2560, 1669), (183, 488), (488, 183))],
- runner_mm_key="videos",
custom_test_opts=[CustomTestOptions(
inputs=custom_inputs.multi_video_multi_aspect_ratio_inputs(
formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
),
limit_mm_per_prompt={"video": 4},
+ runner_mm_key="videos",
)],
),
# FIXME
@@ -218,9 +256,11 @@
auto_cls=AutoModelForVision2Seq,
vllm_output_post_proc=model_utils.llava_video_vllm_to_hf_output,
image_sizes=[((1669, 2560), (2560, 1669), (183, 488), (488, 183))],
- runner_mm_key="videos",
marks=[
- pytest.mark.skip(reason="LLava next video tests currently fail.")
+ pytest.mark.skipif(
+ transformers.__version__.startswith("4.46"),
+ reason="Model broken with changes in transformers 4.46"
+ )
],
),
"minicpmv": VLMTestInfo(
@@ -234,23 +274,6 @@
postprocess_inputs=model_utils.wrap_inputs_post_processor,
hf_output_post_proc=model_utils.minicmpv_trunc_hf_output,
),
- "paligemma": VLMTestInfo(
- models=["google/paligemma-3b-mix-224"],
- test_type=VLMTestType.IMAGE,
- prompt_formatter=identity,
- img_idx_to_prompt = lambda idx: "",
- # Paligemma uses its own sample prompts because the default one fails
- single_image_prompts=IMAGE_ASSETS.prompts({
- "stop_sign": "caption es",
- "cherry_blossom": "What is in the picture?",
- }),
- auto_cls=AutoModelForVision2Seq,
- postprocess_inputs=model_utils.get_key_type_post_processor(
- "pixel_values"
- ),
- vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output,
- dtype="half" if current_platform.is_rocm() else ("half", "float"),
- ),
# Tests for phi3v currently live in another file because of a bug in
# transformers. Once this issue is fixed, we can enable them here instead.
# https://github.com/huggingface/transformers/issues/34307
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
index 6856e8df81a13..e925934db0e7c 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
@@ -56,6 +56,17 @@ def qwen_vllm_to_hf_output(
return output_ids, hf_output_str, out_logprobs
+def qwen2_vllm_to_hf_output(
+ vllm_output: RunnerOutput,
+ model: str) -> Tuple[List[int], str, Optional[SampleLogprobs]]:
+ """Sanitize vllm output [qwen2 models] to be comparable with hf output."""
+ output_ids, output_str, out_logprobs = vllm_output
+
+ hf_output_str = output_str + "<|im_end|>"
+
+ return output_ids, hf_output_str, out_logprobs
+
+
def llava_image_vllm_to_hf_output(vllm_output: RunnerOutput,
model: str) -> RunnerOutput:
config = AutoConfig.from_pretrained(model)
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/runners.py b/tests/models/decoder_only/vision_language/vlm_utils/runners.py
index 5a3f9e820dad0..2d3b39fe3594e 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/runners.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/runners.py
@@ -29,6 +29,7 @@ def run_single_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
num_logprobs=test_case.num_logprobs,
limit_mm_per_prompt={"image": 1},
distributed_executor_backend=test_case.distributed_executor_backend,
+ runner_mm_key="images",
**model_test_info.get_non_parametrized_runner_kwargs())
@@ -51,6 +52,7 @@ def run_multi_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
num_logprobs=test_case.num_logprobs,
limit_mm_per_prompt={"image": len(image_assets)},
distributed_executor_backend=test_case.distributed_executor_backend,
+ runner_mm_key="images",
**model_test_info.get_non_parametrized_runner_kwargs())
@@ -74,6 +76,7 @@ def run_embedding_test(*, model_test_info: VLMTestInfo,
limit_mm_per_prompt={"image": 1},
vllm_embeddings=vllm_embeddings,
distributed_executor_backend=test_case.distributed_executor_backend,
+ runner_mm_key="images",
**model_test_info.get_non_parametrized_runner_kwargs())
@@ -101,6 +104,7 @@ def run_video_test(
num_logprobs=test_case.num_logprobs,
limit_mm_per_prompt={"video": len(video_assets)},
distributed_executor_backend=test_case.distributed_executor_backend,
+ runner_mm_key="videos",
**model_test_info.get_non_parametrized_runner_kwargs())
@@ -115,7 +119,11 @@ def run_custom_inputs_test(*, model_test_info: VLMTestInfo,
inputs = test_case.custom_test_opts.inputs
limit_mm_per_prompt = test_case.custom_test_opts.limit_mm_per_prompt
- assert inputs is not None and limit_mm_per_prompt is not None
+ runner_mm_key = test_case.custom_test_opts.runner_mm_key
+ # Inputs, limit_mm_per_prompt, and runner_mm_key should all be set
+ assert inputs is not None
+ assert limit_mm_per_prompt is not None
+ assert runner_mm_key is not None
core.run_test(
hf_runner=hf_runner,
@@ -127,4 +135,5 @@ def run_custom_inputs_test(*, model_test_info: VLMTestInfo,
num_logprobs=test_case.num_logprobs,
limit_mm_per_prompt=limit_mm_per_prompt,
distributed_executor_backend=test_case.distributed_executor_backend,
+ runner_mm_key=runner_mm_key,
**model_test_info.get_non_parametrized_runner_kwargs())
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/types.py b/tests/models/decoder_only/vision_language/vlm_utils/types.py
index 4d18d53af30fa..fd18c7c8346f0 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/types.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/types.py
@@ -52,6 +52,8 @@ class SizeType(Enum):
class CustomTestOptions(NamedTuple):
inputs: List[Tuple[List[str], List[Union[List[Image], Image]]]]
limit_mm_per_prompt: Dict[str, int]
+ # kwarg to pass multimodal data in as to vllm/hf runner instances.
+ runner_mm_key: str = "images"
class ImageSizeWrapper(NamedTuple):
@@ -141,9 +143,6 @@ class VLMTestInfo(NamedTuple):
Callable[[PosixPath, str, Union[List[ImageAsset], _ImageAssets]],
str]] = None # noqa: E501
- # kwarg to pass multimodal data in as to vllm/hf runner instances
- runner_mm_key: str = "images"
-
# Allows configuring a test to run with custom inputs
custom_test_opts: Optional[List[CustomTestOptions]] = None
@@ -168,7 +167,6 @@ def get_non_parametrized_runner_kwargs(self):
"get_stop_token_ids": self.get_stop_token_ids,
"model_kwargs": self.model_kwargs,
"patch_hf_runner": self.patch_hf_runner,
- "runner_mm_key": self.runner_mm_key,
}
diff --git a/tests/models/embedding/vision_language/test_llava_next.py b/tests/models/embedding/vision_language/test_llava_next.py
index a8d0ac4fc160d..9fab5898a06ba 100644
--- a/tests/models/embedding/vision_language/test_llava_next.py
+++ b/tests/models/embedding/vision_language/test_llava_next.py
@@ -2,6 +2,7 @@
import pytest
import torch.nn.functional as F
+import transformers
from transformers import AutoModelForVision2Seq
from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
@@ -85,8 +86,8 @@ def _run_test(
)
-# FIXME
-@pytest.mark.skip(reason="LLava next embedding tests currently fail")
+@pytest.mark.skipif(transformers.__version__.startswith("4.46"),
+ reason="Model broken with changes in transformers 4.46")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
def test_models_text(
From 77f7ef29088fef854421239e7c41df6b11bc4b5b Mon Sep 17 00:00:00 2001
From: Alexei-V-Ivanov-AMD
<156011006+Alexei-V-Ivanov-AMD@users.noreply.github.com>
Date: Thu, 31 Oct 2024 12:02:58 -0500
Subject: [PATCH 066/113] [CI/Build] Adding a forced docker system prune to
clean up space (#9849)
---
.buildkite/run-amd-test.sh | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh
index df201cdc7c554..329cc42558da6 100755
--- a/.buildkite/run-amd-test.sh
+++ b/.buildkite/run-amd-test.sh
@@ -31,8 +31,8 @@ cleanup_docker() {
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
# Remove dangling images (those that are not tagged and not used by any container)
docker image prune -f
- # Remove unused volumes
- docker volume prune -f
+ # Remove unused volumes / force the system prune for old images as well.
+ docker volume prune -f && docker system prune --force --filter "until=72h" --all
echo "Docker images and volumes cleanup completed."
else
echo "Disk usage is below $threshold%. No cleanup needed."
From 55650c83a0c386526ed04912a0c60eccca202f3e Mon Sep 17 00:00:00 2001
From: sasha0552
Date: Thu, 31 Oct 2024 18:46:36 +0000
Subject: [PATCH 067/113] [Bugfix] Fix `illegal memory access` error with
chunked prefill, prefix caching, block manager v2 and xformers enabled
together (#9532)
Signed-off-by: sasha0552
---
tests/prefix_caching/test_prefix_caching.py | 28 +++++++++++++++++++++
vllm/attention/backends/utils.py | 9 ++++---
2 files changed, 34 insertions(+), 3 deletions(-)
diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py
index 366b030eaa399..fd6564bbfe630 100644
--- a/tests/prefix_caching/test_prefix_caching.py
+++ b/tests/prefix_caching/test_prefix_caching.py
@@ -5,6 +5,7 @@
import pytest
from tests.kernels.utils import override_backend_env_variable
+from vllm import SamplingParams, TokensPrompt
from ..models.utils import check_outputs_equal
@@ -12,6 +13,14 @@
"facebook/opt-125m",
]
+UNSTABLE_PROMPT_SEQUENCE = [
+ ([0] * 588) + ([1] * 1332) + ([2] * 30) + ([3] * 1),
+ ([0] * 588) + ([1] * 1332) + ([4] * 3) + ([5] * 50),
+ ([0] * 588) + ([1] * 1332) + ([2] * 30) + ([6] * 95),
+ ([0] * 588) + ([1] * 1332) + ([4] * 3) + ([7] * 174),
+ ([0] * 588) + ([8] * 1539),
+]
+
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"])
@@ -57,3 +66,22 @@ def test_mixed_requests(
name_0="hf",
name_1="vllm",
)
+
+
+@pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"])
+def test_unstable_prompt_sequence(
+ vllm_runner,
+ backend: str,
+ monkeypatch,
+) -> None:
+ override_backend_env_variable(monkeypatch, backend)
+
+ with vllm_runner(
+ "Qwen/Qwen2.5-0.5B-Instruct",
+ enable_chunked_prefill=True,
+ enable_prefix_caching=True,
+ max_model_len=4096,
+ ) as vllm_model:
+ for prompt in UNSTABLE_PROMPT_SEQUENCE:
+ vllm_model.generate(TokensPrompt(prompt_token_ids=prompt),
+ SamplingParams(max_tokens=1))
diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py
index d1a44f3e8bfa6..32fccd0dfb496 100644
--- a/vllm/attention/backends/utils.py
+++ b/vllm/attention/backends/utils.py
@@ -138,7 +138,6 @@ def _add_seq_group(
chunked_prefill_enabled: bool):
is_prompt = inter_data.is_prompt
block_tables = inter_data.block_tables
- computed_block_nums = inter_data.computed_block_nums
for (seq_id, token_len, seq_len, curr_seq_len, query_len, context_len,
curr_sliding_window_block) in zip(
@@ -164,10 +163,14 @@ def _add_seq_group(
# NOTE: This only works for oooooooxxx style attention.
block_table = []
if inter_data.prefix_cache_hit:
- block_table = computed_block_nums
+ block_table = block_tables[seq_id]
elif ((chunked_prefill_enabled or not is_prompt)
and block_tables is not None):
- block_table = block_tables[seq_id][-curr_sliding_window_block:]
+ if curr_sliding_window_block == 0:
+ block_table = block_tables[seq_id]
+ else:
+ block_table = block_tables[seq_id][
+ -curr_sliding_window_block:]
self.block_tables.append(block_table)
# Compute slot mapping.
From 9fb12f7848d427b6c1c29052271030a5e96bd74a Mon Sep 17 00:00:00 2001
From: Mor Zusman
Date: Thu, 31 Oct 2024 22:06:25 +0200
Subject: [PATCH 068/113] [BugFix][Kernel] Fix Illegal memory access in
causal_conv1d in H100 (#9838)
Signed-off-by: mzusman
---
csrc/mamba/causal_conv1d/causal_conv1d.cu | 34 +++++++++++++++++++++--
tests/kernels/test_causal_conv1d.py | 7 +++--
tests/kernels/test_mamba_ssm.py | 6 ++--
3 files changed, 40 insertions(+), 7 deletions(-)
diff --git a/csrc/mamba/causal_conv1d/causal_conv1d.cu b/csrc/mamba/causal_conv1d/causal_conv1d.cu
index 3a464c5f327ad..498d069c05f0d 100644
--- a/csrc/mamba/causal_conv1d/causal_conv1d.cu
+++ b/csrc/mamba/causal_conv1d/causal_conv1d.cu
@@ -418,6 +418,31 @@ void causal_conv1d_fwd_kernel(ConvParamsBase params) {
typename Ktraits::BlockStoreT(smem_store).Store(out, out_vals_store, seqlen - chunk * kChunkSize);
}
out += kChunkSize;
+
+ int final_state_position = ((seqlen - (kWidth - 1)) - (n_chunks - 1) * kChunkSize);
+ // in case the final state is separated between the last "smem_exchange" and
+ // and the one before it (chunk = n_chunks - 1 and chunk = n_chunks - 2),
+ // (which occurs when `final_state_position` is a non-positivie index)
+ // we load the correct data from smem_exchange from both chunks, the last chunk iteration and the one before it
+ if (final_state_position < 0 && seqlen > kWidth){
+ input_t vals_load[kNElts] = {0};
+ if ((chunk == n_chunks - 2) && (tidx == kNThreads - 1)){
+ // chunk = n_chunks - 2, a segment of the final state sits in the last index
+ reinterpret_cast(vals_load)[0] = smem_exchange[kNThreads - 1];
+ #pragma unroll
+ for (int w = 0; w < -final_state_position; ++w){
+ conv_states[w] = vals_load[kNElts + final_state_position + w];
+ }
+ }
+ if ((chunk == n_chunks - 1) && tidx == 0){
+ // chunk = n_chunks - 1, the second segment of the final state first positions
+ reinterpret_cast(vals_load)[0] = smem_exchange[0];
+ for (int w = -final_state_position; w < kWidth - 1; ++w){
+ conv_states[w] = vals_load[w + final_state_position];
+ }
+ return;
+ }
+ }
}
// Final state is stored in the smem_exchange last token slot,
// in case seqlen < kWidth, we would need to take the final state from the
@@ -446,9 +471,14 @@ void causal_conv1d_fwd_kernel(ConvParamsBase params) {
}
else {
// in case the final state is in between the threads data
- reinterpret_cast(x_vals_load)[1] = smem_exchange[last_thread + 1];
- reinterpret_cast(x_vals_load)[0] = smem_exchange[last_thread];
const int offset = ((seqlen - (kWidth - 1)) % (kNElts));
+ if ((offset + kWidth - 2) >= kNElts && (last_thread + 1 < kNThreads)){
+ // In case last_thread == kNThreads - 1, accessing last_thread + 1 will result in a
+ // illegal access error on H100.
+ // Therefore, we access last_thread + 1, only if the final state data sits there
+ reinterpret_cast(x_vals_load)[1] = smem_exchange[last_thread + 1];
+ }
+ reinterpret_cast(x_vals_load)[0] = smem_exchange[last_thread];
#pragma unroll
for (int w = 0; w < kWidth - 1; ++w){
conv_states[w] = x_vals_load[offset + w ];
diff --git a/tests/kernels/test_causal_conv1d.py b/tests/kernels/test_causal_conv1d.py
index 96bfe06d74ae5..f9b11018288be 100644
--- a/tests/kernels/test_causal_conv1d.py
+++ b/tests/kernels/test_causal_conv1d.py
@@ -151,7 +151,7 @@ def causal_conv1d_opcheck_fn(x: torch.Tensor,
@pytest.mark.parametrize("has_bias", [True])
@pytest.mark.parametrize("width", [4])
@pytest.mark.parametrize(
- 'seqlen', [1, 8, 16, 32, 64, 128, 256, 512, 784, 1024, 2048, 4096])
+ 'seqlen', [1, 8, 16, 32, 64, 128, 256, 512, 784, 1024, 1025, 2048, 4096])
@pytest.mark.parametrize('dim', [64])
@pytest.mark.parametrize('batch', [1])
def test_causal_conv1d(batch, dim, seqlen, width, has_bias, silu_activation,
@@ -420,7 +420,10 @@ def test_causal_conv1d_varlen(with_padding, dim, seqlen, width, has_bias,
unpadded_out = out[:, :out_ref_tensor.shape[-1]]
assert torch.allclose(unpadded_out, out_ref_tensor, rtol=rtol, atol=atol)
- assert torch.allclose(final_states, final_states_ref, rtol=rtol, atol=atol)
+ assert torch.allclose(final_states[state_indices],
+ final_states_ref[state_indices],
+ rtol=rtol,
+ atol=atol)
causal_conv1d_opcheck_fn(x.squeeze(0), weight, bias, cumsum.cuda(),
padded_state_indices, has_initial_states,
diff --git a/tests/kernels/test_mamba_ssm.py b/tests/kernels/test_mamba_ssm.py
index bf7ff3b5c59b8..ad05a97685351 100644
--- a/tests/kernels/test_mamba_ssm.py
+++ b/tests/kernels/test_mamba_ssm.py
@@ -555,7 +555,7 @@ def test_selective_state_update_with_batch_indices(with_padding, dim, dstate,
device = "cuda"
rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (5e-3, 1e-2)
if itype == torch.bfloat16:
- rtol, atol = 7e-2, 7e-2
+ rtol, atol = 1e-1, 1e-1
if torch.version.hip:
atol *= 2
# set seed
@@ -610,8 +610,8 @@ def test_selective_state_update_with_batch_indices(with_padding, dim, dstate,
dt_bias=dt_bias,
dt_softplus=True)
- print("Output diff max", (out - out_ref[0]).max())
- print("Output diff mean", (out - out_ref[0]).mean())
+ print("Output diff max", (out[:batch_size] - out_ref).max())
+ print("Output diff mean", (out[:batch_size] - out_ref).mean())
print("Output state diff max", (state[state_indices, :] - state_ref).max())
print("Output state diff mean",
(state[state_indices, :] - state_ref).mean())
From b63c64d95b01cc955a56bba37d055ad36aa81abd Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu"
Date: Thu, 31 Oct 2024 12:55:38 -1000
Subject: [PATCH 069/113] [ci/build] Configure dependabot to update pip
dependencies (#9811)
Signed-off-by: kevin
---
.github/dependabot.yml | 16 ++++++++++++++++
1 file changed, 16 insertions(+)
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
index 6fddca0d6e4b9..a21acd9671eeb 100644
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -5,3 +5,19 @@ updates:
directory: "/"
schedule:
interval: "weekly"
+ - package-ecosystem: "pip"
+ directory: "/"
+ schedule:
+ interval: "weekly"
+ labels: ["dependencies"]
+ open-pull-requests-limit: 5
+ reviewers: ["khluu", "simon-mo"]
+ allow:
+ - dependency-type: "all"
+ groups:
+ patch-update:
+ applies-to: version-updates
+ update-types: ["patch"]
+ minor-update:
+ applies-to: version-updates
+ update-types: ["minor"]
From 031a7995f38d3c73b0790280cc0fa1fe25d33bff Mon Sep 17 00:00:00 2001
From: Joe Runde
Date: Thu, 31 Oct 2024 19:09:46 -0600
Subject: [PATCH 070/113] [Bugfix][Frontend] Reject guided decoding in
multistep mode (#9892)
Signed-off-by: Joe Runde
---
docs/source/serving/compatibility_matrix.rst | 2 +-
.../openai/test_prompt_validation.py | 20 +++++++++++++++++++
vllm/engine/llm_engine.py | 7 +++++++
vllm/sampling_params.py | 4 ++--
4 files changed, 30 insertions(+), 3 deletions(-)
diff --git a/docs/source/serving/compatibility_matrix.rst b/docs/source/serving/compatibility_matrix.rst
index 20a81f4cad1d1..cab19e4ec5b6c 100644
--- a/docs/source/serving/compatibility_matrix.rst
+++ b/docs/source/serving/compatibility_matrix.rst
@@ -283,7 +283,7 @@ Feature x Feature
- ✅
- ✅
- ✅
- - `✗ `__
+ - `✗ `__
- ?
- ✅
- ✅
diff --git a/tests/entrypoints/openai/test_prompt_validation.py b/tests/entrypoints/openai/test_prompt_validation.py
index 58075f7023821..1ae64ef492d5b 100644
--- a/tests/entrypoints/openai/test_prompt_validation.py
+++ b/tests/entrypoints/openai/test_prompt_validation.py
@@ -35,3 +35,23 @@ async def test_out_of_vocab_token_ids():
prompt=[999999],
max_tokens=5,
temperature=0.0)
+
+
+@pytest.mark.asyncio
+async def test_reject_multistep_with_guided_decoding():
+ model_name = "gpt2"
+ server_args = ["--enforce-eager", "--num-scheduler-steps", "8"]
+ with RemoteOpenAIServer(model_name, server_args) as remote_server:
+ client = remote_server.get_async_client()
+
+ with pytest.raises(openai.BadRequestError,
+ match=re.compile(
+ '.*Guided decoding .* multi-step decoding.*')):
+ await client.completions.create(
+ model=model_name,
+ prompt="Hello",
+ max_tokens=5,
+ temperature=0.0,
+ extra_body={"response_format": {
+ "type": "json_object"
+ }})
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 3fd34fadee1ca..edef1f30a9e91 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -829,6 +829,13 @@ def add_request(
raise ValueError(f"Got priority {priority} but "
"Priority scheduling is not enabled.")
+ if isinstance(params, SamplingParams) \
+ and (params.guided_decoding or params.logits_processors) \
+ and self.scheduler_config.num_scheduler_steps > 1:
+ raise ValueError(
+ "Guided decoding and logits processors are not supported "
+ "in multi-step decoding")
+
if arrival_time is None:
arrival_time = time.time()
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 5e191c6e715e0..5c6df5aaf5446 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -485,8 +485,8 @@ def __repr__(self) -> str:
f"skip_special_tokens={self.skip_special_tokens}, "
"spaces_between_special_tokens="
f"{self.spaces_between_special_tokens}, "
- f"truncate_prompt_tokens={self.truncate_prompt_tokens}), "
- f"guided_decoding={self.guided_decoding}")
+ f"truncate_prompt_tokens={self.truncate_prompt_tokens}, "
+ f"guided_decoding={self.guided_decoding})")
class BeamSearchParams(
From 96e0c9cbbd65ad0b8ad20611b90bcc86a8559aae Mon Sep 17 00:00:00 2001
From: youkaichao
Date: Thu, 31 Oct 2024 21:56:09 -0700
Subject: [PATCH 071/113] [torch.compile] directly register custom op (#9896)
Signed-off-by: youkaichao
---
tests/compile/piecewise/test_simple.py | 20 ++++--
tests/compile/piecewise/test_toy_llama.py | 20 ++++--
vllm/attention/backends/flash_attn.py | 16 +++--
vllm/attention/backends/flashinfer.py | 17 +++--
vllm/distributed/parallel_state.py | 34 +++++++---
.../layers/fused_moe/fused_marlin_moe.py | 25 +++++--
.../layers/fused_moe/fused_moe.py | 68 +++++++++++--------
vllm/utils.py | 45 ++++++++++++
vllm/v1/attention/backends/flash_attn.py | 14 ++--
9 files changed, 192 insertions(+), 67 deletions(-)
diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py
index a34d33efba1d8..d151d62516b07 100644
--- a/tests/compile/piecewise/test_simple.py
+++ b/tests/compile/piecewise/test_simple.py
@@ -6,18 +6,22 @@
import torch
from torch import nn
+from torch.library import Library
from vllm.compilation.compile_context import set_compile_context
from vllm.compilation.counter import compilation_counter
from vllm.compilation.decorators import support_torch_compile
from vllm.compilation.levels import CompilationLevel
+from vllm.utils import direct_register_custom_op
os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(CompilationLevel.PIECEWISE)
global_counter = 0
+# create a library to hold the custom op
+silly_lib = Library("silly", "FRAGMENT") # noqa
+
-@torch.library.custom_op("silly::attention", mutates_args=["out"])
def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
out: torch.Tensor) -> None:
global global_counter
@@ -27,12 +31,20 @@ def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
out[0] += 1
-@silly_attention.register_fake
-def _(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
- out: torch.Tensor) -> None:
+def silly_attention_fake(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
+ out: torch.Tensor) -> None:
return
+direct_register_custom_op(
+ op_name="attention",
+ op_func=silly_attention,
+ mutates_args=["out"],
+ fake_impl=silly_attention_fake,
+ target_lib=silly_lib,
+)
+
+
@support_torch_compile
class SillyModel(nn.Module):
diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py
index db6a983d70feb..e3e5a7d0fc5a5 100644
--- a/tests/compile/piecewise/test_toy_llama.py
+++ b/tests/compile/piecewise/test_toy_llama.py
@@ -8,6 +8,7 @@
import torch
from torch import nn
+from torch.library import Library
from vllm.compilation.compile_context import set_compile_context
from vllm.compilation.config import CompilationConfig
@@ -15,9 +16,12 @@
from vllm.compilation.decorators import support_torch_compile
from vllm.compilation.levels import CompilationLevel
from vllm.plugins import set_compilation_config
+from vllm.utils import direct_register_custom_op
+
+# create a library to hold the custom op
+silly_lib = Library("silly", "FRAGMENT") # noqa
-@torch.library.custom_op("silly::attention", mutates_args=["out"])
def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
out: torch.Tensor) -> None:
out.copy_(q)
@@ -25,12 +29,20 @@ def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
out += v
-@silly_attention.register_fake
-def _(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
- out: torch.Tensor) -> None:
+def silly_attention_fake(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
+ out: torch.Tensor) -> None:
return
+direct_register_custom_op(
+ op_name="attention",
+ op_func=silly_attention,
+ mutates_args=["out"],
+ fake_impl=silly_attention_fake,
+ target_lib=silly_lib,
+)
+
+
@dataclass
class LlamaConfig:
hidden_size: int = 128
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index ffa05e80623ac..c294fcf7f08fe 100644
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -14,7 +14,8 @@
compute_slot_mapping_start_idx,
is_block_tables_empty)
from vllm.forward_context import get_forward_context
-from vllm.utils import async_tensor_h2d, make_tensor_with_pad
+from vllm.utils import (async_tensor_h2d, direct_register_custom_op,
+ make_tensor_with_pad)
if TYPE_CHECKING:
from vllm.worker.model_runner import (ModelInputForGPUBuilder,
@@ -595,8 +596,6 @@ def forward(
return output
-@torch.library.custom_op("vllm::unified_flash_attention",
- mutates_args=["kv_cache"])
def unified_flash_attention(
query: torch.Tensor,
key: torch.Tensor,
@@ -755,8 +754,7 @@ def unified_flash_attention(
return output.view(num_tokens, hidden_size)
-@unified_flash_attention.register_fake
-def _(
+def unified_flash_attention_fake(
query: torch.Tensor,
key: torch.Tensor,
value: torch.Tensor,
@@ -773,3 +771,11 @@ def _(
logits_soft_cap: Optional[float] = None,
) -> torch.Tensor:
return torch.empty_like(query)
+
+
+direct_register_custom_op(
+ op_name="unified_flash_attention",
+ op_func=unified_flash_attention,
+ mutates_args=["kv_cache"],
+ fake_impl=unified_flash_attention_fake,
+)
diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
index 5ea101ae0432f..234c87d5c4edb 100644
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -28,8 +28,8 @@
is_block_tables_empty)
from vllm.attention.ops.paged_attn import PagedAttention
from vllm.forward_context import get_forward_context
-from vllm.utils import (async_tensor_h2d, get_kv_cache_torch_dtype,
- make_tensor_with_pad)
+from vllm.utils import (async_tensor_h2d, direct_register_custom_op,
+ get_kv_cache_torch_dtype, make_tensor_with_pad)
if TYPE_CHECKING:
from vllm.worker.model_runner import (ModelInputForGPUBuilder,
@@ -785,8 +785,6 @@ def forward(
)
-@torch.library.custom_op("vllm::unified_flash_infer",
- mutates_args=["kv_cache"])
def unified_flash_infer(
query: torch.Tensor,
key: torch.Tensor,
@@ -906,8 +904,7 @@ def unified_flash_infer(
return output.view(num_tokens, hidden_size)
-@unified_flash_infer.register_fake
-def _(
+def unified_flash_infer_fake(
query: torch.Tensor,
key: torch.Tensor,
value: torch.Tensor,
@@ -924,3 +921,11 @@ def _(
logits_soft_cap: Optional[float] = None,
) -> torch.Tensor:
return torch.empty_like(query).contiguous()
+
+
+direct_register_custom_op(
+ op_name="unified_flash_infer",
+ op_func=unified_flash_infer,
+ mutates_args=["kv_cache"],
+ fake_impl=unified_flash_infer_fake,
+)
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index b04bbc478534c..94ba41a016f6d 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -37,7 +37,7 @@
import vllm.envs as envs
from vllm.logger import init_logger
from vllm.platforms import current_platform
-from vllm.utils import supports_custom_op
+from vllm.utils import direct_register_custom_op, supports_custom_op
@dataclass
@@ -99,8 +99,6 @@ def _register_group(group: "GroupCoordinator") -> None:
if supports_custom_op():
- @torch.library.custom_op("vllm::inplace_all_reduce",
- mutates_args=["tensor"])
def inplace_all_reduce(tensor: torch.Tensor, group_name: str) -> None:
assert group_name in _groups, f"Group {group_name} is not found."
group = _groups[group_name]()
@@ -108,11 +106,16 @@ def inplace_all_reduce(tensor: torch.Tensor, group_name: str) -> None:
raise ValueError(f"Group {group_name} is destroyed.")
group._all_reduce_in_place(tensor)
- @inplace_all_reduce.register_fake
- def _(tensor: torch.Tensor, group_name: str) -> None:
+ def inplace_all_reduce_fake(tensor: torch.Tensor, group_name: str) -> None:
return
- @torch.library.custom_op("vllm::outplace_all_reduce", mutates_args=[])
+ direct_register_custom_op(
+ op_name="inplace_all_reduce",
+ op_func=inplace_all_reduce,
+ mutates_args=["tensor"],
+ fake_impl=inplace_all_reduce_fake,
+ )
+
def outplace_all_reduce(tensor: torch.Tensor,
group_name: str) -> torch.Tensor:
assert group_name in _groups, f"Group {group_name} is not found."
@@ -121,10 +124,17 @@ def outplace_all_reduce(tensor: torch.Tensor,
raise ValueError(f"Group {group_name} is destroyed.")
return group._all_reduce_out_place(tensor)
- @outplace_all_reduce.register_fake
- def _(tensor: torch.Tensor, group_name: str) -> torch.Tensor:
+ def outplace_all_reduce_fake(tensor: torch.Tensor,
+ group_name: str) -> torch.Tensor:
return torch.empty_like(tensor)
+ direct_register_custom_op(
+ op_name="outplace_all_reduce",
+ op_func=outplace_all_reduce,
+ mutates_args=[],
+ fake_impl=outplace_all_reduce_fake,
+ )
+
class GroupCoordinator:
"""
@@ -338,6 +348,11 @@ def all_reduce(self, input_: torch.Tensor) -> torch.Tensor:
if self.world_size == 1:
return input_
+ if input_.is_cpu:
+ import intel_extension_for_pytorch as ipex
+ ipex.distributed.all_reduce(input_, group=self.device_group)
+ return input_
+
if not supports_custom_op():
self._all_reduce_in_place(input_)
return input_
@@ -369,9 +384,6 @@ def _all_reduce_in_place(self, input_: torch.Tensor) -> None:
pynccl_comm = self.pynccl_comm
if (pynccl_comm is not None and not pynccl_comm.disabled):
pynccl_comm.all_reduce(input_)
- elif input_.is_cpu:
- import intel_extension_for_pytorch as ipex
- ipex.distributed.all_reduce(input_, group=self.device_group)
else:
torch.distributed.all_reduce(input_, group=self.device_group)
diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
index 93019d0d0abb6..4741d69de11ac 100644
--- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
@@ -8,6 +8,7 @@
from vllm.model_executor.layers.fused_moe.fused_moe import (
fused_topk, moe_align_block_size, try_get_optimal_moe_config)
from vllm.scalar_type import scalar_types
+from vllm.utils import direct_register_custom_op
def get_scalar_type(num_bits: int, has_zp: bool):
@@ -18,7 +19,6 @@ def get_scalar_type(num_bits: int, has_zp: bool):
return scalar_types.uint4b8 if num_bits == 4 else scalar_types.uint8b128
-@torch.library.custom_op("vllm::single_marlin_moe", mutates_args=[])
def single_marlin_moe(
hidden_states: torch.Tensor,
w: torch.Tensor,
@@ -119,8 +119,7 @@ def single_marlin_moe(
return torch.sum(intermediate_cache.view(*intermediate_cache.shape), dim=1)
-@single_marlin_moe.register_fake
-def _(
+def single_marlin_moe_fake(
hidden_states: torch.Tensor,
w: torch.Tensor,
scales: torch.Tensor,
@@ -136,7 +135,14 @@ def _(
return torch.empty_like(hidden_states)
-@torch.library.custom_op("vllm::fused_marlin_moe", mutates_args=[])
+direct_register_custom_op(
+ op_name="single_marlin_moe",
+ op_func=single_marlin_moe,
+ mutates_args=[],
+ fake_impl=single_marlin_moe_fake,
+)
+
+
def fused_marlin_moe(
hidden_states: torch.Tensor,
w1: torch.Tensor,
@@ -324,8 +330,7 @@ def fused_marlin_moe(
dim=1)
-@fused_marlin_moe.register_fake
-def _(
+def fused_marlin_moe_fake(
hidden_states: torch.Tensor,
w1: torch.Tensor,
w2: torch.Tensor,
@@ -344,3 +349,11 @@ def _(
is_k_full: bool = True,
) -> torch.Tensor:
return torch.empty_like(hidden_states)
+
+
+direct_register_custom_op(
+ op_name="fused_marlin_moe",
+ op_func=fused_marlin_moe,
+ mutates_args=[],
+ fake_impl=fused_marlin_moe_fake,
+)
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 1cf5c2253ca0b..340da32263c1c 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -12,6 +12,7 @@
from vllm import _custom_ops as ops
from vllm.logger import init_logger
from vllm.platforms import current_platform
+from vllm.utils import direct_register_custom_op
logger = init_logger(__name__)
@@ -466,8 +467,6 @@ def get_config_dtype_str(dtype: torch.dtype,
return None
-@torch.library.custom_op("vllm::inplace_fused_experts",
- mutates_args=["hidden_states"])
def inplace_fused_experts(hidden_states: torch.Tensor,
w1: torch.Tensor,
w2: torch.Tensor,
@@ -484,22 +483,29 @@ def inplace_fused_experts(hidden_states: torch.Tensor,
a1_scale, a2_scale)
-@inplace_fused_experts.register_fake
-def _(hidden_states: torch.Tensor,
- w1: torch.Tensor,
- w2: torch.Tensor,
- topk_weights: torch.Tensor,
- topk_ids: torch.Tensor,
- use_fp8_w8a8: bool = False,
- use_int8_w8a16: bool = False,
- w1_scale: Optional[torch.Tensor] = None,
- w2_scale: Optional[torch.Tensor] = None,
- a1_scale: Optional[torch.Tensor] = None,
- a2_scale: Optional[torch.Tensor] = None) -> None:
+def inplace_fused_experts_fake(
+ hidden_states: torch.Tensor,
+ w1: torch.Tensor,
+ w2: torch.Tensor,
+ topk_weights: torch.Tensor,
+ topk_ids: torch.Tensor,
+ use_fp8_w8a8: bool = False,
+ use_int8_w8a16: bool = False,
+ w1_scale: Optional[torch.Tensor] = None,
+ w2_scale: Optional[torch.Tensor] = None,
+ a1_scale: Optional[torch.Tensor] = None,
+ a2_scale: Optional[torch.Tensor] = None) -> None:
pass
-@torch.library.custom_op("vllm::outplace_fused_experts", mutates_args=[])
+direct_register_custom_op(
+ op_name="inplace_fused_experts",
+ op_func=inplace_fused_experts,
+ mutates_args=["hidden_states"],
+ fake_impl=inplace_fused_experts_fake,
+)
+
+
def outplace_fused_experts(
hidden_states: torch.Tensor,
w1: torch.Tensor,
@@ -517,21 +523,29 @@ def outplace_fused_experts(
w2_scale, a1_scale, a2_scale)
-@outplace_fused_experts.register_fake
-def _(hidden_states: torch.Tensor,
- w1: torch.Tensor,
- w2: torch.Tensor,
- topk_weights: torch.Tensor,
- topk_ids: torch.Tensor,
- use_fp8_w8a8: bool = False,
- use_int8_w8a16: bool = False,
- w1_scale: Optional[torch.Tensor] = None,
- w2_scale: Optional[torch.Tensor] = None,
- a1_scale: Optional[torch.Tensor] = None,
- a2_scale: Optional[torch.Tensor] = None) -> torch.Tensor:
+def outplace_fused_experts_fake(
+ hidden_states: torch.Tensor,
+ w1: torch.Tensor,
+ w2: torch.Tensor,
+ topk_weights: torch.Tensor,
+ topk_ids: torch.Tensor,
+ use_fp8_w8a8: bool = False,
+ use_int8_w8a16: bool = False,
+ w1_scale: Optional[torch.Tensor] = None,
+ w2_scale: Optional[torch.Tensor] = None,
+ a1_scale: Optional[torch.Tensor] = None,
+ a2_scale: Optional[torch.Tensor] = None) -> torch.Tensor:
return torch.empty_like(hidden_states)
+direct_register_custom_op(
+ op_name="outplace_fused_experts",
+ op_func=outplace_fused_experts,
+ mutates_args=[],
+ fake_impl=outplace_fused_experts_fake,
+)
+
+
def fused_experts(hidden_states: torch.Tensor,
w1: torch.Tensor,
w2: torch.Tensor,
diff --git a/vllm/utils.py b/vllm/utils.py
index 03cdbe6a0dc7b..5488719cc99b0 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -32,6 +32,7 @@
import torch.types
import yaml
from packaging.version import Version
+from torch.library import Library
from typing_extensions import ParamSpec, TypeIs, assert_never
import vllm.envs as envs
@@ -1512,3 +1513,47 @@ def weak_ref_tensors(
if isinstance(tensors, tuple):
return tuple(weak_ref_tensor(t) for t in tensors)
raise ValueError("Invalid type for tensors")
+
+
+def is_in_doc_build() -> bool:
+ try:
+ from sphinx.ext.autodoc.mock import _MockModule
+ return isinstance(torch, _MockModule)
+ except ModuleNotFoundError:
+ return False
+
+
+# create a library to hold the custom op
+vllm_lib = Library("vllm", "FRAGMENT") # noqa
+
+
+def direct_register_custom_op(
+ op_name: str,
+ op_func: Callable,
+ mutates_args: List[str],
+ fake_impl: Optional[Callable] = None,
+ target_lib: Optional[Library] = None,
+):
+ """
+ `torch.library.custom_op` can have significant overhead because it
+ needs to consider complicated dispatching logic. This function
+ directly registers a custom op and dispatches it to the CUDA backend.
+ See https://gist.github.com/youkaichao/ecbea9ec9fc79a45d2adce1784d7a9a5
+ for more details.
+
+ By default, the custom op is registered to the vLLM library. If you
+ want to register it to a different library, you can pass the library
+ object to the `target_lib` argument.
+
+ IMPORTANT: the lifetime of the operator is tied to the lifetime of the
+ library object. If you want to bind the operator to a different library,
+ make sure the library object is alive when the operator is used.
+ """
+ if is_in_doc_build():
+ return
+ schema_str = torch.library.infer_schema(op_func, mutates_args=mutates_args)
+ my_lib = target_lib or vllm_lib
+ my_lib.define(op_name + schema_str)
+ my_lib.impl(op_name, op_func, "CUDA")
+ if fake_impl is not None:
+ my_lib._register_fake(op_name, fake_impl)
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index ec07464e6a12a..b2af89ebf854a 100644
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -7,6 +7,7 @@
from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
AttentionMetadata, AttentionType)
from vllm.forward_context import get_forward_context
+from vllm.utils import direct_register_custom_op
from vllm.vllm_flash_attn import flash_attn_varlen_func
@@ -152,8 +153,6 @@ def forward(
return output
-@torch.library.custom_op("vllm::unified_flash_attention",
- mutates_args=["kv_cache"])
def unified_flash_attention(
query: torch.Tensor,
key: torch.Tensor,
@@ -217,8 +216,7 @@ def unified_flash_attention(
return output.view(num_tokens, hidden_size)
-@unified_flash_attention.register_fake
-def _(
+def unified_flash_attention_fake(
query: torch.Tensor,
key: torch.Tensor,
value: torch.Tensor,
@@ -235,3 +233,11 @@ def _(
logits_soft_cap: Optional[float] = None,
) -> torch.Tensor:
return torch.empty_like(query)
+
+
+direct_register_custom_op(
+ op_name="unified_flash_attention",
+ op_func=unified_flash_attention,
+ mutates_args=["kv_cache"],
+ fake_impl=unified_flash_attention_fake,
+)
From 37a4947dcd68c602d0911920e2c1a9168dea1ecb Mon Sep 17 00:00:00 2001
From: Michael Goin
Date: Fri, 1 Nov 2024 01:12:44 -0400
Subject: [PATCH 072/113] [Bugfix] Fix layer skip logic with bitsandbytes
(#9887)
Signed-off-by: mgoin
---
vllm/model_executor/layers/quantization/bitsandbytes.py | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py
index 7a039a78f09b8..718967a065192 100644
--- a/vllm/model_executor/layers/quantization/bitsandbytes.py
+++ b/vllm/model_executor/layers/quantization/bitsandbytes.py
@@ -119,7 +119,12 @@ def get_scaled_act_names(self) -> List[str]:
def is_layer_skipped_bnb(prefix: str, llm_int8_skip_modules: List[str]):
- return any(module_name in prefix for module_name in llm_int8_skip_modules)
+ # Split the prefix into its dot-separated components
+ components = prefix.split('.')
+
+ # Check if any of the skip modules exactly matches any component
+ return any(module_name in components
+ for module_name in llm_int8_skip_modules)
class BitsAndBytesLinearMethod(LinearMethodBase):
From 566cd277979bc1a46b7e99657112416af9874a58 Mon Sep 17 00:00:00 2001
From: youkaichao
Date: Thu, 31 Oct 2024 22:20:17 -0700
Subject: [PATCH 073/113] [torch.compile] rework test plans (#9866)
Signed-off-by: youkaichao
---
tests/compile/test_basic_correctness.py | 113 +++++++++++++++++----
tests/utils.py | 124 +++++++++++++++++++++++-
vllm/model_executor/models/llava.py | 10 +-
vllm/model_executor/models/phi3v.py | 10 +-
4 files changed, 226 insertions(+), 31 deletions(-)
diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py
index 2f92ff73845f5..833589ba5dc9f 100644
--- a/tests/compile/test_basic_correctness.py
+++ b/tests/compile/test_basic_correctness.py
@@ -1,3 +1,4 @@
+import dataclasses
from typing import Dict, List, Optional
import pytest
@@ -8,33 +9,109 @@
from ..utils import compare_all_settings
+@dataclasses.dataclass
+class TestSetting:
+ model: str
+ model_args: List[str]
+ pp_size: int
+ tp_size: int
+ attn_backend: str
+ method: str
+ fullgraph: bool
+
+
+# representative settings for testing
+test_settings = [
+ # basic llama model
+ TestSetting(
+ model="meta-llama/Llama-3.2-1B",
+ model_args=[],
+ pp_size=2,
+ tp_size=2,
+ attn_backend="FLASHINFER",
+ method="generate",
+ fullgraph=True,
+ ),
+ # llama model with quantization
+ TestSetting(
+ model="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
+ model_args=["--quantization", "gptq"],
+ pp_size=1,
+ tp_size=1,
+ attn_backend="FLASH_ATTN",
+ method="generate",
+ fullgraph=True,
+ ),
+ # MoE model
+ TestSetting(
+ model="ibm/PowerMoE-3b",
+ model_args=[],
+ pp_size=1,
+ tp_size=2,
+ attn_backend="FLASH_ATTN",
+ method="generate",
+ fullgraph=True,
+ ),
+ # embedding model
+ TestSetting(
+ model="BAAI/bge-multilingual-gemma2",
+ model_args=["--task", "embedding"],
+ pp_size=1,
+ tp_size=1,
+ attn_backend="FLASHINFER",
+ method="encode",
+ fullgraph=True,
+ ),
+ # vision language model
+ TestSetting(
+ model="microsoft/Phi-3.5-vision-instruct",
+ model_args=["--trust-remote-code", "--max-model-len", "2048"],
+ pp_size=2,
+ tp_size=1,
+ attn_backend="FLASH_ATTN",
+ method="generate_with_image",
+ fullgraph=False,
+ ),
+]
+
+
# we cannot afford testing the full Catesian product
# of all models and all levels
-@pytest.mark.parametrize(
- "model, model_args, pp_size, tp_size, attn_backend, method, fullgraph",
- [
- ("meta-llama/Llama-3.2-1B", [], 2, 2, "FLASHINFER", "generate", True),
- ("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples",
- ["--quantization", "compressed-tensors"
- ], 1, 1, "FLASH_ATTN", "generate", True),
- ("ibm/PowerMoE-3b", [], 1, 2, "FLASH_ATTN", "generate", True),
- # TODO: add multi-modality test for llava
- ("llava-hf/llava-1.5-7b-hf", [], 2, 1, "FLASHINFER", "generate", False)
- ])
-def test_compile_correctness(model, model_args, pp_size, tp_size, attn_backend,
- method, fullgraph):
+@pytest.mark.parametrize("test_setting", test_settings)
+def test_compile_correctness(test_setting: TestSetting):
# this test is run under multiple suits, with different GPUs.
# make sure we only run the test with correct CUDA devices.
# don't use "<", as it will duplicate the tests.
+ model = test_setting.model
+ model_args = test_setting.model_args
+ pp_size = test_setting.pp_size
+ tp_size = test_setting.tp_size
+ attn_backend = test_setting.attn_backend
+ method = test_setting.method
+ fullgraph = test_setting.fullgraph
if cuda_device_count_stateless() != pp_size * tp_size:
pytest.skip("Not correct CUDA devices for the test.")
import os
os.environ["VLLM_ATTENTION_BACKEND"] = attn_backend
- all_args = [["--enforce-eager"] + model_args + ["-pp", str(pp_size)] +
- ["-tp", str(tp_size)]] * 3
- # don't test VLLM_TORCH_COMPILE_LEVEL == 3 case
- # inductor will change the output, so we cannot compare them.
+ final_args = ["--enforce-eager"] + model_args + ["-pp", str(pp_size)] + \
+ ["-tp", str(tp_size)]
+
all_envs: List[Optional[Dict[str, str]]] = []
+
+ for level in [
+ CompilationLevel.NO_COMPILATION,
+ CompilationLevel.PIECEWISE,
+ ]:
+ all_envs.append({"VLLM_TORCH_COMPILE_LEVEL": str(level)})
+
+ # inductor will change the output, so we only compare if the output
+ # is close, not exactly the same.
+ compare_all_settings(
+ model, [final_args] * 2,
+ all_envs,
+ method=method if method != "generate" else "generate_close")
+ all_envs.clear()
+
for level in [
CompilationLevel.NO_COMPILATION,
CompilationLevel.DYNAMO_AS_IS,
@@ -46,4 +123,4 @@ def test_compile_correctness(model, model_args, pp_size, tp_size, attn_backend,
all_envs[-1][
"VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0" # type: ignore
- compare_all_settings(model, all_args, all_envs, method=method)
+ compare_all_settings(model, [final_args] * 3, all_envs, method=method)
diff --git a/tests/utils.py b/tests/utils.py
index e8aad9cb3268f..16e21f68c7c96 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -1,4 +1,5 @@
import asyncio
+import copy
import functools
import os
import signal
@@ -8,13 +9,14 @@
import warnings
from contextlib import contextmanager
from pathlib import Path
-from typing import Any, Callable, Dict, List, Literal, Optional, Type, Union
+from typing import Any, Callable, Dict, List, Optional, Type, Union
import openai
import pytest
import requests
+import torch
from openai.types.completion import Completion
-from typing_extensions import ParamSpec, assert_never
+from typing_extensions import ParamSpec
import vllm.envs as envs
from tests.models.utils import TextTextLogprobs
@@ -272,6 +274,31 @@ def _test_completion(
return results
+def _test_completion_close(
+ client: openai.OpenAI,
+ model: str,
+ prompt: str,
+):
+ results = []
+
+ # test with text prompt
+ completion = client.completions.create(model=model,
+ prompt=prompt,
+ max_tokens=1,
+ logprobs=5,
+ temperature=0.0)
+
+ logporbs = completion.choices[0].logprobs.top_logprobs[0]
+ logporbs = {k: round(v, 2) for k, v in logporbs.items()}
+
+ results.append({
+ "test": "completion_close",
+ "logprobs": logporbs,
+ })
+
+ return results
+
+
def _test_embeddings(
client: openai.OpenAI,
model: str,
@@ -295,13 +322,81 @@ def _test_embeddings(
return results
+def _test_image_text(
+ client: openai.OpenAI,
+ model_name: str,
+ image_url: str,
+):
+ results = []
+
+ # test pure text input
+ messages = [{
+ "role":
+ "user",
+ "content": [
+ {
+ "type": "text",
+ "text": "How do you feel today?"
+ },
+ ],
+ }]
+
+ chat_completion = client.chat.completions.create(model=model_name,
+ messages=messages,
+ temperature=0.0,
+ max_tokens=1,
+ logprobs=True,
+ top_logprobs=5)
+ top_logprobs = chat_completion.choices[0].logprobs.content[0].top_logprobs
+
+ for x in top_logprobs:
+ x.logprob = round(x.logprob, 2)
+
+ results.append({
+ "test": "pure_text",
+ "logprobs": top_logprobs,
+ })
+
+ messages = [{
+ "role":
+ "user",
+ "content": [
+ {
+ "type": "image_url",
+ "image_url": {
+ "url": image_url
+ }
+ },
+ {
+ "type": "text",
+ "text": "What's in this image?"
+ },
+ ],
+ }]
+
+ chat_completion = client.chat.completions.create(model=model_name,
+ messages=messages,
+ temperature=0.0,
+ max_tokens=1,
+ logprobs=True,
+ top_logprobs=5)
+ top_logprobs = chat_completion.choices[0].logprobs.content[0].top_logprobs
+
+ results.append({
+ "test": "text_image",
+ "logprobs": top_logprobs,
+ })
+
+ return results
+
+
def compare_two_settings(model: str,
arg1: List[str],
arg2: List[str],
env1: Optional[Dict[str, str]] = None,
env2: Optional[Dict[str, str]] = None,
*,
- method: Literal["generate", "encode"] = "generate",
+ method: str = "generate",
max_wait_seconds: Optional[float] = None) -> None:
"""
Launch API server with two different sets of arguments/environments
@@ -328,7 +423,7 @@ def compare_all_settings(model: str,
all_args: List[List[str]],
all_envs: List[Optional[Dict[str, str]]],
*,
- method: Literal["generate", "encode"] = "generate",
+ method: str = "generate",
max_wait_seconds: Optional[float] = None) -> None:
"""
Launch API server with several different sets of arguments/environments
@@ -397,10 +492,17 @@ def compare_all_settings(model: str,
if method == "generate":
results += _test_completion(client, model, prompt, token_ids)
+ elif method == "generate_close":
+ results += _test_completion_close(client, model, prompt)
+ elif method == "generate_with_image":
+ results += _test_image_text(
+ client, model,
+ "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png"
+ )
elif method == "encode":
results += _test_embeddings(client, model, prompt)
else:
- assert_never(method)
+ raise ValueError(f"Unknown method: {method}")
if i > 0:
# if any setting fails, raise an error early
@@ -410,6 +512,18 @@ def compare_all_settings(model: str,
compare_envs = all_envs[i]
for ref_result, compare_result in zip(ref_results,
compare_results):
+ ref_result = copy.deepcopy(ref_result)
+ compare_result = copy.deepcopy(compare_result)
+ if "embedding" in ref_result and method == "encode":
+ ref_embedding = torch.tensor(ref_result["embedding"])
+ compare_embedding = torch.tensor(
+ compare_result["embedding"])
+ mse = ((ref_embedding - compare_embedding)**2).mean()
+ assert mse < 1e-6, (
+ f"Embedding for {model=} are not the same.\n"
+ f"mse={mse}\n")
+ del ref_result["embedding"]
+ del compare_result["embedding"]
assert ref_result == compare_result, (
f"Results for {model=} are not the same.\n"
f"{ref_args=} {ref_envs=}\n"
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index eda99c029881f..27055e7ced865 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -493,13 +493,9 @@ def forward(
:class:`LlavaImageInputs`
"""
if intermediate_tensors is not None:
- input_ids = None
inputs_embeds = None
else:
- # always pass the input via `inputs_embeds`
- # to make sure the computation graph is consistent
image_input = self._parse_and_validate_image_input(**kwargs)
-
if image_input is not None:
vision_embeddings = self._process_image_input(image_input)
inputs_embeds = self.language_model.model.get_input_embeddings(
@@ -511,7 +507,11 @@ def forward(
else:
inputs_embeds = self.language_model.model.get_input_embeddings(
input_ids)
- input_ids = None
+
+ # always pass the input via `inputs_embeds`
+ # to make sure the computation graph is consistent
+ # for `torch.compile` integration
+ input_ids = None
hidden_states = self.language_model.model(input_ids,
positions,
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 0fc4556831fd7..4928e447d5b9e 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -679,7 +679,6 @@ def forward(self,
intermediate_tensors: Optional[IntermediateTensors] = None,
**kwargs: object):
if intermediate_tensors is not None:
- input_ids = None
inputs_embeds = None
else:
image_input = self._parse_and_validate_image_input(**kwargs)
@@ -690,9 +689,14 @@ def forward(self,
inputs_embeds = merge_multimodal_embeddings(
input_ids, inputs_embeds, vision_embeddings,
self.image_token_id)
- input_ids = None
else:
- inputs_embeds = None
+ inputs_embeds = self.language_model.model.embed_tokens(
+ input_ids)
+
+ # always pass the input via `inputs_embeds`
+ # to make sure the computation graph is consistent
+ # for `torch.compile` integration
+ input_ids = None
hidden_states = self.language_model.model(input_ids,
positions,
From 93a76dd21dcec8977f1ffd0e21faa88fb515b9e4 Mon Sep 17 00:00:00 2001
From: Michael Goin
Date: Fri, 1 Nov 2024 01:31:56 -0400
Subject: [PATCH 074/113] [Model] Support bitsandbytes for MiniCPMV (#9891)
Signed-off-by: mgoin
---
vllm/model_executor/models/minicpmv.py | 43 ++++++++++++++++++++++++++
1 file changed, 43 insertions(+)
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index a270282d87bc8..4917c33136069 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -810,6 +810,28 @@ class MiniCPMV2_5(MiniCPMVBaseModel, SupportsLoRA):
# resampler
"kv_proj",
]
+
+ # BitandBytes specific attributes
+ default_bitsandbytes_target_modules = [
+ ".gate_proj.",
+ ".down_proj.",
+ ".up_proj.",
+ ".q_proj.",
+ ".k_proj.",
+ ".v_proj.",
+ ".o_proj.",
+ ]
+ # in TP, these weights are partitioned along the column dimension (dim=-1)
+ column_parallel_weights_modules = [".down_proj.", ".o_proj."]
+ bitsandbytes_stacked_params_mapping = {
+ # shard_name, weight_name, index
+ "q_proj": ("qkv_proj", 0),
+ "k_proj": ("qkv_proj", 1),
+ "v_proj": ("qkv_proj", 2),
+ "gate_proj": ("gate_up_proj", 0),
+ "up_proj": ("gate_up_proj", 1),
+ }
+
embedding_modules = {}
embedding_padding_modules = []
@@ -931,6 +953,27 @@ class MiniCPMV2_6(MiniCPMVBaseModel, SupportsLoRA):
"kv_proj",
]
+ # BitandBytes specific attributes
+ default_bitsandbytes_target_modules = [
+ ".gate_proj.",
+ ".down_proj.",
+ ".up_proj.",
+ ".q_proj.",
+ ".k_proj.",
+ ".v_proj.",
+ ".o_proj.",
+ ]
+ # in TP, these weights are partitioned along the column dimension (dim=-1)
+ column_parallel_weights_modules = [".down_proj.", ".o_proj."]
+ bitsandbytes_stacked_params_mapping = {
+ # shard_name, weight_name, index
+ "q_proj": ("qkv_proj", 0),
+ "k_proj": ("qkv_proj", 1),
+ "v_proj": ("qkv_proj", 2),
+ "gate_proj": ("gate_up_proj", 0),
+ "up_proj": ("gate_up_proj", 1),
+ }
+
embedding_modules = {}
embedding_padding_modules = []
From 2b5bf20988edaab21621b78a9eb589edc93f2763 Mon Sep 17 00:00:00 2001
From: Yongzao <532741407@qq.com>
Date: Fri, 1 Nov 2024 15:25:47 +0800
Subject: [PATCH 075/113] [torch.compile] Adding torch compile annotations to
some models (#9876)
Signed-off-by: youkaichao
Co-authored-by: youkaichao
---
docs/source/models/supported_models.rst | 2 +-
tests/distributed/test_pipeline_parallel.py | 2 +-
vllm/model_executor/models/falcon.py | 2 ++
vllm/model_executor/models/phi.py | 2 ++
vllm/model_executor/models/qwen.py | 2 ++
vllm/model_executor/models/qwen2.py | 2 ++
vllm/model_executor/models/qwen2_moe.py | 2 ++
7 files changed, 12 insertions(+), 2 deletions(-)
diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index 3279e7a108232..e493cebf1e9f4 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -281,7 +281,7 @@ Text Generation
- ✅︎
* - :code:`Qwen2ForCausalLM`
- Qwen2
- - :code:`Qwen/Qwen2-beta-7B`, :code:`Qwen/Qwen2-beta-7B-Chat`, etc.
+ - :code:`Qwen/Qwen2-7B-Instruct`, :code:`Qwen/Qwen2-7B`, etc.
- ✅︎
- ✅︎
* - :code:`Qwen2MoeForCausalLM`
diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index ed6360f9d6148..1489a60891761 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -166,7 +166,7 @@ def iter_params(self, model_name: str):
"microsoft/Phi-3.5-MoE-instruct": PPTestSettings.fast(trust_remote_code=True), # noqa: E501
"adept/persimmon-8b-chat": PPTestSettings.fast(),
"Qwen/Qwen-7B-Chat": PPTestSettings.fast(trust_remote_code=True),
- "Qwen/Qwen2-beta-7B-Chat": PPTestSettings.fast(),
+ "Qwen/Qwen2-7B-Instruct": PPTestSettings.fast(),
"Qwen/Qwen1.5-MoE-A2.7B-Chat": PPTestSettings.fast(),
"stabilityai/stablelm-3b-4e1t": PPTestSettings.fast(),
"bigcode/starcoder2-3b": PPTestSettings.fast(),
diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py
index 467a33505ee12..36c85e37783ab 100644
--- a/vllm/model_executor/models/falcon.py
+++ b/vllm/model_executor/models/falcon.py
@@ -27,6 +27,7 @@
from transformers import FalconConfig as HF_FalconConfig
from vllm.attention import Attention, AttentionMetadata
+from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig
from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
get_tensor_model_parallel_world_size,
@@ -329,6 +330,7 @@ def forward(
return output
+@support_torch_compile
class FalconModel(nn.Module):
def __init__(
diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py
index ec20cb249ba9b..497eae4e8905b 100644
--- a/vllm/model_executor/models/phi.py
+++ b/vllm/model_executor/models/phi.py
@@ -42,6 +42,7 @@
from transformers import PhiConfig
from vllm.attention import Attention, AttentionMetadata
+from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, LoRAConfig
from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
from vllm.model_executor.layers.activation import get_act_fn
@@ -193,6 +194,7 @@ def forward(
return hidden_states
+@support_torch_compile
class PhiModel(nn.Module):
def __init__(self,
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index 998016ea28c26..61665768eacf5 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -20,6 +20,7 @@
from transformers import PretrainedConfig
from vllm.attention import Attention, AttentionMetadata
+from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, LoRAConfig, MultiModalConfig
from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext,
@@ -549,6 +550,7 @@ def forward(
return hidden_states, residual
+@support_torch_compile
class QWenModel(nn.Module):
def __init__(
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index db1029345a8ac..db7556b3b5f4b 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -29,6 +29,7 @@
from transformers import Qwen2Config
from vllm.attention import Attention, AttentionMetadata
+from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, LoRAConfig
from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
from vllm.model_executor.layers.activation import SiluAndMul
@@ -237,6 +238,7 @@ def forward(
return hidden_states, residual
+@support_torch_compile
class Qwen2Model(nn.Module):
def __init__(
diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py
index d4475b7ca27af..dac85e35d369d 100644
--- a/vllm/model_executor/models/qwen2_moe.py
+++ b/vllm/model_executor/models/qwen2_moe.py
@@ -30,6 +30,7 @@
from transformers import PretrainedConfig
from vllm.attention import Attention, AttentionMetadata
+from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig
from vllm.distributed import (get_pp_group,
get_tensor_model_parallel_world_size,
@@ -312,6 +313,7 @@ def forward(
return hidden_states, residual
+@support_torch_compile
class Qwen2MoeModel(nn.Module):
def __init__(
From d3aa2a8b2f93f50ed40fe7d8617701a2294a13e4 Mon Sep 17 00:00:00 2001
From: Cyrus Leung
Date: Fri, 1 Nov 2024 15:34:49 +0800
Subject: [PATCH 076/113] [Doc] Update multi-input support (#9906)
---
docs/source/models/supported_models.rst | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index e493cebf1e9f4..80714a90df5c2 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -466,7 +466,7 @@ Text Generation
- ✅︎
* - :code:`LlavaOnevisionForConditionalGeneration`
- LLaVA-Onevision
- - T + I\ :sup:`+` + V
+ - T + I\ :sup:`+` + V\ :sup:`+`
- :code:`llava-hf/llava-onevision-qwen2-7b-ov-hf`, :code:`llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc.
-
- ✅︎
@@ -478,7 +478,7 @@ Text Generation
- ✅︎
* - :code:`MllamaForConditionalGeneration`
- Llama 3.2
- - T + I
+ - T + I\ :sup:`+`
- :code:`meta-llama/Llama-3.2-90B-Vision-Instruct`, :code:`meta-llama/Llama-3.2-11B-Vision`, etc.
-
-
From 06386a64dd706cf3fdab82510124ca2c2f9eee9d Mon Sep 17 00:00:00 2001
From: Cyrus Leung
Date: Fri, 1 Nov 2024 16:13:35 +0800
Subject: [PATCH 077/113] [Frontend] Chat-based Embeddings API (#9759)
---
docs/requirements-docs.txt | 2 +
docs/source/conf.py | 2 +-
docs/source/dev/pooling_params.rst | 5 +
docs/source/getting_started/quickstart.rst | 8 +-
docs/source/index.rst | 1 +
docs/source/models/vlm.rst | 54 ++++-
.../serving/openai_compatible_server.md | 55 ++++-
tests/entrypoints/openai/test_basic.py | 13 +-
tests/entrypoints/openai/test_embedding.py | 137 +++++++----
tests/entrypoints/openai/test_metrics.py | 14 +-
tests/entrypoints/openai/test_tokenization.py | 32 +--
.../openai/test_vision_embedding.py | 94 ++++++++
vllm/entrypoints/openai/api_server.py | 96 +++++---
vllm/entrypoints/openai/protocol.py | 87 ++++++-
vllm/entrypoints/openai/run_batch.py | 34 ++-
vllm/entrypoints/openai/serving_chat.py | 222 +++++++-----------
vllm/entrypoints/openai/serving_completion.py | 75 +++---
vllm/entrypoints/openai/serving_embedding.py | 87 ++++---
vllm/entrypoints/openai/serving_engine.py | 159 ++++++++++++-
.../openai/serving_tokenization.py | 87 +++----
vllm/pooling_params.py | 4 +-
21 files changed, 853 insertions(+), 415 deletions(-)
create mode 100644 docs/source/dev/pooling_params.rst
create mode 100644 tests/entrypoints/openai/test_vision_embedding.py
diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt
index d58f226136918..e3e35844405ac 100644
--- a/docs/requirements-docs.txt
+++ b/docs/requirements-docs.txt
@@ -13,5 +13,7 @@ torch
py-cpuinfo
transformers
mistral_common >= 1.3.4
+aiohttp
+starlette
openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
partial-json-parser # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
\ No newline at end of file
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 8435129e752e1..c7b638473a931 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -96,7 +96,6 @@ def setup(app):
# Mock out external dependencies here, otherwise the autodoc pages may be blank.
autodoc_mock_imports = [
- "aiohttp",
"compressed_tensors",
"cpuinfo",
"cv2",
@@ -143,6 +142,7 @@ def add_line(self, line: str, source: str, *lineno: int) -> None:
"python": ("https://docs.python.org/3", None),
"typing_extensions":
("https://typing-extensions.readthedocs.io/en/latest", None),
+ "aiohttp": ("https://docs.aiohttp.org/en/stable", None),
"pillow": ("https://pillow.readthedocs.io/en/stable", None),
"numpy": ("https://numpy.org/doc/stable", None),
"torch": ("https://pytorch.org/docs/stable", None),
diff --git a/docs/source/dev/pooling_params.rst b/docs/source/dev/pooling_params.rst
new file mode 100644
index 0000000000000..334e0287aff09
--- /dev/null
+++ b/docs/source/dev/pooling_params.rst
@@ -0,0 +1,5 @@
+Pooling Parameters
+==================
+
+.. autoclass:: vllm.PoolingParams
+ :members:
diff --git a/docs/source/getting_started/quickstart.rst b/docs/source/getting_started/quickstart.rst
index f0e6cddf09ef7..00b762ccc2ccb 100644
--- a/docs/source/getting_started/quickstart.rst
+++ b/docs/source/getting_started/quickstart.rst
@@ -138,10 +138,10 @@ Since this server is compatible with OpenAI API, you can use it as a drop-in rep
A more detailed client example can be found `here `__.
-OpenAI Chat API with vLLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~
+OpenAI Chat Completions API with vLLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-vLLM is designed to also support the OpenAI Chat API. The chat interface is a more dynamic, interactive way to communicate with the model, allowing back-and-forth exchanges that can be stored in the chat history. This is useful for tasks that require context or more detailed explanations.
+vLLM is designed to also support the OpenAI Chat Completions API. The chat interface is a more dynamic, interactive way to communicate with the model, allowing back-and-forth exchanges that can be stored in the chat history. This is useful for tasks that require context or more detailed explanations.
You can use the `create chat completion `_ endpoint to interact with the model:
@@ -157,7 +157,7 @@ You can use the `create chat completion `_ API,
+ Since OpenAI Vision API is based on `Chat Completions API `_,
a chat template is **required** to launch the API server.
Although Phi-3.5-Vision comes with a chat template, for other models you may have to provide one if the model's tokenizer does not come with it.
@@ -243,6 +243,10 @@ To consume the server, you can use the OpenAI client like in the example below:
A full code example can be found in `examples/openai_api_client_for_multimodal.py `_.
+.. tip::
+ There is no need to place image placeholders in the text content of the API request - they are already represented by the image content.
+ In fact, you can place image placeholders in the middle of the text by interleaving text and image content.
+
.. note::
By default, the timeout for fetching images through http url is ``5`` seconds. You can override this by setting the environment variable:
@@ -251,5 +255,49 @@ A full code example can be found in `examples/openai_api_client_for_multimodal.p
$ export VLLM_IMAGE_FETCH_TIMEOUT=
-.. note::
- There is no need to format the prompt in the API request since it will be handled by the server.
+Chat Embeddings API
+^^^^^^^^^^^^^^^^^^^
+
+vLLM's Chat Embeddings API is a superset of OpenAI's `Embeddings API `_,
+where a list of ``messages`` can be passed instead of batched ``inputs``. This enables multi-modal inputs to be passed to embedding models.
+
+.. tip::
+ The schema of ``messages`` is exactly the same as in Chat Completions API.
+
+In this example, we will serve the ``TIGER-Lab/VLM2Vec-Full`` model.
+
+.. code-block:: bash
+
+ vllm serve TIGER-Lab/VLM2Vec-Full --task embedding \
+ --trust-remote-code --max-model-len 4096
+
+.. important::
+
+ Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass ``--task embedding``
+ to run this model in embedding mode instead of text generation mode.
+
+Since this schema is not defined by OpenAI client, we post a request to the server using the lower-level ``requests`` library:
+
+.. code-block:: python
+
+ import requests
+
+ image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+
+ response = requests.post(
+ "http://localhost:8000/v1/embeddings",
+ json={
+ "model": "TIGER-Lab/VLM2Vec-Full",
+ "messages": [{
+ "role": "user",
+ "content": [
+ {"type": "image_url", "image_url": {"url": image_url}},
+ {"type": "text", "text": "Represent the given image."},
+ ],
+ }],
+ "encoding_format": "float",
+ },
+ )
+ response.raise_for_status()
+ response_json = response.json()
+ print("Embedding output:", response_json["data"][0]["embedding"])
diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index a1f93a9a28578..0b5f75caf2475 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -26,13 +26,26 @@ print(completion.choices[0].message)
```
## API Reference
-Please see the [OpenAI API Reference](https://platform.openai.com/docs/api-reference) for more information on the API. We support all parameters except:
-- Chat: `tools`, and `tool_choice`.
-- Completions: `suffix`.
-vLLM also provides experimental support for OpenAI Vision API compatible inference. See more details in [Using VLMs](../models/vlm.rst).
+We currently support the following OpenAI APIs:
+
+- [Completions API](https://platform.openai.com/docs/api-reference/completions)
+ - *Note: `suffix` parameter is not supported.*
+- [Chat Completions API](https://platform.openai.com/docs/api-reference/chat)
+ - [Vision](https://platform.openai.com/docs/guides/vision)-related parameters are supported; see [Using VLMs](../models/vlm.rst).
+ - *Note: `image_url.detail` parameter is not supported.*
+ - We also support `audio_url` content type for audio files.
+ - Refer to [vllm.entrypoints.chat_utils](https://github.com/vllm-project/vllm/tree/main/vllm/entrypoints/chat_utils.py) for the exact schema.
+ - *TODO: Support `input_audio` content type as defined [here](https://github.com/openai/openai-python/blob/v1.52.2/src/openai/types/chat/chat_completion_content_part_input_audio_param.py).*
+ - *Note: `parallel_tool_calls` and `user` parameters are ignored.*
+- [Embeddings API](https://platform.openai.com/docs/api-reference/embeddings)
+ - Instead of `inputs`, you can pass in a list of `messages` (same schema as Chat Completions API),
+ which will be treated as a single prompt to the model according to its chat template.
+ - This enables multi-modal inputs to be passed to embedding models, see [Using VLMs](../models/vlm.rst).
+ - *Note: You should run `vllm serve` with `--task embedding` to ensure that the model is being run in embedding mode.*
## Extra Parameters
+
vLLM supports a set of parameters that are not part of the OpenAI API.
In order to use them, you can pass them as extra parameters in the OpenAI client.
Or directly merge them into the JSON payload if you are using HTTP call directly.
@@ -49,7 +62,26 @@ completion = client.chat.completions.create(
)
```
-### Extra Parameters for Chat API
+### Extra Parameters for Completions API
+
+The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported.
+
+```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:language: python
+:start-after: begin-completion-sampling-params
+:end-before: end-completion-sampling-params
+```
+
+The following extra parameters are supported:
+
+```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:language: python
+:start-after: begin-completion-extra-params
+:end-before: end-completion-extra-params
+```
+
+### Extra Parameters for Chat Completions API
+
The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported.
```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
@@ -66,21 +98,22 @@ The following extra parameters are supported:
:end-before: end-chat-completion-extra-params
```
-### Extra Parameters for Completions API
-The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported.
+### Extra Parameters for Embeddings API
+
+The following [pooling parameters (click through to see documentation)](../dev/pooling_params.rst) are supported.
```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
:language: python
-:start-after: begin-completion-sampling-params
-:end-before: end-completion-sampling-params
+:start-after: begin-embedding-pooling-params
+:end-before: end-embedding-pooling-params
```
The following extra parameters are supported:
```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
:language: python
-:start-after: begin-completion-extra-params
-:end-before: end-completion-extra-params
+:start-after: begin-embedding-extra-params
+:end-before: end-embedding-extra-params
```
## Chat Template
diff --git a/tests/entrypoints/openai/test_basic.py b/tests/entrypoints/openai/test_basic.py
index d3aea533b6db9..4616f363cc04a 100644
--- a/tests/entrypoints/openai/test_basic.py
+++ b/tests/entrypoints/openai/test_basic.py
@@ -1,7 +1,6 @@
from http import HTTPStatus
from typing import List
-import openai
import pytest
import pytest_asyncio
import requests
@@ -83,10 +82,8 @@ async def client(server):
indirect=True,
)
@pytest.mark.asyncio
-async def test_show_version(client: openai.AsyncOpenAI):
- base_url = str(client.base_url)[:-3].strip("/")
-
- response = requests.get(base_url + "/version")
+async def test_show_version(server: RemoteOpenAIServer):
+ response = requests.get(server.url_for("version"))
response.raise_for_status()
assert response.json() == {"version": VLLM_VERSION}
@@ -102,9 +99,7 @@ async def test_show_version(client: openai.AsyncOpenAI):
indirect=True,
)
@pytest.mark.asyncio
-async def test_check_health(client: openai.AsyncOpenAI):
- base_url = str(client.base_url)[:-3].strip("/")
-
- response = requests.get(base_url + "/health")
+async def test_check_health(server: RemoteOpenAIServer):
+ response = requests.get(server.url_for("health"))
assert response.status_code == HTTPStatus.OK
diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py
index f119c6c1201c9..9f2b77dde2a7f 100644
--- a/tests/entrypoints/openai/test_embedding.py
+++ b/tests/entrypoints/openai/test_embedding.py
@@ -4,14 +4,18 @@
import openai
import pytest
import pytest_asyncio
+import requests
+
+from vllm.transformers_utils.tokenizer import get_tokenizer
from ...utils import RemoteOpenAIServer
-EMBEDDING_MODEL_NAME = "intfloat/e5-mistral-7b-instruct"
+MODEL_NAME = "intfloat/e5-mistral-7b-instruct"
+DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}""" # noqa: E501
@pytest.fixture(scope="module")
-def embedding_server():
+def server():
args = [
# use half precision for speed and memory savings in CI environment
"--dtype",
@@ -19,31 +23,29 @@ def embedding_server():
"--enforce-eager",
"--max-model-len",
"8192",
+ "--chat-template",
+ DUMMY_CHAT_TEMPLATE,
]
- with RemoteOpenAIServer(EMBEDDING_MODEL_NAME, args) as remote_server:
+ with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest_asyncio.fixture
-async def embedding_client(embedding_server):
- async with embedding_server.get_async_client() as async_client:
+async def client(server):
+ async with server.get_async_client() as async_client:
yield async_client
@pytest.mark.asyncio
-@pytest.mark.parametrize(
- "model_name",
- [EMBEDDING_MODEL_NAME],
-)
-async def test_single_embedding(embedding_client: openai.AsyncOpenAI,
- model_name: str):
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_single_embedding(client: openai.AsyncOpenAI, model_name: str):
input_texts = [
"The chef prepared a delicious meal.",
]
# test single embedding
- embeddings = await embedding_client.embeddings.create(
+ embeddings = await client.embeddings.create(
model=model_name,
input=input_texts,
encoding_format="float",
@@ -57,7 +59,7 @@ async def test_single_embedding(embedding_client: openai.AsyncOpenAI,
# test using token IDs
input_tokens = [1, 1, 1, 1, 1]
- embeddings = await embedding_client.embeddings.create(
+ embeddings = await client.embeddings.create(
model=model_name,
input=input_tokens,
encoding_format="float",
@@ -71,18 +73,14 @@ async def test_single_embedding(embedding_client: openai.AsyncOpenAI,
@pytest.mark.asyncio
-@pytest.mark.parametrize(
- "model_name",
- [EMBEDDING_MODEL_NAME],
-)
-async def test_batch_embedding(embedding_client: openai.AsyncOpenAI,
- model_name: str):
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_batch_embedding(client: openai.AsyncOpenAI, model_name: str):
# test List[str]
input_texts = [
"The cat sat on the mat.", "A feline was resting on a rug.",
"Stars twinkle brightly in the night sky."
]
- embeddings = await embedding_client.embeddings.create(
+ embeddings = await client.embeddings.create(
model=model_name,
input=input_texts,
encoding_format="float",
@@ -90,11 +88,14 @@ async def test_batch_embedding(embedding_client: openai.AsyncOpenAI,
assert embeddings.id is not None
assert len(embeddings.data) == 3
assert len(embeddings.data[0].embedding) == 4096
+ assert embeddings.usage.completion_tokens == 0
+ assert embeddings.usage.prompt_tokens == 32
+ assert embeddings.usage.total_tokens == 32
# test List[List[int]]
input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24],
[25, 32, 64, 77]]
- embeddings = await embedding_client.embeddings.create(
+ embeddings = await client.embeddings.create(
model=model_name,
input=input_tokens,
encoding_format="float",
@@ -108,22 +109,70 @@ async def test_batch_embedding(embedding_client: openai.AsyncOpenAI,
@pytest.mark.asyncio
-@pytest.mark.parametrize(
- "model_name",
- [EMBEDDING_MODEL_NAME],
-)
-async def test_batch_base64_embedding(embedding_client: openai.AsyncOpenAI,
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_conversation_embedding(server: RemoteOpenAIServer,
+ client: openai.AsyncOpenAI,
+ model_name: str):
+ messages = [{
+ "role": "user",
+ "content": "The cat sat on the mat.",
+ }, {
+ "role": "assistant",
+ "content": "A feline was resting on a rug.",
+ }, {
+ "role": "user",
+ "content": "Stars twinkle brightly in the night sky.",
+ }]
+
+ chat_response = requests.post(server.url_for("v1/embeddings"),
+ json={
+ "model": model_name,
+ "messages": messages,
+ "encoding_format": "float",
+ })
+ chat_response.raise_for_status()
+ chat_embeddings = chat_response.json()
+
+ tokenizer = get_tokenizer(tokenizer_name=model_name, tokenizer_mode="fast")
+ prompt = tokenizer.apply_chat_template(
+ messages,
+ chat_template=DUMMY_CHAT_TEMPLATE,
+ add_generation_prompt=True,
+ continue_final_message=False,
+ tokenize=False,
+ )
+ completion_response = await client.embeddings.create(
+ model=model_name,
+ input=prompt,
+ encoding_format="float",
+ # To be consistent with chat
+ extra_body={"add_special_tokens": False},
+ )
+ completion_embeddings = completion_response.model_dump(mode="json")
+
+ assert chat_embeddings.pop("id") is not None
+ assert completion_embeddings.pop("id") is not None
+ assert chat_embeddings.pop("created") <= completion_embeddings.pop(
+ "created")
+ assert chat_embeddings == completion_embeddings
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_batch_base64_embedding(client: openai.AsyncOpenAI,
model_name: str):
input_texts = [
"Hello my name is",
"The best thing about vLLM is that it supports many different models"
]
- responses_float = await embedding_client.embeddings.create(
- input=input_texts, model=model_name, encoding_format="float")
+ responses_float = await client.embeddings.create(input=input_texts,
+ model=model_name,
+ encoding_format="float")
- responses_base64 = await embedding_client.embeddings.create(
- input=input_texts, model=model_name, encoding_format="base64")
+ responses_base64 = await client.embeddings.create(input=input_texts,
+ model=model_name,
+ encoding_format="base64")
decoded_responses_base64_data = []
for data in responses_base64.data:
@@ -137,8 +186,8 @@ async def test_batch_base64_embedding(embedding_client: openai.AsyncOpenAI,
1]
# Default response is float32 decoded from base64 by OpenAI Client
- responses_default = await embedding_client.embeddings.create(
- input=input_texts, model=model_name)
+ responses_default = await client.embeddings.create(input=input_texts,
+ model=model_name)
assert responses_float.data[0].embedding == responses_default.data[
0].embedding
@@ -147,18 +196,15 @@ async def test_batch_base64_embedding(embedding_client: openai.AsyncOpenAI,
@pytest.mark.asyncio
-@pytest.mark.parametrize(
- "model_name",
- [EMBEDDING_MODEL_NAME],
-)
-async def test_single_embedding_truncation(
- embedding_client: openai.AsyncOpenAI, model_name: str):
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_single_embedding_truncation(client: openai.AsyncOpenAI,
+ model_name: str):
input_texts = [
"Como o Brasil pode fomentar o desenvolvimento de modelos de IA?",
]
# test single embedding
- embeddings = await embedding_client.embeddings.create(
+ embeddings = await client.embeddings.create(
model=model_name,
input=input_texts,
extra_body={"truncate_prompt_tokens": 10})
@@ -173,7 +219,7 @@ async def test_single_embedding_truncation(
1, 24428, 289, 18341, 26165, 285, 19323, 283, 289, 26789, 3871, 28728,
9901, 340, 2229, 385, 340, 315, 28741, 28804, 2
]
- embeddings = await embedding_client.embeddings.create(
+ embeddings = await client.embeddings.create(
model=model_name,
input=input_tokens,
extra_body={"truncate_prompt_tokens": 10})
@@ -187,18 +233,15 @@ async def test_single_embedding_truncation(
@pytest.mark.asyncio
-@pytest.mark.parametrize(
- "model_name",
- [EMBEDDING_MODEL_NAME],
-)
-async def test_single_embedding_truncation_invalid(
- embedding_client: openai.AsyncOpenAI, model_name: str):
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_single_embedding_truncation_invalid(client: openai.AsyncOpenAI,
+ model_name: str):
input_texts = [
"Como o Brasil pode fomentar o desenvolvimento de modelos de IA?",
]
with pytest.raises(openai.BadRequestError):
- embeddings = await embedding_client.embeddings.create(
+ embeddings = await client.embeddings.create(
model=model_name,
input=input_texts,
extra_body={"truncate_prompt_tokens": 8193})
diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py
index 6cb74eb78cbf0..b3f1fea91d13e 100644
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@@ -79,9 +79,8 @@ async def client(server):
@pytest.mark.asyncio
-async def test_metrics_counts(client: openai.AsyncOpenAI):
- base_url = str(client.base_url)[:-3].strip("/")
-
+async def test_metrics_counts(server: RemoteOpenAIServer,
+ client: openai.AsyncClient):
for _ in range(_NUM_REQUESTS):
# sending a request triggers the metrics to be logged.
await client.completions.create(
@@ -89,7 +88,7 @@ async def test_metrics_counts(client: openai.AsyncOpenAI):
prompt=_TOKENIZED_PROMPT,
max_tokens=_NUM_GENERATION_TOKENS_PER_REQUEST)
- response = requests.get(base_url + "/metrics")
+ response = requests.get(server.url_for("metrics"))
print(response.text)
assert response.status_code == HTTPStatus.OK
@@ -170,16 +169,15 @@ async def test_metrics_counts(client: openai.AsyncOpenAI):
@pytest.mark.asyncio
-async def test_metrics_exist(client: openai.AsyncOpenAI):
- base_url = str(client.base_url)[:-3].strip("/")
-
+async def test_metrics_exist(server: RemoteOpenAIServer,
+ client: openai.AsyncClient):
# sending a request triggers the metrics to be logged.
await client.completions.create(model=MODEL_NAME,
prompt="Hello, my name is",
max_tokens=5,
temperature=0.0)
- response = requests.get(base_url + "/metrics")
+ response = requests.get(server.url_for("metrics"))
assert response.status_code == HTTPStatus.OK
for metric in EXPECTED_METRICS:
diff --git a/tests/entrypoints/openai/test_tokenization.py b/tests/entrypoints/openai/test_tokenization.py
index 859a676a9c777..b1956a8cbc9dc 100644
--- a/tests/entrypoints/openai/test_tokenization.py
+++ b/tests/entrypoints/openai/test_tokenization.py
@@ -1,4 +1,3 @@
-import openai # use the official client for correctness check
import pytest
import pytest_asyncio
import requests
@@ -55,9 +54,11 @@ async def client(server):
[(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
indirect=["tokenizer_name"],
)
-async def test_tokenize_completions(client: openai.AsyncOpenAI,
- model_name: str, tokenizer_name: str):
- base_url = str(client.base_url)[:-3].strip("/")
+async def test_tokenize_completions(
+ server: RemoteOpenAIServer,
+ model_name: str,
+ tokenizer_name: str,
+):
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
tokenizer_mode="fast")
@@ -65,7 +66,7 @@ async def test_tokenize_completions(client: openai.AsyncOpenAI,
prompt = "vllm1 This is a test prompt."
tokens = tokenizer.encode(prompt, add_special_tokens=add_special)
- response = requests.post(base_url + "/tokenize",
+ response = requests.post(server.url_for("tokenize"),
json={
"add_special_tokens": add_special,
"model": model_name,
@@ -86,9 +87,11 @@ async def test_tokenize_completions(client: openai.AsyncOpenAI,
[(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
indirect=["tokenizer_name"],
)
-async def test_tokenize_chat(client: openai.AsyncOpenAI, model_name: str,
- tokenizer_name: str):
- base_url = str(client.base_url)[:-3].strip("/")
+async def test_tokenize_chat(
+ server: RemoteOpenAIServer,
+ model_name: str,
+ tokenizer_name: str,
+):
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
tokenizer_mode="fast")
@@ -121,7 +124,7 @@ async def test_tokenize_chat(client: openai.AsyncOpenAI, model_name: str,
tokens = tokenizer.encode(prompt,
add_special_tokens=add_special)
- response = requests.post(base_url + "/tokenize",
+ response = requests.post(server.url_for("tokenize"),
json={
"add_generation_prompt":
add_generation,
@@ -146,17 +149,18 @@ async def test_tokenize_chat(client: openai.AsyncOpenAI, model_name: str,
[(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
indirect=["tokenizer_name"],
)
-async def test_detokenize(client: openai.AsyncOpenAI, model_name: str,
- tokenizer_name: str):
- base_url = str(client.base_url)[:-3].strip("/")
+async def test_detokenize(
+ server: RemoteOpenAIServer,
+ model_name: str,
+ tokenizer_name: str,
+):
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
tokenizer_mode="fast")
prompt = "This is a test prompt. vllm1"
tokens = tokenizer.encode(prompt, add_special_tokens=False)
- print(f"CALLING {base_url} FOR {model_name}")
- response = requests.post(base_url + "/detokenize",
+ response = requests.post(server.url_for("detokenize"),
json={
"model": model_name,
"tokens": tokens
diff --git a/tests/entrypoints/openai/test_vision_embedding.py b/tests/entrypoints/openai/test_vision_embedding.py
new file mode 100644
index 0000000000000..73a69da32e434
--- /dev/null
+++ b/tests/entrypoints/openai/test_vision_embedding.py
@@ -0,0 +1,94 @@
+from typing import Dict
+
+import pytest
+import pytest_asyncio
+import requests
+
+from vllm.multimodal.utils import encode_image_base64, fetch_image
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "TIGER-Lab/VLM2Vec-Full"
+MAXIMUM_IMAGES = 2
+
+# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
+TEST_IMAGE_URLS = [
+ "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
+ "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
+ "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
+ "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
+]
+
+
+@pytest.fixture(scope="module")
+def server():
+ args = [
+ "--task",
+ "embedding",
+ "--dtype",
+ "bfloat16",
+ "--max-model-len",
+ "2048",
+ "--max-num-seqs",
+ "5",
+ "--enforce-eager",
+ "--trust-remote-code",
+ "--limit-mm-per-prompt",
+ f"image={MAXIMUM_IMAGES}",
+ ]
+
+ with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+ yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+ async with server.get_async_client() as async_client:
+ yield async_client
+
+
+@pytest.fixture(scope="session")
+def base64_encoded_image() -> Dict[str, str]:
+ return {
+ image_url: encode_image_base64(fetch_image(image_url))
+ for image_url in TEST_IMAGE_URLS
+ }
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+async def test_image_embedding(server: RemoteOpenAIServer, model_name: str,
+ image_url: str):
+ messages = [{
+ "role":
+ "user",
+ "content": [
+ {
+ "type": "image_url",
+ "image_url": {
+ "url": image_url
+ }
+ },
+ {
+ "type": "text",
+ "text": "Represent the given image."
+ },
+ ],
+ }]
+
+ response = requests.post(server.url_for("v1/embeddings"),
+ json={
+ "model": model_name,
+ "messages": messages,
+ "encoding_format": "float"
+ })
+ response.raise_for_status()
+
+ embeddings = response.json()
+ assert embeddings["id"] is not None
+ assert len(embeddings["data"]) == 1
+ assert len(embeddings["data"][0]["embedding"]) == 3072
+ assert embeddings["usage"]["completion_tokens"] == 0
+ assert embeddings["usage"]["prompt_tokens"] == 771
+ assert embeddings["usage"]["total_tokens"] == 771
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 46c92e10b360c..95fd56d916050 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -11,7 +11,7 @@
from contextlib import asynccontextmanager
from functools import partial
from http import HTTPStatus
-from typing import AsyncIterator, Set
+from typing import AsyncIterator, Optional, Set
import uvloop
from fastapi import APIRouter, FastAPI, Request
@@ -51,7 +51,7 @@
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
-from vllm.entrypoints.openai.serving_engine import BaseModelPath
+from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing
from vllm.entrypoints.openai.serving_tokenization import (
OpenAIServingTokenization)
from vllm.entrypoints.openai.tool_parsers import ToolParserManager
@@ -248,20 +248,25 @@ def mount_metrics(app: FastAPI):
app.routes.append(metrics_route)
-def chat(request: Request) -> OpenAIServingChat:
+def base(request: Request) -> OpenAIServing:
+ # Reuse the existing instance
+ return tokenization(request)
+
+
+def chat(request: Request) -> Optional[OpenAIServingChat]:
return request.app.state.openai_serving_chat
-def completion(request: Request) -> OpenAIServingCompletion:
+def completion(request: Request) -> Optional[OpenAIServingCompletion]:
return request.app.state.openai_serving_completion
-def tokenization(request: Request) -> OpenAIServingTokenization:
- return request.app.state.openai_serving_tokenization
+def embedding(request: Request) -> Optional[OpenAIServingEmbedding]:
+ return request.app.state.openai_serving_embedding
-def embedding(request: Request) -> OpenAIServingEmbedding:
- return request.app.state.openai_serving_embedding
+def tokenization(request: Request) -> OpenAIServingTokenization:
+ return request.app.state.openai_serving_tokenization
def engine_client(request: Request) -> EngineClient:
@@ -277,7 +282,9 @@ async def health(raw_request: Request) -> Response:
@router.post("/tokenize")
async def tokenize(request: TokenizeRequest, raw_request: Request):
- generator = await tokenization(raw_request).create_tokenize(request)
+ handler = tokenization(raw_request)
+
+ generator = await handler.create_tokenize(request)
if isinstance(generator, ErrorResponse):
return JSONResponse(content=generator.model_dump(),
status_code=generator.code)
@@ -289,7 +296,9 @@ async def tokenize(request: TokenizeRequest, raw_request: Request):
@router.post("/detokenize")
async def detokenize(request: DetokenizeRequest, raw_request: Request):
- generator = await tokenization(raw_request).create_detokenize(request)
+ handler = tokenization(raw_request)
+
+ generator = await handler.create_detokenize(request)
if isinstance(generator, ErrorResponse):
return JSONResponse(content=generator.model_dump(),
status_code=generator.code)
@@ -301,7 +310,9 @@ async def detokenize(request: DetokenizeRequest, raw_request: Request):
@router.get("/v1/models")
async def show_available_models(raw_request: Request):
- models = await completion(raw_request).show_available_models()
+ handler = base(raw_request)
+
+ models = await handler.show_available_models()
return JSONResponse(content=models.model_dump())
@@ -314,9 +325,12 @@ async def show_version():
@router.post("/v1/chat/completions")
async def create_chat_completion(request: ChatCompletionRequest,
raw_request: Request):
+ handler = chat(raw_request)
+ if handler is None:
+ return base(raw_request).create_error_response(
+ message="The model does not support Chat Completions API")
- generator = await chat(raw_request).create_chat_completion(
- request, raw_request)
+ generator = await handler.create_chat_completion(request, raw_request)
if isinstance(generator, ErrorResponse):
return JSONResponse(content=generator.model_dump(),
@@ -330,8 +344,12 @@ async def create_chat_completion(request: ChatCompletionRequest,
@router.post("/v1/completions")
async def create_completion(request: CompletionRequest, raw_request: Request):
- generator = await completion(raw_request).create_completion(
- request, raw_request)
+ handler = completion(raw_request)
+ if handler is None:
+ return base(raw_request).create_error_response(
+ message="The model does not support Completions API")
+
+ generator = await handler.create_completion(request, raw_request)
if isinstance(generator, ErrorResponse):
return JSONResponse(content=generator.model_dump(),
status_code=generator.code)
@@ -343,8 +361,12 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
@router.post("/v1/embeddings")
async def create_embedding(request: EmbeddingRequest, raw_request: Request):
- generator = await embedding(raw_request).create_embedding(
- request, raw_request)
+ handler = embedding(raw_request)
+ if handler is None:
+ return base(raw_request).create_error_response(
+ message="The model does not support Embeddings API")
+
+ generator = await handler.create_embedding(request, raw_request)
if isinstance(generator, ErrorResponse):
return JSONResponse(content=generator.model_dump(),
status_code=generator.code)
@@ -382,30 +404,26 @@ async def stop_profile(raw_request: Request):
@router.post("/v1/load_lora_adapter")
async def load_lora_adapter(request: LoadLoraAdapterRequest,
raw_request: Request):
- response = await chat(raw_request).load_lora_adapter(request)
- if isinstance(response, ErrorResponse):
- return JSONResponse(content=response.model_dump(),
- status_code=response.code)
-
- response = await completion(raw_request).load_lora_adapter(request)
- if isinstance(response, ErrorResponse):
- return JSONResponse(content=response.model_dump(),
- status_code=response.code)
+ for route in [chat, completion, embedding]:
+ handler = route(raw_request)
+ if handler is not None:
+ response = await handler.load_lora_adapter(request)
+ if isinstance(response, ErrorResponse):
+ return JSONResponse(content=response.model_dump(),
+ status_code=response.code)
return Response(status_code=200, content=response)
@router.post("/v1/unload_lora_adapter")
async def unload_lora_adapter(request: UnloadLoraAdapterRequest,
raw_request: Request):
- response = await chat(raw_request).unload_lora_adapter(request)
- if isinstance(response, ErrorResponse):
- return JSONResponse(content=response.model_dump(),
- status_code=response.code)
-
- response = await completion(raw_request).unload_lora_adapter(request)
- if isinstance(response, ErrorResponse):
- return JSONResponse(content=response.model_dump(),
- status_code=response.code)
+ for route in [chat, completion, embedding]:
+ handler = route(raw_request)
+ if handler is not None:
+ response = await handler.unload_lora_adapter(request)
+ if isinstance(response, ErrorResponse):
+ return JSONResponse(content=response.model_dump(),
+ status_code=response.code)
return Response(status_code=200, content=response)
@@ -501,7 +519,8 @@ def init_app_state(
chat_template=args.chat_template,
return_tokens_as_token_ids=args.return_tokens_as_token_ids,
enable_auto_tools=args.enable_auto_tool_choice,
- tool_parser=args.tool_call_parser)
+ tool_parser=args.tool_call_parser,
+ ) if model_config.task == "generate" else None
state.openai_serving_completion = OpenAIServingCompletion(
engine_client,
model_config,
@@ -510,13 +529,14 @@ def init_app_state(
prompt_adapters=args.prompt_adapters,
request_logger=request_logger,
return_tokens_as_token_ids=args.return_tokens_as_token_ids,
- )
+ ) if model_config.task == "generate" else None
state.openai_serving_embedding = OpenAIServingEmbedding(
engine_client,
model_config,
base_model_paths,
request_logger=request_logger,
- )
+ chat_template=args.chat_template,
+ ) if model_config.task == "embedding" else None
state.openai_serving_tokenization = OpenAIServingTokenization(
engine_client,
model_config,
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 60fc5ac8d11d2..1335e51bd152c 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -708,7 +708,7 @@ def validate_stream_options(cls, data):
return data
-class EmbeddingRequest(OpenAIBaseModel):
+class EmbeddingCompletionRequest(OpenAIBaseModel):
# Ordered by official OpenAI API documentation
# https://platform.openai.com/docs/api-reference/embeddings
model: str
@@ -720,10 +720,15 @@ class EmbeddingRequest(OpenAIBaseModel):
# doc: begin-embedding-pooling-params
additional_data: Optional[Any] = None
-
# doc: end-embedding-pooling-params
# doc: begin-embedding-extra-params
+ add_special_tokens: bool = Field(
+ default=True,
+ description=(
+ "If true (the default), special tokens (e.g. BOS) will be added to "
+ "the prompt."),
+ )
priority: int = Field(
default=0,
description=(
@@ -737,6 +742,82 @@ def to_pooling_params(self):
return PoolingParams(additional_data=self.additional_data)
+class EmbeddingChatRequest(OpenAIBaseModel):
+ model: str
+ messages: List[ChatCompletionMessageParam]
+
+ encoding_format: Literal["float", "base64"] = "float"
+ dimensions: Optional[int] = None
+ user: Optional[str] = None
+ truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
+
+ # doc: begin-chat-embedding-pooling-params
+ additional_data: Optional[Any] = None
+ # doc: end-chat-embedding-pooling-params
+
+ # doc: begin-chat-embedding-extra-params
+ add_generation_prompt: bool = Field(
+ default=True,
+ description=
+ ("If true, the generation prompt will be added to the chat template. "
+ "This is a parameter used by chat template in tokenizer config of the "
+ "model."),
+ )
+ continue_final_message: bool = Field(
+ default=False,
+ description=
+ ("If this is set, the chat will be formatted so that the final "
+ "message in the chat is open-ended, without any EOS tokens. The "
+ "model will continue this message rather than starting a new one. "
+ "This allows you to \"prefill\" part of the model's response for it. "
+ "Cannot be used at the same time as `add_generation_prompt`."),
+ )
+ add_special_tokens: bool = Field(
+ default=False,
+ description=(
+ "If true, special tokens (e.g. BOS) will be added to the prompt "
+ "on top of what is added by the chat template. "
+ "For most models, the chat template takes care of adding the "
+ "special tokens so this should be set to false (as is the "
+ "default)."),
+ )
+ chat_template: Optional[str] = Field(
+ default=None,
+ description=(
+ "A Jinja template to use for this conversion. "
+ "As of transformers v4.44, default chat template is no longer "
+ "allowed, so you must provide a chat template if the tokenizer "
+ "does not define one."),
+ )
+ chat_template_kwargs: Optional[Dict[str, Any]] = Field(
+ default=None,
+ description=("Additional kwargs to pass to the template renderer. "
+ "Will be accessible by the chat template."),
+ )
+ priority: int = Field(
+ default=0,
+ description=(
+ "The priority of the request (lower means earlier handling; "
+ "default: 0). Any priority other than 0 will raise an error "
+ "if the served model does not use priority scheduling."))
+ # doc: end-chat-embedding-extra-params
+
+ @model_validator(mode="before")
+ @classmethod
+ def check_generation_prompt(cls, data):
+ if data.get("continue_final_message") and data.get(
+ "add_generation_prompt"):
+ raise ValueError("Cannot set both `continue_final_message` and "
+ "`add_generation_prompt` to True.")
+ return data
+
+ def to_pooling_params(self):
+ return PoolingParams(additional_data=self.additional_data)
+
+
+EmbeddingRequest = Union[EmbeddingCompletionRequest, EmbeddingChatRequest]
+
+
class CompletionLogProbs(OpenAIBaseModel):
text_offset: List[int] = Field(default_factory=list)
token_logprobs: List[Optional[float]] = Field(default_factory=list)
@@ -799,7 +880,7 @@ class EmbeddingResponseData(OpenAIBaseModel):
class EmbeddingResponse(OpenAIBaseModel):
- id: str = Field(default_factory=lambda: f"cmpl-{random_uuid()}")
+ id: str = Field(default_factory=lambda: f"embd-{random_uuid()}")
object: str = "list"
created: int = Field(default_factory=lambda: int(time.time()))
model: str
diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
index f5249a0c447b3..a64467a311523 100644
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -217,13 +217,14 @@ async def main(args):
prompt_adapters=None,
request_logger=request_logger,
chat_template=None,
- )
+ ) if model_config.task == "generate" else None
openai_serving_embedding = OpenAIServingEmbedding(
engine,
model_config,
base_model_paths,
request_logger=request_logger,
- )
+ chat_template=None,
+ ) if model_config.task == "embedding" else None
tracker = BatchProgressTracker()
logger.info("Reading batch from %s...", args.input_file)
@@ -240,14 +241,31 @@ async def main(args):
# Determine the type of request and run it.
if request.url == "/v1/chat/completions":
- response_futures.append(
- run_request(openai_serving_chat.create_chat_completion,
- request, tracker))
+ handler_fn = (None if openai_serving_chat is None else
+ openai_serving_chat.create_chat_completion)
+ if handler_fn is None:
+ response_futures.append(
+ make_async_error_request_output(
+ request,
+ error_msg=
+ "The model does not support Chat Completions API",
+ ))
+ continue
+
+ response_futures.append(run_request(handler_fn, request, tracker))
tracker.submitted()
elif request.url == "/v1/embeddings":
- response_futures.append(
- run_request(openai_serving_embedding.create_embedding, request,
- tracker))
+ handler_fn = (None if openai_serving_embedding is None else
+ openai_serving_embedding.create_embedding)
+ if handler_fn is None:
+ response_futures.append(
+ make_async_error_request_output(
+ request,
+ error_msg="The model does not support Embeddings API",
+ ))
+ continue
+
+ response_futures.append(run_request(handler_fn, request, tracker))
tracker.submitted()
else:
response_futures.append(
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 1f951d15a7a32..9551b4f2091dd 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -10,11 +10,7 @@
from vllm.config import ModelConfig
from vllm.engine.protocol import EngineClient
-from vllm.entrypoints.chat_utils import (ConversationMessage,
- apply_hf_chat_template,
- apply_mistral_chat_template,
- load_chat_template,
- parse_chat_messages_futures)
+from vllm.entrypoints.chat_utils import ConversationMessage, load_chat_template
from vllm.entrypoints.logger import RequestLogger
from vllm.entrypoints.openai.protocol import (
ChatCompletionLogProb, ChatCompletionLogProbs,
@@ -27,16 +23,12 @@
from vllm.entrypoints.openai.serving_engine import (BaseModelPath,
LoRAModulePath,
OpenAIServing,
- PromptAdapterPath,
- TextTokensPrompt)
+ PromptAdapterPath)
from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
-from vllm.inputs import TokensPrompt
from vllm.logger import init_logger
from vllm.outputs import CompletionOutput, RequestOutput
from vllm.sampling_params import BeamSearchParams, SamplingParams
from vllm.sequence import Logprob
-from vllm.tracing import (contains_trace_headers, extract_trace_headers,
- log_tracing_disabled_warning)
from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
from vllm.utils import iterate_with_cancellation
@@ -94,12 +86,12 @@ async def create_chat_completion(
raw_request: Optional[Request] = None,
) -> Union[AsyncGenerator[str, None], ChatCompletionResponse,
ErrorResponse]:
- """Completion API similar to OpenAI's API.
+ """
+ Chat Completion API similar to OpenAI's API.
See https://platform.openai.com/docs/api-reference/chat/create
for the API specification. This API mimics the OpenAI
- ChatCompletion API.
-
+ Chat Completion API.
"""
error_check_ret = await self._check_model(request)
if error_check_ret is not None:
@@ -118,143 +110,106 @@ async def create_chat_completion(
prompt_adapter_request,
) = self._maybe_get_adapters(request)
- model_config = self.model_config
tokenizer = await self.engine_client.get_tokenizer(lora_request)
-
- conversation, mm_data_future = parse_chat_messages_futures(
- request.messages, model_config, tokenizer)
+ tool_parser = self.tool_parser
+
+ # validation for OpenAI tools
+ # tool_choice = "required" is not supported
+ if request.tool_choice == "required":
+ return self.create_error_response(
+ "tool_choice = \"required\" is not supported!")
+
+ if (request.tool_choice == "auto" and
+ not (self.enable_auto_tools and tool_parser is not None)
+ and not isinstance(tokenizer, MistralTokenizer)):
+ # for hf tokenizers, "auto" tools requires
+ # --enable-auto-tool-choice and --tool-call-parser
+ return self.create_error_response(
+ "\"auto\" tool choice requires "
+ "--enable-auto-tool-choice and --tool-call-parser to be set"
+ )
tool_dicts = None if request.tools is None else [
tool.model_dump() for tool in request.tools
]
- prompt: Union[str, List[int]]
- is_mistral_tokenizer = isinstance(tokenizer, MistralTokenizer)
- if is_mistral_tokenizer:
- prompt = apply_mistral_chat_template(
- tokenizer,
- messages=request.messages,
- chat_template=request.chat_template or self.chat_template,
- add_generation_prompt=request.add_generation_prompt,
- continue_final_message=request.continue_final_message,
- tools=tool_dicts,
- documents=request.documents,
- **(request.chat_template_kwargs or {}),
- )
- else:
- prompt = apply_hf_chat_template(
- tokenizer,
- conversation=conversation,
- chat_template=request.chat_template or self.chat_template,
- add_generation_prompt=request.add_generation_prompt,
- continue_final_message=request.continue_final_message,
- tools=tool_dicts,
- documents=request.documents,
- **(request.chat_template_kwargs or {}),
- )
- except Exception as e:
- logger.exception("Error in applying chat template from request")
- return self.create_error_response(str(e))
-
- try:
- mm_data = await mm_data_future
- except Exception as e:
- logger.exception("Error in loading multi-modal data")
+ (
+ conversation,
+ request_prompts,
+ engine_prompts,
+ ) = await self._preprocess_chat(
+ request,
+ tokenizer,
+ request.messages,
+ chat_template=request.chat_template or self.chat_template,
+ add_generation_prompt=request.add_generation_prompt,
+ continue_final_message=request.continue_final_message,
+ tool_dicts=tool_dicts,
+ documents=request.documents,
+ chat_template_kwargs=request.chat_template_kwargs,
+ tool_parser=tool_parser,
+ truncate_prompt_tokens=request.truncate_prompt_tokens,
+ add_special_tokens=request.add_special_tokens,
+ )
+ except ValueError as e:
+ logger.exception("Error in preprocessing prompt inputs")
return self.create_error_response(str(e))
- # validation for OpenAI tools
- # tool_choice = "required" is not supported
- if request.tool_choice == "required":
- return self.create_error_response(
- "tool_choice = \"required\" is not supported!")
-
- if not is_mistral_tokenizer and request.tool_choice == "auto" and not (
- self.enable_auto_tools and self.tool_parser is not None):
- # for hf tokenizers, "auto" tools requires
- # --enable-auto-tool-choice and --tool-call-parser
- return self.create_error_response(
- "\"auto\" tool choice requires "
- "--enable-auto-tool-choice and --tool-call-parser to be set")
-
- request_id = f"chat-{request.request_id}"
+ request_id = f"chatcmpl-{request.request_id}"
request_metadata = RequestResponseMetadata(request_id=request_id)
if raw_request:
raw_request.state.request_metadata = request_metadata
+ # Schedule the request and get the result generator.
+ generators: List[AsyncGenerator[RequestOutput, None]] = []
try:
- if self.enable_auto_tools and self.tool_parser:
- request = self.tool_parser(tokenizer).adjust_request(
- request=request)
-
- if isinstance(prompt, str):
- prompt_inputs = self._tokenize_prompt_input(
- request,
- tokenizer,
- prompt,
- truncate_prompt_tokens=request.truncate_prompt_tokens,
- add_special_tokens=request.add_special_tokens,
- )
- else:
- assert isinstance(prompt, list) and isinstance(
- prompt[0], int
- ), "Prompt has to be either a string or a list of token ids"
- prompt_inputs = TextTokensPrompt(
- prompt=tokenizer.decode(prompt), prompt_token_ids=prompt)
-
- assert prompt_inputs is not None
-
- sampling_params: Union[SamplingParams, BeamSearchParams]
- default_max_tokens = self.max_model_len - len(
- prompt_inputs["prompt_token_ids"])
- if request.use_beam_search:
- sampling_params = request.to_beam_search_params(
- default_max_tokens)
- else:
- sampling_params = request.to_sampling_params(
- default_max_tokens)
-
- self._log_inputs(request_id,
- prompt_inputs,
- params=sampling_params,
- lora_request=lora_request,
- prompt_adapter_request=prompt_adapter_request)
-
- engine_inputs = TokensPrompt(
- prompt_token_ids=prompt_inputs["prompt_token_ids"])
- if mm_data is not None:
- engine_inputs["multi_modal_data"] = mm_data
-
- is_tracing_enabled = (await
- self.engine_client.is_tracing_enabled())
- trace_headers = None
- if is_tracing_enabled and raw_request:
- trace_headers = extract_trace_headers(raw_request.headers)
- if (not is_tracing_enabled and raw_request
- and contains_trace_headers(raw_request.headers)):
- log_tracing_disabled_warning()
-
- if isinstance(sampling_params, BeamSearchParams):
- result_generator = self.engine_client.beam_search(
- prompt=engine_inputs,
- model_config=self.model_config,
- request_id=request_id,
- params=sampling_params,
- )
- else:
- result_generator = self.engine_client.generate(
- engine_inputs,
- sampling_params,
- request_id,
- lora_request=lora_request,
- trace_headers=trace_headers,
- prompt_adapter_request=prompt_adapter_request,
- priority=request.priority,
- )
+ for i, engine_prompt in enumerate(engine_prompts):
+ sampling_params: Union[SamplingParams, BeamSearchParams]
+ default_max_tokens = self.max_model_len - len(
+ engine_prompt["prompt_token_ids"])
+ if request.use_beam_search:
+ sampling_params = request.to_beam_search_params(
+ default_max_tokens)
+ else:
+ sampling_params = request.to_sampling_params(
+ default_max_tokens)
+
+ self._log_inputs(request_id,
+ request_prompts[i],
+ params=sampling_params,
+ lora_request=lora_request,
+ prompt_adapter_request=prompt_adapter_request)
+
+ trace_headers = (None if raw_request is None else await
+ self._get_trace_headers(raw_request.headers))
+
+ if isinstance(sampling_params, BeamSearchParams):
+ generator = self.engine_client.beam_search(
+ prompt=engine_prompt,
+ model_config=self.model_config,
+ request_id=request_id,
+ params=sampling_params,
+ )
+ else:
+ generator = self.engine_client.generate(
+ engine_prompt,
+ sampling_params,
+ request_id,
+ lora_request=lora_request,
+ trace_headers=trace_headers,
+ prompt_adapter_request=prompt_adapter_request,
+ priority=request.priority,
+ )
+
+ generators.append(generator)
except ValueError as e:
# TODO: Use a vllm-specific Validation Error
return self.create_error_response(str(e))
+ assert len(generators) == 1
+ result_generator, = generators
+
if raw_request:
result_generator = iterate_with_cancellation(
result_generator, raw_request.is_disconnected)
@@ -626,6 +581,9 @@ async def chat_completion_full_generator(
final_res = res
except asyncio.CancelledError:
return self.create_error_response("Client disconnected")
+ except ValueError as e:
+ # TODO: Use a vllm-specific Validation Error
+ return self.create_error_response(str(e))
assert final_res is not None
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index da521a6012530..570232be38379 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -1,7 +1,6 @@
import asyncio
import time
-from typing import (AsyncGenerator, AsyncIterator, Callable, Dict, List,
- Optional)
+from typing import AsyncGenerator, AsyncIterator, Dict, List, Optional
from typing import Sequence as GenericSequence
from typing import Tuple, Union, cast
@@ -30,18 +29,11 @@
from vllm.outputs import RequestOutput
from vllm.sampling_params import BeamSearchParams, SamplingParams
from vllm.sequence import Logprob
-from vllm.tracing import (contains_trace_headers, extract_trace_headers,
- log_tracing_disabled_warning)
from vllm.transformers_utils.tokenizer import AnyTokenizer
from vllm.utils import merge_async_iterators, random_uuid
logger = init_logger(__name__)
-TypeTokenIDs = List[int]
-TypeTopLogProbs = List[Optional[Dict[int, float]]]
-TypeCreateLogProbsFn = Callable[
- [TypeTokenIDs, TypeTopLogProbs, Optional[int], int], CompletionLogProbs]
-
class OpenAIServingCompletion(OpenAIServing):
@@ -101,8 +93,6 @@ async def create_completion(
if raw_request:
raw_request.state.request_metadata = request_metadata
- # Schedule the request and get the result generator.
- generators: List[AsyncGenerator[RequestOutput, None]] = []
try:
(
lora_request,
@@ -111,19 +101,24 @@ async def create_completion(
tokenizer = await self.engine_client.get_tokenizer(lora_request)
- prompts = list(
- self._tokenize_prompt_input_or_inputs(
- request,
- tokenizer,
- request.prompt,
- truncate_prompt_tokens=request.truncate_prompt_tokens,
- add_special_tokens=request.add_special_tokens,
- ))
+ request_prompts, engine_prompts = self._preprocess_completion(
+ request,
+ tokenizer,
+ request.prompt,
+ truncate_prompt_tokens=request.truncate_prompt_tokens,
+ add_special_tokens=request.add_special_tokens,
+ )
+ except ValueError as e:
+ logger.exception("Error in preprocessing prompt inputs")
+ return self.create_error_response(str(e))
- for i, prompt_inputs in enumerate(prompts):
+ # Schedule the request and get the result generator.
+ generators: List[AsyncGenerator[RequestOutput, None]] = []
+ try:
+ for i, engine_prompt in enumerate(engine_prompts):
sampling_params: Union[SamplingParams, BeamSearchParams]
default_max_tokens = self.max_model_len - len(
- prompt_inputs["prompt_token_ids"])
+ engine_prompt["prompt_token_ids"])
if request.use_beam_search:
sampling_params = request.to_beam_search_params(
default_max_tokens)
@@ -134,36 +129,24 @@ async def create_completion(
request_id_item = f"{request_id}-{i}"
self._log_inputs(request_id_item,
- prompt_inputs,
+ request_prompts[i],
params=sampling_params,
lora_request=lora_request,
prompt_adapter_request=prompt_adapter_request)
- is_tracing_enabled = (await
- self.engine_client.is_tracing_enabled())
- trace_headers = None
- if is_tracing_enabled:
- trace_headers = extract_trace_headers(raw_request.headers)
- if not is_tracing_enabled and contains_trace_headers(
- raw_request.headers):
- log_tracing_disabled_warning()
+ trace_headers = (await
+ self._get_trace_headers(raw_request.headers))
if isinstance(sampling_params, BeamSearchParams):
generator = self.engine_client.beam_search(
- prompt={
- "prompt_token_ids":
- prompt_inputs["prompt_token_ids"]
- },
+ prompt=engine_prompt,
model_config=self.model_config,
request_id=request_id,
params=sampling_params,
)
else:
generator = self.engine_client.generate(
- {
- "prompt_token_ids":
- prompt_inputs["prompt_token_ids"]
- },
+ engine_prompt,
sampling_params,
request_id_item,
lora_request=lora_request,
@@ -180,6 +163,8 @@ async def create_completion(
result_generator = merge_async_iterators(
*generators, is_cancelled=raw_request.is_disconnected)
+ num_prompts = len(engine_prompts)
+
# Similar to the OpenAI API, when n != best_of, we do not stream the
# results. In addition, we do not stream the results when use
# beam search.
@@ -195,16 +180,22 @@ async def create_completion(
request_id,
created_time,
model_name,
- num_prompts=len(prompts),
+ num_prompts=num_prompts,
tokenizer=tokenizer,
request_metadata=request_metadata)
# Non-streaming response
- final_res_batch: List[Optional[RequestOutput]] = [None] * len(prompts)
+ final_res_batch: List[Optional[RequestOutput]] = [None] * num_prompts
try:
async for i, res in result_generator:
final_res_batch[i] = res
+ except asyncio.CancelledError:
+ return self.create_error_response("Client disconnected")
+ except ValueError as e:
+ # TODO: Use a vllm-specific Validation Error
+ return self.create_error_response(str(e))
+ try:
for i, final_res in enumerate(final_res_batch):
assert final_res is not None
@@ -212,7 +203,7 @@ async def create_completion(
# We did not pass it into vLLM engine to avoid being redundant
# with the inputs token IDs
if final_res.prompt is None:
- final_res.prompt = prompts[i]["prompt"]
+ final_res.prompt = request_prompts[i]["prompt"]
final_res_batch_checked = cast(List[RequestOutput],
final_res_batch)
@@ -226,8 +217,6 @@ async def create_completion(
tokenizer,
request_metadata,
)
- except asyncio.CancelledError:
- return self.create_error_response("Client disconnected")
except ValueError as e:
# TODO: Use a vllm-specific Validation Error
return self.create_error_response(str(e))
diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py
index 6c46aae2838f6..917856cd2b2dd 100644
--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/openai/serving_embedding.py
@@ -9,8 +9,10 @@
from vllm.config import ModelConfig
from vllm.engine.protocol import EngineClient
+from vllm.entrypoints.chat_utils import load_chat_template
from vllm.entrypoints.logger import RequestLogger
-from vllm.entrypoints.openai.protocol import (EmbeddingRequest,
+from vllm.entrypoints.openai.protocol import (EmbeddingChatRequest,
+ EmbeddingRequest,
EmbeddingResponse,
EmbeddingResponseData,
ErrorResponse, UsageInfo)
@@ -21,8 +23,6 @@
logger = init_logger(__name__)
-TypeTokenIDs = List[int]
-
def _get_embedding(
output: EmbeddingOutput,
@@ -76,6 +76,7 @@ def __init__(
base_model_paths: List[BaseModelPath],
*,
request_logger: Optional[RequestLogger],
+ chat_template: Optional[str],
):
super().__init__(engine_client=engine_client,
model_config=model_config,
@@ -83,21 +84,20 @@ def __init__(
lora_modules=None,
prompt_adapters=None,
request_logger=request_logger)
- self._enabled = self._check_embedding_mode(
- model_config.task == "embedding")
+
+ self.chat_template = load_chat_template(chat_template)
async def create_embedding(
self,
request: EmbeddingRequest,
raw_request: Optional[Request] = None,
) -> Union[EmbeddingResponse, ErrorResponse]:
- """Completion API similar to OpenAI's API.
+ """
+ Embedding API similar to OpenAI's API.
See https://platform.openai.com/docs/api-reference/embeddings/create
for the API specification. This API mimics the OpenAI Embedding API.
"""
- if not self._enabled:
- return self.create_error_response("Embedding API disabled")
error_check_ret = await self._check_model(request)
if error_check_ret is not None:
return error_check_ret
@@ -122,8 +122,6 @@ async def create_embedding(
"greater than max_model_len."
" Please, select a smaller truncation size.")
- # Schedule the request and get the result generator.
- generators: List[AsyncGenerator[EmbeddingRequestOutput, None]] = []
try:
(
lora_request,
@@ -132,32 +130,60 @@ async def create_embedding(
tokenizer = await self.engine_client.get_tokenizer(lora_request)
- pooling_params = request.to_pooling_params()
+ if prompt_adapter_request is not None:
+ raise NotImplementedError("Prompt adapter is not supported "
+ "for embedding models")
+
+ if isinstance(request, EmbeddingChatRequest):
+ (
+ _,
+ request_prompts,
+ engine_prompts,
+ ) = await self._preprocess_chat(
+ request,
+ tokenizer,
+ request.messages,
+ chat_template=request.chat_template or self.chat_template,
+ add_generation_prompt=request.add_generation_prompt,
+ continue_final_message=request.continue_final_message,
+ truncate_prompt_tokens=truncate_prompt_tokens,
+ add_special_tokens=request.add_special_tokens,
+ )
+ else:
+ request_prompts, engine_prompts = self._preprocess_completion(
+ request,
+ tokenizer,
+ request.input,
+ truncate_prompt_tokens=truncate_prompt_tokens,
+ add_special_tokens=request.add_special_tokens,
+ )
+ except ValueError as e:
+ logger.exception("Error in preprocessing prompt inputs")
+ return self.create_error_response(str(e))
- prompts = list(
- self._tokenize_prompt_input_or_inputs(request, tokenizer,
- request.input,
- truncate_prompt_tokens))
+ # Schedule the request and get the result generator.
+ generators: List[AsyncGenerator[EmbeddingRequestOutput, None]] = []
+ try:
+ pooling_params = request.to_pooling_params()
- for i, prompt_inputs in enumerate(prompts):
+ for i, engine_prompt in enumerate(engine_prompts):
request_id_item = f"{request_id}-{i}"
self._log_inputs(request_id_item,
- prompt_inputs,
+ request_prompts[i],
params=pooling_params,
lora_request=lora_request,
prompt_adapter_request=prompt_adapter_request)
- if prompt_adapter_request is not None:
- raise NotImplementedError(
- "Prompt adapter is not supported "
- "for embedding models")
+ trace_headers = (None if raw_request is None else await
+ self._get_trace_headers(raw_request.headers))
generator = self.engine_client.encode(
- {"prompt_token_ids": prompt_inputs["prompt_token_ids"]},
+ engine_prompt,
pooling_params,
request_id_item,
lora_request=lora_request,
+ trace_headers=trace_headers,
priority=request.priority,
)
@@ -171,13 +197,18 @@ async def create_embedding(
is_cancelled=raw_request.is_disconnected if raw_request else None,
)
+ num_prompts = len(engine_prompts)
+
# Non-streaming response
final_res_batch: List[Optional[EmbeddingRequestOutput]]
- final_res_batch = [None] * len(prompts)
+ final_res_batch = [None] * num_prompts
try:
async for i, res in result_generator:
final_res_batch[i] = res
+ except asyncio.CancelledError:
+ return self.create_error_response("Client disconnected")
+ try:
for final_res in final_res_batch:
assert final_res is not None
@@ -187,18 +218,8 @@ async def create_embedding(
response = request_output_to_embedding_response(
final_res_batch_checked, request_id, created_time, model_name,
encoding_format)
- except asyncio.CancelledError:
- return self.create_error_response("Client disconnected")
except ValueError as e:
# TODO: Use a vllm-specific Validation Error
return self.create_error_response(str(e))
return response
-
- def _check_embedding_mode(self, embedding_mode: bool) -> bool:
- if not embedding_mode:
- logger.warning(
- "embedding_mode is False. Embedding API will not work.")
- else:
- logger.info("Activating the server engine with embedding enabled.")
- return embedding_mode
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 22a01b3dc4cc0..e7aeac8f8c018 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -2,28 +2,38 @@
import pathlib
from dataclasses import dataclass
from http import HTTPStatus
-from typing import Iterable, Iterator, List, Optional, Tuple, TypedDict, Union
+from typing import (Any, Callable, Dict, Iterable, Iterator, List, Mapping,
+ Optional, Sequence, Tuple, TypedDict, Union)
from pydantic import Field
+from starlette.datastructures import Headers
from typing_extensions import Annotated
from vllm.config import ModelConfig
from vllm.engine.protocol import EngineClient
+from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
+ ConversationMessage,
+ apply_hf_chat_template,
+ apply_mistral_chat_template,
+ parse_chat_messages_futures)
from vllm.entrypoints.logger import RequestLogger
# yapf conflicts with isort for this block
# yapf: disable
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
CompletionRequest,
DetokenizeRequest,
- EmbeddingRequest, ErrorResponse,
+ EmbeddingChatRequest,
+ EmbeddingCompletionRequest,
+ ErrorResponse,
LoadLoraAdapterRequest,
ModelCard, ModelList,
ModelPermission,
TokenizeChatRequest,
TokenizeCompletionRequest,
- TokenizeRequest,
UnloadLoraAdapterRequest)
+from vllm.entrypoints.openai.tool_parsers import ToolParser
# yapf: enable
+from vllm.inputs import TokensPrompt
from vllm.inputs.parse import parse_and_batch_prompt
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
@@ -31,8 +41,10 @@
from vllm.prompt_adapter.request import PromptAdapterRequest
from vllm.sampling_params import BeamSearchParams, SamplingParams
from vllm.sequence import Logprob
-from vllm.transformers_utils.tokenizer import AnyTokenizer
-from vllm.utils import AtomicCounter
+from vllm.tracing import (contains_trace_headers, extract_trace_headers,
+ log_tracing_disabled_warning)
+from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
+from vllm.utils import AtomicCounter, is_list_of
logger = init_logger(__name__)
@@ -56,8 +68,14 @@ class LoRAModulePath:
base_model_name: Optional[str] = None
-AnyRequest = Union[ChatCompletionRequest, CompletionRequest, DetokenizeRequest,
- EmbeddingRequest, TokenizeRequest]
+CompletionLikeRequest = Union[CompletionRequest, DetokenizeRequest,
+ EmbeddingCompletionRequest,
+ TokenizeCompletionRequest]
+
+ChatLikeRequest = Union[ChatCompletionRequest, EmbeddingChatRequest,
+ TokenizeChatRequest]
+
+AnyRequest = Union[CompletionLikeRequest, ChatLikeRequest]
class TextTokensPrompt(TypedDict):
@@ -65,6 +83,9 @@ class TextTokensPrompt(TypedDict):
prompt_token_ids: List[int]
+RequestPrompt = Union[List[int], str, TextTokensPrompt]
+
+
class OpenAIServing:
def __init__(
@@ -246,7 +267,8 @@ def _validate_input(
token_num = len(input_ids)
# Note: EmbeddingRequest doesn't have max_tokens
- if isinstance(request, EmbeddingRequest):
+ if isinstance(request,
+ (EmbeddingChatRequest, EmbeddingCompletionRequest)):
if token_num > self.max_model_len:
raise ValueError(
f"This model's maximum context length is "
@@ -373,10 +395,115 @@ def _tokenize_prompt_input_or_inputs(
truncate_prompt_tokens=truncate_prompt_tokens,
)
+ def _preprocess_completion(
+ self,
+ request: CompletionLikeRequest,
+ tokenizer: AnyTokenizer,
+ input_or_inputs: Union[str, List[str], List[int], List[List[int]]],
+ truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None,
+ add_special_tokens: bool = True,
+ ) -> Tuple[Sequence[TextTokensPrompt], List[TokensPrompt]]:
+ request_prompts = [
+ request_prompt
+ for request_prompt in self._tokenize_prompt_input_or_inputs(
+ request,
+ tokenizer,
+ input_or_inputs,
+ truncate_prompt_tokens=truncate_prompt_tokens,
+ add_special_tokens=add_special_tokens,
+ )
+ ]
+
+ engine_prompts = [
+ TokensPrompt(prompt_token_ids=request_prompt["prompt_token_ids"])
+ for request_prompt in request_prompts
+ ]
+
+ return request_prompts, engine_prompts
+
+ async def _preprocess_chat(
+ self,
+ request: ChatLikeRequest,
+ tokenizer: AnyTokenizer,
+ messages: List[ChatCompletionMessageParam],
+ chat_template: Optional[str] = None,
+ add_generation_prompt: bool = True,
+ continue_final_message: bool = False,
+ tool_dicts: Optional[List[Dict[str, Any]]] = None,
+ documents: Optional[List[Dict[str, str]]] = None,
+ chat_template_kwargs: Optional[Dict[str, Any]] = None,
+ tool_parser: Optional[Callable[[AnyTokenizer], ToolParser]] = None,
+ truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None,
+ add_special_tokens: bool = False,
+ ) -> Tuple[List[ConversationMessage], Sequence[RequestPrompt],
+ List[TokensPrompt]]:
+ conversation, mm_data_future = parse_chat_messages_futures(
+ messages,
+ self.model_config,
+ tokenizer,
+ )
+
+ request_prompt: Union[str, List[int]]
+ is_mistral_tokenizer = isinstance(tokenizer, MistralTokenizer)
+ if is_mistral_tokenizer:
+ request_prompt = apply_mistral_chat_template(
+ tokenizer,
+ messages=messages,
+ chat_template=chat_template,
+ add_generation_prompt=add_generation_prompt,
+ continue_final_message=continue_final_message,
+ tools=tool_dicts,
+ documents=documents,
+ **(chat_template_kwargs or {}),
+ )
+ else:
+ request_prompt = apply_hf_chat_template(
+ tokenizer,
+ conversation=conversation,
+ chat_template=chat_template,
+ add_generation_prompt=add_generation_prompt,
+ continue_final_message=continue_final_message,
+ tools=tool_dicts,
+ documents=documents,
+ **(chat_template_kwargs or {}),
+ )
+
+ mm_data = await mm_data_future
+
+ if tool_parser is not None:
+ if not isinstance(request, ChatCompletionRequest):
+ msg = "Tool usage is only supported for Chat Completions API"
+ raise NotImplementedError(msg)
+
+ request = tool_parser(tokenizer).adjust_request(request=request)
+
+ if isinstance(request_prompt, str):
+ prompt_inputs = self._tokenize_prompt_input(
+ request,
+ tokenizer,
+ request_prompt,
+ truncate_prompt_tokens=truncate_prompt_tokens,
+ add_special_tokens=add_special_tokens,
+ )
+ else:
+ # For MistralTokenizer
+ assert is_list_of(request_prompt, int), (
+ "Prompt has to be either a string or a list of token ids")
+ prompt_inputs = TextTokensPrompt(
+ prompt=tokenizer.decode(request_prompt),
+ prompt_token_ids=request_prompt)
+
+ engine_prompt = TokensPrompt(
+ prompt_token_ids=prompt_inputs["prompt_token_ids"])
+ if mm_data is not None:
+ engine_prompt["multi_modal_data"] = mm_data
+
+ return conversation, [request_prompt], [engine_prompt]
+
def _log_inputs(
self,
request_id: str,
- inputs: Union[str, List[int], TextTokensPrompt],
+ inputs: RequestPrompt,
params: Optional[Union[SamplingParams, PoolingParams,
BeamSearchParams]],
lora_request: Optional[LoRARequest],
@@ -404,6 +531,20 @@ def _log_inputs(
prompt_adapter_request=prompt_adapter_request,
)
+ async def _get_trace_headers(
+ self,
+ headers: Headers,
+ ) -> Optional[Mapping[str, str]]:
+ is_tracing_enabled = await self.engine_client.is_tracing_enabled()
+
+ if is_tracing_enabled:
+ return extract_trace_headers(headers)
+
+ if contains_trace_headers(headers):
+ log_tracing_disabled_warning()
+
+ return None
+
@staticmethod
def _get_decoded_token(logprob: Logprob,
token_id: int,
diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py
index a269c94c7ec0d..1fd82304f7a4d 100644
--- a/vllm/entrypoints/openai/serving_tokenization.py
+++ b/vllm/entrypoints/openai/serving_tokenization.py
@@ -2,10 +2,7 @@
from vllm.config import ModelConfig
from vllm.engine.protocol import EngineClient
-from vllm.entrypoints.chat_utils import (apply_hf_chat_template,
- apply_mistral_chat_template,
- load_chat_template,
- parse_chat_messages_futures)
+from vllm.entrypoints.chat_utils import load_chat_template
from vllm.entrypoints.logger import RequestLogger
# yapf conflicts with isort for this block
# yapf: disable
@@ -20,7 +17,6 @@
LoRAModulePath,
OpenAIServing)
from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import MistralTokenizer
from vllm.utils import random_uuid
logger = init_logger(__name__)
@@ -62,59 +58,51 @@ async def create_tokenize(
request_id = f"tokn-{random_uuid()}"
- (
- lora_request,
- prompt_adapter_request,
- ) = self._maybe_get_adapters(request)
-
- tokenizer = await self.engine_client.get_tokenizer(lora_request)
-
- prompt: Union[str, List[int]]
- if isinstance(request, TokenizeChatRequest):
- model_config = self.model_config
-
- conversation, mm_data_future = parse_chat_messages_futures(
- request.messages, model_config, tokenizer)
-
- mm_data = await mm_data_future
- if mm_data:
- logger.warning(
- "Multi-modal inputs are ignored during tokenization")
-
- if isinstance(tokenizer, MistralTokenizer):
- prompt = apply_mistral_chat_template(
+ try:
+ (
+ lora_request,
+ prompt_adapter_request,
+ ) = self._maybe_get_adapters(request)
+
+ tokenizer = await self.engine_client.get_tokenizer(lora_request)
+
+ if isinstance(request, TokenizeChatRequest):
+ (
+ _,
+ request_prompts,
+ engine_prompts,
+ ) = await self._preprocess_chat(
+ request,
tokenizer,
- messages=request.messages,
+ request.messages,
chat_template=self.chat_template,
add_generation_prompt=request.add_generation_prompt,
continue_final_message=request.continue_final_message,
+ add_special_tokens=request.add_special_tokens,
)
else:
- prompt = apply_hf_chat_template(
+ request_prompts, engine_prompts = self._preprocess_completion(
+ request,
tokenizer,
- conversation=conversation,
- chat_template=self.chat_template,
- add_generation_prompt=request.add_generation_prompt,
- continue_final_message=request.continue_final_message,
+ request.prompt,
+ add_special_tokens=request.add_special_tokens,
)
- else:
- prompt = request.prompt
+ except ValueError as e:
+ logger.exception("Error in preprocessing prompt inputs")
+ return self.create_error_response(str(e))
- self._log_inputs(request_id,
- prompt,
- params=None,
- lora_request=lora_request,
- prompt_adapter_request=prompt_adapter_request)
+ input_ids: List[int] = []
+ for i, engine_prompt in enumerate(engine_prompts):
+ self._log_inputs(request_id,
+ request_prompts[i],
+ params=None,
+ lora_request=lora_request,
+ prompt_adapter_request=prompt_adapter_request)
- # Silently ignore prompt adapter since it does not affect tokenization
+ # Silently ignore prompt adapter since it does not affect
+ # tokenization (Unlike in Embeddings API where an error is raised)
- prompt_input = self._tokenize_prompt_input(
- request,
- tokenizer,
- prompt,
- add_special_tokens=request.add_special_tokens,
- )
- input_ids = prompt_input["prompt_token_ids"]
+ input_ids.extend(engine_prompt["prompt_token_ids"])
return TokenizeResponse(tokens=input_ids,
count=len(input_ids),
@@ -143,9 +131,8 @@ async def create_detokenize(
lora_request=lora_request,
prompt_adapter_request=prompt_adapter_request)
- if prompt_adapter_request is not None:
- raise NotImplementedError("Prompt adapter is not supported "
- "for tokenization")
+ # Silently ignore prompt adapter since it does not affect tokenization
+ # (Unlike in Embeddings API where an error is raised)
prompt_input = self._tokenize_prompt_input(
request,
diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py
index 7461fb51989c6..2635c0bccd1c4 100644
--- a/vllm/pooling_params.py
+++ b/vllm/pooling_params.py
@@ -7,7 +7,7 @@ class PoolingParams(
msgspec.Struct,
omit_defaults=True, # type: ignore[call-arg]
array_like=True): # type: ignore[call-arg]
- """Pooling parameters for pooling.
+ """Pooling parameters for embeddings API.
Attributes:
additional_data: Any additional data needed for pooling.
@@ -16,7 +16,7 @@ class PoolingParams(
def clone(self) -> "PoolingParams":
"""Returns a deep copy of the PoolingParams instance."""
- return PoolingParams(additional_data=self.additional_data, )
+ return PoolingParams(additional_data=self.additional_data)
def __repr__(self) -> str:
return (f"PoolingParams("
From 30a2e8074246e11a1452ab5e84a7be65ecac6119 Mon Sep 17 00:00:00 2001
From: Michael Goin
Date: Fri, 1 Nov 2024 09:55:29 -0400
Subject: [PATCH 078/113] [CI/Build] Add Model Tests for PixtralHF (#9813)
---
tests/models/decoder_only/vision_language/test_models.py | 9 +++++++++
1 file changed, 9 insertions(+)
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index d738647c91b66..e49ea6f98324d 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -291,6 +291,15 @@
# vllm_output_post_proc=model_utils.phi3v_vllm_to_hf_output,
# num_logprobs=10,
# ),
+ "pixtral_hf": VLMTestInfo(
+ models=["nm-testing/pixtral-12b-FP8-dynamic"],
+ test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+ prompt_formatter=lambda img_prompt: f"[INST]{img_prompt}[/INST]",
+ img_idx_to_prompt=lambda idx: "[IMG]",
+ max_model_len=8192,
+ max_num_seqs=2,
+ auto_cls=AutoModelForVision2Seq,
+ ),
"qwen": VLMTestInfo(
models=["Qwen/Qwen-VL"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
From ba0d8920742597269745f3551eb97b1b19f5e582 Mon Sep 17 00:00:00 2001
From: Cyrus Leung
Date: Fri, 1 Nov 2024 22:09:07 +0800
Subject: [PATCH 079/113] [Frontend] Use a proper chat template for VLM2Vec
(#9912)
---
docs/source/models/vlm.rst | 14 +++++---
..._chat_completion_client_for_multimodal.py} | 0
...ai_chat_embedding_client_for_multimodal.py | 33 +++++++++++++++++++
examples/template_vlm2vec.jinja | 16 +++++++++
.../openai/test_vision_embedding.py | 11 +++++--
vllm/entrypoints/chat_utils.py | 15 ++++++---
6 files changed, 78 insertions(+), 11 deletions(-)
rename examples/{openai_api_client_for_multimodal.py => openai_chat_completion_client_for_multimodal.py} (100%)
create mode 100644 examples/openai_chat_embedding_client_for_multimodal.py
create mode 100644 examples/template_vlm2vec.jinja
diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst
index ac6405b9807a8..3377502a6db28 100644
--- a/docs/source/models/vlm.rst
+++ b/docs/source/models/vlm.rst
@@ -240,8 +240,7 @@ To consume the server, you can use the OpenAI client like in the example below:
)
print("Chat completion output:", chat_response.choices[0].message.content)
-
-A full code example can be found in `examples/openai_api_client_for_multimodal.py `_.
+A full code example can be found in `examples/openai_chat_completion_client_for_multimodal.py `_.
.. tip::
There is no need to place image placeholders in the text content of the API request - they are already represented by the image content.
@@ -269,14 +268,19 @@ In this example, we will serve the ``TIGER-Lab/VLM2Vec-Full`` model.
.. code-block:: bash
vllm serve TIGER-Lab/VLM2Vec-Full --task embedding \
- --trust-remote-code --max-model-len 4096
+ --trust-remote-code --max-model-len 4096 --chat-template examples/template_vlm2vec.jinja
.. important::
Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass ``--task embedding``
to run this model in embedding mode instead of text generation mode.
-Since this schema is not defined by OpenAI client, we post a request to the server using the lower-level ``requests`` library:
+.. important::
+
+ VLM2Vec does not expect chat-based input. We use a `custom chat template `_
+ to combine the text and images together.
+
+Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level ``requests`` library:
.. code-block:: python
@@ -301,3 +305,5 @@ Since this schema is not defined by OpenAI client, we post a request to the serv
response.raise_for_status()
response_json = response.json()
print("Embedding output:", response_json["data"][0]["embedding"])
+
+A full code example can be found in `examples/openai_chat_embedding_client_for_multimodal.py `_.
diff --git a/examples/openai_api_client_for_multimodal.py b/examples/openai_chat_completion_client_for_multimodal.py
similarity index 100%
rename from examples/openai_api_client_for_multimodal.py
rename to examples/openai_chat_completion_client_for_multimodal.py
diff --git a/examples/openai_chat_embedding_client_for_multimodal.py b/examples/openai_chat_embedding_client_for_multimodal.py
new file mode 100644
index 0000000000000..effb588e1387f
--- /dev/null
+++ b/examples/openai_chat_embedding_client_for_multimodal.py
@@ -0,0 +1,33 @@
+import requests
+
+image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+
+response = requests.post(
+ "http://localhost:8000/v1/embeddings",
+ json={
+ "model":
+ "TIGER-Lab/VLM2Vec-Full",
+ "messages": [{
+ "role":
+ "user",
+ "content": [
+ {
+ "type": "image_url",
+ "image_url": {
+ "url": image_url
+ }
+ },
+ {
+ "type": "text",
+ "text": "Represent the given image."
+ },
+ ],
+ }],
+ "encoding_format":
+ "float",
+ },
+)
+response.raise_for_status()
+response_json = response.json()
+
+print("Embedding output:", response_json["data"][0]["embedding"])
diff --git a/examples/template_vlm2vec.jinja b/examples/template_vlm2vec.jinja
new file mode 100644
index 0000000000000..489b99604af38
--- /dev/null
+++ b/examples/template_vlm2vec.jinja
@@ -0,0 +1,16 @@
+{%- if messages | length > 1 -%}
+ {{ raise_exception('Embedding models should only embed one message at a time') }}
+{%- endif -%}
+
+{% set vars = namespace(parts=[], next_image_id=1) %}
+{%- for message in messages -%}
+ {%- for content in message['content'] -%}
+ {%- if content['type'] == 'text' -%}
+ {%- set vars.parts = vars.parts + [content['text']] %}
+ {%- elif content['type'] == 'image' -%}
+ {%- set vars.parts = vars.parts + ['<|image_{i:d}|>'.format(i=vars.next_image_id)] %}
+ {%- set vars.next_image_id = vars.next_image_id + 1 %}
+ {%- endif -%}
+ {%- endfor -%}
+{%- endfor -%}
+{{ vars.parts | join(' ') }}
diff --git a/tests/entrypoints/openai/test_vision_embedding.py b/tests/entrypoints/openai/test_vision_embedding.py
index 73a69da32e434..d0c43b47bf0af 100644
--- a/tests/entrypoints/openai/test_vision_embedding.py
+++ b/tests/entrypoints/openai/test_vision_embedding.py
@@ -6,11 +6,14 @@
from vllm.multimodal.utils import encode_image_base64, fetch_image
-from ...utils import RemoteOpenAIServer
+from ...utils import VLLM_PATH, RemoteOpenAIServer
MODEL_NAME = "TIGER-Lab/VLM2Vec-Full"
MAXIMUM_IMAGES = 2
+vlm2vec_jinja_path = VLLM_PATH / "examples/template_vlm2vec.jinja"
+assert vlm2vec_jinja_path.exists()
+
# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
TEST_IMAGE_URLS = [
"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
@@ -35,6 +38,8 @@ def server():
"--trust-remote-code",
"--limit-mm-per-prompt",
f"image={MAXIMUM_IMAGES}",
+ "--chat-template",
+ str(vlm2vec_jinja_path),
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
@@ -90,5 +95,5 @@ async def test_image_embedding(server: RemoteOpenAIServer, model_name: str,
assert len(embeddings["data"]) == 1
assert len(embeddings["data"][0]["embedding"]) == 3072
assert embeddings["usage"]["completion_tokens"] == 0
- assert embeddings["usage"]["prompt_tokens"] == 771
- assert embeddings["usage"]["total_tokens"] == 771
+ assert embeddings["usage"]["prompt_tokens"] == 762
+ assert embeddings["usage"]["total_tokens"] == 762
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index ce36f20760f4c..bc2de2d162473 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -156,6 +156,10 @@ def __init__(self, model_config: ModelConfig, tokenizer: AnyTokenizer):
self._items: List[_T] = []
+ @property
+ def model_config(self) -> ModelConfig:
+ return self._model_config
+
@staticmethod
@lru_cache(maxsize=None)
def _cached_token_str(tokenizer: AnyTokenizer, token_index: int) -> str:
@@ -491,10 +495,13 @@ def _parse_chat_message_content_parts(
content: List[Union[str, Dict[str, str]]] = []
mm_parser = mm_tracker.create_parser()
- wrap_dicts = \
- mm_tracker._model_config.hf_config.model_type in \
- MODEL_KEEP_MULTI_MODAL_CONTENT or \
- (chat_template_text_format == "openai")
+ model_config = mm_tracker.model_config
+
+ wrap_dicts = (chat_template_text_format == "openai"
+ or (model_config.task == "embedding"
+ and model_config.is_multimodal_model)
+ or (model_config.hf_config.model_type
+ in MODEL_KEEP_MULTI_MODAL_CONTENT))
for part in parts:
parse_res = _parse_chat_message_content_part(
From 1dd4cb2935fc3fff9c156b5772d18e0a0d1861f0 Mon Sep 17 00:00:00 2001
From: Travis Johnson
Date: Fri, 1 Nov 2024 11:33:15 -0600
Subject: [PATCH 080/113] [Bugfix] Fix edge cases for MistralTokenizer (#9625)
Signed-off-by: Travis Johnson
Signed-off-by: Prashant Gupta
Co-authored-by: Prashant Gupta
Co-authored-by: Patrick von Platen
---
tests/tokenization/test_detokenize.py | 80 +++++++++++++++----
vllm/transformers_utils/tokenizers/mistral.py | 64 ++++++++++-----
2 files changed, 105 insertions(+), 39 deletions(-)
diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py
index f4551ed42efb8..1d07885349409 100644
--- a/tests/tokenization/test_detokenize.py
+++ b/tests/tokenization/test_detokenize.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, Generator, List, Optional
import pytest
from transformers import AutoTokenizer
@@ -7,11 +7,17 @@
from vllm.transformers_utils.detokenizer import (Detokenizer,
detokenize_incrementally)
from vllm.transformers_utils.tokenizer_group import get_tokenizer_group
+from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
TRUTH = [
"Hello here, this is a simple test",
"vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. It is designed to be used in production environments, where inference and serving", # noqa
- "我很感谢你的热情"
+ "我很感谢你的热情",
+ # Burmese text triggers an edge-case for Mistral's V3-Tekken tokenizer (eg.
+ # for mistralai/Pixtral-12B-2409) where tokens may map to bytes with
+ # incomplete UTF-8 characters
+ # see https://github.com/vllm-project/vllm/pull/9625
+ "ပုံပြင်လေးပြောပြပါ်",
]
TOKENIZERS = [
"facebook/opt-125m",
@@ -24,6 +30,7 @@
"tiiuae/falcon-7b",
"meta-llama/Llama-2-7b-hf",
"codellama/CodeLlama-7b-hf",
+ "mistralai/Pixtral-12B-2409",
]
@@ -49,15 +56,55 @@ def _run_incremental_decode(tokenizer, all_input_ids,
return decoded_text
+@pytest.fixture
+def tokenizer(tokenizer_name):
+ return (MistralTokenizer.from_pretrained(tokenizer_name)
+ if "mistral" in tokenizer_name else
+ AutoTokenizer.from_pretrained(tokenizer_name))
+
+
+@pytest.mark.parametrize("tokenizer_name", ["mistralai/Pixtral-12B-2409"])
+@pytest.mark.parametrize(
+ "truth",
+ [
+ # Burmese text triggers an edge-case where tokens may map to bytes with
+ # incomplete UTF-8 characters
+ "ပုံပြင်လေးပြောပြပါ",
+ # Using "URGENCY" since "CY" has token id 130282
+ "URGENCY🌶️",
+ ])
+def test_mistral_edge_case(tokenizer, truth):
+ """Test for a specific edge cases with V3-Tekken MistralTokenizer.
+
+ See https://github.com/vllm-project/vllm/pull/9625
+ """
+ starting_index = 0
+ all_input_ids = tokenizer(truth, add_special_tokens=False).input_ids
+
+ decoded_text = _run_incremental_decode(tokenizer,
+ all_input_ids,
+ skip_special_tokens=True,
+ starting_index=starting_index)
+ assert decoded_text == truth
+
+
+@pytest.fixture
+def skip_special_tokens(request, tokenizer_name) -> Generator[bool, Any, None]:
+ if "mistral" in tokenizer_name:
+ yield (
+ bool(True) if request.param else
+ pytest.skip("mistral doesn't support skip_special_tokens=False"))
+ else:
+ yield bool(True) if request.param else bool(False)
+
+
@pytest.mark.parametrize("truth", TRUTH)
@pytest.mark.parametrize("with_prompt", [True, False])
-@pytest.mark.parametrize("tokenizer_id", TOKENIZERS)
-@pytest.mark.parametrize("skip_special_tokens", (True, False))
-def test_decode_streaming(tokenizer_id, truth, with_prompt,
- skip_special_tokens):
- tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
+@pytest.mark.parametrize("tokenizer_name", TOKENIZERS)
+@pytest.mark.parametrize("skip_special_tokens", (True, False), indirect=True)
+def test_decode_streaming(tokenizer, truth, with_prompt, skip_special_tokens):
if with_prompt:
- truth_tokens = tokenizer(truth, add_special_tokens=False)["input_ids"]
+ truth_tokens = tokenizer(truth, add_special_tokens=False).input_ids
prompt_input_ids = truth_tokens[:len(truth) // 2]
generated_input_ids = truth_tokens[len(truth) // 2:]
all_input_ids = prompt_input_ids + generated_input_ids
@@ -68,7 +115,7 @@ def test_decode_streaming(tokenizer_id, truth, with_prompt,
else:
generated = truth
starting_index = 0
- all_input_ids = tokenizer(truth, add_special_tokens=False)["input_ids"]
+ all_input_ids = tokenizer(truth, add_special_tokens=False).input_ids
if skip_special_tokens:
if tokenizer.bos_token_id is not None:
all_input_ids = [tokenizer.bos_token_id] + all_input_ids
@@ -98,7 +145,7 @@ def detokenizer(tokenizer_name: str) -> Detokenizer:
enable_lora=False,
max_num_seqs=100,
max_input_length=None,
- tokenizer_mode="auto",
+ tokenizer_mode="mistral" if "mistral" in tokenizer_name else "auto",
trust_remote_code=False,
revision=None,
)
@@ -113,9 +160,8 @@ def detokenizer(tokenizer_name: str) -> Detokenizer:
@pytest.fixture(name="complete_sequence_token_ids")
def create_complete_sequence_token_ids(complete_sequence: str,
- tokenizer_name: str) -> List[int]:
- tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
- complete_sequence_token_ids = tokenizer(complete_sequence)["input_ids"]
+ tokenizer) -> List[int]:
+ complete_sequence_token_ids = tokenizer(complete_sequence).input_ids
return complete_sequence_token_ids
@@ -150,7 +196,7 @@ def create_dummy_prompt_logprobs(
@pytest.mark.parametrize("complete_sequence", TRUTH)
@pytest.mark.parametrize("tokenizer_name", TOKENIZERS)
-@pytest.mark.parametrize("skip_special_tokens", [True, False])
+@pytest.mark.parametrize("skip_special_tokens", [True, False], indirect=True)
def test_decode_sequence_logprobs(complete_sequence: str,
complete_sequence_token_ids: List[int],
detokenizer: Detokenizer,
@@ -208,9 +254,9 @@ def test_decode_prompt_logprobs(complete_sequence_token_ids: List[int],
# decoded_prompt_logprobs doesn't contain the first token.
token_ids = complete_sequence_token_ids
- tokenzier = detokenizer.get_tokenizer_for_seq(seq)
- text_full = tokenzier.decode(token_ids, skip_special_tokens=True)
- text_first = tokenzier.decode(token_ids[0], skip_special_tokens=True)
+ tokenizer = detokenizer.get_tokenizer_for_seq(seq)
+ text_full = tokenizer.decode(token_ids, skip_special_tokens=True)
+ text_first = tokenizer.decode(token_ids[0], skip_special_tokens=True)
text = text_full[len(text_first):]
# Text for logprobs for the chosen token should be the same as the
diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
index 80e21c2d32ecc..896f70bc1dafd 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -16,9 +16,13 @@
from mistral_common.tokens.tokenizers.tekken import (SpecialTokenPolicy,
Tekkenizer)
+from vllm.logger import init_logger
+
if TYPE_CHECKING:
from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
+logger = init_logger(__name__)
+
@dataclass
class Encoding:
@@ -72,20 +76,21 @@ def __init__(self, tokenizer: PublicMistralTokenizer) -> None:
# Make sure special tokens will not raise
tokenizer_.special_token_policy = SpecialTokenPolicy.IGNORE
- self._vocab = {
- token: idx
- for idx, token in enumerate(tokenizer_.vocab())
- }
elif isinstance(tokenizer_, SentencePieceTokenizer):
- self._vocab = {
- token: idx
- for idx, token in enumerate(tokenizer_.vocab())
- }
+ pass
else:
raise TypeError(f"Unsupported tokenizer: {type(tokenizer_)}")
+ self._vocab = tokenizer_.vocab()
+ # Convert to a Dict[str, int] to match protocol, but this is a lossy
+ # conversion. There may be multiple token ids that decode to the same
+ # string due to partial UTF-8 byte sequences being converted to �
+ self._vocab_dict = {
+ token: idx
+ for idx, token in enumerate(self._vocab)
+ }
self.tokenizer = tokenizer_
- self._max_token_id = max(self._vocab.values())
+ self._max_token_id = self.vocab_size - 1
@classmethod
def from_pretrained(cls,
@@ -182,7 +187,9 @@ def __call__(
return Encoding(input_ids=input_ids)
def get_vocab(self) -> Dict[str, int]:
- return self._vocab
+ # NB: the dictionary form of the vocabulary collapses token ids that map
+ # to the same string but have different bytes
+ return self._vocab_dict
def get_added_vocab(self) -> Dict[str, int]:
# Mistral tokenizers have no added vocabulary
@@ -220,14 +227,20 @@ def convert_tokens_to_string(self, tokens: List[str]) -> str:
if any(isinstance(t, bytes) for t in tokens):
# we need to encode and decode all tokens again
shift = self.tokenizer.num_special_tokens
- byte_tokens = [
- t.encode("utf-8") if not isinstance(t, bytes) else t
- for t in tokens
- ]
- ids = [
- self.tokenizer._tekken_token2id_nospecial[t] + shift
- for t in byte_tokens
- ]
+
+ def _token_to_id(t: str):
+ t_bytes = t.encode("utf-8") \
+ if not isinstance(t, bytes) else t
+ try:
+ return shift + \
+ self.tokenizer._tekken_token2id_nospecial[t_bytes]
+ except KeyError:
+ logger.warning(
+ "Failed to convert token %s to id,"
+ " replacing with ", t_bytes)
+ return self.tokenizer.unk_id
+
+ ids = [_token_to_id(t) for t in tokens]
decoded = self.tokenizer.decode(ids)
else:
decoded = "".join(tokens)
@@ -236,7 +249,13 @@ def convert_tokens_to_string(self, tokens: List[str]) -> str:
return decoded
- def decode(self, ids: Union[List[int], int]) -> str:
+ def decode(self,
+ ids: Union[List[int], int],
+ skip_special_tokens: bool = True) -> str:
+ assert (
+ skip_special_tokens
+ ), "Skipping special tokens is not supported for Mistral tokenizers."
+
if isinstance(ids, int):
ids = [ids]
return self.tokenizer.decode(ids)
@@ -257,10 +276,11 @@ def convert_ids_to_tokens(
tokens = [self.tokenizer.id_to_piece(id) for id in ids]
- if any(t.strip() == "�" for t in tokens):
- # if any stripped decoded token is undefined
- # because it's invalid unicode then pass bytes
+ if any("�" in t for t in tokens):
+ # if a decoded token contains the replacement character, then the
+ # token has an incomplete UTF-8 character so we must use bytes
# See: https://github.com/vllm-project/vllm/pull/8640
+ # https://github.com/vllm-project/vllm/pull/9625
tokens = [self.tokenizer.id_to_byte_piece(id) for id in ids]
return tokens
From 4581d2cc02f655e76233f9cb129f07c6b65d39f4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9=20Jonasson?=
Date: Fri, 1 Nov 2024 19:41:38 +0100
Subject: [PATCH 081/113] [Core] Refactor: Clean up unused argument in
Scheduler._preempt (#9696)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Signed-off-by: André Jonasson
---
vllm/core/scheduler.py | 11 +++--------
1 file changed, 3 insertions(+), 8 deletions(-)
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index 88733b8f53b86..e35c05f4fe7f7 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -828,8 +828,7 @@ def _schedule_priority_preemption(
num_running_seqs)
#Preempt out the victim sequence group
- self._preempt(vseq_group, blocks_to_swap_out,
- PreemptionMode.RECOMPUTE)
+ self._preempt(vseq_group, blocks_to_swap_out)
waiting_queue.appendleft(vseq_group)
force_preemption_count += 1
#Put the sequence back into the waiting queue
@@ -1451,12 +1450,8 @@ def _append_slots(self,
if len(cows) > 0:
blocks_to_copy.extend(cows)
- def _preempt(
- self,
- seq_group: SequenceGroup,
- blocks_to_swap_out: List[Tuple[int, int]],
- preemption_mode: Optional[PreemptionMode] = None,
- ) -> PreemptionMode:
+ def _preempt(self, seq_group: SequenceGroup,
+ blocks_to_swap_out: List[Tuple[int, int]]) -> PreemptionMode:
# If preemption mode is not specified, we determine the mode as follows:
# We use recomputation by default since it incurs lower overhead than
# swapping. However, when the sequence group has multiple sequences
From aff1fd81881bf29f82ad6ba55b301828764cd120 Mon Sep 17 00:00:00 2001
From: youkaichao
Date: Fri, 1 Nov 2024 11:50:37 -0700
Subject: [PATCH 082/113] [torch.compile] use interpreter with stable api from
pytorch (#9889)
Signed-off-by: youkaichao
---
vllm/compilation/backends.py | 165 +++++++++++++++++++----------------
1 file changed, 89 insertions(+), 76 deletions(-)
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 10cf49e19eccc..96ddcba467c5b 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -243,6 +243,65 @@ def split_graph(graph: fx.GraphModule,
return split_gm, outputs
+# we share the global graph pool among all the backends
+global_graph_pool = None
+
+
+class PiecewiseCompileInterpreter(torch.fx.Interpreter):
+ """Code adapted from `torch.fx.passes.shape_prop.ShapeProp`.
+ It runs the given graph with fake inputs, and compile some
+ submodules specified by `compile_submod_names` with the given
+ compilation configs.
+ """
+
+ def __init__(self, module: torch.fx.GraphModule,
+ compile_submod_names: List[str],
+ compilation_configs: CompilationConfig, graph_pool):
+ super().__init__(module)
+ from torch._guards import detect_fake_mode
+ self.fake_mode = detect_fake_mode()
+ self.compile_submod_names = compile_submod_names
+ self.compilation_configs = compilation_configs
+ self.graph_pool = graph_pool
+ self.have_seen_first_graph = False
+
+ def run(self, *args):
+ fake_args = [
+ self.fake_mode.from_tensor(t) if isinstance(t, torch.Tensor) else t
+ for t in args
+ ]
+ return super().run(*fake_args)
+
+ def call_module(self, target: torch.fx.node.Target,
+ args: Tuple[torch.fx.node.Argument,
+ ...], kwargs: Dict[str, Any]) -> Any:
+ assert isinstance(target, str)
+ output = super().call_module(target, args, kwargs)
+
+ if target in self.compile_submod_names:
+ submod = self.fetch_attr(target)
+ sym_shape_indices = [
+ i for i, x in enumerate(args) if isinstance(x, torch.SymInt)
+ ]
+ compiled_graph_for_general_shape = wrap_inductor(
+ submod,
+ args,
+ self.compilation_configs.inductor_compile_config,
+ runtime_shape=None,
+ do_logging=not self.have_seen_first_graph,
+ use_inductor=self.compilation_configs.use_inductor)
+
+ self.module.__dict__[target] = PiecewiseBackend(
+ submod, self.compilation_configs, self.graph_pool,
+ not self.have_seen_first_graph, sym_shape_indices,
+ compiled_graph_for_general_shape)
+
+ self.have_seen_first_graph = True
+ compilation_counter.num_piecewise_capturable_graphs_seen += 1
+
+ return output
+
+
class VllmBackend:
"""The compilation backend for `torch.compile` with VLLM.
It is used for compilation level of `CompilationLevel.PIECEWISE`,
@@ -263,8 +322,14 @@ class VllmBackend:
returned_callable: Callable
def __init__(self, ):
- # every instance of VllmBackend has its own graph pool
- self.graph_pool = torch.cuda.graph_pool_handle()
+ global global_graph_pool
+ if global_graph_pool is None:
+ global_graph_pool = torch.cuda.graph_pool_handle()
+
+ # TODO: in the future, if we want to use multiple
+ # streams, it might not be safe to share a global pool.
+ # only investigate this when we use multiple streams
+ self.graph_pool = global_graph_pool
# `torch.compile` is JIT compiled, so we don't need to
# do anything here
@@ -286,55 +351,26 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
self.split_gm, self.piecewise_graphs = split_graph(
graph, self.compilation_configs.non_cudagraph_ops)
- returned_callable: Callable # type: ignore
+ from torch._dynamo.utils import lazy_format_graph_code
+ logger.debug("%s",
+ lazy_format_graph_code("stiching module", self.split_gm))
- if len(self.piecewise_graphs) == 0:
- compilation_counter.num_piecewise_graphs_seen += 1
- compilation_counter.num_piecewise_capturable_graphs_seen += 1
- returned_callable = PiecewiseBackend(graph,
- self.compilation_configs,
- self.graph_pool,
- is_first_graph=True)
- else:
- from torch._dynamo.utils import lazy_format_graph_code
- logger.debug(
- "%s", lazy_format_graph_code("stiching module", self.split_gm))
-
- is_first_graph = True
-
- for item in self.piecewise_graphs:
- compilation_counter.num_piecewise_graphs_seen += 1
- compilation_counter.num_piecewise_capturable_graphs_seen += not item.is_splitting_graph # noqa
- if not item.is_splitting_graph:
- # cannot setattr to a module, so we need to set
- # the attribute in the __dict__
- self.split_gm.__dict__[
- item.submod_name] = PiecewiseBackend(
- item.graph, self.compilation_configs,
- self.graph_pool, is_first_graph)
- is_first_graph = False
- returned_callable = self.split_gm
-
- self.returned_callable = returned_callable
- # trigger the first compilation
- # code borrowed from https://github.com/pytorch/pytorch/blob/4e3e08b71171fa34172b2362ff668553fac75f27/torch/_dynamo/backends/distributed.py#L206 # noqa
- # to turn the inputs into fake tensors
- import torch._guards
- from torch._guards import detect_fake_mode
- fake_mode = detect_fake_mode(example_inputs)
- fake_args = []
- for arg in example_inputs:
- if isinstance(arg, torch.Tensor) and not isinstance(
- arg, torch._subclasses.FakeTensor):
- fake_args.append(
- torch._dynamo.utils.to_fake_tensor(arg, fake_mode))
- else:
- fake_args.append(arg)
- self.returned_callable(*fake_args)
+ compilation_counter.num_piecewise_graphs_seen += len(
+ self.piecewise_graphs)
+ submod_names_to_compile = [
+ item.submod_name for item in self.piecewise_graphs
+ if not item.is_splitting_graph
+ ]
+
+ # propagate the split graph to the piecewise backend,
+ # compile submodules with symbolic shapes
+ PiecewiseCompileInterpreter(self.split_gm, submod_names_to_compile,
+ self.compilation_configs,
+ self.graph_pool).run(*example_inputs)
self._called = True
- return self.returned_callable
+ return self.split_gm
@dataclasses.dataclass
@@ -352,11 +388,10 @@ class ConcreteSizeEntry:
class PiecewiseBackend:
- def __init__(self,
- graph: fx.GraphModule,
- compilation_configs: CompilationConfig,
- graph_pool: Any,
- is_first_graph: bool = False):
+ def __init__(self, graph: fx.GraphModule,
+ compilation_configs: CompilationConfig, graph_pool: Any,
+ is_first_graph: bool, sym_shape_indices: List[int],
+ compiled_graph_for_general_shape: Callable):
"""
The backend for piecewise compilation.
It mainly handles the compilation and cudagraph capturing.
@@ -381,12 +416,11 @@ def __init__(self,
self.compilation_configs.capture_sizes
) if self.compilation_configs.use_cudagraph else set()
- self.compile_finished = False
self.first_run_finished = False
- self.compiled_graph_for_general_shape: Callable = None # type: ignore
+ self.compiled_graph_for_general_shape = compiled_graph_for_general_shape # noqa
- self.sym_shape_indices: List[int] = []
+ self.sym_shape_indices = sym_shape_indices
# the entries for different shapes that we need to either
# compile or capture cudagraph
@@ -399,27 +433,6 @@ def __init__(self,
)
def __call__(self, *args) -> Any:
-
- if not self.compile_finished:
- self.compile_finished = True
-
- # this is the first compilation, we will compile a graph with
- # dynamic shape, as the caller will mark first dimension as dynamic
-
- self.sym_shape_indices = [
- i for i, x in enumerate(args) if isinstance(x, torch.SymInt)
- ]
-
- self.compiled_graph_for_general_shape = wrap_inductor(
- self.graph,
- args,
- self.compilation_configs.inductor_compile_config,
- runtime_shape=None,
- do_logging=self.is_first_graph,
- use_inductor=self.compilation_configs.use_inductor)
-
- return self.graph(*args)
-
if not self.first_run_finished:
self.first_run_finished = True
return self.compiled_graph_for_general_shape(*args)
From 598b6d7b070149aae5884aa8b17a0c91c93172f5 Mon Sep 17 00:00:00 2001
From: Pavani Majety
Date: Fri, 1 Nov 2024 12:15:05 -0700
Subject: [PATCH 083/113] [Bugfix/Core] Flashinfer k_scale and v_scale (#9861)
---
tests/kernels/test_cache.py | 21 ++++++++++++-------
vllm/attention/backends/flashinfer.py | 9 +++++---
.../layers/quantization/modelopt.py | 7 +++++--
3 files changed, 25 insertions(+), 12 deletions(-)
diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py
index 5b8311a33c361..e2b4778b94b9e 100644
--- a/tests/kernels/test_cache.py
+++ b/tests/kernels/test_cache.py
@@ -258,19 +258,20 @@ def test_reshape_and_cache_flash(
del key_caches
del value_caches
+ k_scale = key.amax().item() / 256
+ v_scale = value.amax().item() / 256
+
# Clone the KV caches.
if kv_cache_dtype == "fp8":
cloned_key_cache = torch.empty_like(key_cache, dtype=torch.float16)
- ops.convert_fp8(cloned_key_cache, key_cache)
+ ops.convert_fp8(cloned_key_cache, key_cache, k_scale, kv_cache_dtype)
cloned_value_cache = torch.empty_like(value_cache, dtype=torch.float16)
- ops.convert_fp8(cloned_value_cache, value_cache)
+ ops.convert_fp8(cloned_value_cache, value_cache, v_scale,
+ kv_cache_dtype)
else:
cloned_key_cache = key_cache.clone()
cloned_value_cache = value_cache.clone()
- # Using default kv_scale
- k_scale = v_scale = 1.0
-
# Call the reshape_and_cache kernel.
opcheck(torch.ops._C_cache_ops.reshape_and_cache_flash,
(key, value, key_cache, value_cache, slot_mapping, kv_cache_dtype,
@@ -281,9 +282,15 @@ def test_reshape_and_cache_flash(
if kv_cache_dtype == "fp8":
result_key_cache = torch.empty_like(key_cache, dtype=torch.float16)
- ops.convert_fp8(result_key_cache, key_cache)
+ ops.convert_fp8(result_key_cache,
+ key_cache,
+ k_scale,
+ kv_dtype=kv_cache_dtype)
result_value_cache = torch.empty_like(value_cache, dtype=torch.float16)
- ops.convert_fp8(result_value_cache, value_cache)
+ ops.convert_fp8(result_value_cache,
+ value_cache,
+ v_scale,
+ kv_dtype=kv_cache_dtype)
# Run the reference implementation.
block_indicies = torch.div(slot_mapping, block_size, rounding_mode="floor")
diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
index 234c87d5c4edb..658805d35be0a 100644
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -759,8 +759,6 @@ def forward(
v_scale: float = 1.0,
attn_type: AttentionType = AttentionType.DECODER,
) -> torch.Tensor:
- assert k_scale == 1.0 and v_scale == 1.0, (
- "key/v_scale is not supported in FlashInfer.")
if attn_type != AttentionType.DECODER:
raise NotImplementedError("Encoder self-attention and "
"encoder/decoder cross-attention "
@@ -874,7 +872,12 @@ def unified_flash_infer(
assert prefill_meta is not None
assert prefill_meta.prefill_wrapper is not None
prefill_output = prefill_meta.prefill_wrapper.forward(
- query, kv_cache, logits_soft_cap=logits_soft_cap, causal=True)
+ query,
+ kv_cache,
+ logits_soft_cap=logits_soft_cap,
+ causal=True,
+ k_scale=k_scale,
+ v_scale=v_scale)
if decode_meta := attn_metadata.decode_metadata:
assert attn_metadata.decode_metadata is not None
assert attn_metadata.decode_metadata.decode_wrapper is not None
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index dc5f47eb9b0fb..9694f2b8208e2 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -141,8 +141,11 @@ def create_weights(
layer.register_parameter("input_scale", scale)
def process_weights_after_loading(self, layer: Module) -> None:
- max_w_scale, weight = requantize_with_max_scale(
- layer.weight, layer.weight_scale, layer.logical_widths)
+ weight = layer.weight
+ max_w_scale = layer.weight_scale.max()
+ if not (layer.weight_scale == layer.weight_scale[0]).all():
+ max_w_scale, weight = requantize_with_max_scale(
+ layer.weight, layer.weight_scale, layer.logical_widths)
layer.weight = Parameter(weight.t(), requires_grad=False)
layer.weight_scale = Parameter(max_w_scale, requires_grad=False)
layer.input_scale = Parameter(layer.input_scale.max(),
From 18bd7587b78b3b9868fea29d59ae8c3600c3e5a5 Mon Sep 17 00:00:00 2001
From: youkaichao
Date: Fri, 1 Nov 2024 13:51:57 -0700
Subject: [PATCH 084/113] [1/N] pass the complete config from engine to
executor (#9933)
Signed-off-by: youkaichao
---
vllm/engine/async_llm_engine.py | 2 +-
vllm/engine/llm_engine.py | 50 +++++++++------------
vllm/engine/multiprocessing/engine.py | 7 +--
vllm/executor/executor_base.py | 37 ++++++----------
vllm/executor/xpu_executor.py | 44 ++++---------------
vllm/v1/engine/llm_engine.py | 62 +++++++++------------------
6 files changed, 65 insertions(+), 137 deletions(-)
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 5198467a6ac40..6aeaf484a22b4 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -680,7 +680,7 @@ def from_engine_args(
# Create the async LLM engine.
engine = cls(
- **engine_config.to_dict(),
+ vllm_config=engine_config,
executor_class=executor_class,
log_requests=not engine_args.disable_log_requests,
log_stats=not engine_args.disable_log_stats,
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index edef1f30a9e91..e6fe1effb8287 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -13,11 +13,8 @@
from typing_extensions import TypeIs, TypeVar
import vllm.envs as envs
-from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig,
- EngineConfig, LoadConfig, LoRAConfig, ModelConfig,
- ObservabilityConfig, ParallelConfig,
- PromptAdapterConfig, SchedulerConfig,
- SpeculativeConfig)
+from vllm.config import (DecodingConfig, EngineConfig, LoRAConfig, ModelConfig,
+ ObservabilityConfig, ParallelConfig, SchedulerConfig)
from vllm.core.scheduler import (ScheduledSequenceGroup, Scheduler,
SchedulerOutputs)
from vllm.engine.arg_utils import EngineArgs
@@ -222,17 +219,7 @@ def validate_outputs(
def __init__(
self,
- model_config: ModelConfig,
- cache_config: CacheConfig,
- parallel_config: ParallelConfig,
- scheduler_config: SchedulerConfig,
- device_config: DeviceConfig,
- load_config: LoadConfig,
- lora_config: Optional[LoRAConfig],
- speculative_config: Optional[SpeculativeConfig],
- decoding_config: Optional[DecodingConfig],
- observability_config: Optional[ObservabilityConfig],
- prompt_adapter_config: Optional[PromptAdapterConfig],
+ vllm_config: EngineConfig,
executor_class: Type[ExecutorBase],
log_stats: bool,
usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
@@ -240,6 +227,22 @@ def __init__(
input_registry: InputRegistry = INPUT_REGISTRY,
use_cached_outputs: bool = False,
) -> None:
+
+ # TODO: remove the local variables and use self.* throughout the class.
+ model_config = self.model_config = vllm_config.model_config
+ cache_config = self.cache_config = vllm_config.cache_config
+ lora_config = self.lora_config = vllm_config.lora_config
+ parallel_config = self.parallel_config = vllm_config.parallel_config
+ scheduler_config = self.scheduler_config = vllm_config.scheduler_config
+ device_config = self.device_config = vllm_config.device_config
+ speculative_config = self.speculative_config = vllm_config.speculative_config # noqa
+ load_config = self.load_config = vllm_config.load_config
+ decoding_config = self.decoding_config = vllm_config.decoding_config or DecodingConfig( # noqa
+ )
+ prompt_adapter_config = self.prompt_adapter_config = vllm_config.prompt_adapter_config # noqa
+ observability_config = self.observability_config = vllm_config.observability_config or ObservabilityConfig( # noqa
+ )
+
logger.info(
"Initializing an LLM engine (v%s) with config: "
"model=%r, speculative_config=%r, tokenizer=%r, "
@@ -340,18 +343,7 @@ def get_tokenizer_for_seq(sequence: Sequence) -> AnyTokenizer:
self.input_processor = input_registry.create_input_processor(
model_config)
- self.model_executor = executor_class(
- model_config=model_config,
- cache_config=cache_config,
- parallel_config=parallel_config,
- scheduler_config=scheduler_config,
- device_config=device_config,
- lora_config=lora_config,
- speculative_config=speculative_config,
- load_config=load_config,
- prompt_adapter_config=prompt_adapter_config,
- observability_config=self.observability_config,
- )
+ self.model_executor = executor_class(vllm_config=vllm_config, )
if self.model_config.task != "embedding":
self._initialize_kv_caches()
@@ -582,7 +574,7 @@ def from_engine_args(
executor_class = cls._get_executor_cls(engine_config)
# Create the LLM engine.
engine = cls(
- **engine_config.to_dict(),
+ vllm_config=engine_config,
executor_class=executor_class,
log_stats=not engine_args.disable_log_stats,
usage_context=usage_context,
diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py
index 0a7f430eca488..eb1512ca17822 100644
--- a/vllm/engine/multiprocessing/engine.py
+++ b/vllm/engine/multiprocessing/engine.py
@@ -7,8 +7,6 @@
import zmq
from vllm import AsyncEngineArgs, SamplingParams
-from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig,
- ParallelConfig, SchedulerConfig)
# yapf conflicts with isort for this block
# yapf: disable
from vllm.engine.multiprocessing import (ENGINE_DEAD_ERROR, IPC_DATA_EXT,
@@ -30,9 +28,6 @@
else:
from vllm.engine.llm_engine import LLMEngine
-CONFIG_TYPE = Union[ModelConfig, DecodingConfig, ParallelConfig,
- SchedulerConfig, LoRAConfig]
-
logger = init_logger(__name__)
POLLING_TIMEOUT_MS = 10000
@@ -130,7 +125,7 @@ def from_engine_args(cls, engine_args: AsyncEngineArgs,
return cls(ipc_path=ipc_path,
use_async_sockets=use_async_sockets,
- **engine_config.to_dict(),
+ vllm_config=engine_config,
executor_class=executor_class,
log_requests=not engine_args.disable_log_requests,
log_stats=not engine_args.disable_log_stats,
diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py
index c96cb0f2c2981..2248eecd1849f 100644
--- a/vllm/executor/executor_base.py
+++ b/vllm/executor/executor_base.py
@@ -1,10 +1,7 @@
from abc import ABC, abstractmethod
from typing import List, Optional, Set, Tuple
-from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
- ModelConfig, ObservabilityConfig, ParallelConfig,
- PromptAdapterConfig, SchedulerConfig,
- SpeculativeConfig)
+from vllm.config import EngineConfig
from vllm.lora.request import LoRARequest
from vllm.model_executor.layers.sampler import SamplerOutput
from vllm.prompt_adapter.request import PromptAdapterRequest
@@ -23,27 +20,19 @@ class ExecutorBase(ABC):
def __init__(
self,
- model_config: ModelConfig,
- cache_config: CacheConfig,
- parallel_config: ParallelConfig,
- scheduler_config: SchedulerConfig,
- device_config: DeviceConfig,
- load_config: LoadConfig,
- lora_config: Optional[LoRAConfig],
- speculative_config: Optional[SpeculativeConfig],
- prompt_adapter_config: Optional[PromptAdapterConfig],
- observability_config: Optional[ObservabilityConfig],
+ vllm_config: EngineConfig,
) -> None:
- self.model_config = model_config
- self.cache_config = cache_config
- self.lora_config = lora_config
- self.load_config = load_config
- self.parallel_config = parallel_config
- self.scheduler_config = scheduler_config
- self.device_config = device_config
- self.speculative_config = speculative_config
- self.prompt_adapter_config = prompt_adapter_config
- self.observability_config = observability_config
+ self.vllm_config = vllm_config
+ self.model_config = vllm_config.model_config
+ self.cache_config = vllm_config.cache_config
+ self.lora_config = vllm_config.lora_config
+ self.load_config = vllm_config.load_config
+ self.parallel_config = vllm_config.parallel_config
+ self.scheduler_config = vllm_config.scheduler_config
+ self.device_config = vllm_config.device_config
+ self.speculative_config = vllm_config.speculative_config
+ self.prompt_adapter_config = vllm_config.prompt_adapter_config
+ self.observability_config = vllm_config.observability_config
self._init_executor()
@abstractmethod
diff --git a/vllm/executor/xpu_executor.py b/vllm/executor/xpu_executor.py
index 5f78993ddc4b4..36b7e2265efab 100644
--- a/vllm/executor/xpu_executor.py
+++ b/vllm/executor/xpu_executor.py
@@ -2,10 +2,7 @@
import torch
-from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
- ModelConfig, ObservabilityConfig, ParallelConfig,
- PromptAdapterConfig, SchedulerConfig,
- SpeculativeConfig)
+from vllm.config import ModelConfig, ParallelConfig
from vllm.executor.executor_base import ExecutorAsyncBase
from vllm.executor.gpu_executor import GPUExecutor
from vllm.logger import init_logger
@@ -21,38 +18,13 @@ class XPUExecutor(GPUExecutor):
uses_ray: bool = False
- def __init__(
- self,
- model_config: ModelConfig,
- cache_config: CacheConfig,
- parallel_config: ParallelConfig,
- scheduler_config: SchedulerConfig,
- device_config: DeviceConfig,
- load_config: LoadConfig,
- lora_config: Optional[LoRAConfig],
- prompt_adapter_config: Optional[PromptAdapterConfig],
- speculative_config: Optional[SpeculativeConfig],
- observability_config: Optional[ObservabilityConfig],
- ) -> None:
- assert device_config.device_type == "xpu"
- assert (not speculative_config
- ), "Speculative decoding not yet supported for XPU backend"
-
- model_config = _verify_and_get_model_config(model_config)
-
- self.model_config = model_config
- self.cache_config = cache_config
- self.load_config = load_config
- self.lora_config = lora_config
- self.parallel_config = _verify_and_get_parallel_config(parallel_config)
- self.scheduler_config = scheduler_config
- self.device_config = device_config
- self.prompt_adapter_config = prompt_adapter_config
- self.speculative_config = None
- self.observability_config = observability_config
-
- # Instantiate the worker and load the model to GPU.
- self._init_executor()
+ def _init_executor(self) -> None:
+ assert self.device_config.device_type == "xpu"
+ assert self.speculative_config is None, (
+ "Speculative decoding not yet supported for XPU backend")
+
+ self.model_config = _verify_and_get_model_config(self.model_config)
+ GPUExecutor._init_executor(self)
def _get_worker_module_and_class(
self) -> Tuple[str, str, Optional[Callable[[], Type[WorkerBase]]]]:
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 072e52bcd686a..febabd2f31036 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -2,11 +2,8 @@
from typing import (Any, Dict, Iterable, List, Mapping, Optional, Tuple, Type,
Union)
-from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig,
- EngineConfig, LoadConfig, LoRAConfig, ModelConfig,
- ObservabilityConfig, ParallelConfig,
- PromptAdapterConfig, SchedulerConfig,
- SpeculativeConfig)
+from vllm.config import (DecodingConfig, EngineConfig, LoRAConfig, ModelConfig,
+ ObservabilityConfig, ParallelConfig, SchedulerConfig)
from vllm.engine.arg_utils import EngineArgs
from vllm.engine.metrics_types import StatLoggerBase
from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs,
@@ -35,17 +32,7 @@ class LLMEngine:
def __init__(
self,
- model_config: ModelConfig,
- cache_config: CacheConfig,
- parallel_config: ParallelConfig,
- scheduler_config: SchedulerConfig,
- device_config: DeviceConfig,
- load_config: LoadConfig,
- lora_config: Optional[LoRAConfig],
- speculative_config: Optional[SpeculativeConfig],
- decoding_config: Optional[DecodingConfig],
- observability_config: Optional[ObservabilityConfig],
- prompt_adapter_config: Optional[PromptAdapterConfig],
+ vllm_config: EngineConfig,
executor_class: Type[GPUExecutor],
log_stats: bool,
usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
@@ -53,6 +40,22 @@ def __init__(
input_registry: InputRegistry = INPUT_REGISTRY,
use_cached_outputs: bool = False,
) -> None:
+
+ # TODO: remove the local variables and use self.* throughout the class.
+ model_config = self.model_config = vllm_config.model_config
+ cache_config = self.cache_config = vllm_config.cache_config
+ lora_config = self.lora_config = vllm_config.lora_config
+ parallel_config = self.parallel_config = vllm_config.parallel_config
+ scheduler_config = self.scheduler_config = vllm_config.scheduler_config
+ device_config = self.device_config = vllm_config.device_config
+ speculative_config = self.speculative_config = vllm_config.speculative_config # noqa
+ load_config = self.load_config = vllm_config.load_config
+ decoding_config = self.decoding_config = vllm_config.decoding_config or DecodingConfig( # noqa
+ )
+ prompt_adapter_config = self.prompt_adapter_config = vllm_config.prompt_adapter_config # noqa
+ observability_config = self.observability_config = vllm_config.observability_config or ObservabilityConfig( # noqa
+ )
+
# Override the configs for V1.
# FIXME
if usage_context == UsageContext.LLM_CLASS:
@@ -112,18 +115,6 @@ def __init__(
model_config.mm_processor_kwargs,
)
- self.model_config = model_config
- self.cache_config = cache_config
- self.lora_config = lora_config
- self.parallel_config = parallel_config
- self.scheduler_config = scheduler_config
- self.device_config = device_config
- self.speculative_config = speculative_config
- self.load_config = load_config
- self.decoding_config = decoding_config or DecodingConfig()
- self.prompt_adapter_config = prompt_adapter_config
- self.observability_config = observability_config or ObservabilityConfig(
- )
self.log_stats = log_stats
assert not self.model_config.skip_tokenizer_init
@@ -154,18 +145,7 @@ def __init__(
# Request id -> RequestOutput
self.request_outputs: Dict[str, RequestOutput] = {}
- self.model_executor = executor_class(
- model_config=model_config,
- cache_config=cache_config,
- parallel_config=parallel_config,
- scheduler_config=scheduler_config,
- device_config=device_config,
- lora_config=lora_config,
- speculative_config=speculative_config,
- load_config=load_config,
- prompt_adapter_config=prompt_adapter_config,
- observability_config=self.observability_config,
- )
+ self.model_executor = executor_class(vllm_config=vllm_config)
assert self.model_config.task != "embedding"
self._initialize_kv_caches()
@@ -203,7 +183,7 @@ def from_engine_args(
executor_class = cls._get_executor_cls(engine_config)
# Create the LLM engine.
engine = cls(
- **engine_config.to_dict(),
+ vllm_config=engine_config,
executor_class=executor_class,
log_stats=not engine_args.disable_log_stats,
usage_context=usage_context,
From 27cd36e6e2e808464c8343066b03db5db2d15413 Mon Sep 17 00:00:00 2001
From: Gene Der Su
Date: Fri, 1 Nov 2024 15:08:23 -0700
Subject: [PATCH 085/113] [Bugfix] PicklingError on RayTaskError (#9934)
Signed-off-by: Gene Su
---
vllm/engine/multiprocessing/engine.py | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py
index eb1512ca17822..a73b4c825b11c 100644
--- a/vllm/engine/multiprocessing/engine.py
+++ b/vllm/engine/multiprocessing/engine.py
@@ -5,6 +5,7 @@
import cloudpickle
import zmq
+from ray.exceptions import RayTaskError
from vllm import AsyncEngineArgs, SamplingParams
# yapf conflicts with isort for this block
@@ -305,6 +306,11 @@ def _health_check(self):
def _send_outputs(self, outputs: REQUEST_OUTPUTS_T):
"""Send List of RequestOutput to RPCClient."""
if outputs:
+ # RayTaskError might not pickelable here. We need to unpack the
+ # underlying exception as the real exception in the output.
+ if (isinstance(outputs, RPCError)
+ and isinstance(outputs.exception, RayTaskError)):
+ outputs.exception = outputs.exception.cause
output_bytes = pickle.dumps(outputs)
self.output_socket.send_multipart((output_bytes, ), copy=False)
From d151fde8341d34592e1e5e14d2152d067421cf63 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Fri, 1 Nov 2024 23:04:42 +0000
Subject: [PATCH 086/113] [ci/build] Bump the patch-update group with 10
updates (#9897)
Signed-off-by: dependabot[bot]
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Kevin H. Luu
---
requirements-lint.txt | 2 +-
requirements-test.in | 2 +-
requirements-test.txt | 12 ++++++------
3 files changed, 8 insertions(+), 8 deletions(-)
diff --git a/requirements-lint.txt b/requirements-lint.txt
index 07f738873e1a8..f9132bbf96437 100644
--- a/requirements-lint.txt
+++ b/requirements-lint.txt
@@ -1,7 +1,7 @@
# formatting
yapf==0.32.0
toml==0.10.2
-tomli==2.0.1
+tomli==2.0.2
ruff==0.6.5
codespell==2.3.0
isort==5.13.2
diff --git a/requirements-test.in b/requirements-test.in
index 3881f2566b556..5d44664c082a6 100644
--- a/requirements-test.in
+++ b/requirements-test.in
@@ -32,6 +32,6 @@ aiohttp
# quantization
bitsandbytes>=0.44.0
-buildkite-test-collector==0.1.8
+buildkite-test-collector==0.1.9
numpy < 2.0.0
diff --git a/requirements-test.txt b/requirements-test.txt
index c474c2ec34b22..7477b7c3a79cd 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -36,20 +36,20 @@ attrs==24.2.0
# referencing
audioread==3.0.1
# via librosa
-awscli==1.35.16
+awscli==1.35.19
# via -r requirements-test.in
bitsandbytes==0.44.1
# via -r requirements-test.in
black==24.10.0
# via datamodel-code-generator
-boto3==1.35.50
+boto3==1.35.53
# via tensorizer
-botocore==1.35.50
+botocore==1.35.53
# via
# awscli
# boto3
# s3transfer
-buildkite-test-collector==0.1.8
+buildkite-test-collector==0.1.9
# via -r requirements-test.in
certifi==2024.8.30
# via
@@ -426,7 +426,7 @@ requests==2.32.3
# transformers
rouge-score==0.1.2
# via lm-eval
-rpds-py==0.20.0
+rpds-py==0.20.1
# via
# jsonschema
# referencing
@@ -552,7 +552,7 @@ xxhash==3.5.0
# via
# datasets
# evaluate
-yarl==1.17.0
+yarl==1.17.1
# via aiohttp
zstandard==0.23.0
# via lm-eval
From 6c0b7f548d80b5f61bfa472ad1497597c922dbc2 Mon Sep 17 00:00:00 2001
From: Peter Salas
Date: Fri, 1 Nov 2024 16:21:10 -0700
Subject: [PATCH 087/113] [Core][VLM] Add precise multi-modal placeholder
tracking (#8346)
Signed-off-by: Peter Salas
---
examples/offline_inference_audio_language.py | 6 +-
tests/kernels/utils.py | 2 +
.../audio_language/test_ultravox.py | 91 ++++++--
tests/multimodal/test_processor_kwargs.py | 14 +-
tests/multimodal/test_utils.py | 57 ++++-
tests/worker/test_model_input.py | 3 +
vllm/attention/backends/abstract.py | 11 +
vllm/attention/backends/blocksparse_attn.py | 3 +
vllm/attention/backends/flash_attn.py | 20 ++
vllm/attention/backends/flashinfer.py | 18 ++
vllm/attention/backends/placeholder_attn.py | 22 +-
vllm/attention/backends/rocm_flash_attn.py | 3 +
vllm/attention/backends/utils.py | 18 ++
vllm/attention/backends/xformers.py | 3 +
vllm/core/scheduler.py | 2 +
vllm/inputs/__init__.py | 3 +-
vllm/inputs/data.py | 11 +-
vllm/inputs/registry.py | 40 ++--
vllm/model_executor/models/blip.py | 10 +-
vllm/model_executor/models/blip2.py | 15 +-
vllm/model_executor/models/chameleon.py | 22 +-
vllm/model_executor/models/clip.py | 32 ++-
vllm/model_executor/models/fuyu.py | 31 ++-
vllm/model_executor/models/internvl.py | 8 +-
vllm/model_executor/models/llava.py | 15 +-
vllm/model_executor/models/llava_next.py | 11 +-
.../model_executor/models/llava_next_video.py | 25 +-
vllm/model_executor/models/llava_onevision.py | 21 +-
vllm/model_executor/models/minicpmv.py | 6 +-
vllm/model_executor/models/mllama.py | 7 +-
vllm/model_executor/models/paligemma.py | 8 +-
vllm/model_executor/models/phi3v.py | 8 +-
vllm/model_executor/models/pixtral.py | 34 ++-
vllm/model_executor/models/qwen.py | 10 +-
vllm/model_executor/models/qwen2_audio.py | 15 +-
vllm/model_executor/models/qwen2_vl.py | 11 +-
vllm/model_executor/models/siglip.py | 24 +-
vllm/model_executor/models/ultravox.py | 60 ++---
vllm/model_executor/models/utils.py | 18 +-
vllm/multimodal/__init__.py | 7 +-
vllm/multimodal/base.py | 214 +++++++++++++++++-
vllm/multimodal/image.py | 8 +-
vllm/multimodal/registry.py | 18 +-
vllm/multimodal/utils.py | 21 +-
vllm/multimodal/video.py | 14 +-
vllm/sequence.py | 17 +-
vllm/worker/cpu_model_runner.py | 38 +++-
vllm/worker/enc_dec_model_runner.py | 30 +--
vllm/worker/model_runner.py | 21 +-
vllm/worker/model_runner_base.py | 5 +-
vllm/worker/openvino_model_runner.py | 43 +++-
vllm/worker/tpu_model_runner.py | 4 +
vllm/worker/xpu_model_runner.py | 38 +++-
53 files changed, 914 insertions(+), 282 deletions(-)
diff --git a/examples/offline_inference_audio_language.py b/examples/offline_inference_audio_language.py
index 37ec667d96a77..050b791b62adb 100644
--- a/examples/offline_inference_audio_language.py
+++ b/examples/offline_inference_audio_language.py
@@ -34,11 +34,7 @@ def run_ultravox(question: str, audio_count: int):
tokenize=False,
add_generation_prompt=True)
- llm = LLM(model=model_name,
- enforce_eager=True,
- enable_chunked_prefill=False,
- max_model_len=8192,
- limit_mm_per_prompt={"audio": audio_count})
+ llm = LLM(model=model_name, limit_mm_per_prompt={"audio": audio_count})
stop_token_ids = None
return llm, prompt, stop_token_ids
diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py
index a2d414f636e13..c3d5252edc2a3 100644
--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@@ -869,6 +869,7 @@ def make_test_metadata(
return attn_backend.make_metadata(
num_prefills=num_prefills,
slot_mapping=(None if kv_mmap is None else kv_mmap.slot_mapping),
+ multi_modal_placeholder_index_maps=None,
num_prefill_tokens=num_prefill_tokens,
num_decode_tokens=num_decode_tokens,
seq_lens=seq_lens,
@@ -914,6 +915,7 @@ def make_test_metadata(
return attn_backend.make_metadata(
num_prefills=num_prefills,
slot_mapping=kv_mmap.slot_mapping,
+ multi_modal_placeholder_index_maps=None,
num_prefill_tokens=num_prefill_tokens,
num_decode_tokens=num_decode_tokens,
seq_lens=seq_lens,
diff --git a/tests/models/decoder_only/audio_language/test_ultravox.py b/tests/models/decoder_only/audio_language/test_ultravox.py
index b9089e75ffab8..d14e88b4e5b26 100644
--- a/tests/models/decoder_only/audio_language/test_ultravox.py
+++ b/tests/models/decoder_only/audio_language/test_ultravox.py
@@ -2,8 +2,10 @@
import numpy as np
import pytest
+import pytest_asyncio
from transformers import AutoModel, AutoTokenizer, BatchEncoding
+from tests.utils import RemoteOpenAIServer
from vllm.sequence import SampleLogprobs
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
@@ -17,6 +19,13 @@
VLLM_PLACEHOLDER = "<|reserved_special_token_0|>"
HF_PLACEHOLDER = "<|audio|>"
+CHUNKED_PREFILL_KWARGS = {
+ "enable_chunked_prefill": True,
+ "max_num_seqs": 2,
+ # Use a very small limit to exercise chunked prefill.
+ "max_num_batched_tokens": 16
+}
+
@pytest.fixture(scope="session")
def audio_assets():
@@ -30,6 +39,26 @@ def audio(request):
return AudioAsset(request.param)
+@pytest.fixture(params=({}, CHUNKED_PREFILL_KWARGS))
+def server(request, audio_assets):
+ args = [
+ "--dtype=bfloat16", "--max-model-len=4096", "--enforce-eager",
+ f"--limit-mm-per-prompt=audio={len(audio_assets)}"
+ ] + [
+ f"--{key.replace('_','-')}={value}"
+ for key, value in request.param.items()
+ ]
+
+ with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+ yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+ async with server.get_async_client() as async_client:
+ yield async_client
+
+
def _get_prompt(audio_count, question, placeholder):
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
placeholder = f"{placeholder}\n" * audio_count
@@ -68,8 +97,7 @@ def run_test(
dtype: str,
max_tokens: int,
num_logprobs: int,
- tensor_parallel_size: int,
- distributed_executor_backend: Optional[str] = None,
+ **kwargs,
):
"""Inference result should be the same between hf and vllm."""
torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
@@ -79,11 +107,8 @@ def run_test(
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
- with vllm_runner(model,
- dtype=dtype,
- tensor_parallel_size=tensor_parallel_size,
- distributed_executor_backend=distributed_executor_backend,
- enforce_eager=True) as vllm_model:
+ with vllm_runner(model, dtype=dtype, enforce_eager=True,
+ **kwargs) as vllm_model:
vllm_outputs_per_audio = [
vllm_model.generate_greedy_logprobs([vllm_prompt],
max_tokens,
@@ -135,18 +160,16 @@ def run_multi_audio_test(
dtype: str,
max_tokens: int,
num_logprobs: int,
- tensor_parallel_size: int,
- distributed_executor_backend: Optional[str] = None,
+ **kwargs,
):
with vllm_runner(model,
dtype=dtype,
- tensor_parallel_size=tensor_parallel_size,
- distributed_executor_backend=distributed_executor_backend,
enforce_eager=True,
limit_mm_per_prompt={
"audio":
max((len(audio) for _, audio in prompts_and_audios))
- }) as vllm_model:
+ },
+ **kwargs) as vllm_model:
vllm_outputs = vllm_model.generate_greedy_logprobs(
[prompt for prompt, _ in prompts_and_audios],
max_tokens,
@@ -162,8 +185,9 @@ def run_multi_audio_test(
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.parametrize("vllm_kwargs", [{}, CHUNKED_PREFILL_KWARGS])
def test_models(hf_runner, vllm_runner, audio, dtype: str, max_tokens: int,
- num_logprobs: int) -> None:
+ num_logprobs: int, vllm_kwargs: dict) -> None:
vllm_prompt = _get_prompt(1, "Describe the audio above.", VLLM_PLACEHOLDER)
hf_prompt = _get_prompt(1, "Describe the audio above.", HF_PLACEHOLDER)
@@ -175,7 +199,7 @@ def test_models(hf_runner, vllm_runner, audio, dtype: str, max_tokens: int,
dtype=dtype,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
- tensor_parallel_size=1,
+ **vllm_kwargs,
)
@@ -183,9 +207,10 @@ def test_models(hf_runner, vllm_runner, audio, dtype: str, max_tokens: int,
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.parametrize("vllm_kwargs", [{}, CHUNKED_PREFILL_KWARGS])
def test_models_with_multiple_audios(vllm_runner, audio_assets, dtype: str,
- max_tokens: int,
- num_logprobs: int) -> None:
+ max_tokens: int, num_logprobs: int,
+ vllm_kwargs: dict) -> None:
vllm_prompt = _get_prompt(len(audio_assets),
"Describe each of the audios above.",
@@ -198,5 +223,37 @@ def test_models_with_multiple_audios(vllm_runner, audio_assets, dtype: str,
dtype=dtype,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
- tensor_parallel_size=1,
+ **vllm_kwargs,
)
+
+
+@pytest.mark.asyncio
+async def test_online_inference(client, audio_assets):
+ """Exercises online inference with/without chunked prefill enabled."""
+
+ messages = [{
+ "role":
+ "user",
+ "content": [
+ *[{
+ "type": "audio_url",
+ "audio_url": {
+ "url": audio.url
+ }
+ } for audio in audio_assets],
+ {
+ "type":
+ "text",
+ "text":
+ f"What's happening in these {len(audio_assets)} audio clips?"
+ },
+ ],
+ }]
+
+ chat_completion = await client.chat.completions.create(model=MODEL_NAME,
+ messages=messages,
+ max_tokens=10)
+
+ assert len(chat_completion.choices) == 1
+ choice = chat_completion.choices[0]
+ assert choice.finish_reason == "length"
diff --git a/tests/multimodal/test_processor_kwargs.py b/tests/multimodal/test_processor_kwargs.py
index 5044740c3e734..4d3bbd805c152 100644
--- a/tests/multimodal/test_processor_kwargs.py
+++ b/tests/multimodal/test_processor_kwargs.py
@@ -5,8 +5,8 @@
import pytest
import torch
-from vllm.inputs import DecoderOnlyInputs, InputContext, token_inputs
-from vllm.inputs.registry import InputRegistry
+from vllm.inputs import (DecoderOnlyInputs, DummyData, InputContext,
+ InputRegistry, token_inputs)
from vllm.multimodal import MultiModalRegistry
from vllm.sequence import VLLM_TOKEN_ID_ARRAY_TYPE, SequenceData
@@ -56,7 +56,7 @@ def custom_dummy_data_factory(self,
num_crops=DEFAULT_NUM_CROPS):
seq_data = SequenceData(
array(VLLM_TOKEN_ID_ARRAY_TYPE, [0] * num_crops))
- return seq_data, None
+ return DummyData(seq_data, None)
with patch(
"vllm.inputs.registry.InputRegistry._default_dummy_data_factory",
@@ -177,9 +177,9 @@ def test_dummy_data_kwarg_overrides(use_dummy_data_mock, num_crops):
# NOTE: seq_len is thrown away here since this will leverage the
# default dummy data factory that we have patched in, whose seq
# len is solely dependent on the value of the mm_processor_kwargs.
- seq_data, _ = dummy_registry.dummy_data_for_profiling(
+ dummy_data = dummy_registry.dummy_data_for_profiling(
ctx.model_config, seq_len=-1, mm_registry=mm_registry)
- assert len(seq_data.prompt_token_ids) == expected_seq_count
+ assert len(dummy_data.seq_data.prompt_token_ids) == expected_seq_count
@pytest.mark.parametrize(
@@ -206,9 +206,9 @@ def test_dummy_data_with_sad_kwarg_overrides(use_dummy_data_mock,
# NOTE: seq_len is thrown away here since this will leverage the
# default dummy data factory that we have patched in, whose seq
# len is solely dependent on the value of the mm_processor_kwargs.
- seq_data, _ = dummy_registry.dummy_data_for_profiling(
+ dummy_data = dummy_registry.dummy_data_for_profiling(
ctx.model_config, seq_len=-1, mm_registry=mm_registry)
- assert len(seq_data.prompt_token_ids) == DEFAULT_NUM_CROPS
+ assert len(dummy_data.seq_data.prompt_token_ids) == DEFAULT_NUM_CROPS
### Test overrides for the max token count per multimodal instance
diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py
index 38cd48629f903..69f04f0a69c0b 100644
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@@ -92,18 +92,50 @@ def test_repeat_and_pad_placeholder_tokens(model):
tokenizer = AutoTokenizer.from_pretrained(model)
test_cases = [
- ("", 2, "", [32000, 32000]),
- ("", 2, "", [32000, 32000, 32000]),
- ("", [3, 2], "",
- [32000, 32000, 32000, 32000, 32000]),
- ("Image:Image:!", [3, 2],
- "Image:Image:!",
- [9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918]),
- ("", [3, 2], "", [32000, 32000, 32000]),
- ]
-
- for prompt, repeat_count, expected_prompt, expected_token_ids in test_cases:
- new_prompt, new_token_ids = repeat_and_pad_placeholder_tokens(
+ (
+ "",
+ 2,
+ "",
+ [32000, 32000],
+ [{ "offset": 0, "length": 2 }],
+ ),
+ (
+ "",
+ 2,
+ "",
+ [32000, 32000, 32000],
+ [{ "offset": 0, "length": 2 }]),
+ (
+ "",
+ [3, 2],
+ "",
+ [32000, 32000, 32000, 32000, 32000],
+ [{ "offset": 0, "length": 3 }, { "offset": 3, "length": 2 }],
+ ),
+ (
+ "Image:Image:!",
+ [3, 2],
+ "Image:Image:!",
+ [9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
+ [{ "offset": 2, "length": 3 }, { "offset": 7, "length": 2 }],
+ ),
+ (
+ "",
+ [3, 2],
+ "",
+ [32000, 32000, 32000],
+ [{ "offset": 0, "length": 3 }],
+ ),
+ ] # yapf: disable
+
+ for (
+ prompt,
+ repeat_count,
+ expected_prompt,
+ expected_token_ids,
+ expected_ranges,
+ ) in test_cases:
+ new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens(
tokenizer=tokenizer,
prompt=prompt,
prompt_token_ids=tokenizer.encode(prompt,
@@ -113,3 +145,4 @@ def test_repeat_and_pad_placeholder_tokens(model):
)
assert new_prompt == expected_prompt
assert new_token_ids == expected_token_ids
+ assert ranges == expected_ranges
diff --git a/tests/worker/test_model_input.py b/tests/worker/test_model_input.py
index 1e7f560fc68cc..b36e8bfe73ff3 100644
--- a/tests/worker/test_model_input.py
+++ b/tests/worker/test_model_input.py
@@ -73,6 +73,7 @@ def test_model_runner_input():
num_prefill_tokens=2,
num_decode_tokens=3,
slot_mapping=torch.zeros(1),
+ multi_modal_placeholder_index_maps=None,
)
model_input = ModelInputForGPUWithSamplingMetadata(
input_tokens=torch.ones(10),
@@ -124,6 +125,7 @@ def test_embedding_model_runner_input():
num_prefill_tokens=2,
num_decode_tokens=3,
slot_mapping=torch.zeros(1),
+ multi_modal_placeholder_index_maps=None,
)
model_input = ModelInputForGPUWithPoolingMetadata(
input_tokens=torch.ones(10),
@@ -174,6 +176,7 @@ def test_multi_step_model_runner_input():
num_prefill_tokens=2,
num_decode_tokens=3,
slot_mapping=torch.zeros(1),
+ multi_modal_placeholder_index_maps=None,
)
frozen_model_input = ModelInputForGPUWithSamplingMetadata(
input_tokens=torch.ones(10),
diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index 9ea89eca01f5b..a504cb1f7e318 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -7,6 +7,8 @@
import torch
+from vllm.multimodal import MultiModalPlaceholderMap
+
if TYPE_CHECKING:
from vllm.worker.model_runner_base import (ModelRunnerBase,
ModelRunnerInputBase,
@@ -108,6 +110,15 @@ class AttentionMetadata:
# in block 0, and 1st slot in block 1, respectively.
slot_mapping: torch.Tensor
+ # The index maps that relate multi-modal embeddings to the corresponding
+ # placeholders.
+ #
+ # N.B. These aren't really related to attention and don't belong on this
+ # type -- this is just a temporary solution to make them available to
+ # `model_executable`.
+ multi_modal_placeholder_index_maps: Optional[Dict[
+ str, MultiModalPlaceholderMap.IndexMap]]
+
@property
@abstractmethod
def prefill_metadata(self) -> Optional["AttentionMetadata"]:
diff --git a/vllm/attention/backends/blocksparse_attn.py b/vllm/attention/backends/blocksparse_attn.py
index c216d195c9e7e..409a42187f46c 100644
--- a/vllm/attention/backends/blocksparse_attn.py
+++ b/vllm/attention/backends/blocksparse_attn.py
@@ -215,6 +215,8 @@ def prefill_metadata(
num_prefill_tokens=self.num_prefill_tokens,
num_decode_tokens=0,
slot_mapping=self.slot_mapping[:self.num_prefill_tokens],
+ multi_modal_placeholder_index_maps=self.
+ multi_modal_placeholder_index_maps,
seq_lens=self.seq_lens[:self.num_prefills],
seq_lens_tensor=self.seq_lens_tensor[:self.num_prefills],
max_query_len=self.max_query_len,
@@ -243,6 +245,7 @@ def decode_metadata(self) -> Optional["BlocksparseFlashAttentionMetadata"]:
num_prefill_tokens=0,
num_decode_tokens=self.num_decode_tokens,
slot_mapping=self.slot_mapping[self.num_prefill_tokens:],
+ multi_modal_placeholder_index_maps=None,
seq_lens=None,
seq_lens_tensor=self.seq_lens_tensor[self.num_prefills:],
max_query_len=None,
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index c294fcf7f08fe..ab363ac78b028 100644
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -1,4 +1,5 @@
"""Attention layer with FlashAttention."""
+from collections import defaultdict
from dataclasses import dataclass
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type
@@ -14,6 +15,7 @@
compute_slot_mapping_start_idx,
is_block_tables_empty)
from vllm.forward_context import get_forward_context
+from vllm.multimodal import MultiModalPlaceholderMap
from vllm.utils import (async_tensor_h2d, direct_register_custom_op,
make_tensor_with_pad)
@@ -169,6 +171,8 @@ def prefill_metadata(self) -> Optional["FlashAttentionMetadata"]:
num_prefill_tokens=self.num_prefill_tokens,
num_decode_tokens=0,
slot_mapping=self.slot_mapping[:self.num_prefill_tokens],
+ multi_modal_placeholder_index_maps=self.
+ multi_modal_placeholder_index_maps,
seq_lens=self.seq_lens[:self.num_prefills],
seq_lens_tensor=self.seq_lens_tensor[:self.num_prefills],
max_query_len=self.max_query_len,
@@ -198,6 +202,7 @@ def decode_metadata(self) -> Optional["FlashAttentionMetadata"]:
num_prefill_tokens=0,
num_decode_tokens=self.num_decode_tokens,
slot_mapping=self.slot_mapping[self.num_prefill_tokens:],
+ multi_modal_placeholder_index_maps=None,
seq_lens=None,
seq_lens_tensor=self.seq_lens_tensor[self.num_prefills:],
max_decode_query_len=self.max_decode_query_len,
@@ -297,6 +302,9 @@ def __init__(self, input_builder: "ModelInputForGPUBuilder"):
self.context_lens: List[int] = []
self.block_tables: List[List[int]] = []
self.curr_seq_lens: List[int] = []
+ self.multimodal_placeholder_maps: Dict[
+ str,
+ MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap)
self.num_prefills = 0
self.num_prefill_tokens = 0
self.num_decode_tokens = 0
@@ -327,6 +335,12 @@ def _add_seq_group(
self.context_lens.append(context_len)
if is_prompt:
+ mm_maps = inter_data.multi_modal_placeholder_maps
+ if mm_maps:
+ for modality, placeholders in mm_maps.items():
+ self.multimodal_placeholder_maps[modality].extend(
+ placeholders)
+
self.num_prefills += 1
self.num_prefill_tokens += token_len
self.prefill_seq_lens.append(seq_len)
@@ -449,6 +463,11 @@ def build(self, seq_lens: List[int], query_lens: List[int],
seq_start_loc = torch.zeros(seq_lens_tensor.shape[0] + 1,
dtype=torch.int32,
device=device)
+ placeholder_index_maps = {
+ modality: placeholder_map.index_map()
+ for modality, placeholder_map in
+ self.multimodal_placeholder_maps.items()
+ }
torch.cumsum(seq_lens_tensor,
dim=0,
dtype=seq_start_loc.dtype,
@@ -464,6 +483,7 @@ def build(self, seq_lens: List[int], query_lens: List[int],
num_prefill_tokens=self.num_prefill_tokens,
num_decode_tokens=num_decode_tokens,
seq_lens=seq_lens,
+ multi_modal_placeholder_index_maps=placeholder_index_maps,
seq_lens_tensor=seq_lens_tensor,
max_query_len=max_query_len,
max_decode_query_len=max_decode_query_len,
diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
index 658805d35be0a..107e3bbf79666 100644
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -1,7 +1,10 @@
+from collections import defaultdict
from contextlib import contextmanager
from dataclasses import dataclass
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple, Type
+from vllm.multimodal import MultiModalPlaceholderMap
+
try:
from flashinfer import BatchDecodeWithPagedKVCacheWrapper
from flashinfer.decode import CUDAGraphBatchDecodeWithPagedKVCacheWrapper
@@ -215,6 +218,7 @@ def graph_capture_get_metadata_for_batch(
attn_metadata = self.runner.attn_backend.make_metadata(
num_prefills=0,
slot_mapping=self._graph_slot_mapping[:batch_size],
+ multi_modal_placeholder_index_maps=None,
num_prefill_tokens=0,
num_decode_tokens=batch_size,
max_prefill_seq_len=0,
@@ -470,6 +474,9 @@ def __init__(self, input_builder: "ModelInputForGPUBuilder"):
self.context_lens: List[int] = []
self.block_tables: List[List[int]] = []
self.curr_seq_lens: List[int] = []
+ self.multimodal_placeholder_maps: Dict[
+ str,
+ MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap)
self.num_prefills = 0
self.num_prefill_tokens = 0
self.num_decode_tokens = 0
@@ -519,6 +526,11 @@ def _add_seq_group(
inter_data.curr_sliding_window_blocks):
self.context_lens.append(context_len)
if is_prompt:
+ mm_maps = inter_data.multi_modal_placeholder_maps
+ if mm_maps:
+ for modality, placeholders in mm_maps.items():
+ self.multimodal_placeholder_maps[modality].extend(
+ placeholders)
self.num_prefills += 1
self.num_prefill_tokens += token_len
self.prefill_seq_lens.append(seq_len)
@@ -651,6 +663,11 @@ def build(self, seq_lens: List[int], query_lens: List[int],
seq_start_loc = torch.zeros(seq_lens_tensor.shape[0] + 1,
dtype=torch.int32,
device=device)
+ placeholder_index_maps = {
+ modality: placeholder_map.index_map()
+ for modality, placeholder_map in
+ self.multimodal_placeholder_maps.items()
+ }
torch.cumsum(seq_lens_tensor,
dim=0,
dtype=seq_start_loc.dtype,
@@ -694,6 +711,7 @@ def build(self, seq_lens: List[int], query_lens: List[int],
decode_query_len=decode_query_len,
num_prefills=self.num_prefills,
slot_mapping=slot_mapping_tensor,
+ multi_modal_placeholder_index_maps=placeholder_index_maps,
num_prefill_tokens=self.num_prefill_tokens,
num_decode_tokens=num_decode_tokens,
max_prefill_seq_len=max_prefill_seq_len,
diff --git a/vllm/attention/backends/placeholder_attn.py b/vllm/attention/backends/placeholder_attn.py
index 4116fbf00020c..888adbffb8578 100644
--- a/vllm/attention/backends/placeholder_attn.py
+++ b/vllm/attention/backends/placeholder_attn.py
@@ -1,5 +1,6 @@
+from collections import defaultdict
from dataclasses import dataclass
-from typing import TYPE_CHECKING, List, Optional, Tuple, Type
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Type
import torch
@@ -7,6 +8,7 @@
AttentionMetadata,
AttentionMetadataBuilder)
from vllm.attention.backends.utils import CommonAttentionState
+from vllm.multimodal import MultiModalPlaceholderMap
if TYPE_CHECKING:
from vllm.worker.model_runner import ModelInputForGPUBuilder
@@ -135,6 +137,8 @@ def prefill_metadata(self) -> Optional["PlaceholderAttentionMetadata"]:
num_prefill_tokens=self.num_prefill_tokens,
num_decode_tokens=0,
slot_mapping=slot_mapping,
+ multi_modal_placeholder_index_maps=self.
+ multi_modal_placeholder_index_maps,
seq_lens=self.seq_lens[:self.num_prefills],
seq_lens_tensor=self.seq_lens_tensor[:self.num_prefills],
max_decode_query_len=0,
@@ -167,6 +171,7 @@ def decode_metadata(self) -> Optional["PlaceholderAttentionMetadata"]:
num_prefill_tokens=0,
num_decode_tokens=self.num_decode_tokens,
slot_mapping=slot_mapping,
+ multi_modal_placeholder_index_maps=None,
seq_lens=None,
seq_lens_tensor=self.seq_lens_tensor[self.num_prefills:],
max_decode_query_len=self.max_decode_query_len,
@@ -189,6 +194,9 @@ def __init__(self, input_builder: "ModelInputForGPUBuilder"):
self.prefill_seq_lens: List[int] = []
self.context_lens: List[int] = []
self.curr_seq_lens: List[int] = []
+ self.multimodal_placeholder_maps: Dict[
+ str,
+ MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap)
self.num_prefills = 0
self.num_prefill_tokens = 0
self.num_decode_tokens = 0
@@ -213,6 +221,12 @@ def _add_seq_group(
self.context_lens.append(context_len)
if is_prompt:
+ mm_maps = inter_data.multi_modal_placeholder_maps
+ if mm_maps:
+ for modality, placeholders in mm_maps.items():
+ self.multimodal_placeholder_maps[modality].extend(
+ placeholders)
+
self.num_prefills += 1
self.num_prefill_tokens += token_len
self.prefill_seq_lens.append(seq_len)
@@ -280,6 +294,11 @@ def build(self, seq_lens: List[int], query_lens: List[int],
seq_start_loc = torch.zeros(seq_lens_tensor.shape[0] + 1,
dtype=torch.int32,
device=device)
+ placeholder_index_maps = {
+ modality: placeholder_map.index_map()
+ for modality, placeholder_map in
+ self.multimodal_placeholder_maps.items()
+ }
torch.cumsum(seq_lens_tensor,
dim=0,
dtype=seq_start_loc.dtype,
@@ -296,6 +315,7 @@ def build(self, seq_lens: List[int], query_lens: List[int],
return PlaceholderAttentionMetadata(
num_prefills=self.num_prefills,
slot_mapping=slot_mapping,
+ multi_modal_placeholder_index_maps=placeholder_index_maps,
num_prefill_tokens=self.num_prefill_tokens,
num_decode_tokens=num_decode_tokens,
seq_lens=seq_lens,
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index 30859dfa60634..b129d0d992f2f 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -150,6 +150,8 @@ def prefill_metadata(self) -> Optional["ROCmFlashAttentionMetadata"]:
num_prefill_tokens=self.num_prefill_tokens,
num_decode_tokens=0,
slot_mapping=self.slot_mapping[:self.num_prefill_tokens],
+ multi_modal_placeholder_index_maps=self.
+ multi_modal_placeholder_index_maps,
seq_lens=self.seq_lens[:self.num_prefills],
seq_lens_tensor=self.seq_lens_tensor[:self.num_prefills],
max_query_len=self.max_query_len,
@@ -178,6 +180,7 @@ def decode_metadata(self) -> Optional["ROCmFlashAttentionMetadata"]:
num_prefill_tokens=0,
num_decode_tokens=self.num_decode_tokens,
slot_mapping=self.slot_mapping[self.num_prefill_tokens:],
+ multi_modal_placeholder_index_maps=None,
seq_lens=None,
seq_lens_tensor=self.seq_lens_tensor[self.num_prefills:],
max_query_len=None,
diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py
index 32fccd0dfb496..55293bbb06e1d 100644
--- a/vllm/attention/backends/utils.py
+++ b/vllm/attention/backends/utils.py
@@ -1,4 +1,5 @@
"""Attention backend utils"""
+from collections import defaultdict
from contextlib import contextmanager
from typing import TYPE_CHECKING, Any, Dict, List, Type, TypeVar, Union
@@ -7,6 +8,7 @@
from vllm.attention import (AttentionMetadata, AttentionMetadataBuilder,
AttentionState)
+from vllm.multimodal import MultiModalPlaceholderMap
from vllm.utils import async_tensor_h2d, make_tensor_with_pad
if TYPE_CHECKING:
@@ -123,6 +125,9 @@ def __init__(self, input_builder: "ModelInputForGPUBuilder"):
self.context_lens: List[int] = []
self.block_tables: List[List[int]] = []
self.curr_seq_lens: List[int] = []
+ self.multimodal_placeholder_maps: Dict[
+ str,
+ MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap)
self.num_prefills = 0
self.num_prefill_tokens = 0
self.num_decode_tokens = 0
@@ -147,6 +152,12 @@ def _add_seq_group(
inter_data.curr_sliding_window_blocks):
self.context_lens.append(context_len)
if is_prompt:
+ mm_maps = inter_data.multi_modal_placeholder_maps
+ if mm_maps:
+ for modality, placeholders in mm_maps.items():
+ self.multimodal_placeholder_maps[modality].extend(
+ placeholders)
+
self.num_prefills += 1
self.num_prefill_tokens += token_len
self.prefill_seq_lens.append(seq_len)
@@ -242,6 +253,11 @@ def build(self, seq_lens: List[int], query_lens: List[int],
seq_start_loc = torch.zeros(seq_lens_tensor.shape[0] + 1,
dtype=torch.int32,
device=device)
+ placeholder_index_maps = {
+ modality: placeholder_map.index_map()
+ for modality, placeholder_map in
+ self.multimodal_placeholder_maps.items()
+ }
torch.cumsum(seq_lens_tensor,
dim=0,
dtype=seq_start_loc.dtype,
@@ -254,6 +270,7 @@ def build(self, seq_lens: List[int], query_lens: List[int],
return self._metadata_cls( # type: ignore
num_prefills=self.num_prefills,
slot_mapping=slot_mapping_tensor,
+ multi_modal_placeholder_index_maps=placeholder_index_maps,
num_prefill_tokens=self.num_prefill_tokens,
num_decode_tokens=num_decode_tokens,
seq_lens=seq_lens,
@@ -305,6 +322,7 @@ def graph_capture_get_metadata_for_batch(
num_prefill_tokens=0,
num_decode_tokens=batch_size,
slot_mapping=self._graph_slot_mapping[:batch_size],
+ multi_modal_placeholder_index_maps=None,
seq_lens=None,
seq_lens_tensor=self._graph_seq_lens[:batch_size],
max_query_len=1,
diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
index 5aaf13d8ea744..21877f2dded0e 100644
--- a/vllm/attention/backends/xformers.py
+++ b/vllm/attention/backends/xformers.py
@@ -212,6 +212,8 @@ def prefill_metadata(self) -> Optional["XFormersMetadata"]:
num_prefill_tokens=self.num_prefill_tokens,
num_decode_tokens=0,
slot_mapping=slot_mapping,
+ multi_modal_placeholder_index_maps=self.
+ multi_modal_placeholder_index_maps,
seq_lens=seq_lens,
seq_lens_tensor=seq_lens_tensor,
max_query_len=self.max_query_len,
@@ -255,6 +257,7 @@ def decode_metadata(self) -> Optional["XFormersMetadata"]:
num_prefill_tokens=0,
num_decode_tokens=self.num_decode_tokens,
slot_mapping=slot_mapping,
+ multi_modal_placeholder_index_maps=None,
seq_lens_tensor=seq_lens_tensor,
max_prefill_seq_len=0,
max_decode_seq_len=self.max_decode_seq_len,
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index e35c05f4fe7f7..e56d5cddce424 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -1308,6 +1308,8 @@ def schedule(
# `multi_modal_data` will be None.
multi_modal_data=seq_group.multi_modal_data
if scheduler_outputs.num_prefill_groups > 0 else None,
+ multi_modal_placeholders=seq_group.multi_modal_placeholders
+ if scheduler_outputs.num_prefill_groups > 0 else None,
mm_processor_kwargs=seq_group.mm_processor_kwargs,
prompt_adapter_request=seq_group.prompt_adapter_request,
)
diff --git a/vllm/inputs/__init__.py b/vllm/inputs/__init__.py
index 7b73922ddd2c5..ac7b3ca28b406 100644
--- a/vllm/inputs/__init__.py
+++ b/vllm/inputs/__init__.py
@@ -3,7 +3,7 @@
SingletonPrompt, TextPrompt, TokenInputs, TokensPrompt,
build_explicit_enc_dec_prompt, to_enc_dec_tuple_list,
token_inputs, zip_enc_dec_prompts)
-from .registry import InputContext, InputRegistry
+from .registry import DummyData, InputContext, InputRegistry
INPUT_REGISTRY = InputRegistry()
"""
@@ -29,6 +29,7 @@
"to_enc_dec_tuple_list",
"zip_enc_dec_prompts",
"INPUT_REGISTRY",
+ "DummyData",
"InputContext",
"InputRegistry",
]
diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py
index 9a094191eda38..ba393cbcce4eb 100644
--- a/vllm/inputs/data.py
+++ b/vllm/inputs/data.py
@@ -4,7 +4,7 @@
from typing_extensions import NotRequired, TypedDict, TypeVar
if TYPE_CHECKING:
- from vllm.multimodal import MultiModalDataDict
+ from vllm.multimodal import MultiModalDataDict, MultiModalPlaceholderDict
class TextPrompt(TypedDict):
@@ -136,6 +136,12 @@ class TokenInputs(TypedDict):
if the model supports it.
"""
+ multi_modal_placeholders: NotRequired[
+ Optional["MultiModalPlaceholderDict"]]
+ """
+ Placeholder ranges for the multi-modal data.
+ """
+
mm_processor_kwargs: NotRequired[Optional[Dict[str, Any]]]
"""
Optional multi-modal processor kwargs to be forwarded to the
@@ -149,6 +155,7 @@ def token_inputs(
prompt_token_ids: List[int],
prompt: Optional[str] = None,
multi_modal_data: Optional["MultiModalDataDict"] = None,
+ multi_modal_placeholders: Optional["MultiModalPlaceholderDict"] = None,
mm_processor_kwargs: Optional[Dict[str, Any]] = None,
) -> TokenInputs:
"""Construct :class:`TokenInputs` from optional values."""
@@ -158,6 +165,8 @@ def token_inputs(
inputs["prompt"] = prompt
if multi_modal_data is not None:
inputs["multi_modal_data"] = multi_modal_data
+ if multi_modal_placeholders is not None:
+ inputs["multi_modal_placeholders"] = multi_modal_placeholders
if mm_processor_kwargs is not None:
inputs["mm_processor_kwargs"] = mm_processor_kwargs
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index 4cebc91ce715c..fbf912a212568 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -1,8 +1,8 @@
import functools
from collections import UserDict
from dataclasses import dataclass
-from typing import (TYPE_CHECKING, Any, Callable, Dict, Mapping, Optional,
- Protocol, Tuple, Type)
+from typing import (TYPE_CHECKING, Any, Callable, Dict, Mapping, NamedTuple,
+ Optional, Protocol, Type)
from torch import nn
from transformers import PretrainedConfig
@@ -16,7 +16,8 @@
if TYPE_CHECKING:
from vllm.config import ModelConfig
- from vllm.multimodal import MultiModalDataDict, MultiModalRegistry
+ from vllm.multimodal import (MultiModalDataDict, MultiModalPlaceholderDict,
+ MultiModalRegistry)
from vllm.sequence import SequenceData
logger = init_logger(__name__)
@@ -63,6 +64,14 @@ def get_hf_image_processor_config(self) -> Dict[str, Any]:
N = TypeVar("N", bound=Type[nn.Module])
+class DummyData(NamedTuple):
+ """Dummy data used for profiling."""
+
+ seq_data: "SequenceData"
+ multi_modal_data: Optional["MultiModalDataDict"] = None
+ multi_modal_placeholders: Optional["MultiModalPlaceholderDict"] = None
+
+
class DummyDataFactory(Protocol):
def __call__(
@@ -71,7 +80,7 @@ def __call__(
seq_len: int,
mm_counts: Mapping[str, int],
**mm_processor_kwargs: Any,
- ) -> Tuple["SequenceData", Optional["MultiModalDataDict"]]:
+ ) -> DummyData:
"""
Create dummy data to be inputted into the model.
@@ -123,7 +132,7 @@ def _default_dummy_data_factory(
ctx: InputContext,
seq_len: int,
mm_counts: Mapping[str, int],
- ) -> Tuple["SequenceData", Optional["MultiModalDataDict"]]:
+ ) -> DummyData:
"""
The default dummy data factory represents the longest possible text
that can be inputted to the model.
@@ -134,10 +143,7 @@ def _default_dummy_data_factory(
# Avoid circular import
from vllm.sequence import SequenceData
- dummy_seq_data = SequenceData.from_prompt_token_counts((0, seq_len))
- dummy_multi_modal_data = None
-
- return dummy_seq_data, dummy_multi_modal_data
+ return DummyData(SequenceData.from_prompt_token_counts((0, seq_len)))
def register_dummy_data(self, factory: DummyDataFactory):
"""
@@ -195,7 +201,7 @@ def dummy_data_for_profiling(
seq_len: int,
mm_registry: "MultiModalRegistry",
is_encoder_data: bool = False,
- ) -> Tuple["SequenceData", Optional["MultiModalDataDict"]]:
+ ) -> DummyData:
"""
Create dummy data for profiling the memory usage of a model.
@@ -220,12 +226,12 @@ def dummy_data_for_profiling(
mm_processor_kwargs = get_allowed_kwarg_only_overrides(
dummy_factory, overrides=model_config.mm_processor_kwargs)
- seq_data, mm_data = dummy_factory(InputContext(model_config), seq_len,
- _MultiModalCounts(mm_counts),
- **mm_processor_kwargs)
+ dummy_data = dummy_factory(InputContext(model_config), seq_len,
+ _MultiModalCounts(mm_counts),
+ **mm_processor_kwargs)
# Having more tokens is over-conservative but otherwise fine
- num_tokens = seq_data.prompt_token_ids
+ num_tokens = dummy_data.seq_data.prompt_token_ids
if len(num_tokens) < seq_len:
if is_encoder_data:
print_warning_once(
@@ -235,15 +241,15 @@ def dummy_data_for_profiling(
raise AssertionError(
f"Expected at least {seq_len} dummy tokens for profiling, "
f"but found {len(num_tokens)} tokens instead.")
- if mm_data is not None:
- for k, v in mm_data.items():
+ if dummy_data.multi_modal_data is not None:
+ for k, v in dummy_data.multi_modal_data.items():
num_items = len(v) if isinstance(v, list) else 1
num_expected = mm_counts[k]
assert num_items >= num_expected, (
f"Expected at least {num_expected} dummy '{k}' instances "
f"for profiling, but found {num_items} instances instead.")
- return seq_data, mm_data
+ return dummy_data
def _default_input_processor(
self,
diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py
index 1f2d7384076ed..e612010677364 100644
--- a/vllm/model_executor/models/blip.py
+++ b/vllm/model_executor/models/blip.py
@@ -98,6 +98,11 @@ def input_processor_for_blip(
if multi_modal_data is None or "image" not in multi_modal_data:
return inputs
+ if "multi_modal_placeholders" in inputs and "image" in inputs[
+ "multi_modal_placeholders"]:
+ # The inputs already have placeholders.
+ return inputs
+
tokenizer = cached_get_tokenizer(model_config.tokenizer)
if image_feature_size_override is None:
@@ -105,7 +110,7 @@ def input_processor_for_blip(
else:
image_feature_size = image_feature_size_override
- new_prompt, new_token_ids = repeat_and_pad_placeholder_tokens(
+ new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens(
tokenizer,
inputs.get("prompt"),
inputs["prompt_token_ids"],
@@ -116,7 +121,8 @@ def input_processor_for_blip(
# NOTE: Create a defensive copy of the original inputs
return token_inputs(prompt_token_ids=new_token_ids,
prompt=new_prompt,
- multi_modal_data=multi_modal_data)
+ multi_modal_data=multi_modal_data,
+ multi_modal_placeholders={"image": ranges})
# Adapted from https://github.com/huggingface/transformers/blob/v4.39.0/src/transformers/models/blip/modeling_blip.py#L164 # noqa
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index c3b3cc8a4ddb6..db1f92649bd49 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -9,13 +9,14 @@
from vllm.attention import AttentionMetadata
from vllm.config import CacheConfig, MultiModalConfig
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext,
- token_inputs)
+from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
+ InputContext, token_inputs)
from vllm.model_executor.layers.activation import get_act_fn
from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.utils import consecutive_placeholder_ranges
from vllm.sequence import IntermediateTensors, SequenceData
from .blip import (BlipVisionModel, dummy_image_for_blip,
@@ -425,7 +426,11 @@ def dummy_seq_data_for_blip2(
return SequenceData.from_prompt_token_counts(
(image_token_id, image_feature_size * num_images),
(0, seq_len - image_feature_size * num_images),
- )
+ ), {
+ "image":
+ consecutive_placeholder_ranges(num_items=num_images,
+ item_size=image_feature_size)
+ }
def dummy_data_for_blip2(ctx: InputContext, seq_len: int,
@@ -434,7 +439,7 @@ def dummy_data_for_blip2(ctx: InputContext, seq_len: int,
vision_config = hf_config.vision_config
num_images = mm_counts["image"]
- seq_data = dummy_seq_data_for_blip2(
+ seq_data, ranges = dummy_seq_data_for_blip2(
hf_config,
seq_len,
num_images,
@@ -444,7 +449,7 @@ def dummy_data_for_blip2(ctx: InputContext, seq_len: int,
if isinstance(vision_config, Blip2VisionConfig):
mm_data = dummy_image_for_blip(vision_config, num_images)
- return seq_data, mm_data
+ return DummyData(seq_data, mm_data, ranges)
msg = f"Unsupported vision config: {type(vision_config)}"
raise NotImplementedError(msg)
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index aaf559ca386cc..9f6c6786c0fa4 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -11,8 +11,8 @@
from vllm.attention import Attention, AttentionMetadata
from vllm.config import CacheConfig, MultiModalConfig
from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext,
- token_inputs)
+from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
+ InputContext, token_inputs)
from vllm.model_executor.layers.activation import SiluAndMul
from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
@@ -30,6 +30,7 @@
from vllm.model_executor.utils import set_weight_attrs
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.utils import (cached_get_tokenizer,
+ consecutive_placeholder_ranges,
repeat_and_pad_placeholder_tokens)
from vllm.sequence import IntermediateTensors, SequenceData
from vllm.utils import print_warning_once
@@ -73,7 +74,11 @@ def dummy_seq_data_for_chameleon(
return SequenceData.from_prompt_token_counts(
(image_token_id, image_feature_size * num_images),
(0, seq_len - image_feature_size * num_images),
- )
+ ), {
+ "image":
+ consecutive_placeholder_ranges(num_items=num_images,
+ item_size=image_feature_size)
+ }
def dummy_image_for_chameleon(
@@ -97,14 +102,14 @@ def dummy_data_for_chameleon(ctx: InputContext, seq_len: int,
mm_counts: Mapping[str, int]):
num_images = mm_counts["image"]
- seq_data = dummy_seq_data_for_chameleon(
+ seq_data, ranges = dummy_seq_data_for_chameleon(
seq_len,
num_images,
image_token_id=CHAMELEON_IMAGE_TOKEN_ID,
)
mm_data = dummy_image_for_chameleon(num_images)
- return seq_data, mm_data
+ return DummyData(seq_data, mm_data, ranges)
def input_processor_for_chameleon(ctx: InputContext,
@@ -120,9 +125,14 @@ def input_processor_for_chameleon(ctx: InputContext,
if multi_modal_data is None or "image" not in multi_modal_data:
return inputs
+ if "multi_modal_placeholders" in inputs and "image" in inputs[
+ "multi_modal_placeholders"]:
+ # The inputs already have placeholders.
+ return inputs
+
model_config = ctx.model_config
tokenizer = cached_get_tokenizer(model_config.tokenizer)
- new_prompt, new_token_ids = repeat_and_pad_placeholder_tokens(
+ new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens(
tokenizer,
inputs.get("prompt"),
inputs["prompt_token_ids"],
diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index a3293020c042e..2d81b9266826b 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -19,6 +19,7 @@
from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.multimodal.utils import (cached_get_tokenizer,
+ consecutive_placeholder_ranges,
repeat_and_pad_placeholder_tokens)
from vllm.sequence import SequenceData
@@ -49,14 +50,13 @@ def get_max_clip_image_tokens(hf_config: CLIPVisionConfig) -> int:
return get_clip_image_feature_size(hf_config)
-def dummy_seq_data_for_clip(
- hf_config: CLIPVisionConfig,
- seq_len: int,
- num_images: int,
- *,
- image_token_id: int,
- image_feature_size_override: Optional[int] = None,
-):
+def dummy_seq_data_for_clip(hf_config: CLIPVisionConfig,
+ seq_len: int,
+ num_images: int,
+ *,
+ image_token_id: int,
+ image_feature_size_override: Optional[int] = None,
+ mm_key: str = "image"):
if image_feature_size_override is None:
image_feature_size = get_clip_image_feature_size(hf_config)
else:
@@ -65,7 +65,11 @@ def dummy_seq_data_for_clip(
return SequenceData.from_prompt_token_counts(
(image_token_id, image_feature_size * num_images),
(0, seq_len - image_feature_size * num_images),
- )
+ ), {
+ mm_key:
+ consecutive_placeholder_ranges(num_items=num_images,
+ item_size=image_feature_size)
+ }
def dummy_image_for_clip(
@@ -117,6 +121,11 @@ def input_processor_for_clip(
if multi_modal_data is None or "image" not in multi_modal_data:
return inputs
+ if "multi_modal_placeholders" in inputs and "image" in inputs[
+ "multi_modal_placeholders"]:
+ # The inputs already have placeholders.
+ return inputs
+
tokenizer = cached_get_tokenizer(model_config.tokenizer)
if image_feature_size_override is None:
@@ -130,7 +139,7 @@ def input_processor_for_clip(
else:
image_feature_size = image_feature_size_override
- new_prompt, new_token_ids = repeat_and_pad_placeholder_tokens(
+ new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens(
tokenizer,
inputs.get("prompt"),
inputs["prompt_token_ids"],
@@ -141,7 +150,8 @@ def input_processor_for_clip(
# NOTE: Create a defensive copy of the original inputs
return token_inputs(prompt_token_ids=new_token_ids,
prompt=new_prompt,
- multi_modal_data=multi_modal_data)
+ multi_modal_data=multi_modal_data,
+ multi_modal_placeholders={"image": ranges})
# Adapted from https://github.com/huggingface/transformers/blob/v4.39.0/src/transformers/models/clip/modeling_clip.py#L164 # noqa
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index 358d1dd288c49..0de590d1d8372 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -27,8 +27,8 @@
from vllm.attention import AttentionMetadata
from vllm.config import CacheConfig, MultiModalConfig
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext,
- token_inputs)
+from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
+ InputContext, token_inputs)
from vllm.model_executor.layers.linear import ColumnParallelLinear
from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.sampler import SamplerOutput
@@ -37,9 +37,11 @@
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.base import MultiModalInputs
from vllm.multimodal.image import cached_get_image_processor
-from vllm.multimodal.utils import cached_get_tokenizer
+from vllm.multimodal.utils import (cached_get_tokenizer,
+ consecutive_placeholder_ranges)
from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors,
SequenceData)
+from vllm.utils import is_list_of
from .interfaces import SupportsMultiModal, SupportsPP
from .utils import AutoWeightsLoader, flatten_bn, merge_multimodal_embeddings
@@ -103,7 +105,11 @@ def dummy_seq_data_for_fuyu(ctx: InputContext, seq_len: int, num_images: int):
token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE, image_token_ids) * num_images
token_ids += array(VLLM_TOKEN_ID_ARRAY_TYPE,
[0]) * (seq_len - image_feature_size * num_images)
- return SequenceData(token_ids)
+ return SequenceData(token_ids), {
+ "image":
+ consecutive_placeholder_ranges(num_items=num_images,
+ item_size=image_feature_size)
+ }
def dummy_image_for_fuyu(
@@ -119,15 +125,15 @@ def dummy_image_for_fuyu(
def dummy_data_for_fuyu(ctx: InputContext, seq_len: int,
mm_counts: Mapping[str, int]):
num_images = mm_counts["image"]
- seq_data = dummy_seq_data_for_fuyu(ctx, seq_len, num_images)
+ seq_data, ranges = dummy_seq_data_for_fuyu(ctx, seq_len, num_images)
mm_data = dummy_image_for_fuyu(num_images,
image_width=MAX_IMAGE_FEATURE_SIZE_WIDTH,
image_height=MAX_IMAGE_FEATURE_SIZE_HEIGHT)
- return seq_data, mm_data
+ return DummyData(seq_data, mm_data, ranges)
def _fuyu_image_preprocess(image_processor: FuyuImageProcessor,
- data: Image.Image):
+ data: List[Image.Image]):
image_encoding = image_processor.preprocess(data, return_tensors="pt")
batch_images = torch.stack([img[0] for img in image_encoding["images"]
]).unsqueeze(1)
@@ -158,8 +164,10 @@ def input_processor_for_fuyu(ctx: InputContext, inputs: DecoderOnlyInputs):
model_config = ctx.model_config
image_data = multi_modal_data["image"]
new_multi_modal_data = {}
+ image_list = image_data if isinstance(image_data, list) else [image_data]
+
# process image data
- if isinstance(image_data, Image.Image):
+ if is_list_of(image_list, Image.Image):
# Fuyu's image_processor can also finish token padding
image_processor: FuyuImageProcessor = cached_get_image_processor(
model_config.model)
@@ -171,7 +179,7 @@ def input_processor_for_fuyu(ctx: InputContext, inputs: DecoderOnlyInputs):
])
new_multi_modal_data["image"] = image_patches
- elif isinstance(image_data, torch.Tensor):
+ elif is_list_of(image_list, torch.Tensor):
raise NotImplementedError("Embeddings input is not supported yet")
else:
raise TypeError(f"Invalid image type: {type(image_data)}")
@@ -198,12 +206,13 @@ def input_processor_for_fuyu(ctx: InputContext, inputs: DecoderOnlyInputs):
def input_mapper_for_fuyu(ctx: InputContext, data: object):
model_config = ctx.model_config
- if isinstance(data, Image.Image):
+ data_list = data if isinstance(data, list) else [data]
+ if is_list_of(data_list, Image.Image):
# Fuyu's image_processor can also finish token padding
image_processor: FuyuImageProcessor = cached_get_image_processor(
model_config.model)
- model_image_input = _fuyu_image_preprocess(image_processor, data)
+ model_image_input = _fuyu_image_preprocess(image_processor, data_list)
data = torch.stack([
image_patch[0]
for image_patch in model_image_input["image_patches"]
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index 1c1fde5b30983..d2ec0ff6e74c6 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -17,8 +17,8 @@
from vllm.attention import AttentionMetadata
from vllm.config import CacheConfig, MultiModalConfig
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext,
- token_inputs)
+from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
+ InputContext, token_inputs)
from vllm.model_executor.layers.quantization import (AWQConfig,
QuantizationConfig)
from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
@@ -379,7 +379,7 @@ def dummy_data(
model_config.tokenizer,
trust_remote_code=model_config.trust_remote_code)
- seq_data = dummy_seq_data_for_clip(
+ seq_data, ranges = dummy_seq_data_for_clip(
hf_config.vision_config,
seq_len,
num_images,
@@ -398,7 +398,7 @@ def dummy_data(
image_height_override=max_image_height,
)
- return seq_data, mm_data
+ return DummyData(seq_data, mm_data, ranges)
input_pipeline = InternVLInputPipeline(IMG_START, IMG_END, IMG_CONTEXT)
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 27055e7ced865..7fbd59ebd98fd 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -10,7 +10,8 @@
from vllm.attention import AttentionMetadata
from vllm.config import CacheConfig, MultiModalConfig
-from vllm.inputs import INPUT_REGISTRY, DecoderOnlyInputs, InputContext
+from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
+ InputContext)
from vllm.model_executor.layers.activation import get_act_fn
from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
@@ -111,7 +112,7 @@ def dummy_data_for_llava(ctx: InputContext, seq_len: int,
image_feature_size = get_max_llava_image_tokens(ctx)
if isinstance(vision_config, CLIPVisionConfig):
- seq_data = dummy_seq_data_for_clip(
+ seq_data, ranges = dummy_seq_data_for_clip(
vision_config,
seq_len,
num_images,
@@ -120,9 +121,9 @@ def dummy_data_for_llava(ctx: InputContext, seq_len: int,
)
mm_data = dummy_image_for_clip(vision_config, num_images)
- return seq_data, mm_data
+ return DummyData(seq_data, mm_data, ranges)
elif isinstance(vision_config, SiglipVisionConfig):
- seq_data = dummy_seq_data_for_siglip(
+ seq_data, ranges = dummy_seq_data_for_siglip(
vision_config,
seq_len,
num_images,
@@ -131,9 +132,9 @@ def dummy_data_for_llava(ctx: InputContext, seq_len: int,
)
mm_data = dummy_image_for_siglip(vision_config, num_images)
- return seq_data, mm_data
+ return DummyData(seq_data, mm_data, ranges)
elif isinstance(vision_config, PixtralVisionConfig):
- seq_data = dummy_seq_data_for_pixtral_hf(
+ seq_data, ranges = dummy_seq_data_for_pixtral_hf(
vision_config,
seq_len,
num_images,
@@ -142,7 +143,7 @@ def dummy_data_for_llava(ctx: InputContext, seq_len: int,
)
mm_data = dummy_image_for_pixtral_hf(vision_config, num_images)
- return seq_data, mm_data
+ return DummyData(seq_data, mm_data, ranges)
msg = f"Unsupported vision config: {type(vision_config)}"
raise NotImplementedError(msg)
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index e8540d85ff565..e8c5786066170 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -12,7 +12,8 @@
from vllm.attention import AttentionMetadata
from vllm.config import CacheConfig, MultiModalConfig, PoolerConfig
-from vllm.inputs import INPUT_REGISTRY, DecoderOnlyInputs, InputContext
+from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
+ InputContext)
from vllm.model_executor.layers.pooler import Pooler, PoolingType
from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
@@ -180,7 +181,7 @@ def dummy_data_for_llava_next(ctx: InputContext, seq_len: int,
max_feat_height, max_feat_width = pinpoint
if isinstance(vision_config, CLIPVisionConfig):
- seq_data = dummy_seq_data_for_clip(
+ seq_data, ranges = dummy_seq_data_for_clip(
vision_config,
seq_len,
num_images,
@@ -195,9 +196,9 @@ def dummy_data_for_llava_next(ctx: InputContext, seq_len: int,
image_height_override=max_feat_height,
)
- return seq_data, mm_data
+ return DummyData(seq_data, mm_data, ranges)
elif isinstance(vision_config, SiglipVisionConfig):
- seq_data = dummy_seq_data_for_siglip(
+ seq_data, ranges = dummy_seq_data_for_siglip(
vision_config,
seq_len,
num_images,
@@ -212,7 +213,7 @@ def dummy_data_for_llava_next(ctx: InputContext, seq_len: int,
image_height_override=max_feat_height,
)
- return seq_data, mm_data
+ return DummyData(seq_data, mm_data, ranges)
msg = f"Unsupported vision config: {type(vision_config)}"
raise NotImplementedError(msg)
diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
index b8051d5fc6ae2..b755e2347f6ed 100644
--- a/vllm/model_executor/models/llava_next_video.py
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -11,8 +11,8 @@
from vllm.attention import AttentionMetadata
from vllm.config import CacheConfig, MultiModalConfig
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext,
- token_inputs)
+from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
+ InputContext, token_inputs)
from vllm.model_executor.layers.activation import get_act_fn
from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
@@ -108,33 +108,35 @@ def dummy_data_for_llava_next_video(ctx: InputContext, seq_len: int,
video_feature_size = frames_per_video * tokens_per_frame
if isinstance(vision_config, CLIPVisionConfig):
- seq_data = dummy_seq_data_for_clip(
+ seq_data, ranges = dummy_seq_data_for_clip(
vision_config,
seq_len,
num_videos,
image_token_id=hf_config.video_token_index,
image_feature_size_override=video_feature_size,
+ mm_key="video",
)
pil_frame = dummy_image_for_clip(vision_config, num_images=1)
np_frame = np.array(pil_frame["image"])
mm_data_per_video = np.repeat([np_frame], frames_per_video, axis=0)
mm_data = {"video": mm_data_per_video}
- return seq_data, mm_data
+ return DummyData(seq_data, mm_data, ranges)
elif isinstance(vision_config, SiglipVisionConfig):
- seq_data = dummy_seq_data_for_siglip(
+ seq_data, ranges = dummy_seq_data_for_siglip(
vision_config,
seq_len,
num_videos,
image_token_id=hf_config.video_token_index,
image_feature_size_override=video_feature_size,
+ mm_key="video",
)
pil_frame = dummy_image_for_siglip(vision_config, num_images=1)
np_frame = np.array(pil_frame["image"])
mm_data_per_video = np.repeat([np_frame], frames_per_video, axis=0)
mm_data = {"video": mm_data_per_video}
- return seq_data, mm_data
+ return DummyData(seq_data, mm_data, ranges)
msg = f"Unsupported vision config: {type(vision_config)}"
raise NotImplementedError(msg)
@@ -145,6 +147,12 @@ def input_processor_for_llava_next_video(ctx: InputContext,
multi_modal_data = inputs.get("multi_modal_data")
if multi_modal_data is None or "video" not in multi_modal_data:
return inputs
+
+ if "multi_modal_placeholders" in inputs and "video" in inputs[
+ "multi_modal_placeholders"]:
+ # The inputs already have placeholders.
+ return inputs
+
video_data = multi_modal_data["video"]
model_config = ctx.model_config
@@ -160,7 +168,7 @@ def input_processor_for_llava_next_video(ctx: InputContext,
tokenizer = cached_get_tokenizer(model_config.tokenizer)
- new_prompt, new_token_ids = repeat_and_pad_placeholder_tokens(
+ new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens(
tokenizer,
inputs.get("prompt"),
inputs["prompt_token_ids"],
@@ -170,7 +178,8 @@ def input_processor_for_llava_next_video(ctx: InputContext,
return token_inputs(prompt_token_ids=new_token_ids,
prompt=new_prompt,
- multi_modal_data=multi_modal_data)
+ multi_modal_data=multi_modal_data,
+ multi_modal_placeholders={"video": ranges})
elif is_list_of(video_data, np.ndarray):
raise NotImplementedError(
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index a0cf208a65f36..f410d64577a77 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -15,8 +15,8 @@
from vllm.attention import AttentionMetadata
from vllm.config import CacheConfig, MultiModalConfig
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext,
- token_inputs)
+from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
+ InputContext, token_inputs)
from vllm.model_executor.layers.activation import get_act_fn
from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
@@ -218,31 +218,31 @@ def dummy_data_for_llava_onevision(ctx: InputContext, seq_len: int,
video_feature_size = get_llava_onevision_video_tokens(ctx, num_frames)
if isinstance(vision_config, CLIPVisionConfig):
- seq_data = dummy_seq_data_for_clip(
+ seq_data, ranges = dummy_seq_data_for_clip(
vision_config,
seq_len,
num_videos,
image_token_id=hf_config.video_token_index,
image_feature_size_override=video_feature_size,
- )
+ mm_key="video")
mm_data = dummy_video_for_clip(vision_config,
num_frames=num_frames,
num_videos=num_videos)
- return seq_data, mm_data
+ return DummyData(seq_data, mm_data, ranges)
elif isinstance(vision_config, SiglipVisionConfig):
- seq_data = dummy_seq_data_for_siglip(
+ seq_data, ranges = dummy_seq_data_for_siglip(
vision_config,
seq_len,
num_videos,
image_token_id=hf_config.video_token_index,
image_feature_size_override=video_feature_size,
- )
+ mm_key="video")
mm_data = dummy_video_for_siglip(vision_config,
num_frames=num_frames,
num_videos=num_videos)
- return seq_data, mm_data
+ return DummyData(seq_data, mm_data, ranges)
msg = f"Unsupported vision config: {type(vision_config)}"
raise NotImplementedError(msg)
@@ -320,7 +320,7 @@ def input_processor_when_multimodal_input_video(ctx: InputContext,
video_feature_size = get_llava_onevision_video_tokens(ctx, num_frames)
tokenizer = cached_get_tokenizer(model_config.tokenizer)
- new_prompt, new_token_ids = repeat_and_pad_placeholder_tokens(
+ new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens(
tokenizer,
inputs.get("prompt"),
inputs["prompt_token_ids"],
@@ -330,7 +330,8 @@ def input_processor_when_multimodal_input_video(ctx: InputContext,
return token_inputs(prompt_token_ids=new_token_ids,
prompt=new_prompt,
- multi_modal_data=multi_modal_data)
+ multi_modal_data=multi_modal_data,
+ multi_modal_placeholders={"video": ranges})
elif is_list_of(video_data, np.ndarray):
video_feature_size = []
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 4917c33136069..a526a5dccd398 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -36,8 +36,8 @@
from vllm.attention import AttentionMetadata
from vllm.config import CacheConfig, LoRAConfig, MultiModalConfig
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext,
- token_inputs)
+from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
+ InputContext, token_inputs)
from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.resampler import (BaseResampler, Resampler2,
@@ -277,7 +277,7 @@ def dummy_data_for_minicpmv(ctx: InputContext, seq_len: int,
seq_data = dummy_seq_data_for_minicpmv(seq_len, num_images)
mm_data = dummy_image_for_minicpmv(ctx, hf_config, num_images)
- return seq_data, mm_data
+ return DummyData(seq_data, mm_data)
def input_processor_for_minicpmv(ctx: InputContext, inputs: DecoderOnlyInputs):
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index 5cf5272cae878..19c3827e43703 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -36,7 +36,7 @@
from vllm.attention.ops.paged_attn import PagedAttention
from vllm.config import CacheConfig, MultiModalConfig
from vllm.distributed import get_tensor_model_parallel_world_size
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs,
+from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
EncoderDecoderInputs, InputContext)
from vllm.logger import init_logger
from vllm.model_executor.layers.layernorm import RMSNorm
@@ -176,13 +176,14 @@ def dummy_image(num_images: int, ):
def dummy_decoder_data_for_mllama(ctx: InputContext, seq_len: int,
mm_counts: Mapping[str, int]):
num_images = mm_counts["image"]
- return dummy_decoder_seq_data(seq_len, num_images), None
+ return DummyData(dummy_decoder_seq_data(seq_len, num_images))
def dummy_encoder_data_for_mllama(ctx: InputContext, seq_len: int,
mm_counts: Mapping[str, int]):
num_images = mm_counts["image"]
- return dummy_encoder_seq_data(ctx, num_images), dummy_image(num_images)
+ return DummyData(dummy_encoder_seq_data(ctx, num_images),
+ dummy_image(num_images))
def _prepare_aspect_ratio_attention_mask(
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index 8e29c6079b994..4b6061e113cb2 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -7,8 +7,8 @@
from vllm.attention import AttentionMetadata
from vllm.config import CacheConfig, MultiModalConfig
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext,
- token_inputs)
+from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
+ InputContext, token_inputs)
from vllm.logger import init_logger
from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.sampler import SamplerOutput
@@ -58,7 +58,7 @@ def dummy_data_for_paligemma(ctx: InputContext, seq_len: int,
vision_config = hf_config.vision_config
num_images = mm_counts["image"]
- seq_data = dummy_seq_data_for_siglip(
+ seq_data, ranges = dummy_seq_data_for_siglip(
vision_config,
seq_len,
num_images,
@@ -66,7 +66,7 @@ def dummy_data_for_paligemma(ctx: InputContext, seq_len: int,
)
mm_data = dummy_image_for_siglip(vision_config, num_images)
- return seq_data, mm_data
+ return DummyData(seq_data, mm_data, ranges)
def input_processor_for_paligemma(ctx: InputContext,
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 4928e447d5b9e..5b477a8ed5f49 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -28,8 +28,8 @@
from vllm.attention import AttentionMetadata
from vllm.config import (CacheConfig, ModelConfig, MultiModalConfig,
PoolerConfig)
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext,
- token_inputs)
+from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
+ InputContext, token_inputs)
from vllm.logger import init_logger
from vllm.model_executor.layers.pooler import Pooler, PoolingType
from vllm.model_executor.layers.quantization import QuantizationConfig
@@ -380,7 +380,7 @@ def dummy_data_for_phi3v(ctx: InputContext,
image_feature_size = get_max_phi3v_image_tokens(ctx, num_crops=num_crops)
- seq_data = dummy_seq_data_for_clip(
+ seq_data, ranges = dummy_seq_data_for_clip(
CLIP_VIT_LARGE_PATCH14_336_CONFIG,
seq_len,
num_images,
@@ -394,7 +394,7 @@ def dummy_data_for_phi3v(ctx: InputContext,
image_height_override=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
)
- return seq_data, mm_data
+ return DummyData(seq_data, mm_data, ranges)
@lru_cache
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 6b53bf5660096..051454c49bff8 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -17,8 +17,8 @@
from vllm.attention import AttentionMetadata
from vllm.config import CacheConfig, ModelConfig, MultiModalConfig
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext,
- token_inputs)
+from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
+ InputContext, token_inputs)
from vllm.model_executor.layers.activation import get_act_fn
from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.quantization import QuantizationConfig
@@ -28,7 +28,8 @@
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.base import MultiModalInputs
-from vllm.multimodal.utils import cached_get_tokenizer
+from vllm.multimodal.utils import (cached_get_tokenizer,
+ consecutive_placeholder_ranges)
from vllm.sequence import IntermediateTensors, SequenceData
from vllm.transformers_utils.processor import cached_get_processor
from vllm.utils import is_list_of
@@ -81,7 +82,12 @@ def dummy_data_for_pixtral(ctx: InputContext, seq_len: int,
)
mm_data = {"image": num_images * [image]}
- return seq_data, mm_data
+ mm_placeholders = {
+ "image":
+ consecutive_placeholder_ranges(num_items=num_images,
+ item_size=image_feature_size)
+ }
+ return DummyData(seq_data, mm_data, mm_placeholders)
def input_mapper_for_pixtral(ctx: InputContext,
@@ -630,13 +636,13 @@ def get_max_pixtral_hf_image_tokens(hf_config: PixtralVisionConfig) -> int:
def dummy_seq_data_for_pixtral_hf(
- hf_config: PixtralVisionConfig,
- seq_len: int,
- num_images: int,
- *,
- image_token_id: int,
- image_feature_size_override: Optional[int] = None,
-):
+ hf_config: PixtralVisionConfig,
+ seq_len: int,
+ num_images: int,
+ *,
+ image_token_id: int,
+ image_feature_size_override: Optional[int] = None,
+ mm_key: str = "image"):
if image_feature_size_override is None:
image_feature_size = get_max_pixtral_hf_image_feature_size(hf_config)
else:
@@ -645,7 +651,11 @@ def dummy_seq_data_for_pixtral_hf(
return SequenceData.from_prompt_token_counts(
(image_token_id, image_feature_size * num_images),
(0, seq_len - image_feature_size * num_images),
- )
+ ), {
+ mm_key:
+ consecutive_placeholder_ranges(num_items=num_images,
+ item_size=image_feature_size)
+ }
def dummy_image_for_pixtral_hf(
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index 61665768eacf5..b2b5c70182135 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -23,8 +23,8 @@
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, LoRAConfig, MultiModalConfig
from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext,
- token_inputs)
+from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
+ InputContext, token_inputs)
from vllm.logger import init_logger
from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn
from vllm.model_executor.layers.layernorm import RMSNorm
@@ -810,7 +810,7 @@ def dummy_data_for_qwen(
ctx: InputContext,
seq_len: int,
mm_counts: Mapping[str, int],
-) -> Tuple[SequenceData, Optional[Dict]]:
+) -> DummyData:
"""Build dummy data for warming up Qwen models; this will only contain text
matching the defaults for VLLM unless the model has a visual config.
@@ -829,7 +829,7 @@ def dummy_data_for_qwen(
if not hasattr(hf_config, "visual"):
seq_data = SequenceData.from_prompt_token_counts((0, seq_len))
mm_data = None
- return seq_data, mm_data
+ return DummyData(seq_data, mm_data)
# We have a visual component - use images to warm up
num_images = mm_counts["image"]
@@ -861,7 +861,7 @@ def dummy_data_for_qwen(
# the data will get resized and the # of tokens per image is constant
image = Image.new("RGB", (224, 224), color=0)
mm_data = {"image": image if num_images == 1 else [image] * num_images}
- return seq_data, mm_data
+ return DummyData(seq_data, mm_data)
class QWenBaseModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index 3d049eeb920b7..6114548bda42c 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -31,8 +31,8 @@
from vllm.attention import AttentionMetadata
from vllm.config import CacheConfig, MultiModalConfig
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext,
- token_inputs)
+from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
+ InputContext, token_inputs)
from vllm.logger import init_logger
from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.quantization.base_config import (
@@ -44,6 +44,7 @@
from vllm.model_executor.models.qwen2 import Qwen2Model
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalInputs
+from vllm.multimodal.utils import consecutive_placeholder_ranges
from vllm.sequence import IntermediateTensors, SequenceData
from .interfaces import SupportsMultiModal, SupportsPP
@@ -85,7 +86,8 @@ def forward(self, audio_features):
def dummy_data_for_qwen2_audio(ctx: InputContext, seq_len: int,
mm_counts: Mapping[str, int]):
num_audios = mm_counts["audio"]
- max_llm_audio_tokens = get_max_qwen2_audio_audio_tokens(ctx) * num_audios
+ max_tokens_per_audio = get_max_qwen2_audio_audio_tokens(ctx)
+ max_llm_audio_tokens = max_tokens_per_audio * num_audios
if seq_len - max_llm_audio_tokens - 2 < 0:
raise RuntimeError(
f"Qwen2-Audio cannot process {num_audios} audios in a prompt, "
@@ -99,7 +101,12 @@ def dummy_data_for_qwen2_audio(ctx: InputContext, seq_len: int,
(0, seq_len - max_llm_audio_tokens),
)
dummy_audio = np.full((max_llm_audio_tokens * 2 * 2 * 160, ), 0.)
- return dummy_seqdata, {"audio": [(dummy_audio, 16000)] * num_audios}
+ return DummyData(
+ dummy_seqdata, {"audio": [(dummy_audio, 16000)] * num_audios}, {
+ "audio":
+ consecutive_placeholder_ranges(num_items=num_audios,
+ item_size=max_tokens_per_audio)
+ })
def get_processor(
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 1e12c2332b65e..d801903f8f9fe 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -44,8 +44,8 @@
from vllm.config import CacheConfig, MultiModalConfig
from vllm.distributed import get_pp_group, parallel_state
from vllm.distributed import utils as dist_utils
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext,
- token_inputs)
+from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
+ InputContext, token_inputs)
from vllm.logger import init_logger
from vllm.model_executor import SamplingMetadata
from vllm.model_executor.layers.activation import QuickGELU
@@ -744,9 +744,10 @@ def dummy_data_for_qwen2_vl(
dummy_image = Image.new("RGB", (max_resized_width, max_resized_height),
color=0)
- return dummy_seqdata, {
- "image": dummy_image if num_images == 1 else [dummy_image] * num_images
- }
+ return DummyData(dummy_seqdata, {
+ "image":
+ dummy_image if num_images == 1 else [dummy_image] * num_images
+ })
def _get_llm_num_vision_tokens(
diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py
index 2e7ae32055aaf..acaf4afdecfe5 100644
--- a/vllm/model_executor/models/siglip.py
+++ b/vllm/model_executor/models/siglip.py
@@ -23,6 +23,7 @@
VocabParallelEmbedding)
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.multimodal.utils import (cached_get_tokenizer,
+ consecutive_placeholder_ranges,
repeat_and_pad_placeholder_tokens)
from vllm.sequence import SequenceData
@@ -61,6 +62,7 @@ def dummy_seq_data_for_siglip(
*,
image_token_id: int,
image_feature_size_override: Optional[int] = None,
+ mm_key: str = "image",
):
if image_feature_size_override is None:
image_feature_size = get_siglip_image_feature_size(hf_config)
@@ -70,7 +72,11 @@ def dummy_seq_data_for_siglip(
return SequenceData.from_prompt_token_counts(
(image_token_id, image_feature_size * num_images),
(0, seq_len - image_feature_size * num_images),
- )
+ ), {
+ mm_key:
+ consecutive_placeholder_ranges(num_items=num_images,
+ item_size=image_feature_size)
+ }
def dummy_image_for_siglip(
@@ -122,6 +128,11 @@ def input_processor_for_siglip(
if multi_modal_data is None or "image" not in multi_modal_data:
return inputs
+ if "multi_modal_placeholders" in inputs and "image" in inputs[
+ "multi_modal_placeholders"]:
+ # The inputs already have placeholders.
+ return inputs
+
tokenizer = cached_get_tokenizer(model_config.tokenizer)
if image_feature_size_override is None:
@@ -135,7 +146,7 @@ def input_processor_for_siglip(
else:
image_feature_size = image_feature_size_override
- new_prompt, new_token_ids = repeat_and_pad_placeholder_tokens(
+ new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens(
tokenizer,
inputs.get("prompt"),
inputs["prompt_token_ids"],
@@ -144,11 +155,10 @@ def input_processor_for_siglip(
)
# NOTE: Create a defensive copy of the original inputs
- return token_inputs(
- prompt_token_ids=new_token_ids,
- prompt=new_prompt,
- multi_modal_data=multi_modal_data,
- )
+ return token_inputs(prompt_token_ids=new_token_ids,
+ prompt=new_prompt,
+ multi_modal_data=multi_modal_data,
+ multi_modal_placeholders={"image": ranges})
# Adapted from https://github.com/huggingface/transformers/blob/v4.43.3/src/transformers/models/siglip/modeling_siglip.py#L249 # noqa
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index f08e4aa355086..749750fc9c16e 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -2,7 +2,6 @@
"""PyTorch Ultravox model."""
import math
-from array import array
from functools import cached_property, lru_cache
from typing import (Iterable, List, Literal, Mapping, Optional, Tuple,
TypedDict, Union, cast)
@@ -17,27 +16,27 @@
from vllm.attention import AttentionMetadata
from vllm.config import CacheConfig, MultiModalConfig
-from vllm.inputs import INPUT_REGISTRY
-from vllm.inputs.data import DecoderOnlyInputs, token_inputs
-from vllm.inputs.registry import InputContext
+from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
+ InputContext, token_inputs)
from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn
from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
from vllm.model_executor.model_loader.loader import DefaultModelLoader
from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.base import MultiModalInputs, NestedTensors
+from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalInputs,
+ NestedTensors)
from vllm.multimodal.utils import (cached_get_tokenizer,
+ consecutive_placeholder_ranges,
repeat_and_pad_placeholder_tokens)
-from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors,
- SequenceData)
+from vllm.sequence import IntermediateTensors, SequenceData
from vllm.transformers_utils.configs.ultravox import UltravoxConfig
from vllm.utils import is_list_of
from .interfaces import SupportsMultiModal, SupportsPP
from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
- init_vllm_registered_model, merge_multimodal_embeddings)
+ init_vllm_registered_model,
+ merge_multimodal_embeddings_from_map)
_AUDIO_PLACEHOLDER_TOKEN = 128002
_AUDIO_TOKENS_PER_SECOND = 6.25
@@ -46,13 +45,13 @@
class UltravoxAudioFeatureInputs(TypedDict):
type: Literal["audio_features"]
data: NestedTensors
- """Shape: `(batch_size, num_audios, 80, M)"""
+ """Shape: `(batch_size, num_audios, 80, M)`"""
class UltravoxAudioEmbeddingInputs(TypedDict):
type: Literal["audio_embeds"]
data: NestedTensors
- """Shape: `(batch_size, num_audios, audio_feature_size, hidden_size)"""
+ """Shape: `(batch_size, num_audios, audio_feature_size, hidden_size)`"""
UltravoxAudioInputs = Union[UltravoxAudioFeatureInputs,
@@ -79,17 +78,16 @@ def dummy_seq_data_for_ultravox(
seq_len: int,
audio_count: int,
):
- audio_placeholder = array(
- VLLM_TOKEN_ID_ARRAY_TYPE,
- [_AUDIO_PLACEHOLDER_TOKEN]) * get_ultravox_max_audio_tokens(ctx)
+ audio_length = min(get_ultravox_max_audio_tokens(ctx),
+ seq_len // audio_count)
- # Add a separator between each chunk.
- audio_token_ids = (audio_placeholder +
- array(VLLM_TOKEN_ID_ARRAY_TYPE, [0])) * audio_count
- other_token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE,
- [0]) * (seq_len - len(audio_token_ids))
-
- return SequenceData(audio_token_ids + other_token_ids)
+ return SequenceData.from_prompt_token_counts(
+ (_AUDIO_PLACEHOLDER_TOKEN, audio_length * audio_count),
+ (0, seq_len - audio_length * audio_count)), {
+ "audio":
+ consecutive_placeholder_ranges(num_items=audio_count,
+ item_size=audio_length)
+ }
def dummy_audio_for_ultravox(
@@ -107,10 +105,10 @@ def dummy_data_for_ultravox(
mm_counts: Mapping[str, int],
):
audio_count = mm_counts["audio"]
- seq_data = dummy_seq_data_for_ultravox(ctx, seq_len, audio_count)
+ seq_data, ranges = dummy_seq_data_for_ultravox(ctx, seq_len, audio_count)
mm_dict = dummy_audio_for_ultravox(ctx, audio_count)
- return (seq_data, mm_dict)
+ return DummyData(seq_data, mm_dict, ranges)
def input_mapper_for_ultravox(ctx: InputContext, data: object):
@@ -164,6 +162,11 @@ def input_processor_for_ultravox(ctx: InputContext, inputs: DecoderOnlyInputs):
if multi_modal_data is None or "audio" not in multi_modal_data:
return inputs
+ if "multi_modal_placeholders" in inputs and "audio" in inputs[
+ "multi_modal_placeholders"]:
+ # The inputs already have placeholders.
+ return inputs
+
feature_extractor = whisper_feature_extractor(ctx)
audios = multi_modal_data["audio"]
if not isinstance(audios, list):
@@ -197,7 +200,7 @@ def input_processor_for_ultravox(ctx: InputContext, inputs: DecoderOnlyInputs):
tokenizer = cached_get_tokenizer(ctx.model_config.tokenizer)
- new_prompt, new_token_ids = repeat_and_pad_placeholder_tokens(
+ new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens(
tokenizer,
inputs.get("prompt"),
inputs["prompt_token_ids"],
@@ -208,7 +211,8 @@ def input_processor_for_ultravox(ctx: InputContext, inputs: DecoderOnlyInputs):
# NOTE: Create a defensive copy of the original inputs
return token_inputs(prompt_token_ids=new_token_ids,
prompt=new_prompt,
- multi_modal_data=multi_modal_data)
+ multi_modal_data=multi_modal_data,
+ multi_modal_placeholders={"audio": ranges})
class StackAudioFrames(nn.Module):
@@ -472,9 +476,9 @@ def forward(self, input_ids: torch.Tensor, positions: torch.Tensor,
inputs_embeds = self.language_model.model.get_input_embeddings(
input_ids)
- inputs_embeds = merge_multimodal_embeddings(
- input_ids, inputs_embeds, audio_embeddings,
- _AUDIO_PLACEHOLDER_TOKEN)
+ merge_multimodal_embeddings_from_map(
+ inputs_embeds, audio_embeddings,
+ attn_metadata.multi_modal_placeholder_index_maps["audio"])
input_ids = None
else:
inputs_embeds = None
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index 0aecb5d151a45..c6ec1769fc5d1 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -18,7 +18,7 @@
from vllm.model_executor.model_loader.loader import build_model
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.models import ModelRegistry
-from vllm.multimodal.base import NestedTensors
+from vllm.multimodal.base import MultiModalPlaceholderMap, NestedTensors
from vllm.platforms import current_platform
from vllm.sequence import IntermediateTensors
from vllm.utils import is_pin_memory_available
@@ -326,6 +326,22 @@ def _embedding_count_expression(embeddings: NestedTensors) -> str:
_embedding_count_expression(inner) for inner in embeddings)
+def merge_multimodal_embeddings_from_map(
+ inputs_embeds: torch.Tensor, multimodal_embeddings: NestedTensors,
+ placeholder_map: MultiModalPlaceholderMap.IndexMap) -> torch.Tensor:
+ """
+ Merge ``multimodal_embeddings`` into ``inputs_embeds`` using the provided
+ placeholder map .
+
+ Note:
+ This updates ``inputs_embeds`` in place.
+ """
+ flattened_embeddings = _flatten_embeddings(multimodal_embeddings)
+ inputs_embeds[placeholder_map.dest] = flattened_embeddings[
+ placeholder_map.src]
+ return inputs_embeds
+
+
def _merge_multimodal_embeddings(
inputs_embeds: torch.Tensor,
is_multimodal: torch.Tensor,
diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py
index 489e1e51f05cb..53da2badb9b98 100644
--- a/vllm/multimodal/__init__.py
+++ b/vllm/multimodal/__init__.py
@@ -1,6 +1,7 @@
from .base import (BatchedTensorInputs, MultiModalDataBuiltins,
- MultiModalDataDict, MultiModalInputs, MultiModalPlugin,
- NestedTensors)
+ MultiModalDataDict, MultiModalInputs,
+ MultiModalPlaceholderDict, MultiModalPlaceholderMap,
+ MultiModalPlugin, NestedTensors)
from .registry import MultiModalRegistry
MULTIMODAL_REGISTRY = MultiModalRegistry()
@@ -17,6 +18,8 @@
"MultiModalDataBuiltins",
"MultiModalDataDict",
"MultiModalInputs",
+ "MultiModalPlaceholderDict",
+ "MultiModalPlaceholderMap",
"MultiModalPlugin",
"NestedTensors",
"MULTIMODAL_REGISTRY",
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 84e71cbf60df7..6b10d0c609f13 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -1,8 +1,9 @@
import sys
from abc import ABC, abstractmethod
from collections import UserDict, defaultdict
-from typing import (Any, Callable, Dict, List, Mapping, Optional, Tuple, Type,
- TypedDict, TypeVar, Union, cast, final)
+from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Mapping,
+ NamedTuple, Optional, Tuple, Type, TypedDict, TypeVar,
+ Union, cast, final)
import numpy as np
import torch
@@ -11,12 +12,15 @@
from torch import nn
from typing_extensions import TypeAlias
-from vllm.config import ModelConfig
from vllm.inputs import InputContext
from vllm.logger import init_logger
from vllm.utils import (JSONTree, get_allowed_kwarg_only_overrides, is_list_of,
json_map_leaves, resolve_mm_processor_kwargs)
+if TYPE_CHECKING:
+ from vllm.config import ModelConfig
+ from vllm.sequence import SequenceGroupMetadata
+
logger = init_logger(__name__)
NestedTensors = Union[List["NestedTensors"], List[torch.Tensor], torch.Tensor]
@@ -151,6 +155,30 @@ class MultiModalDataBuiltins(TypedDict, total=False):
Read more on that :ref:`here `.
"""
+
+class PlaceholderRange(TypedDict):
+ """
+ Placeholder location information for multi-modal data.
+
+ For example:
+ Prompt: AAAA BBBB What is in these images?
+ Images A and B will have:
+ A: { "offset": 0, "length": 4 }
+ B: { "offset": 5, "length": 4 }
+ """
+
+ offset: int
+ """The start index of the placeholder in the prompt."""
+
+ length: int
+ """The length of the placeholder."""
+
+
+MultiModalPlaceholderDict = Mapping[str, List[PlaceholderRange]]
+"""
+A dictionary containing placeholder ranges.
+"""
+
MultiModalInputMapper = Callable[[InputContext, MultiModalData[object]],
MultiModalInputs]
"""
@@ -243,7 +271,7 @@ def wrapper(model_cls: N) -> N:
return wrapper
- def map_input(self, model_config: ModelConfig,
+ def map_input(self, model_config: "ModelConfig",
data: MultiModalData[object],
mm_processor_kwargs: Dict[str, Any]) -> MultiModalInputs:
"""
@@ -332,7 +360,7 @@ def wrapper(model_cls: N) -> N:
return wrapper
- def get_max_multimodal_tokens(self, model_config: ModelConfig) -> int:
+ def get_max_multimodal_tokens(self, model_config: "ModelConfig") -> int:
"""
Get the maximum number of multi-modal tokens
for profiling the memory usage of a model.
@@ -366,3 +394,179 @@ def get_max_multimodal_tokens(self, model_config: ModelConfig) -> int:
self._validate_max_multimodal_tokens(max_mm_tokens)
return max_mm_tokens
+
+
+class MultiModalPlaceholderMap:
+ """
+ Relates multi-modal embeddings to their corresponding placeholders.
+ """
+
+ class IndexMap(NamedTuple):
+ src: List[int]
+ dest: List[int]
+
+ src_ranges: List[range]
+ """
+ The indices of the multi-modal embeddings that will replace the
+ corresponding placeholder embeddings pointed to by ``dest_ranges``.
+ """
+
+ src_len: int
+ """
+ The total number of flattened multi-modal embeddings.
+ """
+
+ dest_ranges: List[range]
+ """
+ The indices of the placeholder embeddings that will be replaced by the
+ multimodal embeddings.
+ """
+
+ dest_len: int
+ """
+ The total number of embeddings in the destination tensor.
+ """
+
+ def __init__(self):
+ self.src_ranges = []
+ self.src_len = 0
+ self.dest_ranges = []
+ self.dest_len = 0
+
+ @classmethod
+ def from_seq_group(
+ cls, seq_group: "SequenceGroupMetadata", positions: range
+ ) -> Tuple[Optional[MultiModalDataDict], Dict[str,
+ "MultiModalPlaceholderMap"]]:
+ """
+ Returns the multi-modal items that intersect with the portion of a
+ prompt (``seq_group``) represented by ``positions``, as well as a
+ ``MultiModalPlaceholderMap`` that relates the multi-modal embedding
+ vectors to their corresponding placeholders.
+
+ Consider the following scenarios:
+
+ Prompt: |AAAA BBBB What's in these images?|
+ Positions: |.................................|
+
+ images = [A, B]
+ src_ranges = [(0, 4), (4, 8)]
+ dest_ranges = [(0, 4), (5, 9)]
+
+ Prompt: |AAAA BBBB What's in these images?|
+ Positions: | ..... |
+
+ images = [A, B]
+ src_ranges = [(2, 4), (4, 6)]
+ dest_ranges = [(0, 2), (3, 5)]
+
+ Prompt: |AAAA BBBB What's in these images?|
+ Positions: | ......... |
+
+ images = [B]
+ src_ranges = [(0, 4)]
+ dest_ranges = [(0, 4)]
+
+ Prompt: |AAAA BBBB What's in these images?|
+ Positions: | .......................|
+
+ images = []
+ src_ranges = []
+ dest_ranges = []
+ """
+ if (not seq_group.multi_modal_data
+ or not seq_group.multi_modal_placeholders):
+ return seq_group.multi_modal_data, {}
+
+ mm_data = {**seq_group.multi_modal_data}
+ placeholder_maps: Dict[str, MultiModalPlaceholderMap] = defaultdict(
+ MultiModalPlaceholderMap)
+
+ for modality, placeholders in seq_group.multi_modal_placeholders.items(
+ ):
+ mm_items = mm_data.pop(modality)
+ if not isinstance(mm_items, list):
+ mm_items = [mm_items]
+
+ if positions:
+ intersecting_items = placeholder_maps[
+ modality].append_items_from_seq_group(
+ positions, mm_items, placeholders)
+
+ if intersecting_items:
+ mm_data[modality] = intersecting_items
+
+ return mm_data, placeholder_maps
+
+ def append_items_from_seq_group(
+ self, positions: range, multi_modal_items: List[_T],
+ multi_modal_placeholders: List[PlaceholderRange]) -> List[_T]:
+ """
+ Adds the multi-modal items that intersect ```positions`` to this
+ placeholder map and returns the intersecting items.
+ """
+ intersecting_items = []
+
+ if len(multi_modal_items) != len(multi_modal_placeholders):
+ raise ValueError(
+ "Multi-modal placeholders and items must have the same length."
+ )
+ for placeholder_dict, mm_item in zip(multi_modal_placeholders,
+ multi_modal_items):
+ placeholder = range(
+ placeholder_dict["offset"],
+ placeholder_dict["offset"] + placeholder_dict["length"])
+ intersection = range(max(positions.start, placeholder.start),
+ min(positions.stop, placeholder.stop))
+
+ if not intersection:
+ # Skip this multi-modal item.
+ continue
+
+ token_embedding_range = range(intersection.start - positions.start,
+ intersection.stop - positions.start)
+
+ multimodal_embedding_range = range(
+ intersection.start - placeholder.start + self.src_len,
+ intersection.stop - placeholder.start + self.src_len)
+
+ intersecting_items.append(mm_item)
+ self.dest_ranges.append(token_embedding_range)
+ self.src_ranges.append(multimodal_embedding_range)
+ self.src_len += len(placeholder)
+
+ self.dest_len += len(positions)
+ return intersecting_items
+
+ def extend(self, other: "MultiModalPlaceholderMap"):
+ """
+ Adds the placeholders from another ``MultiModalPlaceholderMap`` to this
+ instance based on the source and destination tensors being
+ concatenated.
+ """
+
+ self.src_ranges.extend(
+ range(self.src_len + r.start, self.src_len + r.stop)
+ for r in other.src_ranges)
+ self.src_len += other.src_len
+ self.dest_ranges.extend(
+ range(self.dest_len + r.start, self.dest_len + r.stop)
+ for r in other.dest_ranges)
+ self.dest_len += other.dest_len
+
+ def index_map(self) -> "IndexMap":
+ """
+ Finalizes the placeholder map into lists of indices that can be used to
+ index the source and destination tensors.
+ """
+
+ src_indices = [i for r in self.src_ranges for i in r]
+ dest_indices = [i for r in self.dest_ranges for i in r]
+
+ if len(src_indices) != len(dest_indices):
+ raise ValueError(
+ f"The number of source ({len(src_indices)}) and destination "
+ f"indices ({len(dest_indices)}) must be the same.")
+
+ return MultiModalPlaceholderMap.IndexMap(src=src_indices,
+ dest=dest_indices)
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index 5f74bcea65ce2..3f6bb6c8338d2 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -1,11 +1,10 @@
from functools import lru_cache
-from typing import Any, Dict, Optional
+from typing import TYPE_CHECKING, Any, Dict, Optional
import torch
from PIL import Image
from transformers.image_processing_base import BatchFeature
-from vllm.config import ModelConfig
from vllm.inputs.registry import InputContext
from vllm.logger import init_logger
from vllm.transformers_utils.processor import get_image_processor
@@ -13,6 +12,9 @@
from .base import MultiModalData, MultiModalInputs, MultiModalPlugin
+if TYPE_CHECKING:
+ from vllm.config import ModelConfig
+
logger = init_logger(__name__)
cached_get_image_processor = lru_cache(get_image_processor)
@@ -26,7 +28,7 @@ def get_data_key(self) -> str:
def _get_hf_image_processor(
self,
- model_config: ModelConfig,
+ model_config: "ModelConfig",
mm_processor_kwargs: Optional[Dict[str, Any]] = None,
):
if mm_processor_kwargs is None:
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 5e9b8bd518de3..bce2f4c6abe5b 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -1,8 +1,7 @@
import functools
from collections import UserDict
-from typing import Any, Dict, Mapping, Optional, Sequence
+from typing import TYPE_CHECKING, Any, Dict, Mapping, Optional, Sequence
-from vllm.config import ModelConfig
from vllm.logger import init_logger
from .audio import AudioPlugin
@@ -11,6 +10,9 @@
from .image import ImagePlugin
from .video import VideoPlugin
+if TYPE_CHECKING:
+ from vllm.config import ModelConfig
+
logger = init_logger(__name__)
@@ -20,7 +22,7 @@ class _MultiModalLimits(UserDict):
when attempting to access a model that does not exist.
"""
- def __getitem__(self, key: ModelConfig) -> Dict[str, int]:
+ def __getitem__(self, key: "ModelConfig") -> Dict[str, int]:
try:
return super().__getitem__(key)
except KeyError as exc:
@@ -98,7 +100,7 @@ def register_image_input_mapper(
def map_input(
self,
- model_config: ModelConfig,
+ model_config: "ModelConfig",
data: MultiModalDataDict,
mm_processor_kwargs: Optional[Dict[str, Any]] = None,
) -> MultiModalInputs:
@@ -139,7 +141,7 @@ def map_input(
return MultiModalInputs(merged_dict)
- def create_input_mapper(self, model_config: ModelConfig):
+ def create_input_mapper(self, model_config: "ModelConfig"):
"""
Create an input mapper (see :meth:`map_input`) for a specific model.
"""
@@ -177,7 +179,7 @@ def register_max_image_tokens(
"""
return self.register_max_multimodal_tokens("image", max_mm_tokens)
- def get_max_multimodal_tokens(self, model_config: ModelConfig) -> int:
+ def get_max_multimodal_tokens(self, model_config: "ModelConfig") -> int:
"""
Get the maximum number of multi-modal tokens
for profiling the memory usage of a model.
@@ -195,7 +197,7 @@ def get_max_multimodal_tokens(self, model_config: ModelConfig) -> int:
def init_mm_limits_per_prompt(
self,
- model_config: ModelConfig,
+ model_config: "ModelConfig",
) -> None:
"""
Initialize the maximum number of multi-modal input instances for each
@@ -231,7 +233,7 @@ def init_mm_limits_per_prompt(
def get_mm_limits_per_prompt(
self,
- model_config: ModelConfig,
+ model_config: "ModelConfig",
) -> Mapping[str, int]:
"""
Get the maximum number of multi-modal input instances for each modality
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 3c801464383ad..c5ff552e06099 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -10,7 +10,7 @@
from vllm.connections import global_http_connection
from vllm.envs import VLLM_AUDIO_FETCH_TIMEOUT, VLLM_IMAGE_FETCH_TIMEOUT
from vllm.logger import init_logger
-from vllm.multimodal.base import MultiModalDataDict
+from vllm.multimodal.base import MultiModalDataDict, PlaceholderRange
from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
logger = init_logger(__name__)
@@ -258,7 +258,7 @@ def repeat_and_pad_placeholder_tokens(
repeat_count: Union[int, List[int]],
pad_token_left: Optional[int] = None,
pad_token_right: Optional[int] = None,
-) -> Tuple[Optional[str], List[int]]:
+) -> Tuple[Optional[str], List[int], List[PlaceholderRange]]:
if isinstance(repeat_count, int):
repeat_count = [repeat_count]
@@ -301,6 +301,7 @@ def repeat_and_pad_placeholder_tokens(
new_prompt += prompt_parts[-1]
new_token_ids: List[int] = []
+ placeholder_ranges: List[PlaceholderRange] = []
placeholder_token_idx = 0
for i, token in enumerate(prompt_token_ids):
if token == placeholder_token_id:
@@ -310,6 +311,10 @@ def repeat_and_pad_placeholder_tokens(
pad_token_left=pad_token_left,
pad_token_right=pad_token_right,
)
+ placeholder_ranges.append({
+ "offset": len(new_token_ids),
+ "length": len(replacement_ids)
+ })
new_token_ids.extend(replacement_ids)
placeholder_token_idx += 1
@@ -320,4 +325,14 @@ def repeat_and_pad_placeholder_tokens(
else:
new_token_ids.append(token)
- return new_prompt, new_token_ids
+ return new_prompt, new_token_ids, placeholder_ranges
+
+
+def consecutive_placeholder_ranges(num_items: int,
+ item_size: int) -> List[PlaceholderRange]:
+ """Returns a list of consecutive PlaceholderRanges of a fixed size"""
+
+ return [
+ PlaceholderRange(offset=i * item_size, length=item_size)
+ for i in range(num_items)
+ ]
diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index c3235c4acb6fd..6c2c6720f4276 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -1,18 +1,19 @@
from functools import lru_cache
-from typing import Any, Dict, List, Optional, Union
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
import numpy as np
-from vllm.config import ModelConfig
from vllm.inputs.registry import InputContext
from vllm.logger import init_logger
from vllm.transformers_utils.processor import get_video_processor
from vllm.transformers_utils.tokenizer import get_tokenizer
-from vllm.utils import is_list_of
from .base import MultiModalData, MultiModalInputs
from .image import ImagePlugin
+if TYPE_CHECKING:
+ from vllm.config import ModelConfig
+
logger = init_logger(__name__)
cached_get_video_processor = lru_cache(get_video_processor)
@@ -38,7 +39,7 @@ def get_data_key(self) -> str:
def _get_hf_video_processor(
self,
- model_config: ModelConfig,
+ model_config: "ModelConfig",
mm_processor_kwargs: Optional[Dict[str, Any]] = None,
):
if mm_processor_kwargs is None:
@@ -56,7 +57,10 @@ def _default_input_mapper(
) -> MultiModalInputs:
model_config = ctx.model_config
- if isinstance(data, np.ndarray) or is_list_of(data, np.ndarray):
+ if isinstance(data, list) and len(data) == 1:
+ data = data[0]
+
+ if isinstance(data, np.ndarray):
video_processor = self._get_hf_video_processor(
model_config,
mm_processor_kwargs,
diff --git a/vllm/sequence.py b/vllm/sequence.py
index ff59f333f00b4..ee547dde45394 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -15,13 +15,13 @@
from vllm.inputs.parse import is_encoder_decoder_inputs
from vllm.lora.request import LoRARequest
+from vllm.multimodal import MultiModalDataDict, MultiModalPlaceholderDict
from vllm.pooling_params import PoolingParams
from vllm.prompt_adapter.request import PromptAdapterRequest
from vllm.sampling_params import RequestOutputKind, SamplingParams
if TYPE_CHECKING:
from vllm.inputs import SingletonInputs
- from vllm.multimodal.base import MultiModalDataDict
VLLM_TOKEN_ID_ARRAY_TYPE = "l"
@@ -485,7 +485,7 @@ def prompt_token_ids(self) -> List[int]:
return cast(List[int], self.inputs.get(prompt_token_ids_key))
@property
- def multi_modal_data(self) -> "MultiModalDataDict":
+ def multi_modal_data(self) -> MultiModalDataDict:
inputs = self.inputs
if (inputs.get("multi_modal_data")
@@ -495,11 +495,15 @@ def multi_modal_data(self) -> "MultiModalDataDict":
)
return cast(
- "MultiModalDataDict",
+ MultiModalDataDict,
(inputs.get("multi_modal_data")
or inputs.get("encoder_multi_modal_data") or {}),
)
+ @property
+ def multi_modal_placeholders(self) -> MultiModalPlaceholderDict:
+ return self.inputs.get("multi_modal_placeholders") or {}
+
@property
def mm_processor_kwargs(self) -> Dict[str, Any]:
return self.inputs.get("mm_processor_kwargs") or {}
@@ -728,9 +732,13 @@ def encoder_prompt_token_ids(self) -> Optional[List[int]]:
if self.encoder_seq is not None else None)
@property
- def multi_modal_data(self) -> "MultiModalDataDict":
+ def multi_modal_data(self) -> MultiModalDataDict:
return self.first_seq.multi_modal_data
+ @property
+ def multi_modal_placeholders(self) -> MultiModalPlaceholderDict:
+ return self.first_seq.multi_modal_placeholders
+
@property
def mm_processor_kwargs(self) -> Dict[str, Any]:
return self.first_seq.mm_processor_kwargs
@@ -946,6 +954,7 @@ class SequenceGroupMetadata(
# "MultiModalDataDict" types. We have to use Any due to msgspec
# doesn't allow to have union of 2 different dicts.
multi_modal_data: Optional[Any] = None
+ multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None
mm_processor_kwargs: Optional[Dict[str, Any]] = None
encoder_seq_data: Optional[SequenceData] = None
cross_block_table: Optional[List[int]] = None
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index 5032896600b3b..0c6fcdf03ba9e 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -1,5 +1,6 @@
import dataclasses
import weakref
+from collections import defaultdict
from dataclasses import dataclass
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, Union
@@ -16,7 +17,7 @@
from vllm.model_executor.layers.sampler import SamplerOutput
from vllm.model_executor.model_loader import get_model
from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
- MultiModalInputs)
+ MultiModalInputs, MultiModalPlaceholderMap)
from vllm.sequence import (IntermediateTensors, SequenceData,
SequenceGroupMetadata)
from vllm.transformers_utils.config import uses_mrope
@@ -148,9 +149,18 @@ def build(self) -> ModelInputForCPU:
query_lens=seq_lens,
)
- def _compute_multi_modal_input(self, seq_data: SequenceData, mm_data,
- computed_len: int,
+ def _compute_multi_modal_input(self, seq_group: SequenceGroupMetadata,
+ seq_data: SequenceData, computed_len: int,
mm_processor_kwargs: Dict[str, Any]):
+
+ # NOTE: mm_data only includes the subset of multi-modal items that
+ # intersect with the current prefill positions.
+ mm_data, placeholder_maps = MultiModalPlaceholderMap.from_seq_group(
+ seq_group, range(computed_len, len(seq_data.get_token_ids())))
+
+ if not mm_data:
+ return
+
mm_kwargs = self.multi_modal_input_mapper(mm_data, mm_processor_kwargs)
# special processing for mrope position deltas.
@@ -179,7 +189,7 @@ def _compute_multi_modal_input(self, seq_data: SequenceData, mm_data,
context_len=computed_len,
)
seq_data.mrope_position_delta = mrope_position_delta
- return mm_kwargs, mrope_positions
+ return mm_kwargs, placeholder_maps, mrope_positions
def _prepare_prompt(
self,
@@ -194,6 +204,9 @@ def _prepare_prompt(
slot_mapping: List[int] = []
seq_lens: List[int] = []
multi_modal_inputs_list: List[MultiModalInputs] = []
+ multi_modal_placeholder_maps: Dict[
+ str,
+ MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap)
for seq_group_metadata in seq_group_metadata_list:
assert seq_group_metadata.is_prompt
@@ -210,11 +223,15 @@ def _prepare_prompt(
input_tokens.extend(prompt_tokens) # Token ids
mrope_positions = None
- if (mm_data := seq_group_metadata.multi_modal_data):
- mm_kwargs, mrope_positions = self._compute_multi_modal_input(
- seq_data, mm_data, computed_len,
+ if seq_group_metadata.multi_modal_data:
+ mm_kwargs, placeholder_maps, mrope_positions = self \
+ ._compute_multi_modal_input(
+ seq_group_metadata, seq_data, computed_len,
seq_group_metadata.mm_processor_kwargs)
multi_modal_inputs_list.append(mm_kwargs)
+ for modality, placeholder_map in placeholder_maps.items():
+ multi_modal_placeholder_maps[modality].extend(
+ placeholder_map)
# Token position ids
# NOTE(woosuk): Here we assume that the first token in the prompt
@@ -264,6 +281,11 @@ def _prepare_prompt(
slot_mapping = torch.tensor(slot_mapping,
dtype=torch.long,
device=self.device) # type: ignore
+ placeholder_index_maps = {
+ modality: placeholder_map.index_map()
+ for modality, placeholder_map in
+ multi_modal_placeholder_maps.items()
+ }
attn_metadata = self.attn_backend.make_metadata(
is_prompt=True,
@@ -275,6 +297,7 @@ def _prepare_prompt(
num_decode_tokens=0,
block_tables=torch.tensor([]),
slot_mapping=slot_mapping,
+ multi_modal_placeholder_index_maps=placeholder_index_maps,
)
multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list)
@@ -366,6 +389,7 @@ def _prepare_decode(
attn_metadata = self.attn_backend.make_metadata(
is_prompt=False,
slot_mapping=slot_mapping,
+ multi_modal_placeholder_index_maps=None,
seq_lens=seq_lens,
seq_lens_tensor=seq_lens_tensor,
max_decode_seq_len=max_decode_seq_len,
diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py
index 6a00444f5098b..a4b665d71f28a 100644
--- a/vllm/worker/enc_dec_model_runner.py
+++ b/vllm/worker/enc_dec_model_runner.py
@@ -306,13 +306,12 @@ def profile_run(self) -> None:
(group_id < max_num_batched_tokens % max_num_seqs))
batch_size += seq_len
- decoder_seq_data, decoder_dummy_multi_modal_data \
- = self.input_registry.dummy_data_for_profiling(
- self.model_config,
+ decoder_dummy_data = self.input_registry \
+ .dummy_data_for_profiling(self.model_config,
seq_len,
self.mm_registry,
is_encoder_data=False)
- encoder_seq_data, encoder_dummy_multi_modal_data \
+ encoder_dummy_data \
= self.input_registry.dummy_data_for_profiling(
self.model_config,
seq_len,
@@ -320,26 +319,31 @@ def profile_run(self) -> None:
is_encoder_data=True)
# Having more tokens is over-conservative but otherwise fine
- assert len(decoder_seq_data.prompt_token_ids) >= seq_len, (
+ assert len(
+ decoder_dummy_data.seq_data.prompt_token_ids
+ ) >= seq_len, (
f"Expected at least {seq_len} dummy tokens for profiling, "
- f"but got: {len(decoder_seq_data.prompt_token_ids)}")
+ f"but got: {len(decoder_dummy_data.seq_data.prompt_token_ids)}"
+ )
- assert decoder_dummy_multi_modal_data is None or \
- encoder_dummy_multi_modal_data is None, (
+ assert decoder_dummy_data.multi_modal_data is None or \
+ encoder_dummy_data.multi_modal_data is None, (
"Multi-modal data can't be provided in both encoder and decoder"
)
seq = SequenceGroupMetadata(
request_id=str(group_id),
is_prompt=True,
- seq_data={group_id: decoder_seq_data},
+ seq_data={group_id: decoder_dummy_data.seq_data},
sampling_params=sampling_params,
block_tables=None,
- encoder_seq_data=encoder_seq_data,
+ encoder_seq_data=encoder_dummy_data.seq_data,
cross_block_table=None,
- multi_modal_data=decoder_dummy_multi_modal_data
- or encoder_dummy_multi_modal_data,
- )
+ multi_modal_data=decoder_dummy_data.multi_modal_data
+ or encoder_dummy_data.multi_modal_data,
+ multi_modal_placeholders=decoder_dummy_data.
+ multi_modal_placeholders
+ or encoder_dummy_data.multi_modal_placeholders)
seqs.append(seq)
# Run the model with the dummy inputs.
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 891637dafbb14..f2123c64c3274 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -40,7 +40,8 @@
from vllm.model_executor.models import supports_lora, supports_multimodal
from vllm.model_executor.models.utils import set_cpu_offload_max_bytes
from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
- MultiModalInputs, MultiModalRegistry)
+ MultiModalInputs, MultiModalPlaceholderMap,
+ MultiModalRegistry)
from vllm.platforms import current_platform
from vllm.prompt_adapter.layers import PromptAdapterMapping
from vllm.prompt_adapter.request import PromptAdapterRequest
@@ -242,6 +243,8 @@ def __init__(
# Multi-modal inputs.
multi_modal_inputs: Optional[MultiModalInputs] = None,
+ multi_modal_placeholder_maps: Optional[Dict[
+ str, MultiModalPlaceholderMap]] = None,
# Whether the prefix cache is hit (prefill only).
prefix_cache_hit: bool = False,
@@ -361,6 +364,7 @@ def __init__(
self.prompt_adapter_request = prompt_adapter_request
self.multi_modal_inputs = multi_modal_inputs
+ self.multi_modal_placeholder_maps = multi_modal_placeholder_maps
self.prefix_cache_hit = prefix_cache_hit
self.n_seqs = len(self.seq_ids)
@@ -635,7 +639,12 @@ def _compute_prompt_adapter_input(
def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup,
seq_group_metadata: SequenceGroupMetadata):
"""If multi-modal data is given, add it to the input."""
- mm_data = seq_group_metadata.multi_modal_data
+ # NOTE: mm_data only includes the subset of multi-modal items that
+ # intersect with the current prefill positions.
+ positions = inter_data.input_positions[0]
+ mm_data, placeholder_maps = MultiModalPlaceholderMap.from_seq_group(
+ seq_group_metadata,
+ range(positions[0], positions[0] + len(positions)))
if not mm_data:
return
@@ -643,6 +652,7 @@ def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup,
mm_data,
mm_processor_kwargs=seq_group_metadata.mm_processor_kwargs)
inter_data.multi_modal_inputs = mm_kwargs
+ inter_data.multi_modal_placeholder_maps = placeholder_maps
# special processing for mrope position deltas.
if self.runner.model_is_mrope:
@@ -1255,7 +1265,7 @@ def profile_run(self) -> None:
(group_id < max_num_batched_tokens % max_num_seqs))
batch_size += seq_len
- seq_data, dummy_multi_modal_data = self.input_registry \
+ dummy_data = self.input_registry \
.dummy_data_for_profiling(self.model_config,
seq_len,
self.mm_registry)
@@ -1263,12 +1273,13 @@ def profile_run(self) -> None:
seq = SequenceGroupMetadata(
request_id=str(group_id),
is_prompt=True,
- seq_data={group_id: seq_data},
+ seq_data={group_id: dummy_data.seq_data},
sampling_params=sampling_params,
block_tables=None,
lora_request=dummy_lora_requests_per_seq[group_id]
if dummy_lora_requests_per_seq else None,
- multi_modal_data=dummy_multi_modal_data,
+ multi_modal_data=dummy_data.multi_modal_data,
+ multi_modal_placeholders=dummy_data.multi_modal_placeholders,
)
seqs.append(seq)
diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py
index 86883cf152449..89d7addb5a8d9 100644
--- a/vllm/worker/model_runner_base.py
+++ b/vllm/worker/model_runner_base.py
@@ -46,9 +46,8 @@ def _init_attn_metadata_from_tensor_dict(
# Extract the fields used to create AttentionMetadata.
valid_attn_kwargs = {}
for field in dataclasses.fields(attn_backend.get_metadata_cls()):
- val = tensor_dict.pop(field.name, None)
- if val is not None:
- valid_attn_kwargs[field.name] = val
+ if field.name in tensor_dict:
+ valid_attn_kwargs[field.name] = tensor_dict.pop(field.name)
attn_metadata = attn_backend.make_metadata(**valid_attn_kwargs)
tensor_dict["attn_metadata"] = attn_metadata
diff --git a/vllm/worker/openvino_model_runner.py b/vllm/worker/openvino_model_runner.py
index a164fbe3393c4..3da738636a59d 100644
--- a/vllm/worker/openvino_model_runner.py
+++ b/vllm/worker/openvino_model_runner.py
@@ -1,4 +1,5 @@
-from typing import List, NamedTuple, Optional, Tuple
+from collections import defaultdict
+from typing import Dict, List, NamedTuple, Optional, Tuple
import openvino as ov
import torch
@@ -14,7 +15,7 @@
from vllm.model_executor.layers.sampler import SamplerOutput
from vllm.model_executor.model_loader.openvino import get_model
from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
- MultiModalInputs)
+ MultiModalInputs, MultiModalPlaceholderMap)
from vllm.sequence import SequenceGroupMetadata
logger = init_logger(__name__)
@@ -115,6 +116,9 @@ def _prepare_model_input(
past_lens: List[int] = []
query_lens: List[int] = []
multi_modal_inputs_list: List[MultiModalInputs] = []
+ multi_modal_placeholder_maps: Dict[
+ str,
+ MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap)
subsequence_begins: List[int] = []
block_indices: List[int] = []
@@ -168,15 +172,6 @@ def _prepare_model_input(
and self.sliding_window is None
and is_prompt)
- mm_data = seq_group_metadata.multi_modal_data
- if mm_data:
- mm_kwargs = self.multi_modal_input_mapper(
- mm_data,
- mm_processor_kwargs=seq_group_metadata.
- mm_processor_kwargs,
- )
- multi_modal_inputs_list.append(mm_kwargs)
-
block_table = seq_group_metadata.block_tables[seq_id]
# TODO(sang): Combine chunked prefill and prefix caching by
# only allowing multiple of block_size chunk size.
@@ -220,7 +215,8 @@ def _prepare_model_input(
query_lens.append(query_len)
input_tokens.extend(tokens)
- input_positions.extend(list(range(computed_len, seq_len)))
+ positions_range = range(computed_len, seq_len)
+ input_positions.extend(list(positions_range))
past_lens.append(computed_len)
subsequence_begins.append(subsequence_begins[-1] + query_len)
@@ -233,6 +229,22 @@ def _prepare_model_input(
), "seq_len: {}, computed_len: {}, query_len: {}".format(
seq_len, computed_len, query_len)
+ if seq_group_metadata.multi_modal_data:
+ # NOTE: mm_data only includes the subset of multi-modal
+ # items that intersect with the current prefill positions.
+ mm_data, placeholder_maps = MultiModalPlaceholderMap \
+ .from_seq_group(seq_group_metadata, positions_range)
+
+ mm_kwargs = self.multi_modal_input_mapper(
+ mm_data,
+ mm_processor_kwargs=seq_group_metadata.
+ mm_processor_kwargs)
+ multi_modal_inputs_list.append(mm_kwargs)
+
+ for modality, placeholder_map in placeholder_maps.items():
+ multi_modal_placeholder_maps[modality].extend(
+ placeholder_map, )
+
max_query_len = max(query_lens)
assert max_query_len > 0, "query_lens: {}".format(query_lens)
@@ -261,12 +273,19 @@ def _prepare_model_input(
max_context_len, dtype=torch.int32,
device=self.device) # type: ignore
+ placeholder_index_maps = {
+ modality: placeholder_map.index_map()
+ for modality, placeholder_map in
+ multi_modal_placeholder_maps.items()
+ }
+
attn_metadata = self.attn_backend.make_openvino_metadata(
past_lens=past_lens_tensor,
subsequence_begins=subsequence_begins_tensor,
block_indices=block_indices_tensor,
block_indices_begins=block_indices_begins_tensor,
max_context_len=max_context_len_tensor,
+ multi_modal_placeholder_index_maps=placeholder_index_maps,
)
multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list)
diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py
index 87ced7818a676..3792cbc0f730f 100644
--- a/vllm/worker/tpu_model_runner.py
+++ b/vllm/worker/tpu_model_runner.py
@@ -184,6 +184,7 @@ def _dummy_run(
num_prefill_tokens=batch_size * seq_len,
num_decode_tokens=0,
slot_mapping=slot_mapping,
+ multi_modal_placeholder_index_maps=None,
block_tables=None,
context_lens=None,
)
@@ -216,6 +217,7 @@ def _dummy_run(
num_prefill_tokens=0,
num_decode_tokens=batch_size * seq_len,
slot_mapping=slot_mapping,
+ multi_modal_placeholder_index_maps=None,
block_tables=block_tables,
context_lens=context_lens,
)
@@ -360,6 +362,7 @@ def _prepare_prompt(
num_prefill_tokens=0, # NOTE: This is not used.
num_decode_tokens=0,
slot_mapping=slot_mapping,
+ multi_modal_placeholder_index_maps=None,
block_tables=None,
context_lens=None,
)
@@ -429,6 +432,7 @@ def _prepare_decode(
num_prefill_tokens=0,
num_decode_tokens=batch_size,
slot_mapping=slot_mapping,
+ multi_modal_placeholder_index_maps=None,
block_tables=block_tables,
context_lens=context_lens,
)
diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
index 75a6de3b24ba4..739fe1b3d2c4f 100644
--- a/vllm/worker/xpu_model_runner.py
+++ b/vllm/worker/xpu_model_runner.py
@@ -1,6 +1,7 @@
import dataclasses
import time
import weakref
+from collections import defaultdict
from dataclasses import dataclass
from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple,
Type, TypeVar)
@@ -19,7 +20,8 @@
from vllm.model_executor.layers.sampler import SamplerOutput
from vllm.model_executor.model_loader import get_model
from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
- MultiModalInputs, MultiModalRegistry)
+ MultiModalInputs, MultiModalPlaceholderMap,
+ MultiModalRegistry)
from vllm.sampling_params import SamplingParams
from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
from vllm.utils import DeviceMemoryProfiler, make_tensor_with_pad
@@ -161,6 +163,9 @@ def _prepare_prompt(
slot_mapping: List[int] = []
seq_lens: List[int] = []
multi_modal_inputs_list: List[MultiModalInputs] = []
+ multi_modal_placeholder_maps: Dict[
+ str,
+ MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap)
for seq_group_metadata in seq_group_metadata_list:
assert seq_group_metadata.is_prompt
@@ -179,7 +184,21 @@ def _prepare_prompt(
# Token position ids
# NOTE(woosuk): Here we assume that the first token in the prompt
# is always the first token in the sequence.
- input_positions.extend(list(range(computed_len, seq_len)))
+ positions_range = range(computed_len, seq_len)
+ input_positions.extend(list(positions_range))
+
+ if seq_group_metadata.multi_modal_data:
+ # NOTE: mm_data only includes the subset of multi-modal items
+ # that intersect with the current prefill positions.
+ mm_data, placeholder_maps = MultiModalPlaceholderMap \
+ .from_seq_group(seq_group_metadata, positions_range)
+
+ mm_kwargs = self.runner.multi_modal_input_mapper(mm_data)
+ multi_modal_inputs_list.append(mm_kwargs)
+
+ for modality, placeholder_map in placeholder_maps.items():
+ multi_modal_placeholder_maps[modality].extend(
+ placeholder_map)
if seq_group_metadata.block_tables is None:
# During memory profiling, the block tables are not initialized
@@ -220,6 +239,11 @@ def _prepare_prompt(
slot_mapping = torch.tensor(slot_mapping,
dtype=torch.long,
device=self.device) # type: ignore
+ placeholder_index_maps = {
+ modality: placeholder_map.index_map()
+ for modality, placeholder_map in
+ multi_modal_placeholder_maps.items()
+ }
max_seqlen = max(seq_lens)
tmp = [0]
@@ -230,6 +254,7 @@ def _prepare_prompt(
attn_metadata = self.attn_backend.make_metadata(
is_prompt=True,
slot_mapping=slot_mapping,
+ multi_modal_placeholder_index_maps=placeholder_index_maps,
seq_lens=seq_lens,
seqlen_q=seqlen_q,
max_seqlen=max_seqlen,
@@ -313,6 +338,7 @@ def _prepare_decode(
attn_metadata = self.attn_backend.make_metadata(
is_prompt=False,
slot_mapping=slot_mapping,
+ multi_modal_placeholder_index_maps=None,
seq_lens=seq_lens,
seqlen_q=torch.tensor([]),
max_seqlen=0,
@@ -450,7 +476,7 @@ def profile_run(self) -> None:
(group_id < max_num_batched_tokens % max_num_seqs))
batch_size += seq_len
- seq_data, dummy_multi_modal_data = self.input_registry \
+ dummy_data = self.input_registry \
.dummy_data_for_profiling(self.model_config,
seq_len,
self.mm_registry)
@@ -458,12 +484,12 @@ def profile_run(self) -> None:
seq = SequenceGroupMetadata(
request_id=str(group_id),
is_prompt=True,
- seq_data={group_id: seq_data},
+ seq_data={group_id: dummy_data.seq_data},
sampling_params=sampling_params,
block_tables=None,
lora_request=None,
- multi_modal_data=dummy_multi_modal_data,
- )
+ multi_modal_data=dummy_data.multi_modal_data,
+ multi_modal_placeholders=dummy_data.multi_modal_placeholders)
seqs.append(seq)
# Run the model with the dummy inputs.
From d522034c85e8f994bbd193514393056232edd247 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu"
Date: Fri, 1 Nov 2024 13:56:13 -1000
Subject: [PATCH 088/113] [ci/build] Have dependabot ignore pinned dependencies
(#9935)
Signed-off-by: kevin
---
.github/dependabot.yml | 9 +++++++++
1 file changed, 9 insertions(+)
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
index a21acd9671eeb..4f54eea564ecb 100644
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -14,6 +14,15 @@ updates:
reviewers: ["khluu", "simon-mo"]
allow:
- dependency-type: "all"
+ ignore:
+ - dependency-name: "torch"
+ - dependency-name: "torchvision"
+ - dependency-name: "xformers"
+ - dependency-name: "lm-format-enforcer"
+ - dependency-name: "gguf"
+ - dependency-name: "compressed-tensors"
+ - dependency-name: "ray[adag]"
+ - dependency-name: "lm-eval"
groups:
patch-update:
applies-to: version-updates
From a78dd3303efac284afc6785eddba5f175285863b Mon Sep 17 00:00:00 2001
From: sroy745 <142070531+sroy745@users.noreply.github.com>
Date: Fri, 1 Nov 2024 23:22:49 -0700
Subject: [PATCH 089/113] [Encoder Decoder] Add flash_attn kernel support for
encoder-decoder models (#9559)
---
tests/encoder_decoder/test_e2e_correctness.py | 88 +++--
tests/kernels/test_encoder_decoder_attn.py | 156 ++++++--
tests/kernels/utils.py | 90 ++++-
.../vision_language/test_florence2.py | 2 +-
vllm/attention/backends/flash_attn.py | 364 +++++++++++++-----
vllm/attention/backends/utils.py | 159 +++++++-
vllm/attention/backends/xformers.py | 131 ++-----
vllm/attention/selector.py | 2 +-
vllm/model_executor/models/bart.py | 2 -
vllm/utils.py | 4 +-
vllm/worker/enc_dec_model_runner.py | 35 +-
11 files changed, 716 insertions(+), 317 deletions(-)
diff --git a/tests/encoder_decoder/test_e2e_correctness.py b/tests/encoder_decoder/test_e2e_correctness.py
index bef0c515b9073..f2d7e9fd78cf3 100644
--- a/tests/encoder_decoder/test_e2e_correctness.py
+++ b/tests/encoder_decoder/test_e2e_correctness.py
@@ -7,12 +7,18 @@
import pytest
from transformers import AutoModelForSeq2SeqLM
+from vllm.attention.selector import (_Backend,
+ global_force_attn_backend_context_manager)
from vllm.platforms import current_platform
from vllm.sequence import SampleLogprobs
from ..conftest import DecoderPromptType
from ..models.utils import check_logprobs_close
+LIST_ENC_DEC_SUPPORTED_BACKENDS = [
+ _Backend.XFORMERS, _Backend.FLASH_ATTN, None
+]
+
def vllm_to_hf_output(
vllm_output: Tuple[List[int], str, Optional[SampleLogprobs]],
@@ -29,7 +35,8 @@ def vllm_to_hf_output(
@pytest.mark.parametrize("model", ["facebook/bart-large-cnn"])
-@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS)
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5])
@pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType))
@@ -48,6 +55,7 @@ def test_encoder_decoder_e2e(
num_logprobs: int,
decoder_prompt_type: DecoderPromptType,
enforce_eager: bool,
+ attn_backend: _Backend,
) -> None:
'''
End-to-End (E2E) test for the encoder-decoder framework.
@@ -56,43 +64,49 @@ def test_encoder_decoder_e2e(
implementations to ensure that both implementations produce consistent
and correct results.
'''
- test_case_prompts = example_encoder_decoder_prompts[decoder_prompt_type]
+ with global_force_attn_backend_context_manager(attn_backend):
+ if attn_backend == _Backend.FLASH_ATTN:
+ # Flash Attention works only with bfloat16 data-type
+ dtype = 'bfloat16'
+ test_case_prompts = example_encoder_decoder_prompts[
+ decoder_prompt_type]
- # Configuration settings for HF baseline
- hf_kwargs = {
- "top_k": None,
- "num_beams": 1,
- "repetition_penalty": 1.0,
- "top_p": 1.0,
- "length_penalty": 1.0,
- "early_stopping": False,
- "no_repeat_ngram_size": None,
- "min_length": 0
- }
+ # Configuration settings for HF baseline
+ hf_kwargs = {
+ "top_k": None,
+ "num_beams": 1,
+ "repetition_penalty": 1.0,
+ "top_p": 1.0,
+ "length_penalty": 1.0,
+ "early_stopping": False,
+ "no_repeat_ngram_size": None,
+ "min_length": 0
+ }
- with hf_runner(model, dtype=dtype,
- auto_cls=AutoModelForSeq2SeqLM) as hf_model:
- hf_outputs = (hf_model.generate_encoder_decoder_greedy_logprobs_limit(
- test_case_prompts,
- max_tokens,
- num_logprobs,
- **hf_kwargs,
- ))
- with vllm_runner(model, dtype=dtype,
- enforce_eager=enforce_eager) as vllm_model:
- vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs(
- test_case_prompts, max_tokens, num_logprobs)
+ with hf_runner(model, dtype=dtype,
+ auto_cls=AutoModelForSeq2SeqLM) as hf_model:
+ hf_outputs = (
+ hf_model.generate_encoder_decoder_greedy_logprobs_limit(
+ test_case_prompts,
+ max_tokens,
+ num_logprobs,
+ **hf_kwargs,
+ ))
+ with vllm_runner(model, dtype=dtype,
+ enforce_eager=enforce_eager) as vllm_model:
+ vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs(
+ test_case_prompts, max_tokens, num_logprobs)
- hf_skip_tokens = (1
- if decoder_prompt_type == DecoderPromptType.NONE else 0)
+ hf_skip_tokens = (1 if decoder_prompt_type == DecoderPromptType.NONE
+ else 0)
- check_logprobs_close(
- outputs_0_lst=hf_outputs,
- outputs_1_lst=[
- vllm_to_hf_output(vllm_output, decoder_prompt_type)
- for vllm_output in vllm_outputs
- ],
- name_0="hf",
- name_1="vllm",
- num_outputs_0_skip_tokens=hf_skip_tokens,
- )
+ check_logprobs_close(
+ outputs_0_lst=hf_outputs,
+ outputs_1_lst=[
+ vllm_to_hf_output(vllm_output, decoder_prompt_type)
+ for vllm_output in vllm_outputs
+ ],
+ name_0="hf",
+ name_1="vllm",
+ num_outputs_0_skip_tokens=hf_skip_tokens,
+ )
diff --git a/tests/kernels/test_encoder_decoder_attn.py b/tests/kernels/test_encoder_decoder_attn.py
index bc99c5559d388..a1dd5eeeaa398 100644
--- a/tests/kernels/test_encoder_decoder_attn.py
+++ b/tests/kernels/test_encoder_decoder_attn.py
@@ -16,13 +16,13 @@
from vllm.attention import (Attention, AttentionBackend, AttentionMetadata,
AttentionType)
from vllm.attention.backends.utils import STR_NOT_IMPL_ENC_DEC_ROCM_HIP
-from vllm.attention.selector import (_Backend,
+from vllm.attention.selector import (_Backend, get_attn_backend,
global_force_attn_backend_context_manager)
+from vllm.forward_context import set_forward_context
from vllm.platforms import current_platform
# List of support backends for encoder/decoder models
-LIST_ENC_DEC_SUPPORTED_BACKENDS = [_Backend.XFORMERS]
-
+LIST_ENC_DEC_SUPPORTED_BACKENDS = [_Backend.XFORMERS, _Backend.FLASH_ATTN]
HEAD_SIZES = [64, 256]
NUM_HEADS = [1, 16]
@@ -145,7 +145,8 @@ class that Attention will automatically select when it is constructed.
test_pt.num_heads,
test_pt.head_size,
test_pt.block_size,
- device=CUDA_DEVICE)
+ device=CUDA_DEVICE,
+ backend=test_pt.backend_name)
return TestResources(scale, attn_backend, attn, kv_cache)
@@ -592,6 +593,7 @@ def _run_encoder_attention_test(
attn: Attention,
encoder_test_params: PhaseTestParameters,
attn_metadata: AttentionMetadata,
+ test_pt: TestPoint,
) -> torch.Tensor:
'''
Run encoder attention.
@@ -610,6 +612,8 @@ def _run_encoder_attention_test(
(number_of_tokens x num_heads x head_size)
query/key/value fields
* attn_metadata: attention metadata for encoder/decoder-self attention
+ * test_pt: The TestPoint object containing test details like number of
+ model heads, head size, name of the backend being used etc.
Returns:
* Attention.forward() applied to packed {query,key,value} and
@@ -619,20 +623,31 @@ def _run_encoder_attention_test(
attn_type = AttentionType.ENCODER
packed_qkv = encoder_test_params.packed_qkvo.packed_qkv
assert packed_qkv is not None
- return attn.forward(packed_qkv.query,
- packed_qkv.key,
- packed_qkv.value,
- torch.tensor([],
- dtype=torch.float32,
- device=packed_qkv.query.device),
- attn_metadata,
- attn_type=attn_type)
+ with set_forward_context(attn_metadata):
+ # In the test setup the shape of the query is
+ # [batch_size, seq_len, num_heads, head_size]. However
+ # the attention backend expect the shape to be
+ # [num_tokens, hidden_size]. Hence reshape the query before
+ # invoking the forward method.
+ # TODO - Update the way we construct the query so that it
+ # is shaped as [num_tokens, hidden_size] and we can skip the reshape.
+ reshaped_query = packed_qkv.query.view(
+ -1, test_pt.num_heads * test_pt.head_size)
+ return attn.forward(reshaped_query,
+ packed_qkv.key,
+ packed_qkv.value,
+ torch.tensor([],
+ dtype=torch.float32,
+ device=packed_qkv.query.device),
+ attn_metadata,
+ attn_type=attn_type)
def _run_decoder_self_attention_test(
test_rsrcs: TestResources,
decoder_test_params: PhaseTestParameters,
attn_metadata: AttentionMetadata,
+ test_pt: TestPoint,
) -> torch.Tensor:
'''
Run decoder self-attention test.
@@ -650,6 +665,8 @@ def _run_decoder_self_attention_test(
query/key/value fields
* attn_metadata: attention metadata for decoder-self attention
(contains KV cache memory-mapping)
+ * test_pt: The TestPoint object containing test details like number of
+ model heads, head size, name of the backend being used etc.
Returns:
* Attention.forward() applied to packed_{query,key,value}, kv_cache
@@ -660,12 +677,22 @@ def _run_decoder_self_attention_test(
kv_cache = test_rsrcs.kv_cache
packed_qkv = decoder_test_params.packed_qkvo.packed_qkv
assert packed_qkv is not None
- return attn.forward(packed_qkv.query,
- packed_qkv.key,
- packed_qkv.value,
- kv_cache,
- attn_metadata,
- attn_type=attn_type)
+ with set_forward_context(attn_metadata):
+ # In the test setup the shape of the query is
+ # [batch_size, seq_len, num_heads, head_size]. However
+ # the attention backend expect the shape to be
+ # [num_tokens, hidden_size]. Hence reshape the query before
+ # invoking the forward method.
+ # TODO - Update the way we construct the query so that it
+ # is shaped as [num_tokens, hidden_size] and we can skip the reshape.
+ reshaped_query = packed_qkv.query.view(
+ -1, test_pt.num_heads * test_pt.head_size)
+ return attn.forward(reshaped_query,
+ packed_qkv.key,
+ packed_qkv.value,
+ kv_cache,
+ attn_metadata,
+ attn_type=attn_type)
def _run_encoder_decoder_cross_attention_test(
@@ -673,6 +700,7 @@ def _run_encoder_decoder_cross_attention_test(
decoder_test_params: PhaseTestParameters,
cross_test_params: Optional[PhaseTestParameters],
attn_metadata: AttentionMetadata,
+ test_pt: TestPoint,
) -> torch.Tensor:
'''
Run encoder/decoder cross-attention test.
@@ -701,6 +729,8 @@ def _run_encoder_decoder_cross_attention_test(
(number_of_tokens x num_heads x head_size)
key/value fields
* attn_metadata: attention metadata for encoder/decoder-self attention
+ * test_pt: The TestPoint object containing test details like number of
+ model heads, head size, name of the backend being used etc.
Returns:
* Attention.forward() applied to packed_{query,key,value}, kv_cache
@@ -718,12 +748,37 @@ def _run_encoder_decoder_cross_attention_test(
cross_pckd_qkv = cross_test_params.packed_qkvo.packed_qkv
key = (None if cross_pckd_qkv is None else cross_pckd_qkv.key)
value = (None if cross_pckd_qkv is None else cross_pckd_qkv.value)
- return attn.forward(decoder_test_params.packed_qkvo.packed_qkv.query,
- key,
- value,
- kv_cache,
- attn_metadata,
- attn_type=attn_type)
+ with set_forward_context(attn_metadata):
+ # In the test setup the shape of the query is
+ # [batch_size, seq_len, num_heads, head_size]. However
+ # the attention backend expect the shape to be
+ # [num_tokens, hidden_size]. Hence reshape the query before
+ # invoking the forward method.
+ # TODO - Update the way we construct the query so that it
+ # is shaped as [num_tokens, hidden_size] and we can skip the reshape.
+ reshaped_query = decoder_test_params.packed_qkvo.packed_qkv.query.view(
+ -1, test_pt.num_heads * test_pt.head_size)
+ return attn.forward(reshaped_query,
+ key,
+ value,
+ kv_cache,
+ attn_metadata,
+ attn_type=attn_type)
+
+
+@pytest.fixture(autouse=True)
+def set_reset_environment(attn_backend):
+ # Set the default torch datatype to bfloat16 to enable
+ # testing of the Flash Attention backend. Also clear the
+ # cached value of the backend.
+ default_dtype = torch.get_default_dtype()
+ if attn_backend.name == 'FLASH_ATTN':
+ torch.set_default_dtype(torch.bfloat16)
+ get_attn_backend.cache_clear()
+ yield
+ # Reset the torch datatype to what it was before the test
+ # so as not to impact the remaining tests.
+ torch.set_default_dtype(default_dtype)
@pytest.mark.skipif(current_platform.is_rocm(),
@@ -773,10 +828,8 @@ def test_encoder_only(
* max_dec_seq_len: max length of decoder input sequences
* max_enc_seq_len: max length of encoder input sequences
'''
-
# Force Attention wrapper backend
with global_force_attn_backend_context_manager(attn_backend):
-
# Note: KV cache size of 4096 is arbitrary & chosen intentionally
# to be more than necessary, since exceeding the kv cache size
# is not part of this test
@@ -807,10 +860,14 @@ def test_encoder_only(
# PREFILL: encoder attention
enc_pckd_act_out: torch.Tensor = (_run_encoder_attention_test(
- test_rsrcs.attn, enc_test_params, prephase_attn_metadata))
+ test_rsrcs.attn,
+ enc_test_params,
+ prephase_attn_metadata,
+ test_pt=test_pt))
# - Is encoder attention result correct?
- assert_actual_matches_ideal(enc_test_params, enc_pckd_act_out)
+ assert_actual_matches_ideal(enc_test_params, enc_pckd_act_out,
+ attn_backend.name)
@pytest.mark.skipif(current_platform.is_rocm(),
@@ -892,10 +949,8 @@ def test_e2e_enc_dec_attn(
* max_dec_seq_len: max length of decoder input sequences
* max_enc_seq_len: max length of encoder input sequences
'''
-
# Force Attention wrapper backend
with global_force_attn_backend_context_manager(attn_backend):
-
# Note: KV cache size of 4096 is arbitrary & chosen intentionally
# to be more than necessary, since exceeding the kv cache size
# is not part of this test
@@ -955,29 +1010,39 @@ def test_e2e_enc_dec_attn(
enc_pckd_act_out = _run_encoder_attention_test(test_rsrcs.attn,
enc_test_params,
- prephase_attn_metadata)
+ prephase_attn_metadata,
+ test_pt=test_pt)
# - Is encoder attention result correct?
- assert_actual_matches_ideal(enc_test_params, enc_pckd_act_out)
+ assert_actual_matches_ideal(enc_test_params, enc_pckd_act_out,
+ attn_backend.name)
# PREFILL: decoder self-attention test
prephase_dec_pckd_act_out = _run_decoder_self_attention_test(
- test_rsrcs, prephase_dec_test_params, prephase_attn_metadata)
+ test_rsrcs,
+ prephase_dec_test_params,
+ prephase_attn_metadata,
+ test_pt=test_pt)
# - Is prefill decoder self-attention correct?
assert_actual_matches_ideal(prephase_dec_test_params,
- prephase_dec_pckd_act_out)
+ prephase_dec_pckd_act_out,
+ attn_backend.name)
# PREFILL: encoder/decoder cross-attention test
prephase_cross_pckd_act_out = _run_encoder_decoder_cross_attention_test(
- test_rsrcs, prephase_dec_test_params, prephase_cross_test_params,
- prephase_attn_metadata)
+ test_rsrcs,
+ prephase_dec_test_params,
+ prephase_cross_test_params,
+ prephase_attn_metadata,
+ test_pt=test_pt)
# - Is prefill encoder/decoder cross-attention correct?
assert_actual_matches_ideal(prephase_cross_test_params,
- prephase_cross_pckd_act_out)
+ prephase_cross_pckd_act_out,
+ attn_backend.name)
# DECODE: build decode-phase attention metadata
@@ -993,17 +1058,26 @@ def test_e2e_enc_dec_attn(
# DECODE: decoder self-attention test
decphase_dec_pckd_act_out = _run_decoder_self_attention_test(
- test_rsrcs, decphase_dec_test_params, decphase_attn_metadata)
+ test_rsrcs,
+ decphase_dec_test_params,
+ decphase_attn_metadata,
+ test_pt=test_pt)
# - Is decode-phase decoder self-attention correct?
assert_actual_matches_ideal(decphase_dec_test_params,
- decphase_dec_pckd_act_out)
+ decphase_dec_pckd_act_out,
+ attn_backend.name)
# DECODE: encoder/decoder cross-attention test
decphase_cross_pckd_act_out = _run_encoder_decoder_cross_attention_test(
- test_rsrcs, decphase_dec_test_params, None, decphase_attn_metadata)
+ test_rsrcs,
+ decphase_dec_test_params,
+ None,
+ decphase_attn_metadata,
+ test_pt=test_pt)
# - Is decode-phase encoder/decoder cross-attention correct?
assert_actual_matches_ideal(decphase_cross_test_params,
- decphase_cross_pckd_act_out)
+ decphase_cross_pckd_act_out,
+ attn_backend.name)
diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py
index c3d5252edc2a3..e7865fb2500ef 100644
--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@@ -13,8 +13,8 @@
from vllm.attention import AttentionBackend, AttentionMetadata, AttentionType
from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.utils import (STR_BACKEND_ENV_VAR, STR_XFORMERS_ATTN_VAL,
- make_tensor_with_pad)
+from vllm.utils import (STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL,
+ STR_XFORMERS_ATTN_VAL, make_tensor_with_pad)
# For now, disable "test_aot_dispatch_dynamic" since there are some
# bugs related to this test in PyTorch 2.4.
@@ -525,17 +525,22 @@ def make_backend(backend_name: str) -> AttentionBackend:
if backend_name == STR_XFORMERS_ATTN_VAL:
# NOTE: xFormers backend cannot be imported for CPU and AMD GPUs.
from vllm.attention.backends.xformers import XFormersBackend
-
return XFormersBackend()
+ elif backend_name == STR_FLASH_ATTN_VAL:
+ from vllm.attention.backends.flash_attn import FlashAttentionBackend
+ return FlashAttentionBackend()
+
raise AssertionError(
f"Unrecognized backend_name {backend_name} for unit test")
def _make_metadata_tensors(
- seq_lens: Optional[List[int]], context_lens: Optional[List[int]],
- encoder_seq_lens: Optional[List[int]], device: Union[torch.device, str]
-) -> Tuple[torch.Tensor, torch.Tensor, Any, Any, Optional[List[int]],
- torch.Tensor, Optional[int]]:
+ seq_lens: Optional[List[int]],
+ context_lens: Optional[List[int]],
+ encoder_seq_lens: Optional[List[int]],
+ device: Union[torch.device, str],
+) -> Tuple[torch.Tensor, torch.Tensor, Any, Any, Optional[torch.Tensor],
+ torch.Tensor, torch.Tensor, Optional[int]]:
'''
Build scalar & tensor values required to build attention metadata structure.
@@ -553,6 +558,8 @@ def _make_metadata_tensors(
* max_context_len: max(context_lens)
* max_seq_len: max(seq_lens)
* seq_start_loc: start idx of each sequence
+ * encoder_seq_lens_tensor: encoder seq_lens list, as tensor
+ * encoder_seq_start_loc: start idx of each encoder sequence
* max_encoder_seq_len: encoder seq_lens list, as tensor
'''
seq_lens_tensor = maybe_make_int_tensor(seq_lens, device)
@@ -566,8 +573,26 @@ def _make_metadata_tensors(
seq_start_loc = None
+ if seq_lens_tensor is not None:
+ seq_start_loc = torch.zeros(seq_lens_tensor.shape[0] + 1,
+ dtype=torch.int32,
+ device=seq_lens_tensor.device)
+ torch.cumsum(seq_lens_tensor,
+ dim=0,
+ dtype=seq_start_loc.dtype,
+ out=seq_start_loc[1:])
+
+ encoder_seq_start_loc = torch.zeros(encoder_seq_lens_tensor.shape[0] + 1,
+ dtype=torch.int32,
+ device=encoder_seq_lens_tensor.device)
+ torch.cumsum(encoder_seq_lens_tensor,
+ dim=0,
+ dtype=encoder_seq_start_loc.dtype,
+ out=encoder_seq_start_loc[1:])
+
return (seq_lens_tensor, context_lens_tensor, max_context_len, max_seq_len,
- seq_start_loc, encoder_seq_lens_tensor, max_encoder_seq_len)
+ seq_start_loc, encoder_seq_lens_tensor, encoder_seq_start_loc,
+ max_encoder_seq_len)
def make_kv_cache(num_blocks: int,
@@ -575,6 +600,7 @@ def make_kv_cache(num_blocks: int,
head_size: int,
block_size: int,
device: Union[torch.device, str],
+ backend: str,
default_val: float = 0.0) -> torch.Tensor:
'''
Create a fake KV cache.
@@ -591,10 +617,20 @@ def make_kv_cache(num_blocks: int,
Returns:
* kv_cache: 2 x num_blocks x (block_size * num_heads * head_size)
+ * for backend 'XFORMERS'
+ * kv_cache: 2 x num_blocks x block_size x num_heads x head_size
+ * for backend 'FLASH_ATTN'
'''
-
- kv_cache = torch.rand(
- (2, num_blocks, block_size * num_heads * head_size)).to(device)
+ if backend == 'XFORMERS':
+ kv_cache = torch.rand(
+ (2, num_blocks, block_size * num_heads * head_size)).to(device)
+ elif backend == 'FLASH_ATTN':
+ kv_cache = torch.rand(
+ (2, num_blocks, block_size, num_heads, head_size)).to(device)
+ else:
+ raise ValueError(
+ f"Unknown backend value: '{backend}'. Expected 'XFORMERS' or "
+ f"'FLASH_ATTN'.")
if default_val is not None:
kv_cache[:, :, :] = default_val
return kv_cache
@@ -858,8 +894,9 @@ def make_test_metadata(
context_lens_tensor,
_,
_,
- _,
+ seq_start_loc,
encoder_seq_lens_tensor,
+ encoder_seq_start_loc,
max_encoder_seq_len,
) = _make_metadata_tensors(seq_lens,
context_lens,
@@ -874,6 +911,7 @@ def make_test_metadata(
num_decode_tokens=num_decode_tokens,
seq_lens=seq_lens,
seq_lens_tensor=seq_lens_tensor,
+ seq_start_loc=seq_start_loc,
max_prefill_seq_len=None if seq_lens is None else max(seq_lens),
max_decode_seq_len=0,
context_lens_tensor=context_lens_tensor,
@@ -882,6 +920,7 @@ def make_test_metadata(
num_encoder_tokens=num_encoder_tokens,
encoder_seq_lens=encoder_seq_lens,
encoder_seq_lens_tensor=encoder_seq_lens_tensor,
+ encoder_seq_start_loc=encoder_seq_start_loc,
max_encoder_seq_len=max_encoder_seq_len,
cross_slot_mapping=(None if cross_kv_mmap is None else
cross_kv_mmap.slot_mapping),
@@ -904,8 +943,9 @@ def make_test_metadata(
context_lens_tensor,
_,
_,
- _,
+ seq_start_loc,
encoder_seq_lens_tensor,
+ encoder_seq_start_loc,
max_encoder_seq_len,
) = _make_metadata_tensors(seq_lens,
context_lens,
@@ -920,14 +960,17 @@ def make_test_metadata(
num_decode_tokens=num_decode_tokens,
seq_lens=seq_lens,
seq_lens_tensor=seq_lens_tensor,
+ seq_start_loc=seq_start_loc,
max_prefill_seq_len=0,
max_decode_seq_len=max(seq_lens),
+ max_decode_query_len=1,
context_lens_tensor=context_lens_tensor,
block_tables=kv_mmap.block_tables,
use_cuda_graph=False,
num_encoder_tokens=num_encoder_tokens,
encoder_seq_lens=encoder_seq_lens,
encoder_seq_lens_tensor=encoder_seq_lens_tensor,
+ encoder_seq_start_loc=encoder_seq_start_loc,
max_encoder_seq_len=max_encoder_seq_len,
cross_slot_mapping=(None if cross_kv_mmap is None else
cross_kv_mmap.slot_mapping),
@@ -936,7 +979,8 @@ def make_test_metadata(
def assert_actual_matches_ideal(test_params: PhaseTestParameters,
- output_under_test: torch.Tensor) -> None:
+ output_under_test: torch.Tensor,
+ backend: str) -> None:
'''
Assert that observed output matches the ideal output
contained in the test parameters data structure.
@@ -947,8 +991,22 @@ def assert_actual_matches_ideal(test_params: PhaseTestParameters,
* output_under_test: actually observed output value
'''
ideal_output = test_params.packed_qkvo.ideal_output
- torch.testing.assert_close(ideal_output,
- output_under_test.view_as(ideal_output))
+ if backend == 'XFORMERS':
+ torch.testing.assert_close(ideal_output,
+ output_under_test.view_as(ideal_output))
+
+ elif backend == 'FLASH_ATTN':
+ # For FlashAttention override the accuracy thresholds to non default
+ # values since we notice a higher difference between the ideal and
+ # actual output.
+ torch.testing.assert_close(ideal_output,
+ output_under_test.view_as(ideal_output),
+ atol=0.01,
+ rtol=0.016)
+ else:
+ raise ValueError(
+ f"Unknown backend value: '{backend}'. Expected 'XFORMERS' or "
+ f"'FLASH_ATTN'.")
# Copied/modified from torch._refs.__init__.py
diff --git a/tests/models/encoder_decoder/vision_language/test_florence2.py b/tests/models/encoder_decoder/vision_language/test_florence2.py
index 483773f069133..d686f1da3fa17 100644
--- a/tests/models/encoder_decoder/vision_language/test_florence2.py
+++ b/tests/models/encoder_decoder/vision_language/test_florence2.py
@@ -85,7 +85,7 @@ def run_test(
@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("dtype", ["float", "bfloat16"])
@pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("num_logprobs", [5])
def test_models(hf_runner, vllm_runner, model, dtype, max_tokens,
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index ab363ac78b028..2975a41797e9f 100644
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -10,10 +10,11 @@
AttentionMetadata,
AttentionMetadataBuilder,
AttentionType)
-from vllm.attention.backends.utils import (PAD_SLOT_ID, CommonAttentionState,
- compute_slot_mapping,
- compute_slot_mapping_start_idx,
- is_block_tables_empty)
+from vllm.attention.backends.utils import (
+ PAD_SLOT_ID, CommonAttentionState, compute_slot_mapping,
+ compute_slot_mapping_start_idx, get_num_prefill_decode_query_kv_tokens,
+ get_seq_len_block_table_args, is_all_cross_attn_metadata_set,
+ is_all_encoder_attn_metadata_set, is_block_tables_empty)
from vllm.forward_context import get_forward_context
from vllm.multimodal import MultiModalPlaceholderMap
from vllm.utils import (async_tensor_h2d, direct_register_custom_op,
@@ -73,7 +74,6 @@ def swap_blocks(
src_key_cache = src_kv_cache[0]
dst_key_cache = dst_kv_cache[0]
ops.swap_blocks(src_key_cache, dst_key_cache, src_to_dst)
-
src_value_cache = src_kv_cache[1]
dst_value_cache = dst_kv_cache[1]
ops.swap_blocks(src_value_cache, dst_value_cache, src_to_dst)
@@ -85,6 +85,7 @@ def copy_blocks(
) -> None:
key_caches = [kv_cache[0] for kv_cache in kv_caches]
value_caches = [kv_cache[1] for kv_cache in kv_caches]
+
ops.copy_blocks(key_caches, value_caches, src_to_dists)
@@ -111,26 +112,12 @@ class FlashAttentionMetadata(AttentionMetadata):
# |-------------------- seq_len ---------------------|
# |-- query_len ---|
- # Maximum query length in the batch.
- max_query_len: Optional[int]
-
- # Max number of query tokens among request in the batch.
- max_decode_query_len: Optional[int]
-
# Maximum sequence length among prefill batch. 0 if there are decoding
# requests only.
max_prefill_seq_len: int
# Maximum sequence length among decode batch. 0 if there are prefill
# requests only.
max_decode_seq_len: int
- # (batch_size + 1,). The cumulative subquery lengths of the sequences in
- # the batch, used to index into subquery. E.g., if the subquery length
- # is [4, 6], it is [0, 4, 10].
- query_start_loc: Optional[torch.Tensor]
- # (batch_size + 1,). The cumulative sequence lengths of the sequences in
- # the batch, used to index into sequence. E.g., if the sequence length is
- # [4, 6], it is [0, 4, 10].
- seq_start_loc: Optional[torch.Tensor]
# (batch_size,) A tensor of context lengths (tokens that are computed
# so far).
context_lens_tensor: Optional[torch.Tensor]
@@ -146,11 +133,62 @@ class FlashAttentionMetadata(AttentionMetadata):
# Whether or not if cuda graph is enabled.
# Cuda-graph is currently enabled for decoding only.
# TODO(woosuk): Move `use_cuda_graph` out since it's unrelated to attention.
+
use_cuda_graph: bool
+ # Maximum query length in the batch.
+ max_query_len: Optional[int] = None
+
+ # Max number of query tokens among request in the batch.
+ max_decode_query_len: Optional[int] = None
+
+ # (batch_size + 1,). The cumulative subquery lengths of the sequences in
+ # the batch, used to index into subquery. E.g., if the subquery length
+ # is [4, 6], it is [0, 4, 10].
+ query_start_loc: Optional[torch.Tensor] = None
+ # (batch_size + 1,). The cumulative sequence lengths of the sequences in
+ # the batch, used to index into sequence. E.g., if the sequence length is
+ # [4, 6], it is [0, 4, 10].
+ seq_start_loc: Optional[torch.Tensor] = None
+
_cached_prefill_metadata: Optional["FlashAttentionMetadata"] = None
_cached_decode_metadata: Optional["FlashAttentionMetadata"] = None
+ # Begin encoder attn & enc/dec cross-attn fields...
+
+ # Encoder sequence lengths representation
+ encoder_seq_lens: Optional[List[int]] = None
+ encoder_seq_lens_tensor: Optional[torch.Tensor] = None
+ # (batch_size + 1,). The cumulative sequence lengths of the sequences in
+ # the batch, used to index into sequence. E.g., if the sequence length is
+ # [4, 6], it is [0, 4, 10].
+ encoder_seq_start_loc: Optional[torch.Tensor] = None
+ # Maximum sequence length among encoder sequences
+ max_encoder_seq_len: Optional[int] = None
+ # Number of tokens input to encoder
+ num_encoder_tokens: Optional[int] = None
+
+ # Cross-attention memory-mapping data structures: slot mapping
+ # and block tables
+ cross_slot_mapping: Optional[torch.Tensor] = None
+ cross_block_tables: Optional[torch.Tensor] = None
+
+ @property
+ def is_all_encoder_attn_metadata_set(self):
+ '''
+ All attention metadata required for encoder attention is set.
+ '''
+ return is_all_encoder_attn_metadata_set(self)
+
+ @property
+ def is_all_cross_attn_metadata_set(self):
+ '''
+ All attention metadata required for enc/dec cross-attention is set.
+
+ Superset of encoder attention required metadata.
+ '''
+ return is_all_cross_attn_metadata_set(self)
+
@property
def prefill_metadata(self) -> Optional["FlashAttentionMetadata"]:
if self.num_prefills == 0:
@@ -159,32 +197,52 @@ def prefill_metadata(self) -> Optional["FlashAttentionMetadata"]:
if self._cached_prefill_metadata is not None:
return self._cached_prefill_metadata
- assert self.seq_lens is not None
- assert self.seq_lens_tensor is not None
- assert self.query_start_loc is not None
- assert self.context_lens_tensor is not None
- assert self.block_tables is not None
- assert self.seq_start_loc is not None
+ assert ((self.seq_lens is not None)
+ or (self.encoder_seq_lens is not None))
+ assert ((self.seq_lens_tensor is not None)
+ or (self.encoder_seq_lens_tensor is not None))
+
+ # Compute some attn_metadata fields which default to None
+ query_start_loc = (None if self.query_start_loc is None else
+ self.query_start_loc[:self.num_prefills + 1])
+ slot_mapping = (None if self.slot_mapping is None else
+ self.slot_mapping[:self.num_prefill_tokens])
+ seq_lens = (None if self.seq_lens is None else
+ self.seq_lens[:self.num_prefills])
+ seq_lens_tensor = (None if self.seq_lens_tensor is None else
+ self.seq_lens_tensor[:self.num_prefills])
+ seq_start_loc = (None if self.seq_start_loc is None else
+ self.seq_start_loc[:self.num_prefills + 1])
+ context_lens_tensor = (None if self.context_lens_tensor is None else
+ self.context_lens_tensor[:self.num_prefills])
+ block_tables = (None if self.block_tables is None else
+ self.block_tables[:self.num_prefills])
self._cached_prefill_metadata = FlashAttentionMetadata(
num_prefills=self.num_prefills,
num_prefill_tokens=self.num_prefill_tokens,
num_decode_tokens=0,
- slot_mapping=self.slot_mapping[:self.num_prefill_tokens],
+ slot_mapping=slot_mapping,
multi_modal_placeholder_index_maps=self.
multi_modal_placeholder_index_maps,
- seq_lens=self.seq_lens[:self.num_prefills],
- seq_lens_tensor=self.seq_lens_tensor[:self.num_prefills],
+ seq_lens=seq_lens,
+ seq_lens_tensor=seq_lens_tensor,
max_query_len=self.max_query_len,
max_prefill_seq_len=self.max_prefill_seq_len,
max_decode_query_len=0,
max_decode_seq_len=0,
- query_start_loc=self.query_start_loc[:self.num_prefills + 1],
- seq_start_loc=self.seq_start_loc[:self.num_prefills + 1],
- context_lens_tensor=self.context_lens_tensor[:self.num_prefills],
- block_tables=self.block_tables[:self.num_prefills],
+ query_start_loc=query_start_loc,
+ seq_start_loc=seq_start_loc,
+ context_lens_tensor=context_lens_tensor,
+ block_tables=block_tables,
use_cuda_graph=False,
- )
+ # Begin encoder & cross attn fields below...
+ encoder_seq_lens=self.encoder_seq_lens,
+ encoder_seq_lens_tensor=self.encoder_seq_lens_tensor,
+ encoder_seq_start_loc=self.encoder_seq_start_loc,
+ max_encoder_seq_len=self.max_encoder_seq_len,
+ cross_slot_mapping=self.cross_slot_mapping,
+ cross_block_tables=self.cross_block_tables)
return self._cached_prefill_metadata
@property
@@ -194,17 +252,25 @@ def decode_metadata(self) -> Optional["FlashAttentionMetadata"]:
if self._cached_decode_metadata is not None:
return self._cached_decode_metadata
- assert self.block_tables is not None
- assert self.seq_lens_tensor is not None
+ assert ((self.seq_lens_tensor is not None)
+ or (self.encoder_seq_lens_tensor is not None))
+
+ # Compute some attn_metadata fields which default to None
+ slot_mapping = (None if self.slot_mapping is None else
+ self.slot_mapping[self.num_prefill_tokens:])
+ seq_lens_tensor = (None if self.seq_lens_tensor is None else
+ self.seq_lens_tensor[self.num_prefills:])
+ block_tables = (None if self.block_tables is None else
+ self.block_tables[self.num_prefills:])
self._cached_decode_metadata = FlashAttentionMetadata(
num_prefills=0,
num_prefill_tokens=0,
num_decode_tokens=self.num_decode_tokens,
- slot_mapping=self.slot_mapping[self.num_prefill_tokens:],
+ slot_mapping=slot_mapping,
multi_modal_placeholder_index_maps=None,
seq_lens=None,
- seq_lens_tensor=self.seq_lens_tensor[self.num_prefills:],
+ seq_lens_tensor=seq_lens_tensor,
max_decode_query_len=self.max_decode_query_len,
max_query_len=self.max_query_len,
max_prefill_seq_len=0,
@@ -214,9 +280,15 @@ def decode_metadata(self) -> Optional["FlashAttentionMetadata"]:
seq_start_loc=self.seq_start_loc[self.num_prefills:]
if self.seq_start_loc is not None else None,
context_lens_tensor=None,
- block_tables=self.block_tables[self.num_prefills:],
+ block_tables=block_tables,
use_cuda_graph=self.use_cuda_graph,
- )
+ # Begin encoder & cross attn fields below...
+ encoder_seq_lens=self.encoder_seq_lens,
+ encoder_seq_lens_tensor=self.encoder_seq_lens_tensor,
+ encoder_seq_start_loc=self.encoder_seq_start_loc,
+ max_encoder_seq_len=self.max_encoder_seq_len,
+ cross_slot_mapping=self.cross_slot_mapping,
+ cross_block_tables=self.cross_block_tables)
return self._cached_decode_metadata
def advance_step(self,
@@ -586,16 +658,20 @@ def forward(
Returns:
shape = [num_tokens, num_heads * head_size]
"""
- if attn_type != AttentionType.DECODER:
- raise NotImplementedError("Encoder self-attention and "
- "encoder/decoder cross-attention "
- "are not implemented for "
- "FlashAttentionImpl")
-
# NOTE(woosuk): FlashAttention does not support FP8 KV cache.
assert k_scale == 1.0 and v_scale == 1.0, (
"key/v_scale is not supported in FlashAttention.")
+ if (attn_type == AttentionType.ENCODER
+ and (not attn_metadata.is_all_encoder_attn_metadata_set)):
+ raise AttributeError("Encoder attention requires setting "
+ "encoder metadata attributes.")
+ elif (attn_type == AttentionType.ENCODER_DECODER
+ and (not attn_metadata.is_all_cross_attn_metadata_set)):
+ raise AttributeError("Encoder/decoder cross-attention "
+ "requires setting cross-attention "
+ "metadata attributes.")
+
output = torch.ops.vllm.unified_flash_attention(
query,
key,
@@ -608,6 +684,7 @@ def forward(
k_scale,
v_scale,
self.scale,
+ attn_type.value,
self.sliding_window,
self.alibi_slopes,
self.logits_soft_cap,
@@ -616,6 +693,89 @@ def forward(
return output
+def _get_query_key_seq_metadata(
+ attn_metadata,
+ is_prompt: bool,
+ attn_type: AttentionType,
+) -> tuple:
+ """
+ Returns sequence metadata for key and query based on the specified
+ attention type and whether input is a prompt.
+
+ This function computes the starting locations and maximum sequence lengths
+ for key and query sequences for different attention types.
+
+ Args:
+ attn_metadata: The attention metadata object
+ is_prompt (bool): A flag indicating if the input is a prompt
+ attn_type (AttentionType): The type of attention being used.
+
+ Returns:
+ tuple: A tuple containing four integers:
+ - Starting location for the query sequence.
+ - Maximum sequence length for the query sequence.
+ - Starting location for the key sequence.
+ - Maximum sequence length for the key sequence.
+
+ Raises:
+ AttributeError: If an invalid attention type is provided.
+ """
+ if attn_type == AttentionType.DECODER:
+ # Decoder self-attention
+ # Choose max_seq_len based on whether we are in prompt_run
+ if is_prompt:
+ max_seq_len = attn_metadata.max_prefill_seq_len
+ else:
+ max_seq_len = attn_metadata.max_decode_seq_len
+ return (attn_metadata.seq_start_loc, max_seq_len,
+ attn_metadata.seq_start_loc, max_seq_len)
+
+ elif attn_type == AttentionType.ENCODER_DECODER:
+ # This is cross attention between the where the key
+ # is the precomputed encoder attention and query
+ # is the input sequence.
+ # Choose query max length based on whether it is prompt
+ # or not.
+ if is_prompt:
+ max_seq_len = attn_metadata.max_prefill_seq_len
+ else:
+ max_seq_len = attn_metadata.max_decode_seq_len
+ return (attn_metadata.seq_start_loc, max_seq_len,
+ attn_metadata.encoder_seq_start_loc,
+ attn_metadata.max_encoder_seq_len)
+ elif attn_type == AttentionType.ENCODER:
+ # For encoder attention both the query and the key are same i.e the
+ # encoder sequence.
+ return (attn_metadata.encoder_seq_start_loc,
+ attn_metadata.max_encoder_seq_len,
+ attn_metadata.encoder_seq_start_loc,
+ attn_metadata.max_encoder_seq_len)
+ elif attn_type == AttentionType.ENCODER_ONLY:
+ assert is_prompt, "Should not have decode for encoder only model."
+ return (attn_metadata.seq_start_loc, attn_metadata.max_prefill_seq_len,
+ attn_metadata.seq_start_loc, attn_metadata.max_prefill_seq_len)
+ else:
+ raise AttributeError(f"Invalid attention type {str(attn_type)}")
+
+
+def _get_causal_option(attn_type: AttentionType) -> bool:
+ """
+ Determine whether the given attention type is suitable for causal
+ attention mechanisms.
+
+ Args:
+ attn_type (AttentionType): The type of attention being evaluated
+
+ Returns:
+ bool: Returns `True` if the attention type is suitable for causal
+ attention (i.e., not encoder, encoder-only, or encoder-decoder),
+ otherwise returns `False`.
+ """
+ return not (attn_type == AttentionType.ENCODER
+ or attn_type == AttentionType.ENCODER_ONLY
+ or attn_type == AttentionType.ENCODER_DECODER)
+
+
def unified_flash_attention(
query: torch.Tensor,
key: torch.Tensor,
@@ -628,60 +788,76 @@ def unified_flash_attention(
k_scale: float,
v_scale: float,
softmax_scale: float,
+ attn_type_int_val: int,
window_size: Optional[List[int]] = None,
alibi_slopes: Optional[torch.Tensor] = None,
logits_soft_cap: Optional[float] = None,
) -> torch.Tensor:
+ # Convert integer attn_type to enum
+ try:
+ attn_type = AttentionType(attn_type_int_val)
+ except ValueError as err:
+ raise AttributeError(
+ f"Invalid attention type {str(attn_type_int_val)}") from err
+
current_metadata = get_forward_context()
assert current_metadata is not None
assert isinstance(current_metadata, FlashAttentionMetadata)
attn_metadata: FlashAttentionMetadata = current_metadata
num_tokens, hidden_size = query.shape
+
# Reshape the query, key, and value tensors.
query = query.view(-1, num_heads, head_size)
- key = key.view(-1, num_kv_heads, head_size)
- value = value.view(-1, num_kv_heads, head_size)
+ if (key is not None) and (value is not None):
+ key = key.view(-1, num_kv_heads, head_size)
+ value = value.view(-1, num_kv_heads, head_size)
if kv_cache.numel() > 0:
key_cache = kv_cache[0]
value_cache = kv_cache[1]
+ # We skip updating the KV cache under two conditions:
+ # a. When the Attention Type is ENCODER. In this phase, we compute
+ # only the encoder attention without updating the cache.
+ # b. When both Key and Value are None. This occurs during
+ # cross-attention computation in the decoding phase, where the KV
+ # cache is already populated with the cross-attention tensor.
+ # Thus, we skip cache updates during this time.
+ if (attn_type != AttentionType.ENCODER) and (key is not None) and (
+ value is not None):
+ if attn_type == AttentionType.ENCODER_DECODER:
+ # Update cross-attention KV cache (prefill-only)
+ updated_slot_mapping = attn_metadata.cross_slot_mapping
+ else:
+ # Update self-attention KV cache (prefill/decode)
+ updated_slot_mapping = attn_metadata.slot_mapping
+
+ # Reshape the input keys and values and store them in the cache.
+ # If kv_cache is not provided, the new key and value tensors are
+ # not cached. This happens during the initial memory profiling run.
+ torch.ops._C_cache_ops.reshape_and_cache_flash(
+ key,
+ value,
+ kv_cache[0],
+ kv_cache[1],
+ updated_slot_mapping.flatten(), # type: ignore[union-attr]
+ kv_cache_dtype,
+ k_scale,
+ v_scale,
+ )
- # Reshape the input keys and values and store them in the cache.
- # If kv_cache is not provided, the new key and value tensors are
- # not cached. This happens during the initial memory profiling run.
- torch.ops._C_cache_ops.reshape_and_cache_flash(
- key,
- value,
- kv_cache[0],
- kv_cache[1],
- attn_metadata.slot_mapping.flatten(),
- kv_cache_dtype,
- k_scale,
- v_scale,
- )
-
- num_prefill_tokens = attn_metadata.num_prefill_tokens
- num_decode_tokens = attn_metadata.num_decode_tokens
- assert key.shape[0] == num_prefill_tokens + num_decode_tokens, \
- f"key : {key.shape} : #prefill tokens {num_prefill_tokens} : #decode tokens {num_decode_tokens}" # noqa
- assert value.shape[0] == num_prefill_tokens + num_decode_tokens, \
- f"value : {value.shape} : #prefill toks {num_prefill_tokens} : #decode toks {num_decode_tokens}" # noqa
-
- # Query for decode. KV is not needed because it is already cached.
- decode_query = query[num_prefill_tokens:]
+ (num_prefill_query_tokens, num_prefill_kv_tokens,
+ num_decode_query_tokens) = \
+ get_num_prefill_decode_query_kv_tokens(attn_metadata, attn_type)
+ decode_query = query[num_prefill_query_tokens:]
# QKV for prefill.
- query = query[:num_prefill_tokens]
- key = key[:num_prefill_tokens]
- value = value[:num_prefill_tokens]
-
- assert query.shape[0] == num_prefill_tokens
- assert decode_query.shape[0] == num_decode_tokens
+ query = query[:num_prefill_query_tokens]
+ assert query.shape[0] == num_prefill_query_tokens
+ assert decode_query.shape[0] == num_decode_query_tokens
prefill_output: Optional[torch.Tensor] = None
decode_output: Optional[torch.Tensor] = None
-
if prefill_meta := attn_metadata.prefill_metadata:
# Prompt run.
if (kv_cache.numel() == 0 or prefill_meta.block_tables is None
@@ -689,22 +865,30 @@ def unified_flash_attention(
# normal attention
# When block_tables are not filled, it means q and k are the
# prompt, and they have the same length.
+ q_seq_start_loc, q_seq_len, k_seq_start_loc, k_seq_len = \
+ _get_query_key_seq_metadata(prefill_meta, True, attn_type)
+
+ key = key[:num_prefill_kv_tokens]
+ value = value[:num_prefill_kv_tokens]
+
prefill_output = flash_attn_varlen_func(
q=query,
k=key,
v=value,
- cu_seqlens_q=prefill_meta.seq_start_loc,
- cu_seqlens_k=prefill_meta.seq_start_loc,
- max_seqlen_q=prefill_meta.max_prefill_seq_len,
- max_seqlen_k=prefill_meta.max_prefill_seq_len,
+ cu_seqlens_q=q_seq_start_loc,
+ cu_seqlens_k=k_seq_start_loc,
+ max_seqlen_q=q_seq_len,
+ max_seqlen_k=k_seq_len,
softmax_scale=softmax_scale,
- causal=True,
+ causal=_get_causal_option(attn_type),
window_size=window_size,
alibi_slopes=alibi_slopes,
softcap=logits_soft_cap,
)
else:
# prefix-enabled attention
+ assert attn_type == AttentionType.DECODER, (
+ "Only decoder-only models support prefix caching")
assert prefill_meta.seq_lens is not None
max_seq_len = max(prefill_meta.seq_lens)
prefill_output = flash_attn_varlen_func( # noqa
@@ -729,6 +913,8 @@ def unified_flash_attention(
# because different queries might have different lengths.
assert decode_meta.max_decode_query_len is not None
if decode_meta.max_decode_query_len > 1:
+ assert attn_type == AttentionType.DECODER, (
+ "Only decoder-only models support max_decode_query_len > 1")
decode_output = flash_attn_varlen_func(
q=decode_query,
k=key_cache,
@@ -746,12 +932,17 @@ def unified_flash_attention(
)
else:
# Use flash_attn_with_kvcache for normal decoding.
+ (
+ seq_lens_arg,
+ _,
+ block_tables_arg,
+ ) = get_seq_len_block_table_args(decode_meta, False, attn_type)
decode_output = flash_attn_with_kvcache(
q=decode_query.unsqueeze(1),
k_cache=key_cache,
v_cache=value_cache,
- block_table=decode_meta.block_tables,
- cache_seqlens=decode_meta.seq_lens_tensor,
+ block_table=block_tables_arg,
+ cache_seqlens=seq_lens_arg,
softmax_scale=softmax_scale,
causal=True,
window_size=window_size,
@@ -761,10 +952,10 @@ def unified_flash_attention(
if prefill_output is None:
assert decode_output is not None
- return decode_output.view(num_decode_tokens, hidden_size)
+ return decode_output.view(num_decode_query_tokens, hidden_size)
if decode_output is None:
assert prefill_output is not None
- return prefill_output.view(num_prefill_tokens, hidden_size)
+ return prefill_output.view(num_prefill_query_tokens, hidden_size)
# Chunked prefill does not work with speculative decoding.
# Therefore, the query length for decode should be 1 in chunked prefill.
@@ -786,6 +977,7 @@ def unified_flash_attention_fake(
k_scale: float,
v_scale: float,
softmax_scale: float,
+ attn_type_int_val: int,
window_size: Optional[List[int]] = None,
alibi_slopes: Optional[torch.Tensor] = None,
logits_soft_cap: Optional[float] = None,
diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py
index 55293bbb06e1d..096c920c4833a 100644
--- a/vllm/attention/backends/utils.py
+++ b/vllm/attention/backends/utils.py
@@ -1,13 +1,14 @@
"""Attention backend utils"""
from collections import defaultdict
from contextlib import contextmanager
-from typing import TYPE_CHECKING, Any, Dict, List, Type, TypeVar, Union
+from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Type, TypeVar, Union
import numpy as np
import torch
from vllm.attention import (AttentionMetadata, AttentionMetadataBuilder,
AttentionState)
+from vllm.attention.backends.abstract import AttentionType
from vllm.multimodal import MultiModalPlaceholderMap
from vllm.utils import async_tensor_h2d, make_tensor_with_pad
@@ -336,11 +337,13 @@ def graph_capture_get_metadata_for_batch(
use_cuda_graph=True,
)
if is_encoder_decoder_model:
- # The encoder decoder model works only with XFormers backend.
- # Assert the same.
- assert self.runner.attn_backend.get_name() == "XFORMERS", \
- f"Expected attn_backend name to be 'XFORMERS', but "\
- f" got '{self.runner.attn_backend.get_name()}'"
+ # The encoder decoder model works only with XFormers and
+ # Flash Attention backend. Assert the same.
+ assert self.runner.attn_backend.get_name() in\
+ ["XFORMERS", "FLASH_ATTN"], \
+ f"Expected attn_backend name to be either 'XFORMERS' or " \
+ f"'FLASH_ATTN', but "\
+ f"got '{self.runner.attn_backend.get_name()}'"
self._update_captured_metadata_for_enc_dec_model(
batch_size=batch_size, attn_metadata=attn_metadata)
@@ -356,11 +359,13 @@ def get_graph_input_buffers(
"block_tables": attn_metadata.decode_metadata.block_tables,
}
if is_encoder_decoder_model:
- # The encoder decoder model works only with XFormers backend.
- # Assert the same.
- assert self.runner.attn_backend.get_name() == "XFORMERS", \
- f"Expected attn_backend name to be 'XFORMERS', but "\
- f" got '{self.runner.attn_backend.get_name()}'"
+ # The encoder decoder model works only with XFormers and
+ # Flash Attention backend. Assert the same.
+ assert self.runner.attn_backend.get_name() in\
+ ["XFORMERS", "FLASH_ATTN"], \
+ f"Expected attn_backend name to be either 'XFORMERS' or "\
+ f"'FLASH_ATTN', but "\
+ f"got '{self.runner.attn_backend.get_name()}'"
self._add_additonal_input_buffers_for_enc_dec_model(
attn_metadata=attn_metadata, input_buffers=input_buffers)
return input_buffers
@@ -375,11 +380,13 @@ def prepare_graph_input_buffers(
input_buffers["block_tables"].copy_(
attn_metadata.decode_metadata.block_tables, non_blocking=True)
if is_encoder_decoder_model:
- # The encoder decoder model works only with XFormers backend.
- # Assert the same.
- assert self.runner.attn_backend.get_name() == "XFORMERS", \
- f"Expected attn_backend name to be 'XFORMERS', but "\
- f" got '{self.runner.attn_backend.get_name()}'"
+ # The encoder decoder model works only with XFormers and
+ # Flash Attention backend. Assert the same.
+ assert self.runner.attn_backend.get_name() in\
+ ["XFORMERS", "FLASH_ATTN"], \
+ f"Expected attn_backend name to be either 'XFORMERS' or "\
+ f"'FLASH_ATTN', but "\
+ f"got '{self.runner.attn_backend.get_name()}'"
self._prepare_input_buffers_for_enc_dec_model(
attn_metadata, input_buffers)
@@ -411,6 +418,7 @@ def _update_captured_metadata_for_enc_dec_model(self, batch_size: int,
attn_metadata.encoder_seq_lens_tensor = torch.full(
(batch_size, ), 1, dtype=torch.int).cuda()
attn_metadata.max_encoder_seq_len = self.runner.max_seq_len_to_capture
+ attn_metadata.num_encoder_tokens = 0
def _add_additonal_input_buffers_for_enc_dec_model(
self, attn_metadata, input_buffers: Dict[str, Any]):
@@ -453,3 +461,122 @@ def _prepare_input_buffers_for_enc_dec_model(self, attn_metadata,
input_buffers["cross_block_tables"].copy_(
attn_metadata.decode_metadata.cross_block_tables,
non_blocking=True)
+
+
+def is_all_encoder_attn_metadata_set(attn_metadata):
+ '''
+ All attention metadata required for encoder attention is set.
+ '''
+ return ((attn_metadata.encoder_seq_lens is not None)
+ and (attn_metadata.encoder_seq_lens_tensor is not None)
+ and (attn_metadata.max_encoder_seq_len is not None))
+
+
+def is_all_cross_attn_metadata_set(attn_metadata):
+ '''
+ All attention metadata required for enc/dec cross-attention is set.
+
+ Superset of encoder attention required metadata.
+ '''
+ return (attn_metadata.is_all_encoder_attn_metadata_set
+ and (attn_metadata.cross_slot_mapping is not None)
+ and (attn_metadata.cross_block_tables is not None))
+
+
+def get_seq_len_block_table_args(
+ attn_metadata,
+ is_prompt: bool,
+ attn_type: AttentionType,
+) -> tuple:
+ '''
+ The particular choice of sequence-length- and block-table-related
+ attributes which should be extracted from attn_metadata is dependent
+ on the type of attention operation.
+
+ Decoder attn -> select entirely decoder self-attention-related fields
+ Encoder/decoder cross-attn -> select encoder sequence lengths &
+ cross-attn block-tables fields
+ Encoder attn -> select encoder sequence lengths fields & no block tables
+
+ Arguments:
+
+ * attn_metadata: Attention metadata structure associated with attention op
+ * is_prompt: True if prefill, False otherwise
+ * attn_type: encoder attention, decoder self-attention,
+ encoder/decoder cross-attention
+
+ Returns:
+
+ * Appropriate sequence-lengths tensor
+ * Appropriate max sequence-length scalar
+ * Appropriate block tables (or None)
+ '''
+
+ if attn_type == AttentionType.DECODER:
+ # Decoder self-attention
+ # Choose max_seq_len based on whether we are in prompt_run
+ if is_prompt:
+ max_seq_len = attn_metadata.max_prefill_seq_len
+ else:
+ max_seq_len = attn_metadata.max_decode_seq_len
+ return (attn_metadata.seq_lens_tensor, max_seq_len,
+ attn_metadata.block_tables)
+ elif attn_type == AttentionType.ENCODER_DECODER:
+ # Enc/dec cross-attention KVs match encoder sequence length;
+ # cross-attention utilizes special "cross" block tables
+ return (attn_metadata.encoder_seq_lens_tensor,
+ attn_metadata.max_encoder_seq_len,
+ attn_metadata.cross_block_tables)
+ elif attn_type == AttentionType.ENCODER:
+ # No block tables associated with encoder attention
+ return (attn_metadata.encoder_seq_lens_tensor,
+ attn_metadata.max_encoder_seq_len, None)
+ else:
+ raise AttributeError(f"Invalid attention type {str(attn_type)}")
+
+
+def get_num_prefill_decode_query_kv_tokens(
+ attn_metadata,
+ attn_type: AttentionType,
+) -> Tuple[int, int, int]:
+ """
+ Calculate the number of prefill and decode tokens for query, key/value
+ based on the attention metadata and the specified attention type.
+
+ Args:
+ attn_metadata (FlashAttentionMetadata): Attention Metadata object.
+ attn_type (AttentionType): The type of attention being used.
+ Returns:
+ Tuple[int, int, int]: A tuple containing three integers:
+ - The number of prefill query tokens.
+ - The number of prefill key/value tokens.
+ - The number of decode query tokens.
+
+ Raises:
+ AssertionError: If the number of encoder tokens in `attn_metadata`
+ is `None` when required for the calculations.
+ """
+ num_prefill_query_tokens = 0
+ num_decode_query_tokens = 0
+ num_prefill_kv_tokens = 0
+ if attn_type == AttentionType.ENCODER:
+ # Encoder attention is only invoked during prefill phase.
+ # The same input servers a both query and key.
+ assert attn_metadata.num_encoder_tokens is not None
+ num_prefill_query_tokens = attn_metadata.num_encoder_tokens
+ num_prefill_kv_tokens = attn_metadata.num_encoder_tokens
+ num_decode_query_tokens = 0
+ elif attn_type == AttentionType.ENCODER_DECODER:
+ assert attn_metadata.num_encoder_tokens is not None
+ num_prefill_query_tokens = attn_metadata.num_prefill_tokens
+ # The key is the encoder/cross-attention.
+ num_prefill_kv_tokens = attn_metadata.num_encoder_tokens
+ num_decode_query_tokens = attn_metadata.num_decode_tokens
+ else: # attn_type == AttentionType.DECODER or
+ # attn_type == AttentionType.ENCODER_ONLY
+ num_prefill_query_tokens = attn_metadata.num_prefill_tokens
+ num_prefill_kv_tokens = attn_metadata.num_prefill_tokens
+ num_decode_query_tokens = attn_metadata.num_decode_tokens
+
+ return (num_prefill_query_tokens, num_prefill_kv_tokens,
+ num_decode_query_tokens)
diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
index 21877f2dded0e..4725413baade7 100644
--- a/vllm/attention/backends/xformers.py
+++ b/vllm/attention/backends/xformers.py
@@ -11,8 +11,10 @@
from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
AttentionMetadata, AttentionType)
-from vllm.attention.backends.utils import (CommonAttentionState,
- CommonMetadataBuilder)
+from vllm.attention.backends.utils import (
+ CommonAttentionState, CommonMetadataBuilder,
+ get_num_prefill_decode_query_kv_tokens, get_seq_len_block_table_args,
+ is_all_cross_attn_metadata_set, is_all_encoder_attn_metadata_set)
from vllm.attention.ops.paged_attn import (PagedAttention,
PagedAttentionMetadata)
from vllm.logger import init_logger
@@ -135,6 +137,11 @@ class XFormersMetadata(AttentionMetadata, PagedAttentionMetadata):
# Encoder sequence lengths representation
encoder_seq_lens: Optional[List[int]] = None
encoder_seq_lens_tensor: Optional[torch.Tensor] = None
+ # FIXME: It is for flash attn.
+ # (batch_size + 1,). The cumulative sequence lengths of the sequences in
+ # the batch, used to index into sequence. E.g., if the sequence length is
+ # [4, 6], it is [0, 4, 10].
+ encoder_seq_start_loc: Optional[torch.Tensor] = None
# Maximum sequence length among encoder sequences
max_encoder_seq_len: Optional[int] = None
@@ -162,9 +169,7 @@ def is_all_encoder_attn_metadata_set(self):
'''
All attention metadata required for encoder attention is set.
'''
- return ((self.encoder_seq_lens is not None)
- and (self.encoder_seq_lens_tensor is not None)
- and (self.max_encoder_seq_len is not None))
+ return is_all_encoder_attn_metadata_set(self)
@property
def is_all_cross_attn_metadata_set(self):
@@ -173,9 +178,7 @@ def is_all_cross_attn_metadata_set(self):
Superset of encoder attention required metadata.
'''
- return (self.is_all_encoder_attn_metadata_set
- and (self.cross_slot_mapping is not None)
- and (self.cross_block_tables is not None))
+ return is_all_cross_attn_metadata_set(self)
@property
def prefill_metadata(self) -> Optional["XFormersMetadata"]:
@@ -329,64 +332,6 @@ def _set_attn_bias(
raise AttributeError(f"Invalid attention type {str(attn_type)}")
-def _get_seq_len_block_table_args(
- attn_metadata: XFormersMetadata,
- is_prompt: bool,
- attn_type: AttentionType,
-) -> tuple:
- '''
- The particular choice of sequence-length- and block-table-related
- attributes which should be extracted from attn_metadata is dependent
- on the type of attention operation.
-
- Decoder attn -> select entirely decoder self-attention-related fields
- Encoder/decoder cross-attn -> select encoder sequence lengths &
- cross-attn block-tables fields
- Encoder attn -> select encoder sequence lengths fields & no block tables
-
- Arguments:
-
- * attn_metadata: Attention metadata structure associated with attention op
- * is_prompt: True if prefill, False otherwise
- * attn_type: encoder attention, decoder self-attention,
- encoder/decoder cross-attention
-
- Returns:
-
- * Appropriate sequence-lengths tensor
- * Appropriate max sequence-length scalar
- * Appropriate block tables (or None)
- '''
-
- if attn_type == AttentionType.DECODER:
- # Decoder self-attention
- # Choose max_seq_len based on whether we are in prompt_run
- if is_prompt:
- max_seq_len = attn_metadata.max_prefill_seq_len
- else:
- max_seq_len = attn_metadata.max_decode_seq_len
- return (attn_metadata.seq_lens_tensor, max_seq_len,
- attn_metadata.block_tables)
- elif attn_type == AttentionType.ENCODER_DECODER:
- # Enc/dec cross-attention KVs match encoder sequence length;
- # cross-attention utilizes special "cross" block tables
- return (attn_metadata.encoder_seq_lens_tensor,
- attn_metadata.max_encoder_seq_len,
- attn_metadata.cross_block_tables)
- elif attn_type == AttentionType.ENCODER:
- # No block tables associated with encoder attention
- return (attn_metadata.encoder_seq_lens_tensor,
- attn_metadata.max_encoder_seq_len, None)
- elif attn_type == AttentionType.ENCODER_ONLY:
- assert is_prompt, "Should not have decode for encoder only model."
-
- # No block tables associated with encoder attention
- return (attn_metadata.seq_lens_tensor,
- attn_metadata.max_prefill_seq_len, None)
- else:
- raise AttributeError(f"Invalid attention type {str(attn_type)}")
-
-
class XFormersMetadataBuilder(CommonMetadataBuilder[XFormersMetadata]):
_metadata_cls = XFormersMetadata
@@ -574,45 +519,21 @@ def forward(
updated_slot_mapping,
self.kv_cache_dtype,
k_scale, v_scale)
-
- if attn_type == AttentionType.ENCODER:
- # Encoder attention - chunked prefill is not applicable;
- # derive token-count from query shape & and treat them
- # as 100% prefill tokens
- assert attn_metadata.num_encoder_tokens is not None
- num_prefill_tokens = attn_metadata.num_encoder_tokens
- num_encoder_tokens = attn_metadata.num_encoder_tokens
- num_decode_tokens = 0
- elif attn_type == AttentionType.DECODER:
- # Decoder self-attention supports chunked prefill.
- num_prefill_tokens = attn_metadata.num_prefill_tokens
- num_encoder_tokens = attn_metadata.num_prefill_tokens
- num_decode_tokens = attn_metadata.num_decode_tokens
- # Only enforce this shape-constraint for decoder
- # self-attention
- assert key.shape[0] == num_prefill_tokens + num_decode_tokens
- assert value.shape[0] == num_prefill_tokens + num_decode_tokens
- else: # attn_type == AttentionType.ENCODER_DECODER
- # Encoder/decoder cross-attention requires no chunked
- # prefill (100% prefill or 100% decode tokens, no mix)
- num_prefill_tokens = attn_metadata.num_prefill_tokens
- if attn_metadata.num_encoder_tokens is not None:
- num_encoder_tokens = attn_metadata.num_encoder_tokens
- else:
- num_encoder_tokens = attn_metadata.num_prefill_tokens
- num_decode_tokens = attn_metadata.num_decode_tokens
+ (num_prefill_query_tokens, num_prefill_kv_tokens,
+ num_decode_query_tokens) = \
+ get_num_prefill_decode_query_kv_tokens(attn_metadata, attn_type)
output = torch.empty_like(query)
# Query for decode. KV is not needed because it is already cached.
- decode_query = query[num_prefill_tokens:]
+ decode_query = query[num_prefill_query_tokens:]
# QKV for prefill.
- query = query[:num_prefill_tokens]
+ query = query[:num_prefill_query_tokens]
if key is not None and value is not None:
- key = key[:num_encoder_tokens]
- value = value[:num_encoder_tokens]
+ key = key[:num_prefill_kv_tokens]
+ value = value[:num_prefill_kv_tokens]
- assert query.shape[0] == num_prefill_tokens
- assert decode_query.shape[0] == num_decode_tokens
+ assert query.shape[0] == num_prefill_query_tokens
+ assert decode_query.shape[0] == num_decode_query_tokens
if prefill_meta := attn_metadata.prefill_metadata:
# Prompt run.
@@ -622,8 +543,8 @@ def forward(
# prefix.
out = self._run_memory_efficient_xformers_forward(
query, key, value, prefill_meta, attn_type=attn_type)
- assert out.shape == output[:num_prefill_tokens].shape
- output[:num_prefill_tokens] = out
+ assert out.shape == output[:num_prefill_query_tokens].shape
+ output[:num_prefill_query_tokens] = out
else:
assert attn_type != AttentionType.ENCODER_ONLY, (
"Encoder-only models should not have prefix attention.")
@@ -652,8 +573,8 @@ def forward(
k_scale,
v_scale,
)
- assert output[:num_prefill_tokens].shape == out.shape
- output[:num_prefill_tokens] = out
+ assert output[:num_prefill_query_tokens].shape == out.shape
+ output[:num_prefill_query_tokens] = out
if decode_meta := attn_metadata.decode_metadata:
assert attn_type != AttentionType.ENCODER_ONLY, (
@@ -663,9 +584,9 @@ def forward(
seq_lens_arg,
max_seq_len_arg,
block_tables_arg,
- ) = _get_seq_len_block_table_args(decode_meta, False, attn_type)
+ ) = get_seq_len_block_table_args(decode_meta, False, attn_type)
- output[num_prefill_tokens:] = PagedAttention.forward_decode(
+ output[num_prefill_query_tokens:] = PagedAttention.forward_decode(
decode_query,
key_cache,
value_cache,
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index 376b3136f0fb8..8a59cf41a689e 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -98,7 +98,6 @@ def get_attn_backend(
is_blocksparse: bool = False,
) -> Type[AttentionBackend]:
"""Selects which attention backend to use and lazily imports it."""
-
if is_blocksparse:
logger.info("Using BlocksparseFlashAttention backend.")
from vllm.attention.backends.blocksparse_attn import (
@@ -108,6 +107,7 @@ def get_attn_backend(
backend = which_attn_to_use(head_size, dtype, kv_cache_dtype, block_size,
is_attention_free)
if backend == _Backend.FLASH_ATTN:
+ logger.info("Using Flash Attention backend.")
from vllm.attention.backends.flash_attn import ( # noqa: F401
FlashAttentionBackend)
return FlashAttentionBackend
diff --git a/vllm/model_executor/models/bart.py b/vllm/model_executor/models/bart.py
index cbdacf779b089..0543ca978b7dd 100644
--- a/vllm/model_executor/models/bart.py
+++ b/vllm/model_executor/models/bart.py
@@ -624,8 +624,6 @@ def forward(self, input_ids: torch.Tensor, positions: torch.Tensor,
Decoder output torch.Tensor
"""
# retrieve input_ids and inputs_embeds
-
- input_ids = input_ids.view(-1, input_ids.shape[-1])
inputs_embeds = self.embed_tokens(input_ids)
embed_pos = self.embed_positions(
diff --git a/vllm/utils.py b/vllm/utils.py
index 5488719cc99b0..1041120a24b3f 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -80,8 +80,8 @@
"currently supported with encoder/"
"decoder models.")
-STR_NOT_IMPL_ENC_DEC_BACKEND = ("XFormers is the only backend "
- "currently supported with encoder/"
+STR_NOT_IMPL_ENC_DEC_BACKEND = ("XFormers and Flash-Attention are the only "
+ "backends currently supported with encoder/"
"decoder models.")
STR_NOT_IMPL_ENC_DEC_PROMPT_ADAPTER = ("Prompt adapters are not "
diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py
index a4b665d71f28a..2ea314f8608ee 100644
--- a/vllm/worker/enc_dec_model_runner.py
+++ b/vllm/worker/enc_dec_model_runner.py
@@ -19,6 +19,7 @@
from vllm.logger import init_logger
from vllm.model_executor import SamplingMetadata
from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.model_executor.model_loader.utils import get_architecture_class_name
from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalInputs,
MultiModalRegistry)
from vllm.sampling_params import SamplingParams
@@ -36,6 +37,11 @@
logger = init_logger(__name__)
+# The Mllama model has PagedAttention specific logic because of which it
+# can only be run with the XFORMERS backend
+# TODO Make Mllama model work with Flash Attention backend.
+_XFORMERS_ONLY_ENCODER_DECODER_ARCHS = ["MllamaForConditionalGeneration"]
+
@dataclasses.dataclass(frozen=True)
class EncoderDecoderModelInput(ModelInputForGPUWithSamplingMetadata):
@@ -101,9 +107,7 @@ def __init__(
models) but these arguments are present here for compatibility with
the base-class constructor.
'''
-
- self._maybe_force_supported_attention_backend()
-
+ self._maybe_force_supported_attention_backend(model_config)
super().__init__(
model_config,
parallel_config,
@@ -119,7 +123,12 @@ def __init__(
# Crash for unsupported encoder/scenarios
assert_enc_dec_mr_supported_scenario(self)
- def _maybe_force_supported_attention_backend(self):
+ def _is_xformers_only_encoder_decoder_model(self,
+ model: ModelConfig) -> bool:
+ return get_architecture_class_name(
+ model) in _XFORMERS_ONLY_ENCODER_DECODER_ARCHS
+
+ def _maybe_force_supported_attention_backend(self, model: ModelConfig):
'''
Force vLLM to use the XFormers attention backend,
which is currently the only supported option.
@@ -135,22 +144,26 @@ def raise_backend_err():
is_forced_by_global = maybe_global_forced_backend is not None
is_forced_by_env_var = maybe_env_var_forced_backend is not None
- if not (is_forced_by_global or is_forced_by_env_var):
+ if not (is_forced_by_global or is_forced_by_env_var) \
+ and self._is_xformers_only_encoder_decoder_model(model):
# The user has not already specified an attention backend
# override
- logger.info("EncoderDecoderModelRunner requires "
- "XFormers backend; overriding backend "
- "auto-selection and forcing XFormers.")
+ logger.info(
+ "Encoder-Decoder Model Architecture %s requires XFormers "
+ "backend; overriding backend auto-selection and "
+ "forcing XFormers.", get_architecture_class_name(model))
global_force_attn_backend(_Backend.XFORMERS)
elif is_forced_by_global:
# Backend override enforced by global variable takes
# precedence over vLLM backend environment variable.
- if maybe_global_forced_backend != _Backend.XFORMERS:
+ if maybe_global_forced_backend not in\
+ [_Backend.XFORMERS, _Backend.FLASH_ATTN]:
raise_backend_err()
elif is_forced_by_env_var:
# Backend override enforced by vLLM backend
# environment variable
- if maybe_env_var_forced_backend != _Backend.XFORMERS:
+ if maybe_env_var_forced_backend not in\
+ [_Backend.XFORMERS, _Backend.FLASH_ATTN]:
raise_backend_err()
def _list_to_int32_tensor(
@@ -532,6 +545,7 @@ def _prepare_encoder_model_input_tensors(
attn_metadata.encoder_seq_lens,
attn_metadata.encoder_seq_lens_tensor,
attn_metadata.max_encoder_seq_len,
+ attn_metadata.encoder_seq_start_loc,
attn_metadata.cross_slot_mapping,
attn_metadata.cross_block_tables,
) = (
@@ -539,6 +553,7 @@ def _prepare_encoder_model_input_tensors(
encoder_seq_lens,
encoder_seq_lens_tensor,
max_encoder_seq_len,
+ encoder_seq_start_loc,
cross_slot_mapping_tensor,
cross_block_tables,
)
From af7380d83b0d67726a4a6c7a86766423bed6a7a8 Mon Sep 17 00:00:00 2001
From: youkaichao
Date: Fri, 1 Nov 2024 23:35:47 -0700
Subject: [PATCH 090/113] [torch.compile] fix cpu broken code (#9947)
Signed-off-by: youkaichao